backend: initial commit

This commit is contained in:
Davide Scaini
2026-03-28 13:57:12 +01:00
commit 38c5423aeb
36 changed files with 2463 additions and 0 deletions
View File
+271
View File
@@ -0,0 +1,271 @@
"""bincio extract — CLI command."""
import json
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from typing import Optional
import click
from rich.console import Console
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn
from bincio.extract.config import ExtractConfig, default_config, load_config
from bincio.extract.dedup import ActivityRecord, DedupIndex
from bincio.extract.metrics import compute
from bincio.extract.models import ParsedActivity
from bincio.extract.parsers.factory import is_supported, parse_file
from bincio.extract.strava_csv import StravaMetadata
from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
console = Console()
@click.command()
@click.option("--config", "config_path", type=click.Path(exists=True), default=None,
help="Path to extract_config.yaml (default: ./extract_config.yaml).")
@click.option("--input", "input_dir", type=click.Path(exists=True), default=None,
help="Input directory (overrides config).")
@click.option("--output", "output_dir", type=click.Path(), default=None,
help="Output directory (overrides config).")
@click.option("--file", "single_file", type=click.Path(exists=True), default=None,
help="Process a single file and print JSON to stdout.")
@click.option("--since", default=None, metavar="YYYY-MM-DD",
help="Only process files modified after this date.")
@click.option("--workers", default=4, show_default=True,
help="Number of parallel worker processes.")
def extract(
config_path: Optional[str],
input_dir: Optional[str],
output_dir: Optional[str],
single_file: Optional[str],
since: Optional[str],
workers: int,
) -> None:
"""Parse GPX/FIT/TCX files and write BAS JSON data store."""
# ── single file mode ─────────────────────────────────────────────────────
if single_file:
_process_single(Path(single_file))
return
# ── load config ──────────────────────────────────────────────────────────
cfg = _resolve_config(config_path, input_dir, output_dir)
cfg.output_dir.mkdir(parents=True, exist_ok=True)
# ── gather files ─────────────────────────────────────────────────────────
files = _collect_files(cfg, since)
if not files:
console.print("[yellow]No supported files found.[/yellow]")
return
console.print(f"Found [bold]{len(files)}[/bold] activity files.")
# ── Strava metadata ──────────────────────────────────────────────────────
strava_meta: Optional[StravaMetadata] = None
if cfg.metadata_csv and cfg.metadata_csv.exists():
strava_meta = StravaMetadata(cfg.metadata_csv)
console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].")
# ── dedup index ──────────────────────────────────────────────────────────
dedup = DedupIndex(output_dir=cfg.output_dir)
# ── process ──────────────────────────────────────────────────────────────
summaries: list[dict] = []
errors: list[tuple[Path, str]] = []
skipped = 0
owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name}
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Processing...", total=len(files))
with ProcessPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_parse_worker, f): f for f in files}
for future in as_completed(futures):
path = futures[future]
progress.advance(task)
try:
activity = future.result()
except Exception as exc:
errors.append((path, str(exc)))
continue
# ── incremental skip ──────────────────────────────────────
if cfg.incremental:
existing_id = dedup.is_exact_duplicate(activity.source_hash)
if existing_id:
skipped += 1
continue
# ── enrich from Strava CSV ────────────────────────────────
if strava_meta:
strava_meta.enrich(activity.source_file, activity)
# ── compute metrics ───────────────────────────────────────
metrics = compute(activity)
# ── deduplication ─────────────────────────────────────────
activity_id = make_activity_id(activity)
duplicate_of: Optional[str] = None
near_dup_id = dedup.find_near_duplicate(
activity.started_at, metrics.distance_m
)
if near_dup_id:
source = _infer_source(activity)
canonical = dedup.pick_canonical(near_dup_id, source)
if canonical == "__new__":
# New one is better — mark existing as duplicate
existing = dedup._records[near_dup_id]
existing.duplicate_of = activity_id
else:
duplicate_of = near_dup_id
# ── write files ───────────────────────────────────────────
written_id = write_activity(
activity, metrics, cfg.output_dir,
privacy=cfg.default_privacy,
duplicate_of=duplicate_of,
rdp_epsilon=cfg.track.rdp_epsilon,
)
# Register in dedup index
dedup.register(ActivityRecord(
id=written_id,
source_hash=activity.source_hash,
started_at=activity.started_at,
distance_m=metrics.distance_m,
source=_infer_source(activity),
))
if duplicate_of is None:
summaries.append(
build_summary(activity, metrics, written_id, cfg.default_privacy)
)
# ── write index.json ──────────────────────────────────────────────────────
# Merge with any existing summaries from previous incremental runs
existing_index = _load_existing_summaries(cfg.output_dir)
all_summaries = {s["id"]: s for s in existing_index}
for s in summaries:
all_summaries[s["id"]] = s
write_index(list(all_summaries.values()), cfg.output_dir, owner)
dedup.save()
# ── summary ───────────────────────────────────────────────────────────────
console.print(
f"\n[green]Done.[/green] "
f"Processed [bold]{len(summaries)}[/bold] activities, "
f"skipped [bold]{skipped}[/bold] (already up to date), "
f"errors [bold]{len(errors)}[/bold]."
)
if errors:
console.print("\n[red]Errors:[/red]")
for path, msg in errors[:20]:
console.print(f" {path.name}: {msg}")
if len(errors) > 20:
console.print(f" ... and {len(errors) - 20} more.")
# ── helpers ───────────────────────────────────────────────────────────────────
def _parse_worker(path: Path) -> ParsedActivity:
"""Run in worker process — imports are isolated."""
from bincio.extract.parsers.factory import parse_file
return parse_file(path)
def _process_single(path: Path) -> None:
from bincio.extract.parsers.factory import parse_file
try:
activity = parse_file(path)
metrics = compute(activity)
activity_id = make_activity_id(activity)
from bincio.extract.writer import build_summary
result = build_summary(activity, metrics, activity_id)
click.echo(json.dumps(result, indent=2))
except Exception as exc:
console.print(f"[red]Error:[/red] {exc}")
sys.exit(1)
def _resolve_config(
config_path: Optional[str],
input_dir: Optional[str],
output_dir: Optional[str],
) -> ExtractConfig:
if config_path:
cfg = load_config(Path(config_path))
elif Path("extract_config.yaml").exists():
cfg = load_config(Path("extract_config.yaml"))
elif input_dir:
cfg = default_config(
Path(input_dir).expanduser(),
Path(output_dir or "./bincio_data").expanduser(),
)
else:
raise click.UsageError(
"Provide --config, --input, or an extract_config.yaml in the current directory."
)
if input_dir:
cfg.input_dirs = [Path(input_dir).expanduser()]
if output_dir:
cfg.output_dir = Path(output_dir).expanduser()
return cfg
def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]:
from bincio.extract.parsers.factory import is_supported
import os
from datetime import datetime
since_ts: Optional[float] = None
if since:
since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp()
files = []
for d in cfg.input_dirs:
if not d.exists():
console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}")
continue
for path in d.rglob("*"):
if not path.is_file():
continue
if not is_supported(path):
continue
if since_ts and path.stat().st_mtime < since_ts:
continue
files.append(path)
return files
def _load_existing_summaries(output_dir: Path) -> list[dict]:
index_path = output_dir / "index.json"
if not index_path.exists():
return []
try:
data = json.loads(index_path.read_text())
return data.get("activities", [])
except Exception:
return []
def _infer_source(activity: ParsedActivity) -> Optional[str]:
if activity.strava_id:
return "strava_export"
name = activity.source_file.lower()
if "activity" in name and len(name.split(".")) >= 3:
return "karoo"
if name.endswith((".fit", ".fit.gz")):
return "fit_file"
if name.endswith((".gpx", ".gpx.gz")):
return "gpx_file"
if name.endswith((".tcx", ".tcx.gz")):
return "tcx_file"
return None
+88
View File
@@ -0,0 +1,88 @@
"""Extract stage configuration — loaded from extract_config.yaml."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import yaml
@dataclass
class TrackConfig:
simplify: str = "rdp"
rdp_epsilon: float = 0.0001
timeseries_hz: int = 1
@dataclass
class SensorsConfig:
heart_rate: bool = True
cadence: bool = True
temperature: bool = True
power: bool = True
@dataclass
class ClassifierConfig:
enabled: bool = False # off by default; opt-in
@dataclass
class ExtractConfig:
input_dirs: list[Path]
output_dir: Path
metadata_csv: Optional[Path] = None
default_privacy: str = "public"
sensors: SensorsConfig = field(default_factory=SensorsConfig)
track: TrackConfig = field(default_factory=TrackConfig)
classifier: ClassifierConfig = field(default_factory=ClassifierConfig)
incremental: bool = True
owner_handle: str = "me"
owner_display_name: str = "Me"
def load_config(path: Path) -> ExtractConfig:
raw = yaml.safe_load(path.read_text())
inp = raw.get("input", {})
dirs = [Path(d).expanduser() for d in inp.get("dirs", [])]
csv_path = inp.get("metadata_csv")
out = Path(raw.get("output", {}).get("dir", "./bincio_data")).expanduser()
owner = raw.get("owner", {})
sensors_raw = raw.get("sensors", {})
sensors = SensorsConfig(
heart_rate=sensors_raw.get("heart_rate", True),
cadence=sensors_raw.get("cadence", True),
temperature=sensors_raw.get("temperature", True),
power=sensors_raw.get("power", True),
)
track_raw = raw.get("track", {})
track = TrackConfig(
simplify=track_raw.get("simplify", "rdp"),
rdp_epsilon=track_raw.get("rdp_epsilon", 0.0001),
timeseries_hz=track_raw.get("timeseries_hz", 1),
)
cls_raw = raw.get("classifier", {})
classifier = ClassifierConfig(enabled=cls_raw.get("enabled", False))
return ExtractConfig(
input_dirs=dirs,
output_dir=out,
metadata_csv=Path(csv_path).expanduser() if csv_path else None,
default_privacy=raw.get("default_privacy", "public"),
sensors=sensors,
track=track,
classifier=classifier,
incremental=raw.get("incremental", True),
owner_handle=owner.get("handle", "me"),
owner_display_name=owner.get("display_name", "Me"),
)
def default_config(input_dir: Path, output_dir: Path) -> ExtractConfig:
return ExtractConfig(input_dirs=[input_dir], output_dir=output_dir)
+127
View File
@@ -0,0 +1,127 @@
"""Duplicate activity detection.
Two kinds of duplicates:
1. Exact duplicate — same source_hash. Skip entirely.
2. Near-duplicate — same ride recorded by two devices / exported from two
platforms. Detected by (started_at ± 5 min) AND (distance ± 5%).
The "better" source wins; the other gets duplicate_of set.
The deduplication index is a JSON file persisted in the output directory so
that incremental runs don't re-evaluate already-resolved pairs.
"""
import json
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
_INDEX_FILE = ".bincio_cache.json"
# Source quality ranking (higher = preferred when deduplicating)
_SOURCE_QUALITY: dict[str, int] = {
"karoo": 5,
"fit_file": 4,
"garmin_connect": 4,
"strava_export": 3,
"gpx_file": 2,
"tcx_file": 1,
"wahoo": 3,
"komoot": 2,
"manual": 0,
}
@dataclass
class ActivityRecord:
"""Minimal record stored in the dedup index."""
id: str
source_hash: str
started_at: datetime
distance_m: Optional[float]
source: Optional[str]
duplicate_of: Optional[str] = None
@dataclass
class DedupIndex:
output_dir: Path
_records: dict[str, ActivityRecord] = field(default_factory=dict)
# source_hash → id, for exact-duplicate lookup
_by_hash: dict[str, str] = field(default_factory=dict)
def __post_init__(self) -> None:
self._load()
def _load(self) -> None:
p = self.output_dir / _INDEX_FILE
if not p.exists():
return
data = json.loads(p.read_text())
for item in data.get("activities", []):
started_at = datetime.fromisoformat(item["started_at"])
r = ActivityRecord(
id=item["id"],
source_hash=item["source_hash"],
started_at=started_at,
distance_m=item.get("distance_m"),
source=item.get("source"),
duplicate_of=item.get("duplicate_of"),
)
self._records[r.id] = r
self._by_hash[r.source_hash] = r.id
def save(self) -> None:
p = self.output_dir / _INDEX_FILE
data = {
"activities": [
{
"id": r.id,
"source_hash": r.source_hash,
"started_at": r.started_at.isoformat(),
"distance_m": r.distance_m,
"source": r.source,
"duplicate_of": r.duplicate_of,
}
for r in self._records.values()
]
}
p.write_text(json.dumps(data, indent=2))
def is_exact_duplicate(self, source_hash: str) -> Optional[str]:
"""Return existing activity ID if hash is already in the index."""
return self._by_hash.get(source_hash)
def find_near_duplicate(
self,
started_at: datetime,
distance_m: Optional[float],
) -> Optional[str]:
"""Return ID of a near-duplicate if one exists."""
for r in self._records.values():
if r.duplicate_of is not None:
continue # skip already-marked duplicates
if abs((r.started_at - started_at).total_seconds()) > 5 * 60:
continue
if distance_m is None or r.distance_m is None:
continue
ref = max(distance_m, r.distance_m)
if abs(distance_m - r.distance_m) / ref < 0.05:
return r.id
return None
def register(self, record: ActivityRecord) -> None:
self._records[record.id] = record
self._by_hash[record.source_hash] = record.id
def pick_canonical(self, existing_id: str, new_source: Optional[str]) -> str:
"""Return the ID of whichever record should be canonical."""
existing = self._records[existing_id]
existing_q = _SOURCE_QUALITY.get(existing.source or "", 0)
new_q = _SOURCE_QUALITY.get(new_source or "", 0)
# New record is strictly better → existing becomes the duplicate
if new_q > existing_q:
return "__new__"
return existing_id
+210
View File
@@ -0,0 +1,210 @@
"""Compute aggregated metrics from a ParsedActivity.
All calculations are self-contained — no external state needed.
"""
import math
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
from geopy.distance import geodesic
from bincio.extract.models import DataPoint, ParsedActivity
# Speed below which we consider the athlete stopped (km/h)
_STOPPED_THRESHOLD_KMH = 1.0
@dataclass
class ComputedMetrics:
distance_m: Optional[float]
duration_s: Optional[int]
moving_time_s: Optional[int]
elevation_gain_m: Optional[float]
elevation_loss_m: Optional[float]
avg_speed_kmh: Optional[float]
max_speed_kmh: Optional[float]
avg_hr_bpm: Optional[int]
max_hr_bpm: Optional[int]
avg_cadence_rpm: Optional[int]
avg_power_w: Optional[int]
max_power_w: Optional[int]
bbox: Optional[tuple[float, float, float, float]] # min_lon, min_lat, max_lon, max_lat
start_latlng: Optional[tuple[float, float]]
end_latlng: Optional[tuple[float, float]]
def compute(activity: ParsedActivity) -> ComputedMetrics:
pts = activity.points
if not pts:
return _empty()
duration_s = _duration(pts)
distance_m = _distance(pts)
moving_time_s, moving_speed_kmh = _moving_stats(pts)
gain, loss = _elevation(pts)
max_speed = _max_speed(pts)
avg_hr, max_hr = _hr_stats(pts)
avg_cad = _avg_nonnull([p.cadence_rpm for p in pts])
avg_pow = _avg_nonnull([p.power_w for p in pts])
max_pow = _max_nonnull([p.power_w for p in pts])
bbox = _bbox(pts)
start_ll, end_ll = _endpoints(pts)
return ComputedMetrics(
distance_m=distance_m,
duration_s=duration_s,
moving_time_s=moving_time_s,
elevation_gain_m=round(gain, 1) if gain is not None else None,
elevation_loss_m=round(abs(loss), 1) if loss is not None else None,
avg_speed_kmh=round(moving_speed_kmh, 2) if moving_speed_kmh else None,
max_speed_kmh=round(max_speed, 2) if max_speed else None,
avg_hr_bpm=avg_hr,
max_hr_bpm=max_hr,
avg_cadence_rpm=avg_cad,
avg_power_w=avg_pow,
max_power_w=max_pow,
bbox=bbox,
start_latlng=start_ll,
end_latlng=end_ll,
)
# ── helpers ──────────────────────────────────────────────────────────────────
def _duration(pts: list[DataPoint]) -> Optional[int]:
if len(pts) < 2:
return None
return int((pts[-1].timestamp - pts[0].timestamp).total_seconds())
def _distance(pts: list[DataPoint]) -> Optional[float]:
"""Prefer device-recorded cumulative distance; fall back to GPS geodesic."""
# If the last point has a device distance, use it
last_dist = next(
(p.distance_m for p in reversed(pts) if p.distance_m is not None), None
)
if last_dist is not None:
return round(last_dist, 1)
# GPS fallback
total = 0.0
has_gps = False
for a, b in zip(pts, pts[1:]):
if a.lat is None or a.lon is None or b.lat is None or b.lon is None:
continue
has_gps = True
total += geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
return round(total, 1) if has_gps else None
def _moving_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[float]]:
"""Return (moving_time_s, avg_speed_kmh_over_moving_time)."""
moving_s = 0
moving_dist_m = 0.0
has_gps = False
for a, b in zip(pts, pts[1:]):
dt = (b.timestamp - a.timestamp).total_seconds()
if dt <= 0:
continue
# Compute speed for this interval from GPS
if a.lat is not None and a.lon is not None and b.lat is not None and b.lon is not None:
has_gps = True
seg_m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
seg_kmh = (seg_m / dt) * 3.6
elif a.speed_kmh is not None:
seg_kmh = a.speed_kmh
seg_m = (seg_kmh / 3.6) * dt
has_gps = True # speed data present
else:
continue
if seg_kmh >= _STOPPED_THRESHOLD_KMH:
moving_s += int(dt)
moving_dist_m += seg_m
if not has_gps or moving_s == 0:
return None, None
avg_kmh = (moving_dist_m / moving_s) * 3.6
return moving_s, avg_kmh
def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]:
elevations = [p.elevation_m for p in pts if p.elevation_m is not None]
if len(elevations) < 2:
return None, None
gain = loss = 0.0
for a, b in zip(elevations, elevations[1:]):
diff = b - a
if diff > 0:
gain += diff
else:
loss += diff
return gain, loss
def _max_speed(pts: list[DataPoint]) -> Optional[float]:
# Prefer device speed; fall back to GPS-derived
device_speeds = [p.speed_kmh for p in pts if p.speed_kmh is not None]
if device_speeds:
return max(device_speeds)
# GPS-derived max
gps_speeds = []
for a, b in zip(pts, pts[1:]):
if a.lat is None or b.lat is None:
continue
dt = (b.timestamp - a.timestamp).total_seconds()
if dt <= 0:
continue
m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
gps_speeds.append((m / dt) * 3.6)
return max(gps_speeds) if gps_speeds else None
def _hr_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[int]]:
hrs = [p.hr_bpm for p in pts if p.hr_bpm is not None]
if not hrs:
return None, None
return int(sum(hrs) / len(hrs)), max(hrs)
def _avg_nonnull(values: list) -> Optional[int]:
v = [x for x in values if x is not None]
return int(sum(v) / len(v)) if v else None
def _max_nonnull(values: list) -> Optional[int]:
v = [x for x in values if x is not None]
return max(v) if v else None
def _bbox(pts: list[DataPoint]) -> Optional[tuple[float, float, float, float]]:
lats = [p.lat for p in pts if p.lat is not None]
lons = [p.lon for p in pts if p.lon is not None]
if not lats:
return None
return (min(lons), min(lats), max(lons), max(lats))
def _endpoints(
pts: list[DataPoint],
) -> tuple[Optional[tuple[float, float]], Optional[tuple[float, float]]]:
gps = [(p.lat, p.lon) for p in pts if p.lat is not None and p.lon is not None]
if not gps:
return None, None
return gps[0], gps[-1]
def _empty() -> ComputedMetrics:
return ComputedMetrics(
distance_m=None, duration_s=None, moving_time_s=None,
elevation_gain_m=None, elevation_loss_m=None,
avg_speed_kmh=None, max_speed_kmh=None,
avg_hr_bpm=None, max_hr_bpm=None,
avg_cadence_rpm=None, avg_power_w=None, max_power_w=None,
bbox=None, start_latlng=None, end_latlng=None,
)
+58
View File
@@ -0,0 +1,58 @@
"""Core data models for the extract stage.
ParsedActivity is the internal representation produced by parsers.
It gets fed into metrics computation and the BAS JSON writer.
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class DataPoint:
"""One measurement sample from a GPS/sensor recording."""
timestamp: datetime
lat: Optional[float] = None
lon: Optional[float] = None
elevation_m: Optional[float] = None
hr_bpm: Optional[int] = None
cadence_rpm: Optional[int] = None
# Speed from device (km/h). May be absent; we compute it from GPS if so.
speed_kmh: Optional[float] = None
power_w: Optional[int] = None
temperature_c: Optional[float] = None
# Cumulative distance from device (metres), if recorded.
distance_m: Optional[float] = None
@dataclass
class LapData:
index: int
started_at: datetime
duration_s: Optional[int] = None
distance_m: Optional[float] = None
elevation_gain_m: Optional[float] = None
avg_speed_kmh: Optional[float] = None
avg_hr_bpm: Optional[int] = None
avg_power_w: Optional[int] = None
@dataclass
class ParsedActivity:
"""Raw activity data as produced by a parser, before metric computation."""
points: list[DataPoint]
sport: str # normalised to BAS sport enum
started_at: datetime
source_file: str # basename of original file
source_hash: str # "sha256:{hex}"
sub_sport: Optional[str] = None
device: Optional[str] = None
title: Optional[str] = None
description: Optional[str] = None
gear: Optional[str] = None
strava_id: Optional[str] = None
laps: list[LapData] = field(default_factory=list)
View File
+34
View File
@@ -0,0 +1,34 @@
"""Abstract base class for all activity parsers."""
import gzip
import hashlib
from abc import ABC, abstractmethod
from pathlib import Path
from bincio.extract.models import ParsedActivity
class BaseParser(ABC):
@abstractmethod
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
"""Parse activity from raw file bytes.
Receives pre-read bytes so the factory can compute the hash once and
handle decompression transparently before dispatching.
"""
@staticmethod
def _sha256(data: bytes) -> str:
return "sha256:" + hashlib.sha256(data).hexdigest()
@staticmethod
def _read_file(path: Path) -> tuple[bytes, bytes]:
"""Return (raw_bytes, decompressed_bytes).
raw_bytes is the original file content (used for hashing).
decompressed_bytes is what parsers should actually parse.
"""
raw = path.read_bytes()
if path.suffix == ".gz":
return raw, gzip.decompress(raw)
return raw, raw
+46
View File
@@ -0,0 +1,46 @@
"""Parser factory — selects the right parser based on file extension."""
from pathlib import Path
from bincio.extract.models import ParsedActivity
from bincio.extract.parsers.base import BaseParser
from bincio.extract.parsers.fit import FitParser
from bincio.extract.parsers.gpx import GpxParser
from bincio.extract.parsers.tcx import TcxParser
# Supported extensions (including .gz variants)
SUPPORTED = {".fit", ".gpx", ".tcx", ".fit.gz", ".gpx.gz", ".tcx.gz"}
_PARSERS: dict[str, type[BaseParser]] = {
".fit": FitParser,
".gpx": GpxParser,
".tcx": TcxParser,
}
def _base_ext(path: Path) -> str:
"""Return the meaningful extension, stripping .gz if present."""
if path.suffix == ".gz":
return Path(path.stem).suffix # e.g. ".fit" from "ride.fit.gz"
return path.suffix
def is_supported(path: Path) -> bool:
suffix = "".join(path.suffixes[-2:]) if path.suffix == ".gz" else path.suffix
return suffix in SUPPORTED
def parse_file(path: Path) -> ParsedActivity:
"""Parse an activity file, handling .gz transparently."""
ext = _base_ext(path)
parser_cls = _PARSERS.get(ext)
if parser_cls is None:
raise ValueError(f"Unsupported file type: {path.name!r}")
raw_bytes, content_bytes = BaseParser._read_file(path)
parser = parser_cls()
activity = parser.parse(path, content_bytes)
# Attach hash of the *original* bytes (compressed if .gz) for dedup
activity.source_hash = BaseParser._sha256(raw_bytes)
activity.source_file = path.name
return activity
+133
View File
@@ -0,0 +1,133 @@
"""FIT file parser (Garmin binary format)."""
from datetime import timezone
from pathlib import Path
from typing import Any
import fitdecode
from bincio.extract.models import DataPoint, LapData, ParsedActivity
from bincio.extract.sport import normalise_sport
class FitParser:
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
import io
points: list[DataPoint] = []
laps: list[LapData] = []
sport: str = "cycling"
sub_sport: str | None = None
device: str | None = None
with fitdecode.FitReader(io.BytesIO(raw_bytes)) as fit:
for frame in fit:
if not isinstance(frame, fitdecode.FitDataMessage):
continue
if frame.name == "sport":
sport = normalise_sport(_get(frame, "sport", "cycling"))
sub_sport = _normalise_sub_sport(_get(frame, "sub_sport"))
elif frame.name == "device_info":
mfr = _get(frame, "manufacturer")
prod = _get(frame, "product_name") or _get(frame, "garmin_product")
if mfr and prod:
device = f"{mfr} {prod}"
elif prod:
device = str(prod)
elif frame.name == "record":
ts = _get(frame, "timestamp")
if ts is None:
continue
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
lat = _semicircles_to_deg(_get(frame, "position_lat"))
lon = _semicircles_to_deg(_get(frame, "position_long"))
speed_raw = _get(frame, "speed") # m/s
dp = DataPoint(
timestamp=ts,
lat=lat,
lon=lon,
elevation_m=_get(frame, "altitude"),
hr_bpm=_get(frame, "heart_rate"),
cadence_rpm=_get(frame, "cadence"),
speed_kmh=speed_raw * 3.6 if speed_raw is not None else None,
power_w=_get(frame, "power"),
temperature_c=_get(frame, "temperature"),
distance_m=_get(frame, "distance"),
)
points.append(dp)
elif frame.name == "lap":
ts = _get(frame, "start_time")
if ts is not None:
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
elapsed = _get(frame, "total_elapsed_time")
speed_raw = _get(frame, "avg_speed")
laps.append(
LapData(
index=len(laps),
started_at=ts,
duration_s=int(elapsed) if elapsed else None,
distance_m=_get(frame, "total_distance"),
elevation_gain_m=_get(frame, "total_ascent"),
avg_speed_kmh=speed_raw * 3.6 if speed_raw else None,
avg_hr_bpm=_get(frame, "avg_heart_rate"),
avg_power_w=_get(frame, "avg_power"),
)
)
if not points:
raise ValueError(f"No record messages found in {path.name}")
return ParsedActivity(
points=points,
sport=sport,
sub_sport=sub_sport,
started_at=points[0].timestamp,
device=device,
laps=laps,
source_file=path.name,
source_hash="",
)
def _get(frame: fitdecode.FitDataMessage, field: str, default: Any = None) -> Any:
try:
return frame.get_value(field)
except KeyError:
return default
def _semicircles_to_deg(value: Any) -> float | None:
if value is None:
return None
try:
deg = float(value) * (180.0 / 2**31)
# Sanity check: invalid semicircle values often come out as ±180+
if abs(deg) > 180:
return None
return deg
except (TypeError, ValueError):
return None
def _normalise_sub_sport(value: Any) -> str | None:
if value is None:
return None
s = str(value).lower().replace(" ", "_")
mapping = {
"road": "road",
"mountain": "mountain",
"gravel_cycling": "gravel",
"cyclocross": "gravel",
"indoor_cycling": "indoor",
"trail": "trail",
"track": "track",
}
return mapping.get(s, s) or None
+82
View File
@@ -0,0 +1,82 @@
"""GPX file parser."""
from datetime import timezone
from pathlib import Path
import gpxpy
import gpxpy.gpx
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.parsers.base import BaseParser
from bincio.extract.sport import normalise_sport
# Known GPX extension namespaces
_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1"
_NS_GARMIN_V2 = "http://www.garmin.com/xmlschemas/TrackPointExtension/v2"
class GpxParser(BaseParser):
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
gpx = gpxpy.parse(raw_bytes.decode("utf-8", errors="replace"))
points: list[DataPoint] = []
for track in gpx.tracks:
for segment in track.segments:
for pt in segment.points:
if pt.time is None:
continue
ts = pt.time
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
dp = DataPoint(
timestamp=ts,
lat=pt.latitude,
lon=pt.longitude,
elevation_m=pt.elevation,
)
_apply_extensions(pt, dp)
points.append(dp)
if not points:
raise ValueError(f"No trackpoints found in {path.name}")
sport = normalise_sport(
(gpx.tracks[0].type if gpx.tracks else None) or "cycling"
)
started_at = points[0].timestamp
return ParsedActivity(
points=points,
sport=sport,
started_at=started_at,
source_file=path.name,
source_hash="", # set by factory
)
def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
"""Extract HR, cadence, temperature from Garmin TrackPointExtension."""
if pt.extensions is None:
return
for ext in pt.extensions:
ns = _strip_ns(ext.tag)
if ns == "TrackPointExtension":
for child in ext:
tag = _strip_ns(child.tag)
val = child.text
if val is None:
continue
if tag == "hr":
dp.hr_bpm = int(float(val))
elif tag == "cad":
dp.cadence_rpm = int(float(val))
elif tag == "atemp":
dp.temperature_c = float(val)
elif tag == "speed":
dp.speed_kmh = float(val) * 3.6 # m/s → km/h
def _strip_ns(tag: str) -> str:
"""'{namespace}localname''localname'."""
return tag.split("}")[-1] if "}" in tag else tag
+89
View File
@@ -0,0 +1,89 @@
"""TCX (Training Center XML) file parser."""
from datetime import datetime, timezone
from pathlib import Path
from lxml import etree
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.sport import normalise_sport
_NS = {
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
"ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
}
class TcxParser:
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
# Some exporters (e.g. Garmin) prepend whitespace before the XML
# declaration, which is technically invalid. Strip it.
root = etree.fromstring(raw_bytes.lstrip())
activities = root.findall(".//tcx:Activity", _NS)
if not activities:
raise ValueError(f"No Activity elements found in {path.name}")
# Use the first activity
act = activities[0]
sport_attr = act.get("Sport", "Biking")
sport = normalise_sport(sport_attr)
points: list[DataPoint] = []
for tp in act.findall(".//tcx:Trackpoint", _NS):
ts_el = tp.find("tcx:Time", _NS)
if ts_el is None or not ts_el.text:
continue
ts = _parse_ts(ts_el.text)
lat, lon = None, None
pos = tp.find("tcx:Position", _NS)
if pos is not None:
lat_el = pos.find("tcx:LatitudeDegrees", _NS)
lon_el = pos.find("tcx:LongitudeDegrees", _NS)
lat = float(lat_el.text) if lat_el is not None and lat_el.text else None
lon = float(lon_el.text) if lon_el is not None and lon_el.text else None
ele_el = tp.find("tcx:AltitudeMeters", _NS)
hr_el = tp.find(".//tcx:HeartRateBpm/tcx:Value", _NS)
cad_el = tp.find("tcx:Cadence", _NS)
dist_el = tp.find("tcx:DistanceMeters", _NS)
# Extensions (speed, watts)
speed_el = tp.find(".//ext:Speed", _NS)
power_el = tp.find(".//ext:Watts", _NS)
dp = DataPoint(
timestamp=ts,
lat=lat,
lon=lon,
elevation_m=float(ele_el.text) if ele_el is not None and ele_el.text else None,
hr_bpm=int(float(hr_el.text)) if hr_el is not None and hr_el.text else None,
cadence_rpm=int(float(cad_el.text)) if cad_el is not None and cad_el.text else None,
distance_m=float(dist_el.text) if dist_el is not None and dist_el.text else None,
speed_kmh=float(speed_el.text) * 3.6 if speed_el is not None and speed_el.text else None,
power_w=int(float(power_el.text)) if power_el is not None and power_el.text else None,
)
points.append(dp)
if not points:
raise ValueError(f"No trackpoints found in {path.name}")
return ParsedActivity(
points=points,
sport=sport,
started_at=points[0].timestamp,
source_file=path.name,
source_hash="",
)
def _parse_ts(s: str) -> datetime:
# ISO 8601 with or without fractional seconds
s = s.rstrip("Z")
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
try:
return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
except ValueError:
continue
raise ValueError(f"Cannot parse timestamp: {s!r}")
+60
View File
@@ -0,0 +1,60 @@
"""GPS track simplification using the Ramer-Douglas-Peucker algorithm."""
from typing import Optional
from rdp import rdp
from bincio.extract.models import DataPoint
def simplify_track(
points: list[DataPoint],
epsilon: float = 0.0001,
) -> list[DataPoint]:
"""Return a simplified subset of points using RDP.
epsilon is in degrees (~11m at equator for 0.0001).
Points without GPS coordinates are dropped.
"""
gps_pts = [(p, p.lat, p.lon) for p in points if p.lat is not None and p.lon is not None]
if len(gps_pts) < 2:
return [p for p, _, _ in gps_pts]
coords = [[lon, lat] for _, lat, lon in gps_pts]
mask = rdp(coords, epsilon=epsilon, return_mask=True)
return [p for (p, _, _), keep in zip(gps_pts, mask) if keep]
def build_geojson(
points: list[DataPoint],
activity_id: str,
epsilon: float = 0.0001,
original_count: Optional[int] = None,
) -> dict:
"""Build a GeoJSON Feature for the simplified track."""
simplified = simplify_track(points, epsilon=epsilon)
coordinates = [
[p.lon, p.lat, p.elevation_m] if p.elevation_m is not None else [p.lon, p.lat]
for p in simplified
if p.lon is not None and p.lat is not None
]
# Parallel speed array for gradient coloring
speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified]
return {
"type": "Feature",
"geometry": {
"type": "LineString",
"coordinates": coordinates,
},
"properties": {
"id": activity_id,
"speeds": speeds,
"simplification": "rdp",
"rdp_epsilon": epsilon,
"point_count_original": original_count or len(points),
"point_count_simplified": len(coordinates),
},
}
+40
View File
@@ -0,0 +1,40 @@
"""Sport name normalisation."""
_MAPPING: dict[str, str] = {
# cycling variants
"cycling": "cycling",
"biking": "cycling",
"bike": "cycling",
"road_biking": "cycling",
"mountain_biking": "cycling",
"gravel_cycling": "cycling",
"cyclocross": "cycling",
"indoor_cycling": "cycling",
"virtual_ride": "cycling",
"e-biking": "cycling",
# running
"running": "running",
"run": "running",
"trail_running": "running",
"treadmill_running": "running",
"virtual_run": "running",
# hiking
"hiking": "hiking",
"hike": "hiking",
# walking
"walking": "walking",
"walk": "walking",
# swimming
"swimming": "swimming",
"swim": "swimming",
"open_water_swimming": "swimming",
}
BAS_SPORTS = {"cycling", "running", "hiking", "walking", "swimming", "other"}
def normalise_sport(raw: object) -> str:
if raw is None:
return "other"
key = str(raw).lower().strip().replace(" ", "_")
return _MAPPING.get(key, "other")
+55
View File
@@ -0,0 +1,55 @@
"""Import metadata from Strava's activities.csv bulk export.
Strava export columns we care about:
Activity ID, Activity Date, Activity Name, Activity Type,
Activity Description, Filename
"""
import csv
import re
from pathlib import Path
from typing import Optional
_STRAVA_DATE_FMTS = (
"%b %d, %Y, %I:%M:%S %p", # "Jun 1, 2024, 7:30:12 AM"
"%Y-%m-%d %H:%M:%S",
)
class StravaMetadata:
"""Maps original filename → Strava metadata."""
def __init__(self, csv_path: Path) -> None:
self._by_filename: dict[str, dict] = {}
self._load(csv_path)
def _load(self, path: Path) -> None:
with path.open(newline="", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
for row in reader:
filename = row.get("Filename", "").strip()
if not filename:
continue
# Strava stores paths like "activities/12345.fit.gz"
basename = Path(filename).name
self._by_filename[basename] = row
def lookup(self, source_file: str) -> Optional[dict]:
"""Return the Strava CSV row for a given source filename, or None."""
return self._by_filename.get(source_file)
def enrich(self, source_file: str, activity: object) -> None:
"""Mutate a ParsedActivity with Strava metadata if found."""
row = self.lookup(source_file)
if row is None:
return
if not activity.title and row.get("Activity Name"): # type: ignore[attr-defined]
activity.title = row["Activity Name"].strip() # type: ignore[attr-defined]
if not activity.description and row.get("Activity Description"): # type: ignore[attr-defined]
activity.description = row["Activity Description"].strip() # type: ignore[attr-defined]
if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined]
activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined]
+58
View File
@@ -0,0 +1,58 @@
"""Downsample a list of DataPoints to at most 1 sample/second and build
the BAS timeseries object (parallel arrays)."""
from datetime import datetime
from typing import Optional
from bincio.extract.models import DataPoint
def build_timeseries(
points: list[DataPoint],
started_at: datetime,
privacy: str = "public",
) -> dict:
"""Return the BAS `timeseries` object.
privacy='no_gps' or 'private' → lat/lon set to null.
Downsamples so at most one point per second is emitted.
"""
if not points:
return {"t": []}
include_gps = privacy not in ("no_gps", "private")
# Downsample: keep at most one point per second
sampled: list[DataPoint] = []
last_t: Optional[int] = None
for p in points:
t = int((p.timestamp - started_at).total_seconds())
if t < 0:
continue
if last_t is not None and t == last_t:
continue # skip sub-second duplicates
sampled.append(p)
last_t = t
ts_vals = [int((p.timestamp - started_at).total_seconds()) for p in sampled]
lat_vals = [round(p.lat, 7) if p.lat is not None else None for p in sampled] if include_gps else None
lon_vals = [round(p.lon, 7) if p.lon is not None else None for p in sampled] if include_gps else None
ele_vals = [round(p.elevation_m, 1) if p.elevation_m is not None else None for p in sampled]
spd_vals = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in sampled]
hr_vals = [p.hr_bpm for p in sampled]
cad_vals = [p.cadence_rpm for p in sampled]
pwr_vals = [p.power_w for p in sampled]
tmp_vals = [round(p.temperature_c, 1) if p.temperature_c is not None else None for p in sampled]
result: dict = {
"t": ts_vals,
"lat": lat_vals,
"lon": lon_vals,
"elevation_m": ele_vals,
"speed_kmh": spd_vals,
"hr_bpm": hr_vals,
"cadence_rpm": cad_vals,
"power_w": pwr_vals,
"temperature_c": tmp_vals,
}
return result
+198
View File
@@ -0,0 +1,198 @@
"""Write a processed activity to BAS JSON files."""
import json
import re
import unicodedata
from pathlib import Path
from bincio.extract.metrics import ComputedMetrics
from bincio.extract.models import LapData, ParsedActivity
from bincio.extract.simplify import build_geojson
from bincio.extract.timeseries import build_timeseries
def make_activity_id(activity: ParsedActivity) -> str:
"""Generate a BAS activity ID from started_at + optional title slug."""
ts = activity.started_at
# Compact ISO format: 2024-06-01T073012+0200
tz_str = ts.strftime("%z") # e.g. "+0200" or ""
ts_part = ts.strftime("%Y-%m-%dT%H%M%S") + (tz_str or "Z")
if activity.title:
slug = _slugify(activity.title)
return f"{ts_part}-{slug}" if slug else ts_part
return ts_part
def write_activity(
activity: ParsedActivity,
metrics: ComputedMetrics,
output_dir: Path,
privacy: str = "public",
duplicate_of: str | None = None,
rdp_epsilon: float = 0.0001,
) -> str:
"""Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
activity_id = make_activity_id(activity)
acts_dir = output_dir / "activities"
acts_dir.mkdir(parents=True, exist_ok=True)
source = _infer_source(activity)
has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private")
# ── detail JSON ──────────────────────────────────────────────────────────
detail: dict = {
"bas_version": "1.0",
"id": activity_id,
"title": activity.title or _auto_title(activity),
"description": activity.description,
"sport": activity.sport,
"sub_sport": activity.sub_sport,
"started_at": activity.started_at.isoformat(),
"distance_m": metrics.distance_m,
"duration_s": metrics.duration_s,
"moving_time_s": metrics.moving_time_s,
"elevation_gain_m": metrics.elevation_gain_m,
"elevation_loss_m": metrics.elevation_loss_m,
"avg_speed_kmh": metrics.avg_speed_kmh,
"max_speed_kmh": metrics.max_speed_kmh,
"avg_hr_bpm": metrics.avg_hr_bpm,
"max_hr_bpm": metrics.max_hr_bpm,
"avg_cadence_rpm": metrics.avg_cadence_rpm,
"avg_power_w": metrics.avg_power_w,
"max_power_w": metrics.max_power_w,
"gear": activity.gear,
"device": activity.device,
"bbox": list(metrics.bbox) if metrics.bbox else None,
"start_latlng": list(metrics.start_latlng) if metrics.start_latlng else None,
"end_latlng": list(metrics.end_latlng) if metrics.end_latlng else None,
"laps": [_serialise_lap(lap) for lap in activity.laps],
"timeseries": build_timeseries(activity.points, activity.started_at, privacy),
"source": source,
"source_file": activity.source_file,
"source_hash": activity.source_hash,
"strava_id": activity.strava_id,
"duplicate_of": duplicate_of,
"privacy": privacy,
"custom": {},
}
json_path = acts_dir / f"{activity_id}.json"
json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
# ── GeoJSON track ────────────────────────────────────────────────────────
if has_gps:
geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
geojson_path = acts_dir / f"{activity_id}.geojson"
geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))
return activity_id
def build_summary(
activity: ParsedActivity,
metrics: ComputedMetrics,
activity_id: str,
privacy: str = "public",
) -> dict:
"""Build the Activity Summary object for index.json."""
has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private")
return {
"id": activity_id,
"title": activity.title or _auto_title(activity),
"sport": activity.sport,
"sub_sport": activity.sub_sport,
"started_at": activity.started_at.isoformat(),
"distance_m": metrics.distance_m,
"duration_s": metrics.duration_s,
"moving_time_s": metrics.moving_time_s,
"elevation_gain_m": metrics.elevation_gain_m,
"avg_speed_kmh": metrics.avg_speed_kmh,
"max_speed_kmh": metrics.max_speed_kmh,
"avg_hr_bpm": metrics.avg_hr_bpm,
"max_hr_bpm": metrics.max_hr_bpm,
"avg_cadence_rpm": metrics.avg_cadence_rpm,
"avg_power_w": metrics.avg_power_w,
"source": _infer_source(activity),
"privacy": privacy,
"detail_url": f"activities/{activity_id}.json",
"track_url": f"activities/{activity_id}.geojson" if has_gps else None,
}
def write_index(summaries: list[dict], output_dir: Path, owner: dict) -> None:
"""Write index.json (sorted newest first)."""
sorted_summaries = sorted(
summaries,
key=lambda s: s["started_at"],
reverse=True,
)
index = {
"bas_version": "1.0",
"owner": owner,
"generated_at": _now_iso(),
"shards": [],
"activities": sorted_summaries,
}
(output_dir / "index.json").write_text(
json.dumps(index, indent=2, ensure_ascii=False)
)
# ── helpers ──────────────────────────────────────────────────────────────────
def _now_iso() -> str:
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
def _auto_title(activity: ParsedActivity) -> str:
ts = activity.started_at
hour = ts.hour
if 5 <= hour < 12:
time_of_day = "Morning"
elif 12 <= hour < 17:
time_of_day = "Afternoon"
elif 17 <= hour < 21:
time_of_day = "Evening"
else:
time_of_day = "Night"
sport = activity.sport.capitalize()
return f"{time_of_day} {sport}"
def _infer_source(activity: ParsedActivity) -> str | None:
if activity.strava_id:
return "strava_export"
name = activity.source_file.lower()
# Karoo uses UUID-style names
if "activity" in name and len(name.split(".")) >= 3:
return "karoo"
if name.endswith(".fit") or name.endswith(".fit.gz"):
return "fit_file"
if name.endswith(".gpx") or name.endswith(".gpx.gz"):
return "gpx_file"
if name.endswith(".tcx") or name.endswith(".tcx.gz"):
return "tcx_file"
return None
def _slugify(text: str) -> str:
text = unicodedata.normalize("NFKD", text)
text = text.encode("ascii", "ignore").decode("ascii")
text = text.lower()
text = re.sub(r"[^a-z0-9]+", "-", text)
return text.strip("-")[:60]
def _serialise_lap(lap: LapData) -> dict:
return {
"index": lap.index,
"started_at": lap.started_at.isoformat(),
"duration_s": lap.duration_s,
"distance_m": lap.distance_m,
"elevation_gain_m": lap.elevation_gain_m,
"avg_speed_kmh": lap.avg_speed_kmh,
"avg_hr_bpm": lap.avg_hr_bpm,
"avg_power_w": lap.avg_power_w,
}