backend: initial commit

2026-03-28 13:57:12 +01:00
commit 38c5423aeb
36 changed files with 2463 additions and 0 deletions
@@ -0,0 +1,271 @@
+"""bincio extract — CLI command."""
+
+import json
+import sys
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import Optional
+
+import click
+from rich.console import Console
+from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn
+
+from bincio.extract.config import ExtractConfig, default_config, load_config
+from bincio.extract.dedup import ActivityRecord, DedupIndex
+from bincio.extract.metrics import compute
+from bincio.extract.models import ParsedActivity
+from bincio.extract.parsers.factory import is_supported, parse_file
+from bincio.extract.strava_csv import StravaMetadata
+from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
+
+console = Console()
+
+
+@click.command()
+@click.option("--config", "config_path", type=click.Path(exists=True), default=None,
+              help="Path to extract_config.yaml (default: ./extract_config.yaml).")
+@click.option("--input", "input_dir", type=click.Path(exists=True), default=None,
+              help="Input directory (overrides config).")
+@click.option("--output", "output_dir", type=click.Path(), default=None,
+              help="Output directory (overrides config).")
+@click.option("--file", "single_file", type=click.Path(exists=True), default=None,
+              help="Process a single file and print JSON to stdout.")
+@click.option("--since", default=None, metavar="YYYY-MM-DD",
+              help="Only process files modified after this date.")
+@click.option("--workers", default=4, show_default=True,
+              help="Number of parallel worker processes.")
+def extract(
+    config_path: Optional[str],
+    input_dir: Optional[str],
+    output_dir: Optional[str],
+    single_file: Optional[str],
+    since: Optional[str],
+    workers: int,
+) -> None:
+    """Parse GPX/FIT/TCX files and write BAS JSON data store."""
+
+    # ── single file mode ─────────────────────────────────────────────────────
+    if single_file:
+        _process_single(Path(single_file))
+        return
+
+    # ── load config ──────────────────────────────────────────────────────────
+    cfg = _resolve_config(config_path, input_dir, output_dir)
+    cfg.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # ── gather files ─────────────────────────────────────────────────────────
+    files = _collect_files(cfg, since)
+    if not files:
+        console.print("[yellow]No supported files found.[/yellow]")
+        return
+    console.print(f"Found [bold]{len(files)}[/bold] activity files.")
+
+    # ── Strava metadata ──────────────────────────────────────────────────────
+    strava_meta: Optional[StravaMetadata] = None
+    if cfg.metadata_csv and cfg.metadata_csv.exists():
+        strava_meta = StravaMetadata(cfg.metadata_csv)
+        console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].")
+
+    # ── dedup index ──────────────────────────────────────────────────────────
+    dedup = DedupIndex(output_dir=cfg.output_dir)
+
+    # ── process ──────────────────────────────────────────────────────────────
+    summaries: list[dict] = []
+    errors: list[tuple[Path, str]] = []
+    skipped = 0
+
+    owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name}
+
+    with Progress(
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Processing...", total=len(files))
+
+        with ProcessPoolExecutor(max_workers=workers) as pool:
+            futures = {pool.submit(_parse_worker, f): f for f in files}
+            for future in as_completed(futures):
+                path = futures[future]
+                progress.advance(task)
+                try:
+                    activity = future.result()
+                except Exception as exc:
+                    errors.append((path, str(exc)))
+                    continue
+
+                # ── incremental skip ──────────────────────────────────────
+                if cfg.incremental:
+                    existing_id = dedup.is_exact_duplicate(activity.source_hash)
+                    if existing_id:
+                        skipped += 1
+                        continue
+
+                # ── enrich from Strava CSV ────────────────────────────────
+                if strava_meta:
+                    strava_meta.enrich(activity.source_file, activity)
+
+                # ── compute metrics ───────────────────────────────────────
+                metrics = compute(activity)
+
+                # ── deduplication ─────────────────────────────────────────
+                activity_id = make_activity_id(activity)
+                duplicate_of: Optional[str] = None
+
+                near_dup_id = dedup.find_near_duplicate(
+                    activity.started_at, metrics.distance_m
+                )
+                if near_dup_id:
+                    source = _infer_source(activity)
+                    canonical = dedup.pick_canonical(near_dup_id, source)
+                    if canonical == "__new__":
+                        # New one is better — mark existing as duplicate
+                        existing = dedup._records[near_dup_id]
+                        existing.duplicate_of = activity_id
+                    else:
+                        duplicate_of = near_dup_id
+
+                # ── write files ───────────────────────────────────────────
+                written_id = write_activity(
+                    activity, metrics, cfg.output_dir,
+                    privacy=cfg.default_privacy,
+                    duplicate_of=duplicate_of,
+                    rdp_epsilon=cfg.track.rdp_epsilon,
+                )
+
+                # Register in dedup index
+                dedup.register(ActivityRecord(
+                    id=written_id,
+                    source_hash=activity.source_hash,
+                    started_at=activity.started_at,
+                    distance_m=metrics.distance_m,
+                    source=_infer_source(activity),
+                ))
+
+                if duplicate_of is None:
+                    summaries.append(
+                        build_summary(activity, metrics, written_id, cfg.default_privacy)
+                    )
+
+    # ── write index.json ──────────────────────────────────────────────────────
+    # Merge with any existing summaries from previous incremental runs
+    existing_index = _load_existing_summaries(cfg.output_dir)
+    all_summaries = {s["id"]: s for s in existing_index}
+    for s in summaries:
+        all_summaries[s["id"]] = s
+    write_index(list(all_summaries.values()), cfg.output_dir, owner)
+    dedup.save()
+
+    # ── summary ───────────────────────────────────────────────────────────────
+    console.print(
+        f"\n[green]Done.[/green] "
+        f"Processed [bold]{len(summaries)}[/bold] activities, "
+        f"skipped [bold]{skipped}[/bold] (already up to date), "
+        f"errors [bold]{len(errors)}[/bold]."
+    )
+    if errors:
+        console.print("\n[red]Errors:[/red]")
+        for path, msg in errors[:20]:
+            console.print(f"  {path.name}: {msg}")
+        if len(errors) > 20:
+            console.print(f"  ... and {len(errors) - 20} more.")
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def _parse_worker(path: Path) -> ParsedActivity:
+    """Run in worker process — imports are isolated."""
+    from bincio.extract.parsers.factory import parse_file
+    return parse_file(path)
+
+
+def _process_single(path: Path) -> None:
+    from bincio.extract.parsers.factory import parse_file
+    try:
+        activity = parse_file(path)
+        metrics = compute(activity)
+        activity_id = make_activity_id(activity)
+        from bincio.extract.writer import build_summary
+        result = build_summary(activity, metrics, activity_id)
+        click.echo(json.dumps(result, indent=2))
+    except Exception as exc:
+        console.print(f"[red]Error:[/red] {exc}")
+        sys.exit(1)
+
+
+def _resolve_config(
+    config_path: Optional[str],
+    input_dir: Optional[str],
+    output_dir: Optional[str],
+) -> ExtractConfig:
+    if config_path:
+        cfg = load_config(Path(config_path))
+    elif Path("extract_config.yaml").exists():
+        cfg = load_config(Path("extract_config.yaml"))
+    elif input_dir:
+        cfg = default_config(
+            Path(input_dir).expanduser(),
+            Path(output_dir or "./bincio_data").expanduser(),
+        )
+    else:
+        raise click.UsageError(
+            "Provide --config, --input, or an extract_config.yaml in the current directory."
+        )
+    if input_dir:
+        cfg.input_dirs = [Path(input_dir).expanduser()]
+    if output_dir:
+        cfg.output_dir = Path(output_dir).expanduser()
+    return cfg
+
+
+def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]:
+    from bincio.extract.parsers.factory import is_supported
+    import os
+    from datetime import datetime
+
+    since_ts: Optional[float] = None
+    if since:
+        since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp()
+
+    files = []
+    for d in cfg.input_dirs:
+        if not d.exists():
+            console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}")
+            continue
+        for path in d.rglob("*"):
+            if not path.is_file():
+                continue
+            if not is_supported(path):
+                continue
+            if since_ts and path.stat().st_mtime < since_ts:
+                continue
+            files.append(path)
+    return files
+
+
+def _load_existing_summaries(output_dir: Path) -> list[dict]:
+    index_path = output_dir / "index.json"
+    if not index_path.exists():
+        return []
+    try:
+        data = json.loads(index_path.read_text())
+        return data.get("activities", [])
+    except Exception:
+        return []
+
+
+def _infer_source(activity: ParsedActivity) -> Optional[str]:
+    if activity.strava_id:
+        return "strava_export"
+    name = activity.source_file.lower()
+    if "activity" in name and len(name.split(".")) >= 3:
+        return "karoo"
+    if name.endswith((".fit", ".fit.gz")):
+        return "fit_file"
+    if name.endswith((".gpx", ".gpx.gz")):
+        return "gpx_file"
+    if name.endswith((".tcx", ".tcx.gz")):
+        return "tcx_file"
+    return None
@@ -0,0 +1,88 @@
+"""Extract stage configuration — loaded from extract_config.yaml."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+
+@dataclass
+class TrackConfig:
+    simplify: str = "rdp"
+    rdp_epsilon: float = 0.0001
+    timeseries_hz: int = 1
+
+
+@dataclass
+class SensorsConfig:
+    heart_rate: bool = True
+    cadence: bool = True
+    temperature: bool = True
+    power: bool = True
+
+
+@dataclass
+class ClassifierConfig:
+    enabled: bool = False  # off by default; opt-in
+
+
+@dataclass
+class ExtractConfig:
+    input_dirs: list[Path]
+    output_dir: Path
+    metadata_csv: Optional[Path] = None
+    default_privacy: str = "public"
+    sensors: SensorsConfig = field(default_factory=SensorsConfig)
+    track: TrackConfig = field(default_factory=TrackConfig)
+    classifier: ClassifierConfig = field(default_factory=ClassifierConfig)
+    incremental: bool = True
+    owner_handle: str = "me"
+    owner_display_name: str = "Me"
+
+
+def load_config(path: Path) -> ExtractConfig:
+    raw = yaml.safe_load(path.read_text())
+
+    inp = raw.get("input", {})
+    dirs = [Path(d).expanduser() for d in inp.get("dirs", [])]
+    csv_path = inp.get("metadata_csv")
+
+    out = Path(raw.get("output", {}).get("dir", "./bincio_data")).expanduser()
+
+    owner = raw.get("owner", {})
+
+    sensors_raw = raw.get("sensors", {})
+    sensors = SensorsConfig(
+        heart_rate=sensors_raw.get("heart_rate", True),
+        cadence=sensors_raw.get("cadence", True),
+        temperature=sensors_raw.get("temperature", True),
+        power=sensors_raw.get("power", True),
+    )
+
+    track_raw = raw.get("track", {})
+    track = TrackConfig(
+        simplify=track_raw.get("simplify", "rdp"),
+        rdp_epsilon=track_raw.get("rdp_epsilon", 0.0001),
+        timeseries_hz=track_raw.get("timeseries_hz", 1),
+    )
+
+    cls_raw = raw.get("classifier", {})
+    classifier = ClassifierConfig(enabled=cls_raw.get("enabled", False))
+
+    return ExtractConfig(
+        input_dirs=dirs,
+        output_dir=out,
+        metadata_csv=Path(csv_path).expanduser() if csv_path else None,
+        default_privacy=raw.get("default_privacy", "public"),
+        sensors=sensors,
+        track=track,
+        classifier=classifier,
+        incremental=raw.get("incremental", True),
+        owner_handle=owner.get("handle", "me"),
+        owner_display_name=owner.get("display_name", "Me"),
+    )
+
+
+def default_config(input_dir: Path, output_dir: Path) -> ExtractConfig:
+    return ExtractConfig(input_dirs=[input_dir], output_dir=output_dir)
@@ -0,0 +1,127 @@
+"""Duplicate activity detection.
+
+Two kinds of duplicates:
+
+1. Exact duplicate — same source_hash. Skip entirely.
+2. Near-duplicate — same ride recorded by two devices / exported from two
+   platforms. Detected by (started_at ± 5 min) AND (distance ± 5%).
+   The "better" source wins; the other gets duplicate_of set.
+
+The deduplication index is a JSON file persisted in the output directory so
+that incremental runs don't re-evaluate already-resolved pairs.
+"""
+
+import json
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional
+
+_INDEX_FILE = ".bincio_cache.json"
+
+# Source quality ranking (higher = preferred when deduplicating)
+_SOURCE_QUALITY: dict[str, int] = {
+    "karoo": 5,
+    "fit_file": 4,
+    "garmin_connect": 4,
+    "strava_export": 3,
+    "gpx_file": 2,
+    "tcx_file": 1,
+    "wahoo": 3,
+    "komoot": 2,
+    "manual": 0,
+}
+
+
+@dataclass
+class ActivityRecord:
+    """Minimal record stored in the dedup index."""
+
+    id: str
+    source_hash: str
+    started_at: datetime
+    distance_m: Optional[float]
+    source: Optional[str]
+    duplicate_of: Optional[str] = None
+
+
+@dataclass
+class DedupIndex:
+    output_dir: Path
+    _records: dict[str, ActivityRecord] = field(default_factory=dict)
+    # source_hash → id, for exact-duplicate lookup
+    _by_hash: dict[str, str] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        self._load()
+
+    def _load(self) -> None:
+        p = self.output_dir / _INDEX_FILE
+        if not p.exists():
+            return
+        data = json.loads(p.read_text())
+        for item in data.get("activities", []):
+            started_at = datetime.fromisoformat(item["started_at"])
+            r = ActivityRecord(
+                id=item["id"],
+                source_hash=item["source_hash"],
+                started_at=started_at,
+                distance_m=item.get("distance_m"),
+                source=item.get("source"),
+                duplicate_of=item.get("duplicate_of"),
+            )
+            self._records[r.id] = r
+            self._by_hash[r.source_hash] = r.id
+
+    def save(self) -> None:
+        p = self.output_dir / _INDEX_FILE
+        data = {
+            "activities": [
+                {
+                    "id": r.id,
+                    "source_hash": r.source_hash,
+                    "started_at": r.started_at.isoformat(),
+                    "distance_m": r.distance_m,
+                    "source": r.source,
+                    "duplicate_of": r.duplicate_of,
+                }
+                for r in self._records.values()
+            ]
+        }
+        p.write_text(json.dumps(data, indent=2))
+
+    def is_exact_duplicate(self, source_hash: str) -> Optional[str]:
+        """Return existing activity ID if hash is already in the index."""
+        return self._by_hash.get(source_hash)
+
+    def find_near_duplicate(
+        self,
+        started_at: datetime,
+        distance_m: Optional[float],
+    ) -> Optional[str]:
+        """Return ID of a near-duplicate if one exists."""
+        for r in self._records.values():
+            if r.duplicate_of is not None:
+                continue  # skip already-marked duplicates
+            if abs((r.started_at - started_at).total_seconds()) > 5 * 60:
+                continue
+            if distance_m is None or r.distance_m is None:
+                continue
+            ref = max(distance_m, r.distance_m)
+            if abs(distance_m - r.distance_m) / ref < 0.05:
+                return r.id
+        return None
+
+    def register(self, record: ActivityRecord) -> None:
+        self._records[record.id] = record
+        self._by_hash[record.source_hash] = record.id
+
+    def pick_canonical(self, existing_id: str, new_source: Optional[str]) -> str:
+        """Return the ID of whichever record should be canonical."""
+        existing = self._records[existing_id]
+        existing_q = _SOURCE_QUALITY.get(existing.source or "", 0)
+        new_q = _SOURCE_QUALITY.get(new_source or "", 0)
+        # New record is strictly better → existing becomes the duplicate
+        if new_q > existing_q:
+            return "__new__"
+        return existing_id
@@ -0,0 +1,210 @@
+"""Compute aggregated metrics from a ParsedActivity.
+
+All calculations are self-contained — no external state needed.
+"""
+
+import math
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Optional
+
+from geopy.distance import geodesic
+
+from bincio.extract.models import DataPoint, ParsedActivity
+
+# Speed below which we consider the athlete stopped (km/h)
+_STOPPED_THRESHOLD_KMH = 1.0
+
+
+@dataclass
+class ComputedMetrics:
+    distance_m: Optional[float]
+    duration_s: Optional[int]
+    moving_time_s: Optional[int]
+    elevation_gain_m: Optional[float]
+    elevation_loss_m: Optional[float]
+    avg_speed_kmh: Optional[float]
+    max_speed_kmh: Optional[float]
+    avg_hr_bpm: Optional[int]
+    max_hr_bpm: Optional[int]
+    avg_cadence_rpm: Optional[int]
+    avg_power_w: Optional[int]
+    max_power_w: Optional[int]
+    bbox: Optional[tuple[float, float, float, float]]  # min_lon, min_lat, max_lon, max_lat
+    start_latlng: Optional[tuple[float, float]]
+    end_latlng: Optional[tuple[float, float]]
+
+
+def compute(activity: ParsedActivity) -> ComputedMetrics:
+    pts = activity.points
+    if not pts:
+        return _empty()
+
+    duration_s = _duration(pts)
+    distance_m = _distance(pts)
+    moving_time_s, moving_speed_kmh = _moving_stats(pts)
+    gain, loss = _elevation(pts)
+    max_speed = _max_speed(pts)
+    avg_hr, max_hr = _hr_stats(pts)
+    avg_cad = _avg_nonnull([p.cadence_rpm for p in pts])
+    avg_pow = _avg_nonnull([p.power_w for p in pts])
+    max_pow = _max_nonnull([p.power_w for p in pts])
+    bbox = _bbox(pts)
+    start_ll, end_ll = _endpoints(pts)
+
+    return ComputedMetrics(
+        distance_m=distance_m,
+        duration_s=duration_s,
+        moving_time_s=moving_time_s,
+        elevation_gain_m=round(gain, 1) if gain is not None else None,
+        elevation_loss_m=round(abs(loss), 1) if loss is not None else None,
+        avg_speed_kmh=round(moving_speed_kmh, 2) if moving_speed_kmh else None,
+        max_speed_kmh=round(max_speed, 2) if max_speed else None,
+        avg_hr_bpm=avg_hr,
+        max_hr_bpm=max_hr,
+        avg_cadence_rpm=avg_cad,
+        avg_power_w=avg_pow,
+        max_power_w=max_pow,
+        bbox=bbox,
+        start_latlng=start_ll,
+        end_latlng=end_ll,
+    )
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def _duration(pts: list[DataPoint]) -> Optional[int]:
+    if len(pts) < 2:
+        return None
+    return int((pts[-1].timestamp - pts[0].timestamp).total_seconds())
+
+
+def _distance(pts: list[DataPoint]) -> Optional[float]:
+    """Prefer device-recorded cumulative distance; fall back to GPS geodesic."""
+    # If the last point has a device distance, use it
+    last_dist = next(
+        (p.distance_m for p in reversed(pts) if p.distance_m is not None), None
+    )
+    if last_dist is not None:
+        return round(last_dist, 1)
+
+    # GPS fallback
+    total = 0.0
+    has_gps = False
+    for a, b in zip(pts, pts[1:]):
+        if a.lat is None or a.lon is None or b.lat is None or b.lon is None:
+            continue
+        has_gps = True
+        total += geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
+    return round(total, 1) if has_gps else None
+
+
+def _moving_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[float]]:
+    """Return (moving_time_s, avg_speed_kmh_over_moving_time)."""
+    moving_s = 0
+    moving_dist_m = 0.0
+    has_gps = False
+
+    for a, b in zip(pts, pts[1:]):
+        dt = (b.timestamp - a.timestamp).total_seconds()
+        if dt <= 0:
+            continue
+
+        # Compute speed for this interval from GPS
+        if a.lat is not None and a.lon is not None and b.lat is not None and b.lon is not None:
+            has_gps = True
+            seg_m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
+            seg_kmh = (seg_m / dt) * 3.6
+        elif a.speed_kmh is not None:
+            seg_kmh = a.speed_kmh
+            seg_m = (seg_kmh / 3.6) * dt
+            has_gps = True  # speed data present
+        else:
+            continue
+
+        if seg_kmh >= _STOPPED_THRESHOLD_KMH:
+            moving_s += int(dt)
+            moving_dist_m += seg_m
+
+    if not has_gps or moving_s == 0:
+        return None, None
+
+    avg_kmh = (moving_dist_m / moving_s) * 3.6
+    return moving_s, avg_kmh
+
+
+def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]:
+    elevations = [p.elevation_m for p in pts if p.elevation_m is not None]
+    if len(elevations) < 2:
+        return None, None
+    gain = loss = 0.0
+    for a, b in zip(elevations, elevations[1:]):
+        diff = b - a
+        if diff > 0:
+            gain += diff
+        else:
+            loss += diff
+    return gain, loss
+
+
+def _max_speed(pts: list[DataPoint]) -> Optional[float]:
+    # Prefer device speed; fall back to GPS-derived
+    device_speeds = [p.speed_kmh for p in pts if p.speed_kmh is not None]
+    if device_speeds:
+        return max(device_speeds)
+    # GPS-derived max
+    gps_speeds = []
+    for a, b in zip(pts, pts[1:]):
+        if a.lat is None or b.lat is None:
+            continue
+        dt = (b.timestamp - a.timestamp).total_seconds()
+        if dt <= 0:
+            continue
+        m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
+        gps_speeds.append((m / dt) * 3.6)
+    return max(gps_speeds) if gps_speeds else None
+
+
+def _hr_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[int]]:
+    hrs = [p.hr_bpm for p in pts if p.hr_bpm is not None]
+    if not hrs:
+        return None, None
+    return int(sum(hrs) / len(hrs)), max(hrs)
+
+
+def _avg_nonnull(values: list) -> Optional[int]:
+    v = [x for x in values if x is not None]
+    return int(sum(v) / len(v)) if v else None
+
+
+def _max_nonnull(values: list) -> Optional[int]:
+    v = [x for x in values if x is not None]
+    return max(v) if v else None
+
+
+def _bbox(pts: list[DataPoint]) -> Optional[tuple[float, float, float, float]]:
+    lats = [p.lat for p in pts if p.lat is not None]
+    lons = [p.lon for p in pts if p.lon is not None]
+    if not lats:
+        return None
+    return (min(lons), min(lats), max(lons), max(lats))
+
+
+def _endpoints(
+    pts: list[DataPoint],
+) -> tuple[Optional[tuple[float, float]], Optional[tuple[float, float]]]:
+    gps = [(p.lat, p.lon) for p in pts if p.lat is not None and p.lon is not None]
+    if not gps:
+        return None, None
+    return gps[0], gps[-1]
+
+
+def _empty() -> ComputedMetrics:
+    return ComputedMetrics(
+        distance_m=None, duration_s=None, moving_time_s=None,
+        elevation_gain_m=None, elevation_loss_m=None,
+        avg_speed_kmh=None, max_speed_kmh=None,
+        avg_hr_bpm=None, max_hr_bpm=None,
+        avg_cadence_rpm=None, avg_power_w=None, max_power_w=None,
+        bbox=None, start_latlng=None, end_latlng=None,
+    )
@@ -0,0 +1,58 @@
+"""Core data models for the extract stage.
+
+ParsedActivity is the internal representation produced by parsers.
+It gets fed into metrics computation and the BAS JSON writer.
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Optional
+
+
+@dataclass
+class DataPoint:
+    """One measurement sample from a GPS/sensor recording."""
+
+    timestamp: datetime
+    lat: Optional[float] = None
+    lon: Optional[float] = None
+    elevation_m: Optional[float] = None
+    hr_bpm: Optional[int] = None
+    cadence_rpm: Optional[int] = None
+    # Speed from device (km/h). May be absent; we compute it from GPS if so.
+    speed_kmh: Optional[float] = None
+    power_w: Optional[int] = None
+    temperature_c: Optional[float] = None
+    # Cumulative distance from device (metres), if recorded.
+    distance_m: Optional[float] = None
+
+
+@dataclass
+class LapData:
+    index: int
+    started_at: datetime
+    duration_s: Optional[int] = None
+    distance_m: Optional[float] = None
+    elevation_gain_m: Optional[float] = None
+    avg_speed_kmh: Optional[float] = None
+    avg_hr_bpm: Optional[int] = None
+    avg_power_w: Optional[int] = None
+
+
+@dataclass
+class ParsedActivity:
+    """Raw activity data as produced by a parser, before metric computation."""
+
+    points: list[DataPoint]
+    sport: str                         # normalised to BAS sport enum
+    started_at: datetime
+    source_file: str                   # basename of original file
+    source_hash: str                   # "sha256:{hex}"
+
+    sub_sport: Optional[str] = None
+    device: Optional[str] = None
+    title: Optional[str] = None
+    description: Optional[str] = None
+    gear: Optional[str] = None
+    strava_id: Optional[str] = None
+    laps: list[LapData] = field(default_factory=list)
@@ -0,0 +1,34 @@
+"""Abstract base class for all activity parsers."""
+
+import gzip
+import hashlib
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+from bincio.extract.models import ParsedActivity
+
+
+class BaseParser(ABC):
+    @abstractmethod
+    def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
+        """Parse activity from raw file bytes.
+
+        Receives pre-read bytes so the factory can compute the hash once and
+        handle decompression transparently before dispatching.
+        """
+
+    @staticmethod
+    def _sha256(data: bytes) -> str:
+        return "sha256:" + hashlib.sha256(data).hexdigest()
+
+    @staticmethod
+    def _read_file(path: Path) -> tuple[bytes, bytes]:
+        """Return (raw_bytes, decompressed_bytes).
+
+        raw_bytes is the original file content (used for hashing).
+        decompressed_bytes is what parsers should actually parse.
+        """
+        raw = path.read_bytes()
+        if path.suffix == ".gz":
+            return raw, gzip.decompress(raw)
+        return raw, raw
@@ -0,0 +1,46 @@
+"""Parser factory — selects the right parser based on file extension."""
+
+from pathlib import Path
+
+from bincio.extract.models import ParsedActivity
+from bincio.extract.parsers.base import BaseParser
+from bincio.extract.parsers.fit import FitParser
+from bincio.extract.parsers.gpx import GpxParser
+from bincio.extract.parsers.tcx import TcxParser
+
+# Supported extensions (including .gz variants)
+SUPPORTED = {".fit", ".gpx", ".tcx", ".fit.gz", ".gpx.gz", ".tcx.gz"}
+
+_PARSERS: dict[str, type[BaseParser]] = {
+    ".fit": FitParser,
+    ".gpx": GpxParser,
+    ".tcx": TcxParser,
+}
+
+
+def _base_ext(path: Path) -> str:
+    """Return the meaningful extension, stripping .gz if present."""
+    if path.suffix == ".gz":
+        return Path(path.stem).suffix  # e.g. ".fit" from "ride.fit.gz"
+    return path.suffix
+
+
+def is_supported(path: Path) -> bool:
+    suffix = "".join(path.suffixes[-2:]) if path.suffix == ".gz" else path.suffix
+    return suffix in SUPPORTED
+
+
+def parse_file(path: Path) -> ParsedActivity:
+    """Parse an activity file, handling .gz transparently."""
+    ext = _base_ext(path)
+    parser_cls = _PARSERS.get(ext)
+    if parser_cls is None:
+        raise ValueError(f"Unsupported file type: {path.name!r}")
+
+    raw_bytes, content_bytes = BaseParser._read_file(path)
+    parser = parser_cls()
+    activity = parser.parse(path, content_bytes)
+    # Attach hash of the *original* bytes (compressed if .gz) for dedup
+    activity.source_hash = BaseParser._sha256(raw_bytes)
+    activity.source_file = path.name
+    return activity
@@ -0,0 +1,133 @@
+"""FIT file parser (Garmin binary format)."""
+
+from datetime import timezone
+from pathlib import Path
+from typing import Any
+
+import fitdecode
+
+from bincio.extract.models import DataPoint, LapData, ParsedActivity
+from bincio.extract.sport import normalise_sport
+
+
+class FitParser:
+    def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
+        import io
+
+        points: list[DataPoint] = []
+        laps: list[LapData] = []
+        sport: str = "cycling"
+        sub_sport: str | None = None
+        device: str | None = None
+
+        with fitdecode.FitReader(io.BytesIO(raw_bytes)) as fit:
+            for frame in fit:
+                if not isinstance(frame, fitdecode.FitDataMessage):
+                    continue
+
+                if frame.name == "sport":
+                    sport = normalise_sport(_get(frame, "sport", "cycling"))
+                    sub_sport = _normalise_sub_sport(_get(frame, "sub_sport"))
+
+                elif frame.name == "device_info":
+                    mfr = _get(frame, "manufacturer")
+                    prod = _get(frame, "product_name") or _get(frame, "garmin_product")
+                    if mfr and prod:
+                        device = f"{mfr} {prod}"
+                    elif prod:
+                        device = str(prod)
+
+                elif frame.name == "record":
+                    ts = _get(frame, "timestamp")
+                    if ts is None:
+                        continue
+                    if hasattr(ts, "tzinfo") and ts.tzinfo is None:
+                        ts = ts.replace(tzinfo=timezone.utc)
+
+                    lat = _semicircles_to_deg(_get(frame, "position_lat"))
+                    lon = _semicircles_to_deg(_get(frame, "position_long"))
+                    speed_raw = _get(frame, "speed")  # m/s
+
+                    dp = DataPoint(
+                        timestamp=ts,
+                        lat=lat,
+                        lon=lon,
+                        elevation_m=_get(frame, "altitude"),
+                        hr_bpm=_get(frame, "heart_rate"),
+                        cadence_rpm=_get(frame, "cadence"),
+                        speed_kmh=speed_raw * 3.6 if speed_raw is not None else None,
+                        power_w=_get(frame, "power"),
+                        temperature_c=_get(frame, "temperature"),
+                        distance_m=_get(frame, "distance"),
+                    )
+                    points.append(dp)
+
+                elif frame.name == "lap":
+                    ts = _get(frame, "start_time")
+                    if ts is not None:
+                        if hasattr(ts, "tzinfo") and ts.tzinfo is None:
+                            ts = ts.replace(tzinfo=timezone.utc)
+                        elapsed = _get(frame, "total_elapsed_time")
+                        speed_raw = _get(frame, "avg_speed")
+                        laps.append(
+                            LapData(
+                                index=len(laps),
+                                started_at=ts,
+                                duration_s=int(elapsed) if elapsed else None,
+                                distance_m=_get(frame, "total_distance"),
+                                elevation_gain_m=_get(frame, "total_ascent"),
+                                avg_speed_kmh=speed_raw * 3.6 if speed_raw else None,
+                                avg_hr_bpm=_get(frame, "avg_heart_rate"),
+                                avg_power_w=_get(frame, "avg_power"),
+                            )
+                        )
+
+        if not points:
+            raise ValueError(f"No record messages found in {path.name}")
+
+        return ParsedActivity(
+            points=points,
+            sport=sport,
+            sub_sport=sub_sport,
+            started_at=points[0].timestamp,
+            device=device,
+            laps=laps,
+            source_file=path.name,
+            source_hash="",
+        )
+
+
+def _get(frame: fitdecode.FitDataMessage, field: str, default: Any = None) -> Any:
+    try:
+        return frame.get_value(field)
+    except KeyError:
+        return default
+
+
+def _semicircles_to_deg(value: Any) -> float | None:
+    if value is None:
+        return None
+    try:
+        deg = float(value) * (180.0 / 2**31)
+        # Sanity check: invalid semicircle values often come out as ±180+
+        if abs(deg) > 180:
+            return None
+        return deg
+    except (TypeError, ValueError):
+        return None
+
+
+def _normalise_sub_sport(value: Any) -> str | None:
+    if value is None:
+        return None
+    s = str(value).lower().replace(" ", "_")
+    mapping = {
+        "road": "road",
+        "mountain": "mountain",
+        "gravel_cycling": "gravel",
+        "cyclocross": "gravel",
+        "indoor_cycling": "indoor",
+        "trail": "trail",
+        "track": "track",
+    }
+    return mapping.get(s, s) or None
@@ -0,0 +1,82 @@
+"""GPX file parser."""
+
+from datetime import timezone
+from pathlib import Path
+
+import gpxpy
+import gpxpy.gpx
+
+from bincio.extract.models import DataPoint, ParsedActivity
+from bincio.extract.parsers.base import BaseParser
+from bincio.extract.sport import normalise_sport
+
+# Known GPX extension namespaces
+_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1"
+_NS_GARMIN_V2 = "http://www.garmin.com/xmlschemas/TrackPointExtension/v2"
+
+
+class GpxParser(BaseParser):
+    def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
+        gpx = gpxpy.parse(raw_bytes.decode("utf-8", errors="replace"))
+
+        points: list[DataPoint] = []
+        for track in gpx.tracks:
+            for segment in track.segments:
+                for pt in segment.points:
+                    if pt.time is None:
+                        continue
+                    ts = pt.time
+                    if ts.tzinfo is None:
+                        ts = ts.replace(tzinfo=timezone.utc)
+
+                    dp = DataPoint(
+                        timestamp=ts,
+                        lat=pt.latitude,
+                        lon=pt.longitude,
+                        elevation_m=pt.elevation,
+                    )
+                    _apply_extensions(pt, dp)
+                    points.append(dp)
+
+        if not points:
+            raise ValueError(f"No trackpoints found in {path.name}")
+
+        sport = normalise_sport(
+            (gpx.tracks[0].type if gpx.tracks else None) or "cycling"
+        )
+        started_at = points[0].timestamp
+
+        return ParsedActivity(
+            points=points,
+            sport=sport,
+            started_at=started_at,
+            source_file=path.name,
+            source_hash="",  # set by factory
+        )
+
+
+def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
+    """Extract HR, cadence, temperature from Garmin TrackPointExtension."""
+    if pt.extensions is None:
+        return
+    for ext in pt.extensions:
+        ns = _strip_ns(ext.tag)
+        if ns == "TrackPointExtension":
+            for child in ext:
+                tag = _strip_ns(child.tag)
+                val = child.text
+                if val is None:
+                    continue
+                if tag == "hr":
+                    dp.hr_bpm = int(float(val))
+                elif tag == "cad":
+                    dp.cadence_rpm = int(float(val))
+                elif tag == "atemp":
+                    dp.temperature_c = float(val)
+                elif tag == "speed":
+                    dp.speed_kmh = float(val) * 3.6  # m/s → km/h
+
+
+def _strip_ns(tag: str) -> str:
+    """'{namespace}localname' → 'localname'."""
+    return tag.split("}")[-1] if "}" in tag else tag
@@ -0,0 +1,89 @@
+"""TCX (Training Center XML) file parser."""
+
+from datetime import datetime, timezone
+from pathlib import Path
+
+from lxml import etree
+
+from bincio.extract.models import DataPoint, ParsedActivity
+from bincio.extract.sport import normalise_sport
+
+_NS = {
+    "tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
+    "ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
+}
+
+
+class TcxParser:
+    def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
+        # Some exporters (e.g. Garmin) prepend whitespace before the XML
+        # declaration, which is technically invalid. Strip it.
+        root = etree.fromstring(raw_bytes.lstrip())
+
+        activities = root.findall(".//tcx:Activity", _NS)
+        if not activities:
+            raise ValueError(f"No Activity elements found in {path.name}")
+
+        # Use the first activity
+        act = activities[0]
+        sport_attr = act.get("Sport", "Biking")
+        sport = normalise_sport(sport_attr)
+
+        points: list[DataPoint] = []
+        for tp in act.findall(".//tcx:Trackpoint", _NS):
+            ts_el = tp.find("tcx:Time", _NS)
+            if ts_el is None or not ts_el.text:
+                continue
+            ts = _parse_ts(ts_el.text)
+
+            lat, lon = None, None
+            pos = tp.find("tcx:Position", _NS)
+            if pos is not None:
+                lat_el = pos.find("tcx:LatitudeDegrees", _NS)
+                lon_el = pos.find("tcx:LongitudeDegrees", _NS)
+                lat = float(lat_el.text) if lat_el is not None and lat_el.text else None
+                lon = float(lon_el.text) if lon_el is not None and lon_el.text else None
+
+            ele_el = tp.find("tcx:AltitudeMeters", _NS)
+            hr_el = tp.find(".//tcx:HeartRateBpm/tcx:Value", _NS)
+            cad_el = tp.find("tcx:Cadence", _NS)
+            dist_el = tp.find("tcx:DistanceMeters", _NS)
+
+            # Extensions (speed, watts)
+            speed_el = tp.find(".//ext:Speed", _NS)
+            power_el = tp.find(".//ext:Watts", _NS)
+
+            dp = DataPoint(
+                timestamp=ts,
+                lat=lat,
+                lon=lon,
+                elevation_m=float(ele_el.text) if ele_el is not None and ele_el.text else None,
+                hr_bpm=int(float(hr_el.text)) if hr_el is not None and hr_el.text else None,
+                cadence_rpm=int(float(cad_el.text)) if cad_el is not None and cad_el.text else None,
+                distance_m=float(dist_el.text) if dist_el is not None and dist_el.text else None,
+                speed_kmh=float(speed_el.text) * 3.6 if speed_el is not None and speed_el.text else None,
+                power_w=int(float(power_el.text)) if power_el is not None and power_el.text else None,
+            )
+            points.append(dp)
+
+        if not points:
+            raise ValueError(f"No trackpoints found in {path.name}")
+
+        return ParsedActivity(
+            points=points,
+            sport=sport,
+            started_at=points[0].timestamp,
+            source_file=path.name,
+            source_hash="",
+        )
+
+
+def _parse_ts(s: str) -> datetime:
+    # ISO 8601 with or without fractional seconds
+    s = s.rstrip("Z")
+    for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
+        try:
+            return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
+        except ValueError:
+            continue
+    raise ValueError(f"Cannot parse timestamp: {s!r}")
@@ -0,0 +1,60 @@
+"""GPS track simplification using the Ramer-Douglas-Peucker algorithm."""
+
+from typing import Optional
+
+from rdp import rdp
+
+from bincio.extract.models import DataPoint
+
+
+def simplify_track(
+    points: list[DataPoint],
+    epsilon: float = 0.0001,
+) -> list[DataPoint]:
+    """Return a simplified subset of points using RDP.
+
+    epsilon is in degrees (~11m at equator for 0.0001).
+    Points without GPS coordinates are dropped.
+    """
+    gps_pts = [(p, p.lat, p.lon) for p in points if p.lat is not None and p.lon is not None]
+    if len(gps_pts) < 2:
+        return [p for p, _, _ in gps_pts]
+
+    coords = [[lon, lat] for _, lat, lon in gps_pts]
+    mask = rdp(coords, epsilon=epsilon, return_mask=True)
+    return [p for (p, _, _), keep in zip(gps_pts, mask) if keep]
+
+
+def build_geojson(
+    points: list[DataPoint],
+    activity_id: str,
+    epsilon: float = 0.0001,
+    original_count: Optional[int] = None,
+) -> dict:
+    """Build a GeoJSON Feature for the simplified track."""
+    simplified = simplify_track(points, epsilon=epsilon)
+
+    coordinates = [
+        [p.lon, p.lat, p.elevation_m] if p.elevation_m is not None else [p.lon, p.lat]
+        for p in simplified
+        if p.lon is not None and p.lat is not None
+    ]
+
+    # Parallel speed array for gradient coloring
+    speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified]
+
+    return {
+        "type": "Feature",
+        "geometry": {
+            "type": "LineString",
+            "coordinates": coordinates,
+        },
+        "properties": {
+            "id": activity_id,
+            "speeds": speeds,
+            "simplification": "rdp",
+            "rdp_epsilon": epsilon,
+            "point_count_original": original_count or len(points),
+            "point_count_simplified": len(coordinates),
+        },
+    }
@@ -0,0 +1,40 @@
+"""Sport name normalisation."""
+
+_MAPPING: dict[str, str] = {
+    # cycling variants
+    "cycling": "cycling",
+    "biking": "cycling",
+    "bike": "cycling",
+    "road_biking": "cycling",
+    "mountain_biking": "cycling",
+    "gravel_cycling": "cycling",
+    "cyclocross": "cycling",
+    "indoor_cycling": "cycling",
+    "virtual_ride": "cycling",
+    "e-biking": "cycling",
+    # running
+    "running": "running",
+    "run": "running",
+    "trail_running": "running",
+    "treadmill_running": "running",
+    "virtual_run": "running",
+    # hiking
+    "hiking": "hiking",
+    "hike": "hiking",
+    # walking
+    "walking": "walking",
+    "walk": "walking",
+    # swimming
+    "swimming": "swimming",
+    "swim": "swimming",
+    "open_water_swimming": "swimming",
+}
+
+BAS_SPORTS = {"cycling", "running", "hiking", "walking", "swimming", "other"}
+
+
+def normalise_sport(raw: object) -> str:
+    if raw is None:
+        return "other"
+    key = str(raw).lower().strip().replace(" ", "_")
+    return _MAPPING.get(key, "other")
@@ -0,0 +1,55 @@
+"""Import metadata from Strava's activities.csv bulk export.
+
+Strava export columns we care about:
+  Activity ID, Activity Date, Activity Name, Activity Type,
+  Activity Description, Filename
+"""
+
+import csv
+import re
+from pathlib import Path
+from typing import Optional
+
+
+_STRAVA_DATE_FMTS = (
+    "%b %d, %Y, %I:%M:%S %p",  # "Jun 1, 2024, 7:30:12 AM"
+    "%Y-%m-%d %H:%M:%S",
+)
+
+
+class StravaMetadata:
+    """Maps original filename → Strava metadata."""
+
+    def __init__(self, csv_path: Path) -> None:
+        self._by_filename: dict[str, dict] = {}
+        self._load(csv_path)
+
+    def _load(self, path: Path) -> None:
+        with path.open(newline="", encoding="utf-8-sig") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                filename = row.get("Filename", "").strip()
+                if not filename:
+                    continue
+                # Strava stores paths like "activities/12345.fit.gz"
+                basename = Path(filename).name
+                self._by_filename[basename] = row
+
+    def lookup(self, source_file: str) -> Optional[dict]:
+        """Return the Strava CSV row for a given source filename, or None."""
+        return self._by_filename.get(source_file)
+
+    def enrich(self, source_file: str, activity: object) -> None:
+        """Mutate a ParsedActivity with Strava metadata if found."""
+        row = self.lookup(source_file)
+        if row is None:
+            return
+
+        if not activity.title and row.get("Activity Name"):  # type: ignore[attr-defined]
+            activity.title = row["Activity Name"].strip()  # type: ignore[attr-defined]
+
+        if not activity.description and row.get("Activity Description"):  # type: ignore[attr-defined]
+            activity.description = row["Activity Description"].strip()  # type: ignore[attr-defined]
+
+        if not activity.strava_id and row.get("Activity ID"):  # type: ignore[attr-defined]
+            activity.strava_id = row["Activity ID"].strip()  # type: ignore[attr-defined]
@@ -0,0 +1,58 @@
+"""Downsample a list of DataPoints to at most 1 sample/second and build
+the BAS timeseries object (parallel arrays)."""
+
+from datetime import datetime
+from typing import Optional
+
+from bincio.extract.models import DataPoint
+
+
+def build_timeseries(
+    points: list[DataPoint],
+    started_at: datetime,
+    privacy: str = "public",
+) -> dict:
+    """Return the BAS `timeseries` object.
+
+    privacy='no_gps' or 'private' → lat/lon set to null.
+    Downsamples so at most one point per second is emitted.
+    """
+    if not points:
+        return {"t": []}
+
+    include_gps = privacy not in ("no_gps", "private")
+
+    # Downsample: keep at most one point per second
+    sampled: list[DataPoint] = []
+    last_t: Optional[int] = None
+    for p in points:
+        t = int((p.timestamp - started_at).total_seconds())
+        if t < 0:
+            continue
+        if last_t is not None and t == last_t:
+            continue  # skip sub-second duplicates
+        sampled.append(p)
+        last_t = t
+
+    ts_vals   = [int((p.timestamp - started_at).total_seconds()) for p in sampled]
+    lat_vals  = [round(p.lat, 7) if p.lat is not None else None for p in sampled] if include_gps else None
+    lon_vals  = [round(p.lon, 7) if p.lon is not None else None for p in sampled] if include_gps else None
+    ele_vals  = [round(p.elevation_m, 1) if p.elevation_m is not None else None for p in sampled]
+    spd_vals  = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in sampled]
+    hr_vals   = [p.hr_bpm for p in sampled]
+    cad_vals  = [p.cadence_rpm for p in sampled]
+    pwr_vals  = [p.power_w for p in sampled]
+    tmp_vals  = [round(p.temperature_c, 1) if p.temperature_c is not None else None for p in sampled]
+
+    result: dict = {
+        "t":             ts_vals,
+        "lat":           lat_vals,
+        "lon":           lon_vals,
+        "elevation_m":   ele_vals,
+        "speed_kmh":     spd_vals,
+        "hr_bpm":        hr_vals,
+        "cadence_rpm":   cad_vals,
+        "power_w":       pwr_vals,
+        "temperature_c": tmp_vals,
+    }
+    return result
@@ -0,0 +1,198 @@
+"""Write a processed activity to BAS JSON files."""
+
+import json
+import re
+import unicodedata
+from pathlib import Path
+
+from bincio.extract.metrics import ComputedMetrics
+from bincio.extract.models import LapData, ParsedActivity
+from bincio.extract.simplify import build_geojson
+from bincio.extract.timeseries import build_timeseries
+
+
+def make_activity_id(activity: ParsedActivity) -> str:
+    """Generate a BAS activity ID from started_at + optional title slug."""
+    ts = activity.started_at
+    # Compact ISO format: 2024-06-01T073012+0200
+    tz_str = ts.strftime("%z")  # e.g. "+0200" or ""
+    ts_part = ts.strftime("%Y-%m-%dT%H%M%S") + (tz_str or "Z")
+
+    if activity.title:
+        slug = _slugify(activity.title)
+        return f"{ts_part}-{slug}" if slug else ts_part
+    return ts_part
+
+
+def write_activity(
+    activity: ParsedActivity,
+    metrics: ComputedMetrics,
+    output_dir: Path,
+    privacy: str = "public",
+    duplicate_of: str | None = None,
+    rdp_epsilon: float = 0.0001,
+) -> str:
+    """Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
+    activity_id = make_activity_id(activity)
+    acts_dir = output_dir / "activities"
+    acts_dir.mkdir(parents=True, exist_ok=True)
+
+    source = _infer_source(activity)
+    has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private")
+
+    # ── detail JSON ──────────────────────────────────────────────────────────
+    detail: dict = {
+        "bas_version": "1.0",
+        "id": activity_id,
+        "title": activity.title or _auto_title(activity),
+        "description": activity.description,
+        "sport": activity.sport,
+        "sub_sport": activity.sub_sport,
+        "started_at": activity.started_at.isoformat(),
+        "distance_m": metrics.distance_m,
+        "duration_s": metrics.duration_s,
+        "moving_time_s": metrics.moving_time_s,
+        "elevation_gain_m": metrics.elevation_gain_m,
+        "elevation_loss_m": metrics.elevation_loss_m,
+        "avg_speed_kmh": metrics.avg_speed_kmh,
+        "max_speed_kmh": metrics.max_speed_kmh,
+        "avg_hr_bpm": metrics.avg_hr_bpm,
+        "max_hr_bpm": metrics.max_hr_bpm,
+        "avg_cadence_rpm": metrics.avg_cadence_rpm,
+        "avg_power_w": metrics.avg_power_w,
+        "max_power_w": metrics.max_power_w,
+        "gear": activity.gear,
+        "device": activity.device,
+        "bbox": list(metrics.bbox) if metrics.bbox else None,
+        "start_latlng": list(metrics.start_latlng) if metrics.start_latlng else None,
+        "end_latlng": list(metrics.end_latlng) if metrics.end_latlng else None,
+        "laps": [_serialise_lap(lap) for lap in activity.laps],
+        "timeseries": build_timeseries(activity.points, activity.started_at, privacy),
+        "source": source,
+        "source_file": activity.source_file,
+        "source_hash": activity.source_hash,
+        "strava_id": activity.strava_id,
+        "duplicate_of": duplicate_of,
+        "privacy": privacy,
+        "custom": {},
+    }
+
+    json_path = acts_dir / f"{activity_id}.json"
+    json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
+
+    # ── GeoJSON track ────────────────────────────────────────────────────────
+    if has_gps:
+        geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
+        geojson_path = acts_dir / f"{activity_id}.geojson"
+        geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))
+
+    return activity_id
+
+
+def build_summary(
+    activity: ParsedActivity,
+    metrics: ComputedMetrics,
+    activity_id: str,
+    privacy: str = "public",
+) -> dict:
+    """Build the Activity Summary object for index.json."""
+    has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private")
+    return {
+        "id": activity_id,
+        "title": activity.title or _auto_title(activity),
+        "sport": activity.sport,
+        "sub_sport": activity.sub_sport,
+        "started_at": activity.started_at.isoformat(),
+        "distance_m": metrics.distance_m,
+        "duration_s": metrics.duration_s,
+        "moving_time_s": metrics.moving_time_s,
+        "elevation_gain_m": metrics.elevation_gain_m,
+        "avg_speed_kmh": metrics.avg_speed_kmh,
+        "max_speed_kmh": metrics.max_speed_kmh,
+        "avg_hr_bpm": metrics.avg_hr_bpm,
+        "max_hr_bpm": metrics.max_hr_bpm,
+        "avg_cadence_rpm": metrics.avg_cadence_rpm,
+        "avg_power_w": metrics.avg_power_w,
+        "source": _infer_source(activity),
+        "privacy": privacy,
+        "detail_url": f"activities/{activity_id}.json",
+        "track_url": f"activities/{activity_id}.geojson" if has_gps else None,
+    }
+
+
+def write_index(summaries: list[dict], output_dir: Path, owner: dict) -> None:
+    """Write index.json (sorted newest first)."""
+    sorted_summaries = sorted(
+        summaries,
+        key=lambda s: s["started_at"],
+        reverse=True,
+    )
+    index = {
+        "bas_version": "1.0",
+        "owner": owner,
+        "generated_at": _now_iso(),
+        "shards": [],
+        "activities": sorted_summaries,
+    }
+    (output_dir / "index.json").write_text(
+        json.dumps(index, indent=2, ensure_ascii=False)
+    )
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def _now_iso() -> str:
+    from datetime import datetime, timezone
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _auto_title(activity: ParsedActivity) -> str:
+    ts = activity.started_at
+    hour = ts.hour
+    if 5 <= hour < 12:
+        time_of_day = "Morning"
+    elif 12 <= hour < 17:
+        time_of_day = "Afternoon"
+    elif 17 <= hour < 21:
+        time_of_day = "Evening"
+    else:
+        time_of_day = "Night"
+    sport = activity.sport.capitalize()
+    return f"{time_of_day} {sport}"
+
+
+def _infer_source(activity: ParsedActivity) -> str | None:
+    if activity.strava_id:
+        return "strava_export"
+    name = activity.source_file.lower()
+    # Karoo uses UUID-style names
+    if "activity" in name and len(name.split(".")) >= 3:
+        return "karoo"
+    if name.endswith(".fit") or name.endswith(".fit.gz"):
+        return "fit_file"
+    if name.endswith(".gpx") or name.endswith(".gpx.gz"):
+        return "gpx_file"
+    if name.endswith(".tcx") or name.endswith(".tcx.gz"):
+        return "tcx_file"
+    return None
+
+
+def _slugify(text: str) -> str:
+    text = unicodedata.normalize("NFKD", text)
+    text = text.encode("ascii", "ignore").decode("ascii")
+    text = text.lower()
+    text = re.sub(r"[^a-z0-9]+", "-", text)
+    return text.strip("-")[:60]
+
+
+def _serialise_lap(lap: LapData) -> dict:
+    return {
+        "index": lap.index,
+        "started_at": lap.started_at.isoformat(),
+        "duration_s": lap.duration_s,
+        "distance_m": lap.distance_m,
+        "elevation_gain_m": lap.elevation_gain_m,
+        "avg_speed_kmh": lap.avg_speed_kmh,
+        "avg_hr_bpm": lap.avg_hr_bpm,
+        "avg_power_w": lap.avg_power_w,
+    }