Indoor detection: title-based inference in merge layer + fix _merge_all_locked

- Add _INDOOR_TITLE_RE / _infer_indoor_title() to writer.py (matches zwift, ftp-builder, turbo-trainer, rodillo); replaces the narrower zwift-only regex that was local to write_athlete_json - _is_outdoor now delegates to _infer_indoor_title so all four keywords are excluded from records and MMP aggregation - apply_sidecar and _apply_sidecar_summary both set sub_sport=indoor when the title matches and no explicit sub_sport is already present - _merge_one_locked: detect title-inferred activities as needs_merge and call apply_sidecar({},{}) so the _merged copy gets sub_sport=indoor written - _merge_all_locked: read index upfront to populate to_merge with title-inferred IDs; call apply_sidecar({},{}) for activities in to_merge without sidecars; apply _apply_sidecar_summary to ALL summary entries (not only sidecar ones)
2026-05-15 01:03:17 +02:00
parent 0fbb7822df
commit 4ea2292e2b
2 changed files with 49 additions and 12 deletions
@@ -10,6 +10,18 @@ from bincio.extract.models import LapData, ParsedActivity
 from bincio.extract.simplify import build_geojson, preview_coords
 from bincio.extract.timeseries import build_timeseries
 # Titles that reliably identify indoor/virtual activities regardless of sub_sport metadata.
 # Strava imports from Zwift and FTP-builder platforms lose sub_sport on export.
 _INDOOR_TITLE_RE = re.compile(
    r'\b(zwift|ftp[\s\-]builder|turbo[\s\-]?trainer|rodillo)\b',
    re.IGNORECASE,
 )
 def _infer_indoor_title(title: str) -> bool:
    """Return True if the title reliably identifies an indoor/virtual activity."""
    return bool(_INDOOR_TITLE_RE.search(title))
 def make_activity_id(activity: ParsedActivity) -> str:
    """Generate a BAS activity ID from started_at + optional title slug.
@@ -278,14 +290,11 @@ def write_athlete_json(summaries: list[dict], output_dir: Path, athlete_config:
        return [[d, w] for d, w in sorted(best.items())]
    _INDOOR_SUB_SPORTS = {"indoor", "treadmill", "virtual"}
    _INDOOR_TITLE_RE = re.compile(r'\bzwift\b', re.IGNORECASE)
    def _is_outdoor(s: dict) -> bool:
        if s.get("sub_sport") in _INDOOR_SUB_SPORTS:
            return False
-        if _INDOOR_TITLE_RE.search(s.get("title") or ""):
+        return not _infer_indoor_title(s.get("title") or "")
            return False
        return True
    all_mmps = [s["mmp"] for s in summaries if s.get("mmp") and _is_outdoor(s)]
    mmps_365 = [s["mmp"] for s in summaries if s.get("mmp") and _is_outdoor(s) and s["started_at"] >= cutoff_365]
@@ -71,6 +71,7 @@ def parse_sidecar(path: Path) -> tuple[dict, str]:
 def apply_sidecar(detail: dict, fm: dict, body: str) -> dict:
    """Apply sidecar overrides to a detail JSON dict. Returns a modified copy."""
    from bincio.extract.writer import _infer_indoor_title
    d = dict(detail)
    d.setdefault("custom", {})
    d["custom"] = dict(d["custom"])  # don't mutate original
@@ -81,6 +82,9 @@ def apply_sidecar(detail: dict, fm: dict, body: str) -> dict:
        d["sport"] = str(fm["sport"])
    if "sub_sport" in fm:
        d["sub_sport"] = str(fm["sub_sport"]) if fm["sub_sport"] else None
    # Infer indoor from title when sub_sport is still absent after sidecar
    if not d.get("sub_sport") and _infer_indoor_title(d.get("title") or ""):
        d["sub_sport"] = "indoor"
    if "gear" in fm:
        d["gear"] = str(fm["gear"]) if fm["gear"] else d.get("gear")
    if body:
@@ -99,6 +103,7 @@ def apply_sidecar(detail: dict, fm: dict, body: str) -> dict:
 def _apply_sidecar_summary(summary: dict, fm: dict) -> dict:
    """Apply sidecar overrides to an index summary entry."""
    from bincio.extract.writer import _infer_indoor_title
    s = dict(summary)
    s.setdefault("custom", {})
    s["custom"] = dict(s["custom"])
@@ -113,6 +118,9 @@ def _apply_sidecar_summary(summary: dict, fm: dict) -> dict:
        s["custom"]["highlight"] = bool(fm["highlight"])
    if "private" in fm:
        s["privacy"] = "unlisted" if fm["private"] else summary.get("privacy", "public")
    # Infer indoor from title when sub_sport is still absent
    if not s.get("sub_sport") and _infer_indoor_title(s.get("title") or ""):
        s["sub_sport"] = "indoor"
    return s
@@ -156,6 +164,12 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
        )
    needs_merge = has_sidecar or bool(image_files)
    # Also need a real file (not symlink) when title inference would change sub_sport
    if not needs_merge and not has_sidecar:
        from bincio.extract.writer import _infer_indoor_title
        _peek = json.loads(src.read_text(encoding="utf-8"))
        if not _peek.get("sub_sport") and _infer_indoor_title(_peek.get("title") or ""):
            needs_merge = True
    # Symlink the timeseries file (never merged — always points to the extract output)
    ts_src = acts_dir / f"{activity_id}.timeseries.json"
@@ -170,10 +184,13 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
        dest.unlink()
    if needs_merge:
-        detail = json.loads(src.read_text(encoding="utf-8"))
+        detail = locals().get("_peek") or json.loads(src.read_text(encoding="utf-8"))
        if has_sidecar:
            fm, body = parse_sidecar(sidecar_path)  # type: ignore[arg-type]
            detail = apply_sidecar(detail, fm, body)
        else:
            # No sidecar — still apply title inference
            detail = apply_sidecar(detail, {}, "")
        if image_files:
            detail["custom"] = dict(detail.get("custom") or {})
            detail["custom"]["images"] = image_files
@@ -195,9 +212,8 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
    activities = []
    for s in index.get("activities", []):
        aid = s.get("id", "")
-        if aid in all_sidecars:
+        fm, _ = all_sidecars[aid] if aid in all_sidecars else ({}, "")
-            fm, _ = all_sidecars[aid]
+        s = _apply_sidecar_summary(s, fm)
            s = _apply_sidecar_summary(s, fm)
        activities.append(s)
    activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
@@ -243,6 +259,17 @@ def _merge_all_locked(data_dir: Path) -> int:
    to_merge = set(sidecars) | set(image_lists)
    # Also include activities whose title implies indoor (no sidecar required)
    _index_path = data_dir / "index.json"
    _cached_index: dict | None = None
    if _index_path.exists():
        from bincio.extract.writer import _infer_indoor_title
        _cached_index = json.loads(_index_path.read_text(encoding="utf-8"))
        for _s in _cached_index.get("activities", []):
            _aid = _s.get("id", "")
            if _aid and not _s.get("sub_sport") and _infer_indoor_title(_s.get("title") or ""):
                to_merge.add(_aid)
    # Wipe and recreate _merged/activities/
    shutil.rmtree(merged_acts, ignore_errors=True)
    merged_acts.mkdir(parents=True, exist_ok=True)
@@ -259,6 +286,8 @@ def _merge_all_locked(data_dir: Path) -> int:
                if activity_id in sidecars:
                    fm, body = sidecars[activity_id]
                    detail = apply_sidecar(detail, fm, body)
                else:
                    detail = apply_sidecar(detail, {}, "")
                if activity_id in image_lists:
                    detail["custom"] = dict(detail.get("custom") or {})
                    detail["custom"]["images"] = image_lists[activity_id]
@@ -303,13 +332,12 @@ def _merge_all_locked(data_dir: Path) -> int:
    # Write merged index.json (private filtered, highlight sorted)
    index_path = data_dir / "index.json"
    if index_path.exists():
-        index = json.loads(index_path.read_text(encoding="utf-8"))
+        index = _cached_index or json.loads(index_path.read_text(encoding="utf-8"))
        activities = []
        for s in index.get("activities", []):
            aid = s.get("id", "")
-            if aid in sidecars:
+            fm, _ = sidecars[aid] if aid in sidecars else ({}, "")
-                fm, _ = sidecars[aid]
+            s = _apply_sidecar_summary(s, fm)
                s = _apply_sidecar_summary(s, fm)
            activities.append(s)
        # "unlisted" (and legacy "private") activities are kept in the index so