Indoor detection: title-based inference in merge layer + fix _merge_all_locked

- Add _INDOOR_TITLE_RE / _infer_indoor_title() to writer.py (matches zwift,
  ftp-builder, turbo-trainer, rodillo); replaces the narrower zwift-only regex
  that was local to write_athlete_json
- _is_outdoor now delegates to _infer_indoor_title so all four keywords are
  excluded from records and MMP aggregation
- apply_sidecar and _apply_sidecar_summary both set sub_sport=indoor when the
  title matches and no explicit sub_sport is already present
- _merge_one_locked: detect title-inferred activities as needs_merge and call
  apply_sidecar({},{}) so the _merged copy gets sub_sport=indoor written
- _merge_all_locked: read index upfront to populate to_merge with title-inferred
  IDs; call apply_sidecar({},{}) for activities in to_merge without sidecars;
  apply _apply_sidecar_summary to ALL summary entries (not only sidecar ones)
This commit is contained in:
Davide Scaini
2026-05-15 01:03:17 +02:00
parent 0fbb7822df
commit 4ea2292e2b
2 changed files with 49 additions and 12 deletions
+13 -4
View File
@@ -10,6 +10,18 @@ from bincio.extract.models import LapData, ParsedActivity
from bincio.extract.simplify import build_geojson, preview_coords from bincio.extract.simplify import build_geojson, preview_coords
from bincio.extract.timeseries import build_timeseries from bincio.extract.timeseries import build_timeseries
# Titles that reliably identify indoor/virtual activities regardless of sub_sport metadata.
# Strava imports from Zwift and FTP-builder platforms lose sub_sport on export.
_INDOOR_TITLE_RE = re.compile(
r'\b(zwift|ftp[\s\-]builder|turbo[\s\-]?trainer|rodillo)\b',
re.IGNORECASE,
)
def _infer_indoor_title(title: str) -> bool:
"""Return True if the title reliably identifies an indoor/virtual activity."""
return bool(_INDOOR_TITLE_RE.search(title))
def make_activity_id(activity: ParsedActivity) -> str: def make_activity_id(activity: ParsedActivity) -> str:
"""Generate a BAS activity ID from started_at + optional title slug. """Generate a BAS activity ID from started_at + optional title slug.
@@ -278,14 +290,11 @@ def write_athlete_json(summaries: list[dict], output_dir: Path, athlete_config:
return [[d, w] for d, w in sorted(best.items())] return [[d, w] for d, w in sorted(best.items())]
_INDOOR_SUB_SPORTS = {"indoor", "treadmill", "virtual"} _INDOOR_SUB_SPORTS = {"indoor", "treadmill", "virtual"}
_INDOOR_TITLE_RE = re.compile(r'\bzwift\b', re.IGNORECASE)
def _is_outdoor(s: dict) -> bool: def _is_outdoor(s: dict) -> bool:
if s.get("sub_sport") in _INDOOR_SUB_SPORTS: if s.get("sub_sport") in _INDOOR_SUB_SPORTS:
return False return False
if _INDOOR_TITLE_RE.search(s.get("title") or ""): return not _infer_indoor_title(s.get("title") or "")
return False
return True
all_mmps = [s["mmp"] for s in summaries if s.get("mmp") and _is_outdoor(s)] all_mmps = [s["mmp"] for s in summaries if s.get("mmp") and _is_outdoor(s)]
mmps_365 = [s["mmp"] for s in summaries if s.get("mmp") and _is_outdoor(s) and s["started_at"] >= cutoff_365] mmps_365 = [s["mmp"] for s in summaries if s.get("mmp") and _is_outdoor(s) and s["started_at"] >= cutoff_365]
+36 -8
View File
@@ -71,6 +71,7 @@ def parse_sidecar(path: Path) -> tuple[dict, str]:
def apply_sidecar(detail: dict, fm: dict, body: str) -> dict: def apply_sidecar(detail: dict, fm: dict, body: str) -> dict:
"""Apply sidecar overrides to a detail JSON dict. Returns a modified copy.""" """Apply sidecar overrides to a detail JSON dict. Returns a modified copy."""
from bincio.extract.writer import _infer_indoor_title
d = dict(detail) d = dict(detail)
d.setdefault("custom", {}) d.setdefault("custom", {})
d["custom"] = dict(d["custom"]) # don't mutate original d["custom"] = dict(d["custom"]) # don't mutate original
@@ -81,6 +82,9 @@ def apply_sidecar(detail: dict, fm: dict, body: str) -> dict:
d["sport"] = str(fm["sport"]) d["sport"] = str(fm["sport"])
if "sub_sport" in fm: if "sub_sport" in fm:
d["sub_sport"] = str(fm["sub_sport"]) if fm["sub_sport"] else None d["sub_sport"] = str(fm["sub_sport"]) if fm["sub_sport"] else None
# Infer indoor from title when sub_sport is still absent after sidecar
if not d.get("sub_sport") and _infer_indoor_title(d.get("title") or ""):
d["sub_sport"] = "indoor"
if "gear" in fm: if "gear" in fm:
d["gear"] = str(fm["gear"]) if fm["gear"] else d.get("gear") d["gear"] = str(fm["gear"]) if fm["gear"] else d.get("gear")
if body: if body:
@@ -99,6 +103,7 @@ def apply_sidecar(detail: dict, fm: dict, body: str) -> dict:
def _apply_sidecar_summary(summary: dict, fm: dict) -> dict: def _apply_sidecar_summary(summary: dict, fm: dict) -> dict:
"""Apply sidecar overrides to an index summary entry.""" """Apply sidecar overrides to an index summary entry."""
from bincio.extract.writer import _infer_indoor_title
s = dict(summary) s = dict(summary)
s.setdefault("custom", {}) s.setdefault("custom", {})
s["custom"] = dict(s["custom"]) s["custom"] = dict(s["custom"])
@@ -113,6 +118,9 @@ def _apply_sidecar_summary(summary: dict, fm: dict) -> dict:
s["custom"]["highlight"] = bool(fm["highlight"]) s["custom"]["highlight"] = bool(fm["highlight"])
if "private" in fm: if "private" in fm:
s["privacy"] = "unlisted" if fm["private"] else summary.get("privacy", "public") s["privacy"] = "unlisted" if fm["private"] else summary.get("privacy", "public")
# Infer indoor from title when sub_sport is still absent
if not s.get("sub_sport") and _infer_indoor_title(s.get("title") or ""):
s["sub_sport"] = "indoor"
return s return s
@@ -156,6 +164,12 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
) )
needs_merge = has_sidecar or bool(image_files) needs_merge = has_sidecar or bool(image_files)
# Also need a real file (not symlink) when title inference would change sub_sport
if not needs_merge and not has_sidecar:
from bincio.extract.writer import _infer_indoor_title
_peek = json.loads(src.read_text(encoding="utf-8"))
if not _peek.get("sub_sport") and _infer_indoor_title(_peek.get("title") or ""):
needs_merge = True
# Symlink the timeseries file (never merged — always points to the extract output) # Symlink the timeseries file (never merged — always points to the extract output)
ts_src = acts_dir / f"{activity_id}.timeseries.json" ts_src = acts_dir / f"{activity_id}.timeseries.json"
@@ -170,10 +184,13 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
dest.unlink() dest.unlink()
if needs_merge: if needs_merge:
detail = json.loads(src.read_text(encoding="utf-8")) detail = locals().get("_peek") or json.loads(src.read_text(encoding="utf-8"))
if has_sidecar: if has_sidecar:
fm, body = parse_sidecar(sidecar_path) # type: ignore[arg-type] fm, body = parse_sidecar(sidecar_path) # type: ignore[arg-type]
detail = apply_sidecar(detail, fm, body) detail = apply_sidecar(detail, fm, body)
else:
# No sidecar — still apply title inference
detail = apply_sidecar(detail, {}, "")
if image_files: if image_files:
detail["custom"] = dict(detail.get("custom") or {}) detail["custom"] = dict(detail.get("custom") or {})
detail["custom"]["images"] = image_files detail["custom"]["images"] = image_files
@@ -195,9 +212,8 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
activities = [] activities = []
for s in index.get("activities", []): for s in index.get("activities", []):
aid = s.get("id", "") aid = s.get("id", "")
if aid in all_sidecars: fm, _ = all_sidecars[aid] if aid in all_sidecars else ({}, "")
fm, _ = all_sidecars[aid] s = _apply_sidecar_summary(s, fm)
s = _apply_sidecar_summary(s, fm)
activities.append(s) activities.append(s)
activities.sort(key=lambda a: a.get("started_at", ""), reverse=True) activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
@@ -243,6 +259,17 @@ def _merge_all_locked(data_dir: Path) -> int:
to_merge = set(sidecars) | set(image_lists) to_merge = set(sidecars) | set(image_lists)
# Also include activities whose title implies indoor (no sidecar required)
_index_path = data_dir / "index.json"
_cached_index: dict | None = None
if _index_path.exists():
from bincio.extract.writer import _infer_indoor_title
_cached_index = json.loads(_index_path.read_text(encoding="utf-8"))
for _s in _cached_index.get("activities", []):
_aid = _s.get("id", "")
if _aid and not _s.get("sub_sport") and _infer_indoor_title(_s.get("title") or ""):
to_merge.add(_aid)
# Wipe and recreate _merged/activities/ # Wipe and recreate _merged/activities/
shutil.rmtree(merged_acts, ignore_errors=True) shutil.rmtree(merged_acts, ignore_errors=True)
merged_acts.mkdir(parents=True, exist_ok=True) merged_acts.mkdir(parents=True, exist_ok=True)
@@ -259,6 +286,8 @@ def _merge_all_locked(data_dir: Path) -> int:
if activity_id in sidecars: if activity_id in sidecars:
fm, body = sidecars[activity_id] fm, body = sidecars[activity_id]
detail = apply_sidecar(detail, fm, body) detail = apply_sidecar(detail, fm, body)
else:
detail = apply_sidecar(detail, {}, "")
if activity_id in image_lists: if activity_id in image_lists:
detail["custom"] = dict(detail.get("custom") or {}) detail["custom"] = dict(detail.get("custom") or {})
detail["custom"]["images"] = image_lists[activity_id] detail["custom"]["images"] = image_lists[activity_id]
@@ -303,13 +332,12 @@ def _merge_all_locked(data_dir: Path) -> int:
# Write merged index.json (private filtered, highlight sorted) # Write merged index.json (private filtered, highlight sorted)
index_path = data_dir / "index.json" index_path = data_dir / "index.json"
if index_path.exists(): if index_path.exists():
index = json.loads(index_path.read_text(encoding="utf-8")) index = _cached_index or json.loads(index_path.read_text(encoding="utf-8"))
activities = [] activities = []
for s in index.get("activities", []): for s in index.get("activities", []):
aid = s.get("id", "") aid = s.get("id", "")
if aid in sidecars: fm, _ = sidecars[aid] if aid in sidecars else ({}, "")
fm, _ = sidecars[aid] s = _apply_sidecar_summary(s, fm)
s = _apply_sidecar_summary(s, fm)
activities.append(s) activities.append(s)
# "unlisted" (and legacy "private") activities are kept in the index so # "unlisted" (and legacy "private") activities are kept in the index so