"""Audit elevation accuracy vs Strava. Friends add a note with the Strava elevation to their activity descriptions. Supported formats (case-insensitive): - "strava 1323md+" most common - "strava 1323 m d+" - "Strava 1625 m d+" - "Strava Elevation 1173m" - "1038 m d+ Strava" number before the word strava - "Strava 207 metri di dislivello" Descriptions live in _merged/activities/ (sidecar merge). Computed elevation_gain_m is read from activities/ (main file). Usage: uv run scripts/strava_elevation_audit.py [--data-dir /var/bincio/data] [--out elevation_audit.csv] """ from __future__ import annotations import argparse import csv import json import re import sys from pathlib import Path from bincio.extract.metrics import elevation_params # Patterns tried in order; first match wins. # Each pattern must have exactly one capturing group for the numeric value. _PATTERNS: list[re.Pattern] = [ # "strava NNN m ..." or "strava NNNmd+" re.compile(r'\bstrava\b\s*([0-9][0-9.,]*)\s*m', re.IGNORECASE), # "Strava Elevation NNNm" or "Strava ... NNNm" (one word between) re.compile(r'\bstrava\b\s+\w+\s+([0-9][0-9.,]*)\s*m', re.IGNORECASE), # "NNN m ... strava" (number comes first, up to 20 chars before strava) re.compile(r'([0-9][0-9.,]*)\s*m\b.{0,20}?\bstrava\b', re.IGNORECASE), # "Strava NNN metri di dislivello" (Italian) re.compile(r'\bstrava\b.*?([0-9][0-9.,]*)\s+metr', re.IGNORECASE), ] def _find_strava_elevation(description: str) -> float | None: for pat in _PATTERNS: m = pat.search(description) if m: raw = m.group(1).replace(',', '.') try: return float(raw) except ValueError: continue return None def audit(data_dir: Path, out_path: Path) -> list[dict]: rows: list[dict] = [] unmatched: list[tuple[str, str]] = [] # (path, desc) couldn't parse elevation for merged_path in sorted(data_dir.glob("*/_merged/activities/*.json")): if merged_path.suffix != ".json": continue if ".timeseries." in merged_path.name or ".geojson" in merged_path.name: continue try: merged = json.loads(merged_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue description = merged.get("description") or "" if not description or "strava" not in description.lower(): continue # Skip strava:// athlete-mention links (not elevation notes) if re.search(r'strava://', description, re.IGNORECASE): continue strava_elev = _find_strava_elevation(description) if strava_elev is None: unmatched.append((str(merged_path), description)) continue # Read computed elevation from main activity file main_path = ( merged_path.parents[3] # data_dir / merged_path.parents[2].name # user / "activities" / merged_path.name ) try: main = json.loads(main_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): main = merged # fall back to merged values our_elev = main.get("elevation_gain_m") title = main.get("title") or merged.get("title") or merged_path.stem user = merged_path.parents[2].name altitude_source = main.get("altitude_source") or "unknown" source = main.get("source") or "" device = main.get("device") or "unknown" ma_window, threshold = elevation_params(altitude_source, source) delta = round(our_elev - strava_elev, 1) if our_elev is not None else None pct = ( round((our_elev - strava_elev) / strava_elev * 100, 1) if our_elev is not None and strava_elev != 0 else None ) rows.append({ "file": merged_path.name, "user": user, "title": title, "device": device, "altitude_source": altitude_source, "source": source, "ma_window_s": ma_window, "threshold_m": threshold, "our_elevation_m": our_elev, "strava_elevation_m": strava_elev, "delta_m": delta, "delta_pct": pct, "description": description[:120].replace("\n", " ").replace("\r", ""), }) rows.sort(key=lambda r: abs(r["delta_m"] or 0), reverse=True) if rows: with out_path.open("w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) writer.writeheader() writer.writerows(rows) if unmatched: print(f"\nCould not parse elevation from {len(unmatched)} description(s):") for path, desc in unmatched: print(f" {Path(path).name} {desc[:80]!r}") return rows def main() -> None: ap = argparse.ArgumentParser(description="Audit elevation accuracy vs Strava notes") ap.add_argument("--data-dir", default="/var/bincio/data", type=Path) ap.add_argument("--out", default="elevation_audit.csv", type=Path) args = ap.parse_args() if not args.data_dir.exists(): print(f"ERROR: data dir not found: {args.data_dir}", file=sys.stderr) sys.exit(1) print(f"Scanning {args.data_dir} …") rows = audit(args.data_dir, args.out) if not rows: print("No activities found with a parseable Strava elevation note.") return print(f"\nFound {len(rows)} activit{'y' if len(rows)==1 else 'ies'}:\n") header = ( f"{'File':<50} {'User':<15} {'Source':<16} {'AltSrc':<12}" f" {'MA':>4} {'Thr':>5} {'Ours':>8} {'Strava':>8} {'Delta':>8} {'Delta%':>7}" ) print(header) print("-" * len(header)) for r in rows: delta_str = f"{r['delta_m']:+.0f}" if r['delta_m'] is not None else "n/a" pct_str = f"{r['delta_pct']:+.1f}%" if r['delta_pct'] is not None else "n/a" our_str = f"{r['our_elevation_m']:.0f}" if r['our_elevation_m'] is not None else "n/a" print( f"{r['file']:<50} {r['user']:<15} {r['source']:<16} {r['altitude_source']:<12}" f" {r['ma_window_s']:>4} {r['threshold_m']:>5.1f}" f" {our_str:>8} {r['strava_elevation_m']:>8.0f}" f" {delta_str:>8} {pct_str:>7}" ) n = len(rows) pcts = [r["delta_pct"] for r in rows if r["delta_pct"] is not None] deltas = [r["delta_m"] for r in rows if r["delta_m"] is not None] if pcts: avg_pct = sum(pcts) / len(pcts) sorted_pcts = sorted(pcts) median_pct = sorted_pcts[len(sorted_pcts) // 2] within_10 = sum(1 for p in pcts if abs(p) <= 10) within_15 = sum(1 for p in pcts if abs(p) <= 15) avg_d = sum(deltas) / len(deltas) if deltas else 0 print( f"\n n={n} avg={avg_pct:+.1f}% median={median_pct:+.1f}%" f" avg delta={avg_d:+.0f} m" f" within ±10%: {within_10}/{n} within ±15%: {within_15}/{n}" ) print(f"\nCSV saved to: {args.out}") if __name__ == "__main__": main()