bincio-activity/scripts/strava_elevation_audit.py

"""Audit elevation accuracy vs Strava.

Friends add a note with the Strava elevation to their activity descriptions.
Supported formats (case-insensitive):
  - "strava 1323md+"            most common
  - "strava 1323 m d+"
  - "Strava 1625 m d+"
  - "Strava Elevation 1173m"
  - "1038 m d+ Strava"          number before the word strava
  - "Strava 207 metri di dislivello"

Descriptions live in _merged/activities/ (sidecar merge).
Computed elevation_gain_m is read from activities/ (main file).

Usage:
    uv run scripts/strava_elevation_audit.py [--data-dir /var/bincio/data] [--out elevation_audit.csv]
"""

from __future__ import annotations

import argparse
import csv
import json
import re
import sys
from pathlib import Path

from bincio.extract.metrics import elevation_params

# Patterns tried in order; first match wins.
# Each pattern must have exactly one capturing group for the numeric value.
_PATTERNS: list[re.Pattern] = [
    # "strava NNN m ..."  or  "strava NNNmd+"
    re.compile(r'\bstrava\b\s*([0-9][0-9.,]*)\s*m', re.IGNORECASE),
    # "Strava Elevation NNNm"  or  "Strava ... NNNm"  (one word between)
    re.compile(r'\bstrava\b\s+\w+\s+([0-9][0-9.,]*)\s*m', re.IGNORECASE),
    # "NNN m ... strava"  (number comes first, up to 20 chars before strava)
    re.compile(r'([0-9][0-9.,]*)\s*m\b.{0,20}?\bstrava\b', re.IGNORECASE),
    # "Strava NNN metri di dislivello"  (Italian)
    re.compile(r'\bstrava\b.*?([0-9][0-9.,]*)\s+metr', re.IGNORECASE),
]


def _find_strava_elevation(description: str) -> float | None:
    for pat in _PATTERNS:
        m = pat.search(description)
        if m:
            raw = m.group(1).replace(',', '.')
            try:
                return float(raw)
            except ValueError:
                continue
    return None


def audit(data_dir: Path, out_path: Path) -> list[dict]:
    rows: list[dict] = []
    unmatched: list[tuple[str, str]] = []  # (path, desc) couldn't parse elevation

    for merged_path in sorted(data_dir.glob("*/_merged/activities/*.json")):
        if merged_path.suffix != ".json":
            continue
        if ".timeseries." in merged_path.name or ".geojson" in merged_path.name:
            continue

        try:
            merged = json.loads(merged_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            continue

        description = merged.get("description") or ""
        if not description or "strava" not in description.lower():
            continue

        # Skip strava:// athlete-mention links (not elevation notes)
        if re.search(r'strava://', description, re.IGNORECASE):
            continue

        strava_elev = _find_strava_elevation(description)
        if strava_elev is None:
            unmatched.append((str(merged_path), description))
            continue

        # Read computed elevation from main activity file
        main_path = (
            merged_path.parents[3]  # data_dir
            / merged_path.parents[2].name  # user
            / "activities"
            / merged_path.name
        )
        try:
            main = json.loads(main_path.read_text(encoding="utf-8"))
        except (json.JSONDecodeError, OSError):
            main = merged  # fall back to merged values

        our_elev = main.get("elevation_gain_m")
        title = main.get("title") or merged.get("title") or merged_path.stem
        user = merged_path.parents[2].name
        altitude_source = main.get("altitude_source") or "unknown"
        source = main.get("source") or ""
        device = main.get("device") or "unknown"
        ma_window, threshold = elevation_params(altitude_source, source)

        delta = round(our_elev - strava_elev, 1) if our_elev is not None else None
        pct = (
            round((our_elev - strava_elev) / strava_elev * 100, 1)
            if our_elev is not None and strava_elev != 0
            else None
        )

        rows.append({
            "file": merged_path.name,
            "user": user,
            "title": title,
            "device": device,
            "altitude_source": altitude_source,
            "source": source,
            "ma_window_s": ma_window,
            "threshold_m": threshold,
            "our_elevation_m": our_elev,
            "strava_elevation_m": strava_elev,
            "delta_m": delta,
            "delta_pct": pct,
            "description": description[:120].replace("\n", " ").replace("\r", ""),
        })

    rows.sort(key=lambda r: abs(r["delta_m"] or 0), reverse=True)

    if rows:
        with out_path.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
            writer.writeheader()
            writer.writerows(rows)

    if unmatched:
        print(f"\nCould not parse elevation from {len(unmatched)} description(s):")
        for path, desc in unmatched:
            print(f"  {Path(path).name}  {desc[:80]!r}")

    return rows


def main() -> None:
    ap = argparse.ArgumentParser(description="Audit elevation accuracy vs Strava notes")
    ap.add_argument("--data-dir", default="/var/bincio/data", type=Path)
    ap.add_argument("--out", default="elevation_audit.csv", type=Path)
    args = ap.parse_args()

    if not args.data_dir.exists():
        print(f"ERROR: data dir not found: {args.data_dir}", file=sys.stderr)
        sys.exit(1)

    print(f"Scanning {args.data_dir} …")
    rows = audit(args.data_dir, args.out)

    if not rows:
        print("No activities found with a parseable Strava elevation note.")
        return

    print(f"\nFound {len(rows)} activit{'y' if len(rows)==1 else 'ies'}:\n")
    header = (
        f"{'File':<50} {'User':<15} {'Source':<16} {'AltSrc':<12}"
        f" {'MA':>4} {'Thr':>5} {'Ours':>8} {'Strava':>8} {'Delta':>8} {'Delta%':>7}"
    )
    print(header)
    print("-" * len(header))
    for r in rows:
        delta_str = f"{r['delta_m']:+.0f}" if r['delta_m'] is not None else "n/a"
        pct_str   = f"{r['delta_pct']:+.1f}%" if r['delta_pct'] is not None else "n/a"
        our_str   = f"{r['our_elevation_m']:.0f}" if r['our_elevation_m'] is not None else "n/a"
        print(
            f"{r['file']:<50} {r['user']:<15} {r['source']:<16} {r['altitude_source']:<12}"
            f" {r['ma_window_s']:>4} {r['threshold_m']:>5.1f}"
            f" {our_str:>8} {r['strava_elevation_m']:>8.0f}"
            f" {delta_str:>8} {pct_str:>7}"
        )

    n = len(rows)
    pcts = [r["delta_pct"] for r in rows if r["delta_pct"] is not None]
    deltas = [r["delta_m"] for r in rows if r["delta_m"] is not None]
    if pcts:
        avg_pct = sum(pcts) / len(pcts)
        sorted_pcts = sorted(pcts)
        median_pct = sorted_pcts[len(sorted_pcts) // 2]
        within_10 = sum(1 for p in pcts if abs(p) <= 10)
        within_15 = sum(1 for p in pcts if abs(p) <= 15)
        avg_d = sum(deltas) / len(deltas) if deltas else 0
        print(
            f"\n  n={n}  avg={avg_pct:+.1f}%  median={median_pct:+.1f}%"
            f"  avg delta={avg_d:+.0f} m"
            f"  within ±10%: {within_10}/{n}  within ±15%: {within_15}/{n}"
        )

    print(f"\nCSV saved to: {args.out}")


if __name__ == "__main__":
    main()