second pass. low

2026-04-01 19:00:28 +02:00
parent 3d364c3992
commit bd5831c2fd
11 changed files with 277 additions and 62 deletions
@@ -554,10 +554,15 @@ async def upload_activity(file: UploadFile = File(...)) -> JSONResponse:
    if suffix not in _SUPPORTED_SUFFIXES:
        raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.")
    _MAX_UPLOAD_BYTES = 50 * 1024 * 1024  # 50 MB
    contents = await file.read()
    if len(contents) > _MAX_UPLOAD_BYTES:
        raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.")
    staging = dd / "_uploads"
    staging.mkdir(exist_ok=True)
    staged = staging / name
-    staged.write_bytes(await file.read())
+    staged.write_bytes(contents)
    try:
        from bincio.extract.metrics import compute
@@ -592,7 +597,7 @@ async def upload_activity(file: UploadFile = File(...)) -> JSONResponse:
    except HTTPException:
        raise
    except Exception as exc:
-        raise HTTPException(422, str(exc))
+        raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}")
    finally:
        staged.unlink(missing_ok=True)
@@ -46,6 +46,9 @@ def _process_file(path: Path) -> dict:
    """Runs inside a worker process. Only receives a Path (tiny pickle).
    All heavy shared data (_known_hashes, _strava_lookup, etc.) is already
    in the worker's memory from the initializer — zero per-task overhead.
    Writes to pending files (not final paths) so the main process can
    arbitrate collisions and pick the best version.
    """
    from bincio.extract.metrics import compute
    from bincio.extract.parsers.factory import parse_file
@@ -80,11 +83,17 @@ def _process_file(path: Path) -> dict:
            activity, metrics, _output_dir,
            privacy=_privacy,
            rdp_epsilon=_rdp_epsilon,
            pending=True,
        )
        summary = build_summary(activity, metrics, activity_id, _privacy)
    except Exception as exc:
        return {"status": "error", "path": str(path), "error": str(exc)}
    # Quality signals for the main process to compare competing results
    sensor_channels = sum(1 for v in [
        metrics.avg_hr_bpm, metrics.avg_power_w, metrics.avg_cadence_rpm,
    ] if v is not None)
    return {
        "status": "ok",
        "summary": summary,
@@ -94,6 +103,8 @@ def _process_file(path: Path) -> dict:
        "distance_m": metrics.distance_m,
        "source": summary.get("source"),
        "mmp": metrics.mmp,
        "point_count": len(activity.points),
        "sensor_channels": sensor_channels,
    }
@@ -177,6 +188,8 @@ def extract(
    summaries: list[dict] = []
    errors: list[tuple[str, str]] = []
    skipped = 0
    # Collect all pending results, grouped by activity_id for collision arbitration
    pending_by_id: dict[str, list[dict]] = {}
    with Progress(
        TextColumn("[progress.description]{task.description}"),
@@ -202,30 +215,61 @@ def extract(
                elif result["status"] == "error":
                    errors.append((result["path"], result["error"]))
                else:
-                    # Near-duplicate check — must be sequential (stateful)
+                    pending_by_id.setdefault(result["id"], []).append(result)
                    from datetime import datetime
                    started_at = datetime.fromisoformat(result["started_at"])
                    near_id = dedup.find_near_duplicate(started_at, result["distance_m"])
-                    if near_id:
+    # ── Arbitrate collisions and finalize pending files ───────────────────────
-                        canonical = dedup.pick_canonical(near_id, result.get("source"))
+    from bincio.extract.writer import (
-                        if canonical != "__new__":
+        activity_quality, cleanup_pending, finalize_pending, write_athlete_json, write_index,
-                            _patch_duplicate_of(cfg.output_dir, result["id"], near_id)
+    )
                            skipped += 1
                            continue
                        _patch_duplicate_of(cfg.output_dir, near_id, result["id"])
                        dedup._records[near_id].duplicate_of = result["id"]
-                    dedup.register(ActivityRecord(
+    for activity_id, candidates in pending_by_id.items():
-                        id=result["id"],
+        # Pick the best candidate by quality score
-                        source_hash=result["hash"],
+        candidates.sort(key=activity_quality, reverse=True)
-                        started_at=started_at,
+        winner = candidates[0]
-                        distance_m=result["distance_m"],
+
-                        source=result.get("source"),
+        # Clean up losing candidates' pending files
-                    ))
+        for loser in candidates[1:]:
-                    summaries.append(result["summary"])
+            cleanup_pending(cfg.output_dir, activity_id, loser["hash"])
            skipped += 1
        # Near-duplicate check against already-known activities
        from datetime import datetime
        started_at = datetime.fromisoformat(winner["started_at"])
        near_id = dedup.find_near_duplicate(started_at, winner["distance_m"])
        if near_id:
            canonical = dedup.pick_canonical(near_id, winner.get("source"))
            if canonical != "__new__":
                # Existing is better — finalize winner as duplicate, then patch it
                final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
                _patch_duplicate_of(cfg.output_dir, final_id, near_id)
                skipped += 1
                continue
            # New is better — patch the existing one as duplicate
            final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
            _patch_duplicate_of(cfg.output_dir, near_id, final_id)
            dedup._records[near_id].duplicate_of = final_id
        else:
            final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
        # Update summary with the finalized ID (may include hash suffix)
        summary = winner["summary"]
        if final_id != activity_id:
            summary = dict(summary)
            summary["id"] = final_id
            summary["detail_url"] = f"activities/{final_id}.json"
            if summary.get("track_url"):
                summary["track_url"] = f"activities/{final_id}.geojson"
        dedup.register(ActivityRecord(
            id=final_id,
            source_hash=winner["hash"],
            started_at=started_at,
            distance_m=winner["distance_m"],
            source=winner.get("source"),
        ))
        summaries.append(summary)
    from bincio.extract.writer import write_athlete_json, write_index
    existing = _load_existing_summaries(cfg.output_dir)
    merged = {s["id"]: s for s in existing}
    for s in summaries:
@@ -76,6 +76,8 @@ def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
                    dp.temperature_c = float(val)
                elif tag == "speed":
                    dp.speed_kmh = float(val) * 3.6  # m/s → km/h
                elif tag in ("pwr", "power", "watts"):
                    dp.power_w = int(float(val))
 def _strip_ns(tag: str) -> str:
@@ -97,8 +97,8 @@ def _parse_ts(s: str) -> datetime:
                return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
            except ValueError:
                continue
-    # Numeric offset like +02:00 or -05:30 — parse with %z then convert to UTC
+    # Numeric offset like +02:00, -05:30, or +0200 — parse with %z then convert to UTC
-    m = _re.match(r"^(.+)([+-]\d{2}:\d{2})$", s)
+    m = _re.match(r"^(.+)([+-]\d{2}:?\d{2})$", s)
    if m:
        body, off = m.group(1), m.group(2).replace(":", "")
        for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
@@ -67,8 +67,12 @@ def build_geojson(
        if p.lon is not None and p.lat is not None
    ]
-    # Parallel speed array for gradient coloring
+    # Parallel speed array for gradient coloring — same filter as coordinates
-    speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified]
+    speeds = [
        round(p.speed_kmh, 2) if p.speed_kmh is not None else None
        for p in simplified
        if p.lon is not None and p.lat is not None
    ]
    return {
        "type": "Feature",
@@ -33,8 +33,16 @@ def write_activity(
    privacy: str = "public",
    duplicate_of: str | None = None,
    rdp_epsilon: float = 0.0001,
    pending: bool = False,
 ) -> str:
-    """Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
+    """Write {id}.json and (if GPS available) {id}.geojson. Returns the ID.
    When pending=True, writes to a uniquely-named pending file
    ({id}.{hash[:8]}.pending.json) instead of the final path. This avoids
    race conditions when multiple workers process activities with the same ID.
    The main process is responsible for promoting pending files to final paths
    via finalize_pending().
    """
    activity_id = make_activity_id(activity)
    acts_dir = output_dir / "activities"
    acts_dir.mkdir(parents=True, exist_ok=True)
@@ -82,26 +90,108 @@ def write_activity(
        "custom": {},
    }
-    json_path = acts_dir / f"{activity_id}.json"
+    if pending:
-    # Collision guard: if a *different* activity already has this ID, append a
+        # Write to a unique pending file — no collision possible
-    # short hash suffix to disambiguate (same hash = idempotent re-extract).
+        tag = activity.source_hash[-8:] if activity.source_hash else "unknown"
-    if json_path.exists():
+        json_path = acts_dir / f"{activity_id}.{tag}.pending.json"
-        existing = json.loads(json_path.read_text(encoding="utf-8"))
+    else:
-        if existing.get("source_hash") != activity.source_hash:
+        json_path = acts_dir / f"{activity_id}.json"
-            activity_id = f"{activity_id}-{activity.source_hash[-6:]}"
+        # Legacy non-pending path: collision guard for callers that don't use
-            json_path = acts_dir / f"{activity_id}.json"
+        # the pending workflow (e.g. edit server upload_activity)
-            detail["id"] = activity_id
+        if json_path.exists():
            existing = json.loads(json_path.read_text(encoding="utf-8"))
            if existing.get("source_hash") != activity.source_hash:
                activity_id = f"{activity_id}-{activity.source_hash[-6:]}"
                json_path = acts_dir / f"{activity_id}.json"
                detail["id"] = activity_id
    json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
    # ── GeoJSON track ────────────────────────────────────────────────────────
    if has_gps:
        geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
-        geojson_path = acts_dir / f"{activity_id}.geojson"
+        if pending:
            geojson_path = acts_dir / f"{activity_id}.{tag}.pending.geojson"
        else:
            geojson_path = acts_dir / f"{activity_id}.geojson"
        geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))
    return activity_id
 def activity_quality(result: dict) -> int:
    """Compute a quality score for an activity result from a worker.
    Higher is better. Used by the main process to pick the best version
    when multiple workers produce results for the same activity ID.
    """
    from bincio.extract.dedup import _SOURCE_QUALITY
    score = 0
    # Source type quality (FIT > GPX > TCX)
    score += _SOURCE_QUALITY.get(result.get("source") or "", 0) * 100
    # Sensor channel count
    score += result.get("sensor_channels", 0) * 10
    # Point count (more data = better)
    score += min(result.get("point_count", 0), 50000) // 100
    return score
 def finalize_pending(output_dir: Path, activity_id: str, source_hash: str) -> str:
    """Promote a pending file to its final path via atomic rename.
    If another activity already occupies the ID (different source_hash),
    the pending file is disambiguated with a hash suffix.
    Returns the final activity_id (may include suffix).
    """
    acts_dir = output_dir / "activities"
    tag = source_hash[-8:] if source_hash else "unknown"
    pending_json = acts_dir / f"{activity_id}.{tag}.pending.json"
    pending_geojson = acts_dir / f"{activity_id}.{tag}.pending.geojson"
    final_id = activity_id
    final_json = acts_dir / f"{final_id}.json"
    # Check for ID collision with a different activity
    if final_json.exists():
        existing = json.loads(final_json.read_text(encoding="utf-8"))
        if existing.get("source_hash") != source_hash:
            final_id = f"{activity_id}-{source_hash[-6:]}"
            final_json = acts_dir / f"{final_id}.json"
    # Update the ID inside the JSON if it changed
    if final_id != activity_id and pending_json.exists():
        detail = json.loads(pending_json.read_text(encoding="utf-8"))
        detail["id"] = final_id
        pending_json.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
    # Atomic rename: pending → final
    if pending_json.exists():
        pending_json.rename(final_json)
    final_geojson = acts_dir / f"{final_id}.geojson"
    if pending_geojson.exists():
        # Update the ID in GeoJSON properties too
        if final_id != activity_id:
            geo = json.loads(pending_geojson.read_text(encoding="utf-8"))
            geo["properties"]["id"] = final_id
            pending_geojson.write_text(json.dumps(geo, indent=2, ensure_ascii=False))
        pending_geojson.rename(final_geojson)
    return final_id
 def cleanup_pending(output_dir: Path, activity_id: str, source_hash: str) -> None:
    """Remove pending files for a losing activity (the one not chosen as canonical)."""
    acts_dir = output_dir / "activities"
    tag = source_hash[-8:] if source_hash else "unknown"
    for suffix in (".pending.json", ".pending.geojson"):
        p = acts_dir / f"{activity_id}.{tag}{suffix}"
        p.unlink(missing_ok=True)
 def build_summary(
    activity: ParsedActivity,
    metrics: ComputedMetrics,
@@ -1,6 +1,6 @@
 <script lang="ts">
  import * as Plot from '@observablehq/plot';
-  import { onMount } from 'svelte';
+  import { onMount, onDestroy } from 'svelte';
  import type { Timeseries, AthleteZones } from '../lib/types';
  export let timeseries: Timeseries;
@@ -82,8 +82,15 @@
  // Range handles — reset whenever the metric or chart type changes
  let trimMin = 0;
  let trimMax = 100;
-  $: if (dataMin !== undefined) resetTrim(dataMin, dataMax);
+  let lastResetTab: Tab | null = null;
-  function resetTrim(lo: number, hi: number) { trimMin = lo; trimMax = hi; }
+  $: {
    // Reset trim on tab change OR when data range changes
    if (activeTab !== lastResetTab || trimMin < dataMin || trimMax > dataMax) {
      trimMin = dataMin;
      trimMax = dataMax;
      lastResetTab = activeTab;
    }
  }
  $: step = (dataMax - dataMin) / 200 || 1;
@@ -116,6 +123,7 @@
  // ── Rendering ────────────────────────────────────────────────────────────
  onMount(() => { renderChart(); });
  onDestroy(() => { chart?.remove(); chart = null; });
  $: if (chartEl) {
    activeTab; xMode; chartType; histData; histThresholds; alignZones;
@@ -27,7 +27,9 @@
  }
  onMount(async () => {
-    activeTab = (new URLSearchParams(window.location.search).get('tab') as Tab) ?? 'power';
+    const TABS: Tab[] = ['power', 'records', 'profile'];
    const rawTab = new URLSearchParams(window.location.search).get('tab');
    activeTab = TABS.includes(rawTab as Tab) ? (rawTab as Tab) : 'power';
    mounted = true;
    try {
      const [athleteRes, indexRes] = await Promise.all([
@@ -55,7 +55,7 @@ def test_parse_sidecar_frontmatter_only(tmp_path):
 # ── apply_sidecar ─────────────────────────────────────────────────────────────
 BASE_DETAIL = {
-    "id": "2024-01-01T08:00:00Z_cycling",
+    "id": "2024-01-01T080000Z-morning-ride",
    "title": "Morning Ride",
    "sport": "cycling",
    "started_at": "2024-01-01T08:00:00Z",
@@ -118,21 +118,21 @@ def data_dir(tmp_path):
    acts = tmp_path / "activities"
    acts.mkdir()
    # Two activities
-    for act_id, title in [
+    for act_id, title, sport, started_at in [
-        ("2024-01-01T08:00:00Z_cycling", "Morning Ride"),
+        ("2024-01-01T080000Z-morning-ride", "Morning Ride", "cycling", "2024-01-01T08:00:00Z"),
-        ("2024-01-02T09:00:00Z_running", "Easy Run"),
+        ("2024-01-02T090000Z-easy-run", "Easy Run", "running", "2024-01-02T09:00:00Z"),
    ]:
        detail = {
-            "id": act_id, "title": title, "sport": act_id.split("_")[1],
+            "id": act_id, "title": title, "sport": sport,
-            "started_at": act_id.split("_")[0],
+            "started_at": started_at,
            "description": "", "privacy": "public", "custom": {},
        }
        (acts / f"{act_id}.json").write_text(json.dumps(detail))
    # Index
    index = {"activities": [
-        {"id": "2024-01-01T08:00:00Z_cycling", "title": "Morning Ride",
+        {"id": "2024-01-01T080000Z-morning-ride", "title": "Morning Ride",
         "sport": "cycling", "started_at": "2024-01-01T08:00:00Z", "privacy": "public", "custom": {}},
-        {"id": "2024-01-02T09:00:00Z_running", "title": "Easy Run",
+        {"id": "2024-01-02T090000Z-easy-run", "title": "Easy Run",
         "sport": "running", "started_at": "2024-01-02T09:00:00Z", "privacy": "public", "custom": {}},
    ]}
    (tmp_path / "index.json").write_text(json.dumps(index))
@@ -145,20 +145,20 @@ def test_merge_all_no_sidecars(data_dir):
    merged = data_dir / "_merged"
    assert merged.exists()
    # Unmodified files are symlinked
-    detail_link = merged / "activities" / "2024-01-01T08:00:00Z_cycling.json"
+    detail_link = merged / "activities" / "2024-01-01T080000Z-morning-ride.json"
    assert detail_link.is_symlink()
 def test_merge_all_applies_sidecar(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text(
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text(
        "---\ntitle: Epic Ride\nhighlight: true\n---\n\nWhat a day!"
    )
    n = merge_all(data_dir)
    assert n == 1
-    merged_json = data_dir / "_merged" / "activities" / "2024-01-01T08:00:00Z_cycling.json"
+    merged_json = data_dir / "_merged" / "activities" / "2024-01-01T080000Z-morning-ride.json"
    assert not merged_json.is_symlink()
    data = json.loads(merged_json.read_text())
    assert data["title"] == "Epic Ride"
@@ -166,41 +166,41 @@ def test_merge_all_applies_sidecar(data_dir):
    assert data["description"] == "What a day!"
    # Untouched activity is still a symlink
-    run_link = data_dir / "_merged" / "activities" / "2024-01-02T09:00:00Z_running.json"
+    run_link = data_dir / "_merged" / "activities" / "2024-01-02T090000Z-easy-run.json"
    assert run_link.is_symlink()
 def test_merge_all_private_filtered_from_index(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\nprivate: true\n---\n")
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\nprivate: true\n---\n")
    merge_all(data_dir)
    index = json.loads((data_dir / "_merged" / "index.json").read_text())
    ids = [a["id"] for a in index["activities"]]
-    assert "2024-01-01T08:00:00Z_cycling" not in ids
+    assert "2024-01-01T080000Z-morning-ride" not in ids
-    assert "2024-01-02T09:00:00Z_running" in ids
+    assert "2024-01-02T090000Z-easy-run" in ids
 def test_merge_all_highlight_sorts_first(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
    # Highlight the older activity — it should appear first
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\nhighlight: true\n---\n")
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\nhighlight: true\n---\n")
    merge_all(data_dir)
    index = json.loads((data_dir / "_merged" / "index.json").read_text())
    ids = [a["id"] for a in index["activities"]]
-    assert ids[0] == "2024-01-01T08:00:00Z_cycling"
+    assert ids[0] == "2024-01-01T080000Z-morning-ride"
 def test_merge_all_idempotent(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\ntitle: Renamed\n---\n")
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\ntitle: Renamed\n---\n")
    merge_all(data_dir)
    merge_all(data_dir)  # second run should not error or double-apply
    data = json.loads(
-        (data_dir / "_merged" / "activities" / "2024-01-01T08:00:00Z_cycling.json").read_text()
+        (data_dir / "_merged" / "activities" / "2024-01-01T080000Z-morning-ride.json").read_text()
    )
    assert data["title"] == "Renamed"
@@ -11,6 +11,16 @@ def test_running_variants():
        assert normalise_sport(raw) == "running", raw
 def test_skiing_variants():
    for raw in ("skiing", "alpine_skiing", "nordic_skiing", "backcountry_ski"):
        assert normalise_sport(raw) == "skiing", raw
 def test_swimming_variants():
    for raw in ("swimming", "swim", "open_water_swimming", "lap_swimming"):
        assert normalise_sport(raw) == "swimming", raw
 def test_unknown_falls_back_to_other():
    assert normalise_sport("yoga") == "other"
    assert normalise_sport(None) == "other"
@@ -1,4 +1,5 @@
-from bincio.extract.writer import make_activity_id, _slugify
+from bincio.extract.writer import make_activity_id, build_summary, _slugify
 from bincio.extract.metrics import ComputedMetrics
 from bincio.extract.models import ParsedActivity, DataPoint
 from datetime import datetime, timezone
@@ -31,3 +32,52 @@ def test_slugify():
    assert _slugify("Morning Ride!") == "morning-ride"
    assert _slugify("  Vélo  ") == "velo"   # é → e via NFKD + ASCII
    assert _slugify("") == ""
 def test_id_utc_conversion():
    """Non-UTC timestamps should be converted to UTC in the ID."""
    from datetime import timedelta
    tz_plus2 = timezone(timedelta(hours=2))
    ts = datetime(2024, 6, 1, 9, 30, 12, tzinfo=tz_plus2)  # 07:30:12 UTC
    act = ParsedActivity(
        points=[DataPoint(timestamp=ts)],
        sport="cycling",
        started_at=ts,
        source_file="test.fit",
        source_hash="sha256:abc",
    )
    assert make_activity_id(act) == "2024-06-01T073012Z"
 def test_build_summary_required_fields():
    """build_summary should include all fields needed by the schema."""
    act = _dummy_activity("Test Ride")
    metrics = ComputedMetrics(
        distance_m=10000.0,
        duration_s=3600,
        moving_time_s=3500,
        elevation_gain_m=100.0,
        elevation_loss_m=95.0,
        avg_speed_kmh=10.0,
        max_speed_kmh=20.0,
        avg_hr_bpm=None,
        max_hr_bpm=None,
        avg_cadence_rpm=None,
        avg_power_w=None,
        max_power_w=None,
        bbox=None,
        start_latlng=None,
        end_latlng=None,
        mmp=None,
        best_efforts=None,
        best_climb_m=None,
    )
    summary = build_summary(act, metrics, "2024-06-01T073012Z-test-ride")
    # Required fields per schema
    assert summary["id"] == "2024-06-01T073012Z-test-ride"
    assert summary["title"] == "Test Ride"
    assert summary["sport"] == "cycling"
    assert "started_at" in summary
    assert "privacy" in summary
    assert "detail_url" in summary
    assert "track_url" in summary