second pass. low

2026-04-01 19:00:28 +02:00
parent 3d364c3992
commit bd5831c2fd
11 changed files with 277 additions and 62 deletions
@@ -554,10 +554,15 @@ async def upload_activity(file: UploadFile = File(...)) -> JSONResponse:
    if suffix not in _SUPPORTED_SUFFIXES:
        raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.")

+    _MAX_UPLOAD_BYTES = 50 * 1024 * 1024  # 50 MB
+    contents = await file.read()
+    if len(contents) > _MAX_UPLOAD_BYTES:
+        raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.")
+
    staging = dd / "_uploads"
    staging.mkdir(exist_ok=True)
    staged = staging / name
-    staged.write_bytes(await file.read())
+    staged.write_bytes(contents)

    try:
        from bincio.extract.metrics import compute
@@ -592,7 +597,7 @@ async def upload_activity(file: UploadFile = File(...)) -> JSONResponse:
    except HTTPException:
        raise
    except Exception as exc:
-        raise HTTPException(422, str(exc))
+        raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}")
    finally:
        staged.unlink(missing_ok=True)

@@ -46,6 +46,9 @@ def _process_file(path: Path) -> dict:
    """Runs inside a worker process. Only receives a Path (tiny pickle).
    All heavy shared data (_known_hashes, _strava_lookup, etc.) is already
    in the worker's memory from the initializer — zero per-task overhead.
+
+    Writes to pending files (not final paths) so the main process can
+    arbitrate collisions and pick the best version.
    """
    from bincio.extract.metrics import compute
    from bincio.extract.parsers.factory import parse_file
@@ -80,11 +83,17 @@ def _process_file(path: Path) -> dict:
            activity, metrics, _output_dir,
            privacy=_privacy,
            rdp_epsilon=_rdp_epsilon,
+            pending=True,
        )
        summary = build_summary(activity, metrics, activity_id, _privacy)
    except Exception as exc:
        return {"status": "error", "path": str(path), "error": str(exc)}

+    # Quality signals for the main process to compare competing results
+    sensor_channels = sum(1 for v in [
+        metrics.avg_hr_bpm, metrics.avg_power_w, metrics.avg_cadence_rpm,
+    ] if v is not None)
+
    return {
        "status": "ok",
        "summary": summary,
@@ -94,6 +103,8 @@ def _process_file(path: Path) -> dict:
        "distance_m": metrics.distance_m,
        "source": summary.get("source"),
        "mmp": metrics.mmp,
+        "point_count": len(activity.points),
+        "sensor_channels": sensor_channels,
    }


@@ -177,6 +188,8 @@ def extract(
    summaries: list[dict] = []
    errors: list[tuple[str, str]] = []
    skipped = 0
+    # Collect all pending results, grouped by activity_id for collision arbitration
+    pending_by_id: dict[str, list[dict]] = {}

    with Progress(
        TextColumn("[progress.description]{task.description}"),
@@ -202,30 +215,61 @@ def extract(
                elif result["status"] == "error":
                    errors.append((result["path"], result["error"]))
                else:
-                    # Near-duplicate check — must be sequential (stateful)
+                    pending_by_id.setdefault(result["id"], []).append(result)
+
+    # ── Arbitrate collisions and finalize pending files ───────────────────────
+    from bincio.extract.writer import (
+        activity_quality, cleanup_pending, finalize_pending, write_athlete_json, write_index,
+    )
+
+    for activity_id, candidates in pending_by_id.items():
+        # Pick the best candidate by quality score
+        candidates.sort(key=activity_quality, reverse=True)
+        winner = candidates[0]
+
+        # Clean up losing candidates' pending files
+        for loser in candidates[1:]:
+            cleanup_pending(cfg.output_dir, activity_id, loser["hash"])
+            skipped += 1
+
+        # Near-duplicate check against already-known activities
        from datetime import datetime
-                    started_at = datetime.fromisoformat(result["started_at"])
-                    near_id = dedup.find_near_duplicate(started_at, result["distance_m"])
+        started_at = datetime.fromisoformat(winner["started_at"])
+        near_id = dedup.find_near_duplicate(started_at, winner["distance_m"])

        if near_id:
-                        canonical = dedup.pick_canonical(near_id, result.get("source"))
+            canonical = dedup.pick_canonical(near_id, winner.get("source"))
            if canonical != "__new__":
-                            _patch_duplicate_of(cfg.output_dir, result["id"], near_id)
+                # Existing is better — finalize winner as duplicate, then patch it
+                final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
+                _patch_duplicate_of(cfg.output_dir, final_id, near_id)
                skipped += 1
                continue
-                        _patch_duplicate_of(cfg.output_dir, near_id, result["id"])
-                        dedup._records[near_id].duplicate_of = result["id"]
+            # New is better — patch the existing one as duplicate
+            final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
+            _patch_duplicate_of(cfg.output_dir, near_id, final_id)
+            dedup._records[near_id].duplicate_of = final_id
+        else:
+            final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
+
+        # Update summary with the finalized ID (may include hash suffix)
+        summary = winner["summary"]
+        if final_id != activity_id:
+            summary = dict(summary)
+            summary["id"] = final_id
+            summary["detail_url"] = f"activities/{final_id}.json"
+            if summary.get("track_url"):
+                summary["track_url"] = f"activities/{final_id}.geojson"

        dedup.register(ActivityRecord(
-                        id=result["id"],
-                        source_hash=result["hash"],
+            id=final_id,
+            source_hash=winner["hash"],
            started_at=started_at,
-                        distance_m=result["distance_m"],
-                        source=result.get("source"),
+            distance_m=winner["distance_m"],
+            source=winner.get("source"),
        ))
-                    summaries.append(result["summary"])
+        summaries.append(summary)

-    from bincio.extract.writer import write_athlete_json, write_index
    existing = _load_existing_summaries(cfg.output_dir)
    merged = {s["id"]: s for s in existing}
    for s in summaries:
@@ -76,6 +76,8 @@ def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
                    dp.temperature_c = float(val)
                elif tag == "speed":
                    dp.speed_kmh = float(val) * 3.6  # m/s → km/h
+                elif tag in ("pwr", "power", "watts"):
+                    dp.power_w = int(float(val))


 def _strip_ns(tag: str) -> str:
@@ -97,8 +97,8 @@ def _parse_ts(s: str) -> datetime:
                return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
            except ValueError:
                continue
-    # Numeric offset like +02:00 or -05:30 — parse with %z then convert to UTC
-    m = _re.match(r"^(.+)([+-]\d{2}:\d{2})$", s)
+    # Numeric offset like +02:00, -05:30, or +0200 — parse with %z then convert to UTC
+    m = _re.match(r"^(.+)([+-]\d{2}:?\d{2})$", s)
    if m:
        body, off = m.group(1), m.group(2).replace(":", "")
        for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
@@ -67,8 +67,12 @@ def build_geojson(
        if p.lon is not None and p.lat is not None
    ]

-    # Parallel speed array for gradient coloring
-    speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified]
+    # Parallel speed array for gradient coloring — same filter as coordinates
+    speeds = [
+        round(p.speed_kmh, 2) if p.speed_kmh is not None else None
+        for p in simplified
+        if p.lon is not None and p.lat is not None
+    ]

    return {
        "type": "Feature",
@@ -33,8 +33,16 @@ def write_activity(
    privacy: str = "public",
    duplicate_of: str | None = None,
    rdp_epsilon: float = 0.0001,
+    pending: bool = False,
 ) -> str:
-    """Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
+    """Write {id}.json and (if GPS available) {id}.geojson. Returns the ID.
+
+    When pending=True, writes to a uniquely-named pending file
+    ({id}.{hash[:8]}.pending.json) instead of the final path. This avoids
+    race conditions when multiple workers process activities with the same ID.
+    The main process is responsible for promoting pending files to final paths
+    via finalize_pending().
+    """
    activity_id = make_activity_id(activity)
    acts_dir = output_dir / "activities"
    acts_dir.mkdir(parents=True, exist_ok=True)
@@ -82,26 +90,108 @@ def write_activity(
        "custom": {},
    }

+    if pending:
+        # Write to a unique pending file — no collision possible
+        tag = activity.source_hash[-8:] if activity.source_hash else "unknown"
+        json_path = acts_dir / f"{activity_id}.{tag}.pending.json"
+    else:
        json_path = acts_dir / f"{activity_id}.json"
-    # Collision guard: if a *different* activity already has this ID, append a
-    # short hash suffix to disambiguate (same hash = idempotent re-extract).
+        # Legacy non-pending path: collision guard for callers that don't use
+        # the pending workflow (e.g. edit server upload_activity)
        if json_path.exists():
            existing = json.loads(json_path.read_text(encoding="utf-8"))
            if existing.get("source_hash") != activity.source_hash:
                activity_id = f"{activity_id}-{activity.source_hash[-6:]}"
                json_path = acts_dir / f"{activity_id}.json"
                detail["id"] = activity_id
+
    json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))

    # ── GeoJSON track ────────────────────────────────────────────────────────
    if has_gps:
        geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
+        if pending:
+            geojson_path = acts_dir / f"{activity_id}.{tag}.pending.geojson"
+        else:
            geojson_path = acts_dir / f"{activity_id}.geojson"
        geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))

    return activity_id


+def activity_quality(result: dict) -> int:
+    """Compute a quality score for an activity result from a worker.
+
+    Higher is better. Used by the main process to pick the best version
+    when multiple workers produce results for the same activity ID.
+    """
+    from bincio.extract.dedup import _SOURCE_QUALITY
+
+    score = 0
+    # Source type quality (FIT > GPX > TCX)
+    score += _SOURCE_QUALITY.get(result.get("source") or "", 0) * 100
+    # Sensor channel count
+    score += result.get("sensor_channels", 0) * 10
+    # Point count (more data = better)
+    score += min(result.get("point_count", 0), 50000) // 100
+    return score
+
+
+def finalize_pending(output_dir: Path, activity_id: str, source_hash: str) -> str:
+    """Promote a pending file to its final path via atomic rename.
+
+    If another activity already occupies the ID (different source_hash),
+    the pending file is disambiguated with a hash suffix.
+
+    Returns the final activity_id (may include suffix).
+    """
+    acts_dir = output_dir / "activities"
+    tag = source_hash[-8:] if source_hash else "unknown"
+
+    pending_json = acts_dir / f"{activity_id}.{tag}.pending.json"
+    pending_geojson = acts_dir / f"{activity_id}.{tag}.pending.geojson"
+
+    final_id = activity_id
+    final_json = acts_dir / f"{final_id}.json"
+
+    # Check for ID collision with a different activity
+    if final_json.exists():
+        existing = json.loads(final_json.read_text(encoding="utf-8"))
+        if existing.get("source_hash") != source_hash:
+            final_id = f"{activity_id}-{source_hash[-6:]}"
+            final_json = acts_dir / f"{final_id}.json"
+
+    # Update the ID inside the JSON if it changed
+    if final_id != activity_id and pending_json.exists():
+        detail = json.loads(pending_json.read_text(encoding="utf-8"))
+        detail["id"] = final_id
+        pending_json.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
+
+    # Atomic rename: pending → final
+    if pending_json.exists():
+        pending_json.rename(final_json)
+
+    final_geojson = acts_dir / f"{final_id}.geojson"
+    if pending_geojson.exists():
+        # Update the ID in GeoJSON properties too
+        if final_id != activity_id:
+            geo = json.loads(pending_geojson.read_text(encoding="utf-8"))
+            geo["properties"]["id"] = final_id
+            pending_geojson.write_text(json.dumps(geo, indent=2, ensure_ascii=False))
+        pending_geojson.rename(final_geojson)
+
+    return final_id
+
+
+def cleanup_pending(output_dir: Path, activity_id: str, source_hash: str) -> None:
+    """Remove pending files for a losing activity (the one not chosen as canonical)."""
+    acts_dir = output_dir / "activities"
+    tag = source_hash[-8:] if source_hash else "unknown"
+    for suffix in (".pending.json", ".pending.geojson"):
+        p = acts_dir / f"{activity_id}.{tag}{suffix}"
+        p.unlink(missing_ok=True)
+
+
 def build_summary(
    activity: ParsedActivity,
    metrics: ComputedMetrics,
@@ -1,6 +1,6 @@
 <script lang="ts">
  import * as Plot from '@observablehq/plot';
-  import { onMount } from 'svelte';
+  import { onMount, onDestroy } from 'svelte';
  import type { Timeseries, AthleteZones } from '../lib/types';

  export let timeseries: Timeseries;
@@ -82,8 +82,15 @@
  // Range handles — reset whenever the metric or chart type changes
  let trimMin = 0;
  let trimMax = 100;
-  $: if (dataMin !== undefined) resetTrim(dataMin, dataMax);
-  function resetTrim(lo: number, hi: number) { trimMin = lo; trimMax = hi; }
+  let lastResetTab: Tab | null = null;
+  $: {
+    // Reset trim on tab change OR when data range changes
+    if (activeTab !== lastResetTab || trimMin < dataMin || trimMax > dataMax) {
+      trimMin = dataMin;
+      trimMax = dataMax;
+      lastResetTab = activeTab;
+    }
+  }

  $: step = (dataMax - dataMin) / 200 || 1;

@@ -116,6 +123,7 @@

  // ── Rendering ────────────────────────────────────────────────────────────
  onMount(() => { renderChart(); });
+  onDestroy(() => { chart?.remove(); chart = null; });

  $: if (chartEl) {
    activeTab; xMode; chartType; histData; histThresholds; alignZones;
@@ -27,7 +27,9 @@
  }

  onMount(async () => {
-    activeTab = (new URLSearchParams(window.location.search).get('tab') as Tab) ?? 'power';
+    const TABS: Tab[] = ['power', 'records', 'profile'];
+    const rawTab = new URLSearchParams(window.location.search).get('tab');
+    activeTab = TABS.includes(rawTab as Tab) ? (rawTab as Tab) : 'power';
    mounted = true;
    try {
      const [athleteRes, indexRes] = await Promise.all([
@@ -55,7 +55,7 @@ def test_parse_sidecar_frontmatter_only(tmp_path):
 # ── apply_sidecar ─────────────────────────────────────────────────────────────

 BASE_DETAIL = {
-    "id": "2024-01-01T08:00:00Z_cycling",
+    "id": "2024-01-01T080000Z-morning-ride",
    "title": "Morning Ride",
    "sport": "cycling",
    "started_at": "2024-01-01T08:00:00Z",
@@ -118,21 +118,21 @@ def data_dir(tmp_path):
    acts = tmp_path / "activities"
    acts.mkdir()
    # Two activities
-    for act_id, title in [
-        ("2024-01-01T08:00:00Z_cycling", "Morning Ride"),
-        ("2024-01-02T09:00:00Z_running", "Easy Run"),
+    for act_id, title, sport, started_at in [
+        ("2024-01-01T080000Z-morning-ride", "Morning Ride", "cycling", "2024-01-01T08:00:00Z"),
+        ("2024-01-02T090000Z-easy-run", "Easy Run", "running", "2024-01-02T09:00:00Z"),
    ]:
        detail = {
-            "id": act_id, "title": title, "sport": act_id.split("_")[1],
-            "started_at": act_id.split("_")[0],
+            "id": act_id, "title": title, "sport": sport,
+            "started_at": started_at,
            "description": "", "privacy": "public", "custom": {},
        }
        (acts / f"{act_id}.json").write_text(json.dumps(detail))
    # Index
    index = {"activities": [
-        {"id": "2024-01-01T08:00:00Z_cycling", "title": "Morning Ride",
+        {"id": "2024-01-01T080000Z-morning-ride", "title": "Morning Ride",
         "sport": "cycling", "started_at": "2024-01-01T08:00:00Z", "privacy": "public", "custom": {}},
-        {"id": "2024-01-02T09:00:00Z_running", "title": "Easy Run",
+        {"id": "2024-01-02T090000Z-easy-run", "title": "Easy Run",
         "sport": "running", "started_at": "2024-01-02T09:00:00Z", "privacy": "public", "custom": {}},
    ]}
    (tmp_path / "index.json").write_text(json.dumps(index))
@@ -145,20 +145,20 @@ def test_merge_all_no_sidecars(data_dir):
    merged = data_dir / "_merged"
    assert merged.exists()
    # Unmodified files are symlinked
-    detail_link = merged / "activities" / "2024-01-01T08:00:00Z_cycling.json"
+    detail_link = merged / "activities" / "2024-01-01T080000Z-morning-ride.json"
    assert detail_link.is_symlink()


 def test_merge_all_applies_sidecar(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text(
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text(
        "---\ntitle: Epic Ride\nhighlight: true\n---\n\nWhat a day!"
    )
    n = merge_all(data_dir)
    assert n == 1

-    merged_json = data_dir / "_merged" / "activities" / "2024-01-01T08:00:00Z_cycling.json"
+    merged_json = data_dir / "_merged" / "activities" / "2024-01-01T080000Z-morning-ride.json"
    assert not merged_json.is_symlink()
    data = json.loads(merged_json.read_text())
    assert data["title"] == "Epic Ride"
@@ -166,41 +166,41 @@ def test_merge_all_applies_sidecar(data_dir):
    assert data["description"] == "What a day!"

    # Untouched activity is still a symlink
-    run_link = data_dir / "_merged" / "activities" / "2024-01-02T09:00:00Z_running.json"
+    run_link = data_dir / "_merged" / "activities" / "2024-01-02T090000Z-easy-run.json"
    assert run_link.is_symlink()


 def test_merge_all_private_filtered_from_index(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\nprivate: true\n---\n")
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\nprivate: true\n---\n")
    merge_all(data_dir)

    index = json.loads((data_dir / "_merged" / "index.json").read_text())
    ids = [a["id"] for a in index["activities"]]
-    assert "2024-01-01T08:00:00Z_cycling" not in ids
-    assert "2024-01-02T09:00:00Z_running" in ids
+    assert "2024-01-01T080000Z-morning-ride" not in ids
+    assert "2024-01-02T090000Z-easy-run" in ids


 def test_merge_all_highlight_sorts_first(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
    # Highlight the older activity — it should appear first
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\nhighlight: true\n---\n")
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\nhighlight: true\n---\n")
    merge_all(data_dir)

    index = json.loads((data_dir / "_merged" / "index.json").read_text())
    ids = [a["id"] for a in index["activities"]]
-    assert ids[0] == "2024-01-01T08:00:00Z_cycling"
+    assert ids[0] == "2024-01-01T080000Z-morning-ride"


 def test_merge_all_idempotent(data_dir):
    edits = data_dir / "edits"
    edits.mkdir()
-    (edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\ntitle: Renamed\n---\n")
+    (edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\ntitle: Renamed\n---\n")
    merge_all(data_dir)
    merge_all(data_dir)  # second run should not error or double-apply
    data = json.loads(
-        (data_dir / "_merged" / "activities" / "2024-01-01T08:00:00Z_cycling.json").read_text()
+        (data_dir / "_merged" / "activities" / "2024-01-01T080000Z-morning-ride.json").read_text()
    )
    assert data["title"] == "Renamed"
@@ -11,6 +11,16 @@ def test_running_variants():
        assert normalise_sport(raw) == "running", raw


+def test_skiing_variants():
+    for raw in ("skiing", "alpine_skiing", "nordic_skiing", "backcountry_ski"):
+        assert normalise_sport(raw) == "skiing", raw
+
+
+def test_swimming_variants():
+    for raw in ("swimming", "swim", "open_water_swimming", "lap_swimming"):
+        assert normalise_sport(raw) == "swimming", raw
+
+
 def test_unknown_falls_back_to_other():
    assert normalise_sport("yoga") == "other"
    assert normalise_sport(None) == "other"
@@ -1,4 +1,5 @@
-from bincio.extract.writer import make_activity_id, _slugify
+from bincio.extract.writer import make_activity_id, build_summary, _slugify
+from bincio.extract.metrics import ComputedMetrics
 from bincio.extract.models import ParsedActivity, DataPoint
 from datetime import datetime, timezone

@@ -31,3 +32,52 @@ def test_slugify():
    assert _slugify("Morning Ride!") == "morning-ride"
    assert _slugify("  Vélo  ") == "velo"   # é → e via NFKD + ASCII
    assert _slugify("") == ""
+
+
+def test_id_utc_conversion():
+    """Non-UTC timestamps should be converted to UTC in the ID."""
+    from datetime import timedelta
+    tz_plus2 = timezone(timedelta(hours=2))
+    ts = datetime(2024, 6, 1, 9, 30, 12, tzinfo=tz_plus2)  # 07:30:12 UTC
+    act = ParsedActivity(
+        points=[DataPoint(timestamp=ts)],
+        sport="cycling",
+        started_at=ts,
+        source_file="test.fit",
+        source_hash="sha256:abc",
+    )
+    assert make_activity_id(act) == "2024-06-01T073012Z"
+
+
+def test_build_summary_required_fields():
+    """build_summary should include all fields needed by the schema."""
+    act = _dummy_activity("Test Ride")
+    metrics = ComputedMetrics(
+        distance_m=10000.0,
+        duration_s=3600,
+        moving_time_s=3500,
+        elevation_gain_m=100.0,
+        elevation_loss_m=95.0,
+        avg_speed_kmh=10.0,
+        max_speed_kmh=20.0,
+        avg_hr_bpm=None,
+        max_hr_bpm=None,
+        avg_cadence_rpm=None,
+        avg_power_w=None,
+        max_power_w=None,
+        bbox=None,
+        start_latlng=None,
+        end_latlng=None,
+        mmp=None,
+        best_efforts=None,
+        best_climb_m=None,
+    )
+    summary = build_summary(act, metrics, "2024-06-01T073012Z-test-ride")
+    # Required fields per schema
+    assert summary["id"] == "2024-06-01T073012Z-test-ride"
+    assert summary["title"] == "Test Ride"
+    assert summary["sport"] == "cycling"
+    assert "started_at" in summary
+    assert "privacy" in summary
+    assert "detail_url" in summary
+    assert "track_url" in summary