second pass. low

This commit is contained in:
Davide Scaini
2026-04-01 19:00:28 +02:00
parent 3d364c3992
commit bd5831c2fd
11 changed files with 277 additions and 62 deletions
+7 -2
View File
@@ -554,10 +554,15 @@ async def upload_activity(file: UploadFile = File(...)) -> JSONResponse:
if suffix not in _SUPPORTED_SUFFIXES:
raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.")
_MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
contents = await file.read()
if len(contents) > _MAX_UPLOAD_BYTES:
raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.")
staging = dd / "_uploads"
staging.mkdir(exist_ok=True)
staged = staging / name
staged.write_bytes(await file.read())
staged.write_bytes(contents)
try:
from bincio.extract.metrics import compute
@@ -592,7 +597,7 @@ async def upload_activity(file: UploadFile = File(...)) -> JSONResponse:
except HTTPException:
raise
except Exception as exc:
raise HTTPException(422, str(exc))
raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}")
finally:
staged.unlink(missing_ok=True)
+57 -13
View File
@@ -46,6 +46,9 @@ def _process_file(path: Path) -> dict:
"""Runs inside a worker process. Only receives a Path (tiny pickle).
All heavy shared data (_known_hashes, _strava_lookup, etc.) is already
in the worker's memory from the initializer — zero per-task overhead.
Writes to pending files (not final paths) so the main process can
arbitrate collisions and pick the best version.
"""
from bincio.extract.metrics import compute
from bincio.extract.parsers.factory import parse_file
@@ -80,11 +83,17 @@ def _process_file(path: Path) -> dict:
activity, metrics, _output_dir,
privacy=_privacy,
rdp_epsilon=_rdp_epsilon,
pending=True,
)
summary = build_summary(activity, metrics, activity_id, _privacy)
except Exception as exc:
return {"status": "error", "path": str(path), "error": str(exc)}
# Quality signals for the main process to compare competing results
sensor_channels = sum(1 for v in [
metrics.avg_hr_bpm, metrics.avg_power_w, metrics.avg_cadence_rpm,
] if v is not None)
return {
"status": "ok",
"summary": summary,
@@ -94,6 +103,8 @@ def _process_file(path: Path) -> dict:
"distance_m": metrics.distance_m,
"source": summary.get("source"),
"mmp": metrics.mmp,
"point_count": len(activity.points),
"sensor_channels": sensor_channels,
}
@@ -177,6 +188,8 @@ def extract(
summaries: list[dict] = []
errors: list[tuple[str, str]] = []
skipped = 0
# Collect all pending results, grouped by activity_id for collision arbitration
pending_by_id: dict[str, list[dict]] = {}
with Progress(
TextColumn("[progress.description]{task.description}"),
@@ -202,30 +215,61 @@ def extract(
elif result["status"] == "error":
errors.append((result["path"], result["error"]))
else:
# Near-duplicate check — must be sequential (stateful)
pending_by_id.setdefault(result["id"], []).append(result)
# ── Arbitrate collisions and finalize pending files ───────────────────────
from bincio.extract.writer import (
activity_quality, cleanup_pending, finalize_pending, write_athlete_json, write_index,
)
for activity_id, candidates in pending_by_id.items():
# Pick the best candidate by quality score
candidates.sort(key=activity_quality, reverse=True)
winner = candidates[0]
# Clean up losing candidates' pending files
for loser in candidates[1:]:
cleanup_pending(cfg.output_dir, activity_id, loser["hash"])
skipped += 1
# Near-duplicate check against already-known activities
from datetime import datetime
started_at = datetime.fromisoformat(result["started_at"])
near_id = dedup.find_near_duplicate(started_at, result["distance_m"])
started_at = datetime.fromisoformat(winner["started_at"])
near_id = dedup.find_near_duplicate(started_at, winner["distance_m"])
if near_id:
canonical = dedup.pick_canonical(near_id, result.get("source"))
canonical = dedup.pick_canonical(near_id, winner.get("source"))
if canonical != "__new__":
_patch_duplicate_of(cfg.output_dir, result["id"], near_id)
# Existing is better — finalize winner as duplicate, then patch it
final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
_patch_duplicate_of(cfg.output_dir, final_id, near_id)
skipped += 1
continue
_patch_duplicate_of(cfg.output_dir, near_id, result["id"])
dedup._records[near_id].duplicate_of = result["id"]
# New is better — patch the existing one as duplicate
final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
_patch_duplicate_of(cfg.output_dir, near_id, final_id)
dedup._records[near_id].duplicate_of = final_id
else:
final_id = finalize_pending(cfg.output_dir, activity_id, winner["hash"])
# Update summary with the finalized ID (may include hash suffix)
summary = winner["summary"]
if final_id != activity_id:
summary = dict(summary)
summary["id"] = final_id
summary["detail_url"] = f"activities/{final_id}.json"
if summary.get("track_url"):
summary["track_url"] = f"activities/{final_id}.geojson"
dedup.register(ActivityRecord(
id=result["id"],
source_hash=result["hash"],
id=final_id,
source_hash=winner["hash"],
started_at=started_at,
distance_m=result["distance_m"],
source=result.get("source"),
distance_m=winner["distance_m"],
source=winner.get("source"),
))
summaries.append(result["summary"])
summaries.append(summary)
from bincio.extract.writer import write_athlete_json, write_index
existing = _load_existing_summaries(cfg.output_dir)
merged = {s["id"]: s for s in existing}
for s in summaries:
+2
View File
@@ -76,6 +76,8 @@ def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
dp.temperature_c = float(val)
elif tag == "speed":
dp.speed_kmh = float(val) * 3.6 # m/s → km/h
elif tag in ("pwr", "power", "watts"):
dp.power_w = int(float(val))
def _strip_ns(tag: str) -> str:
+2 -2
View File
@@ -97,8 +97,8 @@ def _parse_ts(s: str) -> datetime:
return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
except ValueError:
continue
# Numeric offset like +02:00 or -05:30 — parse with %z then convert to UTC
m = _re.match(r"^(.+)([+-]\d{2}:\d{2})$", s)
# Numeric offset like +02:00, -05:30, or +0200 — parse with %z then convert to UTC
m = _re.match(r"^(.+)([+-]\d{2}:?\d{2})$", s)
if m:
body, off = m.group(1), m.group(2).replace(":", "")
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
+6 -2
View File
@@ -67,8 +67,12 @@ def build_geojson(
if p.lon is not None and p.lat is not None
]
# Parallel speed array for gradient coloring
speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified]
# Parallel speed array for gradient coloring — same filter as coordinates
speeds = [
round(p.speed_kmh, 2) if p.speed_kmh is not None else None
for p in simplified
if p.lon is not None and p.lat is not None
]
return {
"type": "Feature",
+93 -3
View File
@@ -33,8 +33,16 @@ def write_activity(
privacy: str = "public",
duplicate_of: str | None = None,
rdp_epsilon: float = 0.0001,
pending: bool = False,
) -> str:
"""Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
"""Write {id}.json and (if GPS available) {id}.geojson. Returns the ID.
When pending=True, writes to a uniquely-named pending file
({id}.{hash[:8]}.pending.json) instead of the final path. This avoids
race conditions when multiple workers process activities with the same ID.
The main process is responsible for promoting pending files to final paths
via finalize_pending().
"""
activity_id = make_activity_id(activity)
acts_dir = output_dir / "activities"
acts_dir.mkdir(parents=True, exist_ok=True)
@@ -82,26 +90,108 @@ def write_activity(
"custom": {},
}
if pending:
# Write to a unique pending file — no collision possible
tag = activity.source_hash[-8:] if activity.source_hash else "unknown"
json_path = acts_dir / f"{activity_id}.{tag}.pending.json"
else:
json_path = acts_dir / f"{activity_id}.json"
# Collision guard: if a *different* activity already has this ID, append a
# short hash suffix to disambiguate (same hash = idempotent re-extract).
# Legacy non-pending path: collision guard for callers that don't use
# the pending workflow (e.g. edit server upload_activity)
if json_path.exists():
existing = json.loads(json_path.read_text(encoding="utf-8"))
if existing.get("source_hash") != activity.source_hash:
activity_id = f"{activity_id}-{activity.source_hash[-6:]}"
json_path = acts_dir / f"{activity_id}.json"
detail["id"] = activity_id
json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
# ── GeoJSON track ────────────────────────────────────────────────────────
if has_gps:
geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
if pending:
geojson_path = acts_dir / f"{activity_id}.{tag}.pending.geojson"
else:
geojson_path = acts_dir / f"{activity_id}.geojson"
geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))
return activity_id
def activity_quality(result: dict) -> int:
"""Compute a quality score for an activity result from a worker.
Higher is better. Used by the main process to pick the best version
when multiple workers produce results for the same activity ID.
"""
from bincio.extract.dedup import _SOURCE_QUALITY
score = 0
# Source type quality (FIT > GPX > TCX)
score += _SOURCE_QUALITY.get(result.get("source") or "", 0) * 100
# Sensor channel count
score += result.get("sensor_channels", 0) * 10
# Point count (more data = better)
score += min(result.get("point_count", 0), 50000) // 100
return score
def finalize_pending(output_dir: Path, activity_id: str, source_hash: str) -> str:
"""Promote a pending file to its final path via atomic rename.
If another activity already occupies the ID (different source_hash),
the pending file is disambiguated with a hash suffix.
Returns the final activity_id (may include suffix).
"""
acts_dir = output_dir / "activities"
tag = source_hash[-8:] if source_hash else "unknown"
pending_json = acts_dir / f"{activity_id}.{tag}.pending.json"
pending_geojson = acts_dir / f"{activity_id}.{tag}.pending.geojson"
final_id = activity_id
final_json = acts_dir / f"{final_id}.json"
# Check for ID collision with a different activity
if final_json.exists():
existing = json.loads(final_json.read_text(encoding="utf-8"))
if existing.get("source_hash") != source_hash:
final_id = f"{activity_id}-{source_hash[-6:]}"
final_json = acts_dir / f"{final_id}.json"
# Update the ID inside the JSON if it changed
if final_id != activity_id and pending_json.exists():
detail = json.loads(pending_json.read_text(encoding="utf-8"))
detail["id"] = final_id
pending_json.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
# Atomic rename: pending → final
if pending_json.exists():
pending_json.rename(final_json)
final_geojson = acts_dir / f"{final_id}.geojson"
if pending_geojson.exists():
# Update the ID in GeoJSON properties too
if final_id != activity_id:
geo = json.loads(pending_geojson.read_text(encoding="utf-8"))
geo["properties"]["id"] = final_id
pending_geojson.write_text(json.dumps(geo, indent=2, ensure_ascii=False))
pending_geojson.rename(final_geojson)
return final_id
def cleanup_pending(output_dir: Path, activity_id: str, source_hash: str) -> None:
"""Remove pending files for a losing activity (the one not chosen as canonical)."""
acts_dir = output_dir / "activities"
tag = source_hash[-8:] if source_hash else "unknown"
for suffix in (".pending.json", ".pending.geojson"):
p = acts_dir / f"{activity_id}.{tag}{suffix}"
p.unlink(missing_ok=True)
def build_summary(
activity: ParsedActivity,
metrics: ComputedMetrics,
+11 -3
View File
@@ -1,6 +1,6 @@
<script lang="ts">
import * as Plot from '@observablehq/plot';
import { onMount } from 'svelte';
import { onMount, onDestroy } from 'svelte';
import type { Timeseries, AthleteZones } from '../lib/types';
export let timeseries: Timeseries;
@@ -82,8 +82,15 @@
// Range handles — reset whenever the metric or chart type changes
let trimMin = 0;
let trimMax = 100;
$: if (dataMin !== undefined) resetTrim(dataMin, dataMax);
function resetTrim(lo: number, hi: number) { trimMin = lo; trimMax = hi; }
let lastResetTab: Tab | null = null;
$: {
// Reset trim on tab change OR when data range changes
if (activeTab !== lastResetTab || trimMin < dataMin || trimMax > dataMax) {
trimMin = dataMin;
trimMax = dataMax;
lastResetTab = activeTab;
}
}
$: step = (dataMax - dataMin) / 200 || 1;
@@ -116,6 +123,7 @@
// ── Rendering ────────────────────────────────────────────────────────────
onMount(() => { renderChart(); });
onDestroy(() => { chart?.remove(); chart = null; });
$: if (chartEl) {
activeTab; xMode; chartType; histData; histThresholds; alignZones;
+3 -1
View File
@@ -27,7 +27,9 @@
}
onMount(async () => {
activeTab = (new URLSearchParams(window.location.search).get('tab') as Tab) ?? 'power';
const TABS: Tab[] = ['power', 'records', 'profile'];
const rawTab = new URLSearchParams(window.location.search).get('tab');
activeTab = TABS.includes(rawTab as Tab) ? (rawTab as Tab) : 'power';
mounted = true;
try {
const [athleteRes, indexRes] = await Promise.all([
+19 -19
View File
@@ -55,7 +55,7 @@ def test_parse_sidecar_frontmatter_only(tmp_path):
# ── apply_sidecar ─────────────────────────────────────────────────────────────
BASE_DETAIL = {
"id": "2024-01-01T08:00:00Z_cycling",
"id": "2024-01-01T080000Z-morning-ride",
"title": "Morning Ride",
"sport": "cycling",
"started_at": "2024-01-01T08:00:00Z",
@@ -118,21 +118,21 @@ def data_dir(tmp_path):
acts = tmp_path / "activities"
acts.mkdir()
# Two activities
for act_id, title in [
("2024-01-01T08:00:00Z_cycling", "Morning Ride"),
("2024-01-02T09:00:00Z_running", "Easy Run"),
for act_id, title, sport, started_at in [
("2024-01-01T080000Z-morning-ride", "Morning Ride", "cycling", "2024-01-01T08:00:00Z"),
("2024-01-02T090000Z-easy-run", "Easy Run", "running", "2024-01-02T09:00:00Z"),
]:
detail = {
"id": act_id, "title": title, "sport": act_id.split("_")[1],
"started_at": act_id.split("_")[0],
"id": act_id, "title": title, "sport": sport,
"started_at": started_at,
"description": "", "privacy": "public", "custom": {},
}
(acts / f"{act_id}.json").write_text(json.dumps(detail))
# Index
index = {"activities": [
{"id": "2024-01-01T08:00:00Z_cycling", "title": "Morning Ride",
{"id": "2024-01-01T080000Z-morning-ride", "title": "Morning Ride",
"sport": "cycling", "started_at": "2024-01-01T08:00:00Z", "privacy": "public", "custom": {}},
{"id": "2024-01-02T09:00:00Z_running", "title": "Easy Run",
{"id": "2024-01-02T090000Z-easy-run", "title": "Easy Run",
"sport": "running", "started_at": "2024-01-02T09:00:00Z", "privacy": "public", "custom": {}},
]}
(tmp_path / "index.json").write_text(json.dumps(index))
@@ -145,20 +145,20 @@ def test_merge_all_no_sidecars(data_dir):
merged = data_dir / "_merged"
assert merged.exists()
# Unmodified files are symlinked
detail_link = merged / "activities" / "2024-01-01T08:00:00Z_cycling.json"
detail_link = merged / "activities" / "2024-01-01T080000Z-morning-ride.json"
assert detail_link.is_symlink()
def test_merge_all_applies_sidecar(data_dir):
edits = data_dir / "edits"
edits.mkdir()
(edits / "2024-01-01T08:00:00Z_cycling.md").write_text(
(edits / "2024-01-01T080000Z-morning-ride.md").write_text(
"---\ntitle: Epic Ride\nhighlight: true\n---\n\nWhat a day!"
)
n = merge_all(data_dir)
assert n == 1
merged_json = data_dir / "_merged" / "activities" / "2024-01-01T08:00:00Z_cycling.json"
merged_json = data_dir / "_merged" / "activities" / "2024-01-01T080000Z-morning-ride.json"
assert not merged_json.is_symlink()
data = json.loads(merged_json.read_text())
assert data["title"] == "Epic Ride"
@@ -166,41 +166,41 @@ def test_merge_all_applies_sidecar(data_dir):
assert data["description"] == "What a day!"
# Untouched activity is still a symlink
run_link = data_dir / "_merged" / "activities" / "2024-01-02T09:00:00Z_running.json"
run_link = data_dir / "_merged" / "activities" / "2024-01-02T090000Z-easy-run.json"
assert run_link.is_symlink()
def test_merge_all_private_filtered_from_index(data_dir):
edits = data_dir / "edits"
edits.mkdir()
(edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\nprivate: true\n---\n")
(edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\nprivate: true\n---\n")
merge_all(data_dir)
index = json.loads((data_dir / "_merged" / "index.json").read_text())
ids = [a["id"] for a in index["activities"]]
assert "2024-01-01T08:00:00Z_cycling" not in ids
assert "2024-01-02T09:00:00Z_running" in ids
assert "2024-01-01T080000Z-morning-ride" not in ids
assert "2024-01-02T090000Z-easy-run" in ids
def test_merge_all_highlight_sorts_first(data_dir):
edits = data_dir / "edits"
edits.mkdir()
# Highlight the older activity — it should appear first
(edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\nhighlight: true\n---\n")
(edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\nhighlight: true\n---\n")
merge_all(data_dir)
index = json.loads((data_dir / "_merged" / "index.json").read_text())
ids = [a["id"] for a in index["activities"]]
assert ids[0] == "2024-01-01T08:00:00Z_cycling"
assert ids[0] == "2024-01-01T080000Z-morning-ride"
def test_merge_all_idempotent(data_dir):
edits = data_dir / "edits"
edits.mkdir()
(edits / "2024-01-01T08:00:00Z_cycling.md").write_text("---\ntitle: Renamed\n---\n")
(edits / "2024-01-01T080000Z-morning-ride.md").write_text("---\ntitle: Renamed\n---\n")
merge_all(data_dir)
merge_all(data_dir) # second run should not error or double-apply
data = json.loads(
(data_dir / "_merged" / "activities" / "2024-01-01T08:00:00Z_cycling.json").read_text()
(data_dir / "_merged" / "activities" / "2024-01-01T080000Z-morning-ride.json").read_text()
)
assert data["title"] == "Renamed"
+10
View File
@@ -11,6 +11,16 @@ def test_running_variants():
assert normalise_sport(raw) == "running", raw
def test_skiing_variants():
for raw in ("skiing", "alpine_skiing", "nordic_skiing", "backcountry_ski"):
assert normalise_sport(raw) == "skiing", raw
def test_swimming_variants():
for raw in ("swimming", "swim", "open_water_swimming", "lap_swimming"):
assert normalise_sport(raw) == "swimming", raw
def test_unknown_falls_back_to_other():
assert normalise_sport("yoga") == "other"
assert normalise_sport(None) == "other"
+51 -1
View File
@@ -1,4 +1,5 @@
from bincio.extract.writer import make_activity_id, _slugify
from bincio.extract.writer import make_activity_id, build_summary, _slugify
from bincio.extract.metrics import ComputedMetrics
from bincio.extract.models import ParsedActivity, DataPoint
from datetime import datetime, timezone
@@ -31,3 +32,52 @@ def test_slugify():
assert _slugify("Morning Ride!") == "morning-ride"
assert _slugify(" Vélo ") == "velo" # é → e via NFKD + ASCII
assert _slugify("") == ""
def test_id_utc_conversion():
"""Non-UTC timestamps should be converted to UTC in the ID."""
from datetime import timedelta
tz_plus2 = timezone(timedelta(hours=2))
ts = datetime(2024, 6, 1, 9, 30, 12, tzinfo=tz_plus2) # 07:30:12 UTC
act = ParsedActivity(
points=[DataPoint(timestamp=ts)],
sport="cycling",
started_at=ts,
source_file="test.fit",
source_hash="sha256:abc",
)
assert make_activity_id(act) == "2024-06-01T073012Z"
def test_build_summary_required_fields():
"""build_summary should include all fields needed by the schema."""
act = _dummy_activity("Test Ride")
metrics = ComputedMetrics(
distance_m=10000.0,
duration_s=3600,
moving_time_s=3500,
elevation_gain_m=100.0,
elevation_loss_m=95.0,
avg_speed_kmh=10.0,
max_speed_kmh=20.0,
avg_hr_bpm=None,
max_hr_bpm=None,
avg_cadence_rpm=None,
avg_power_w=None,
max_power_w=None,
bbox=None,
start_latlng=None,
end_latlng=None,
mmp=None,
best_efforts=None,
best_climb_m=None,
)
summary = build_summary(act, metrics, "2024-06-01T073012Z-test-ride")
# Required fields per schema
assert summary["id"] == "2024-06-01T073012Z-test-ride"
assert summary["title"] == "Test Ride"
assert summary["sport"] == "cycling"
assert "started_at" in summary
assert "privacy" in summary
assert "detail_url" in summary
assert "track_url" in summary