second pass. low

This commit is contained in:
Davide Scaini
2026-04-01 19:00:28 +02:00
parent 3d364c3992
commit bd5831c2fd
11 changed files with 277 additions and 62 deletions
+101 -11
View File
@@ -33,8 +33,16 @@ def write_activity(
privacy: str = "public",
duplicate_of: str | None = None,
rdp_epsilon: float = 0.0001,
pending: bool = False,
) -> str:
"""Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
"""Write {id}.json and (if GPS available) {id}.geojson. Returns the ID.
When pending=True, writes to a uniquely-named pending file
({id}.{hash[:8]}.pending.json) instead of the final path. This avoids
race conditions when multiple workers process activities with the same ID.
The main process is responsible for promoting pending files to final paths
via finalize_pending().
"""
activity_id = make_activity_id(activity)
acts_dir = output_dir / "activities"
acts_dir.mkdir(parents=True, exist_ok=True)
@@ -82,26 +90,108 @@ def write_activity(
"custom": {},
}
json_path = acts_dir / f"{activity_id}.json"
# Collision guard: if a *different* activity already has this ID, append a
# short hash suffix to disambiguate (same hash = idempotent re-extract).
if json_path.exists():
existing = json.loads(json_path.read_text(encoding="utf-8"))
if existing.get("source_hash") != activity.source_hash:
activity_id = f"{activity_id}-{activity.source_hash[-6:]}"
json_path = acts_dir / f"{activity_id}.json"
detail["id"] = activity_id
if pending:
# Write to a unique pending file — no collision possible
tag = activity.source_hash[-8:] if activity.source_hash else "unknown"
json_path = acts_dir / f"{activity_id}.{tag}.pending.json"
else:
json_path = acts_dir / f"{activity_id}.json"
# Legacy non-pending path: collision guard for callers that don't use
# the pending workflow (e.g. edit server upload_activity)
if json_path.exists():
existing = json.loads(json_path.read_text(encoding="utf-8"))
if existing.get("source_hash") != activity.source_hash:
activity_id = f"{activity_id}-{activity.source_hash[-6:]}"
json_path = acts_dir / f"{activity_id}.json"
detail["id"] = activity_id
json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
# ── GeoJSON track ────────────────────────────────────────────────────────
if has_gps:
geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
geojson_path = acts_dir / f"{activity_id}.geojson"
if pending:
geojson_path = acts_dir / f"{activity_id}.{tag}.pending.geojson"
else:
geojson_path = acts_dir / f"{activity_id}.geojson"
geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))
return activity_id
def activity_quality(result: dict) -> int:
"""Compute a quality score for an activity result from a worker.
Higher is better. Used by the main process to pick the best version
when multiple workers produce results for the same activity ID.
"""
from bincio.extract.dedup import _SOURCE_QUALITY
score = 0
# Source type quality (FIT > GPX > TCX)
score += _SOURCE_QUALITY.get(result.get("source") or "", 0) * 100
# Sensor channel count
score += result.get("sensor_channels", 0) * 10
# Point count (more data = better)
score += min(result.get("point_count", 0), 50000) // 100
return score
def finalize_pending(output_dir: Path, activity_id: str, source_hash: str) -> str:
"""Promote a pending file to its final path via atomic rename.
If another activity already occupies the ID (different source_hash),
the pending file is disambiguated with a hash suffix.
Returns the final activity_id (may include suffix).
"""
acts_dir = output_dir / "activities"
tag = source_hash[-8:] if source_hash else "unknown"
pending_json = acts_dir / f"{activity_id}.{tag}.pending.json"
pending_geojson = acts_dir / f"{activity_id}.{tag}.pending.geojson"
final_id = activity_id
final_json = acts_dir / f"{final_id}.json"
# Check for ID collision with a different activity
if final_json.exists():
existing = json.loads(final_json.read_text(encoding="utf-8"))
if existing.get("source_hash") != source_hash:
final_id = f"{activity_id}-{source_hash[-6:]}"
final_json = acts_dir / f"{final_id}.json"
# Update the ID inside the JSON if it changed
if final_id != activity_id and pending_json.exists():
detail = json.loads(pending_json.read_text(encoding="utf-8"))
detail["id"] = final_id
pending_json.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
# Atomic rename: pending → final
if pending_json.exists():
pending_json.rename(final_json)
final_geojson = acts_dir / f"{final_id}.geojson"
if pending_geojson.exists():
# Update the ID in GeoJSON properties too
if final_id != activity_id:
geo = json.loads(pending_geojson.read_text(encoding="utf-8"))
geo["properties"]["id"] = final_id
pending_geojson.write_text(json.dumps(geo, indent=2, ensure_ascii=False))
pending_geojson.rename(final_geojson)
return final_id
def cleanup_pending(output_dir: Path, activity_id: str, source_hash: str) -> None:
"""Remove pending files for a losing activity (the one not chosen as canonical)."""
acts_dir = output_dir / "activities"
tag = source_hash[-8:] if source_hash else "unknown"
for suffix in (".pending.json", ".pending.geojson"):
p = acts_dir / f"{activity_id}.{tag}{suffix}"
p.unlink(missing_ok=True)
def build_summary(
activity: ParsedActivity,
metrics: ComputedMetrics,