backend: initial commit

2026-03-28 13:57:12 +01:00
commit 38c5423aeb
36 changed files with 2463 additions and 0 deletions
@@ -0,0 +1,127 @@
+"""Duplicate activity detection.
+
+Two kinds of duplicates:
+
+1. Exact duplicate — same source_hash. Skip entirely.
+2. Near-duplicate — same ride recorded by two devices / exported from two
+   platforms. Detected by (started_at ± 5 min) AND (distance ± 5%).
+   The "better" source wins; the other gets duplicate_of set.
+
+The deduplication index is a JSON file persisted in the output directory so
+that incremental runs don't re-evaluate already-resolved pairs.
+"""
+
+import json
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional
+
+_INDEX_FILE = ".bincio_cache.json"
+
+# Source quality ranking (higher = preferred when deduplicating)
+_SOURCE_QUALITY: dict[str, int] = {
+    "karoo": 5,
+    "fit_file": 4,
+    "garmin_connect": 4,
+    "strava_export": 3,
+    "gpx_file": 2,
+    "tcx_file": 1,
+    "wahoo": 3,
+    "komoot": 2,
+    "manual": 0,
+}
+
+
+@dataclass
+class ActivityRecord:
+    """Minimal record stored in the dedup index."""
+
+    id: str
+    source_hash: str
+    started_at: datetime
+    distance_m: Optional[float]
+    source: Optional[str]
+    duplicate_of: Optional[str] = None
+
+
+@dataclass
+class DedupIndex:
+    output_dir: Path
+    _records: dict[str, ActivityRecord] = field(default_factory=dict)
+    # source_hash → id, for exact-duplicate lookup
+    _by_hash: dict[str, str] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        self._load()
+
+    def _load(self) -> None:
+        p = self.output_dir / _INDEX_FILE
+        if not p.exists():
+            return
+        data = json.loads(p.read_text())
+        for item in data.get("activities", []):
+            started_at = datetime.fromisoformat(item["started_at"])
+            r = ActivityRecord(
+                id=item["id"],
+                source_hash=item["source_hash"],
+                started_at=started_at,
+                distance_m=item.get("distance_m"),
+                source=item.get("source"),
+                duplicate_of=item.get("duplicate_of"),
+            )
+            self._records[r.id] = r
+            self._by_hash[r.source_hash] = r.id
+
+    def save(self) -> None:
+        p = self.output_dir / _INDEX_FILE
+        data = {
+            "activities": [
+                {
+                    "id": r.id,
+                    "source_hash": r.source_hash,
+                    "started_at": r.started_at.isoformat(),
+                    "distance_m": r.distance_m,
+                    "source": r.source,
+                    "duplicate_of": r.duplicate_of,
+                }
+                for r in self._records.values()
+            ]
+        }
+        p.write_text(json.dumps(data, indent=2))
+
+    def is_exact_duplicate(self, source_hash: str) -> Optional[str]:
+        """Return existing activity ID if hash is already in the index."""
+        return self._by_hash.get(source_hash)
+
+    def find_near_duplicate(
+        self,
+        started_at: datetime,
+        distance_m: Optional[float],
+    ) -> Optional[str]:
+        """Return ID of a near-duplicate if one exists."""
+        for r in self._records.values():
+            if r.duplicate_of is not None:
+                continue  # skip already-marked duplicates
+            if abs((r.started_at - started_at).total_seconds()) > 5 * 60:
+                continue
+            if distance_m is None or r.distance_m is None:
+                continue
+            ref = max(distance_m, r.distance_m)
+            if abs(distance_m - r.distance_m) / ref < 0.05:
+                return r.id
+        return None
+
+    def register(self, record: ActivityRecord) -> None:
+        self._records[record.id] = record
+        self._by_hash[record.source_hash] = record.id
+
+    def pick_canonical(self, existing_id: str, new_source: Optional[str]) -> str:
+        """Return the ID of whichever record should be canonical."""
+        existing = self._records[existing_id]
+        existing_q = _SOURCE_QUALITY.get(existing.source or "", 0)
+        new_q = _SOURCE_QUALITY.get(new_source or "", 0)
+        # New record is strictly better → existing becomes the duplicate
+        if new_q > existing_q:
+            return "__new__"
+        return existing_id