backend: initial commit

This commit is contained in:
Davide Scaini
2026-03-28 13:57:12 +01:00
commit 38c5423aeb
36 changed files with 2463 additions and 0 deletions
+127
View File
@@ -0,0 +1,127 @@
"""Duplicate activity detection.
Two kinds of duplicates:
1. Exact duplicate — same source_hash. Skip entirely.
2. Near-duplicate — same ride recorded by two devices / exported from two
platforms. Detected by (started_at ± 5 min) AND (distance ± 5%).
The "better" source wins; the other gets duplicate_of set.
The deduplication index is a JSON file persisted in the output directory so
that incremental runs don't re-evaluate already-resolved pairs.
"""
import json
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
_INDEX_FILE = ".bincio_cache.json"
# Source quality ranking (higher = preferred when deduplicating)
_SOURCE_QUALITY: dict[str, int] = {
"karoo": 5,
"fit_file": 4,
"garmin_connect": 4,
"strava_export": 3,
"gpx_file": 2,
"tcx_file": 1,
"wahoo": 3,
"komoot": 2,
"manual": 0,
}
@dataclass
class ActivityRecord:
"""Minimal record stored in the dedup index."""
id: str
source_hash: str
started_at: datetime
distance_m: Optional[float]
source: Optional[str]
duplicate_of: Optional[str] = None
@dataclass
class DedupIndex:
output_dir: Path
_records: dict[str, ActivityRecord] = field(default_factory=dict)
# source_hash → id, for exact-duplicate lookup
_by_hash: dict[str, str] = field(default_factory=dict)
def __post_init__(self) -> None:
self._load()
def _load(self) -> None:
p = self.output_dir / _INDEX_FILE
if not p.exists():
return
data = json.loads(p.read_text())
for item in data.get("activities", []):
started_at = datetime.fromisoformat(item["started_at"])
r = ActivityRecord(
id=item["id"],
source_hash=item["source_hash"],
started_at=started_at,
distance_m=item.get("distance_m"),
source=item.get("source"),
duplicate_of=item.get("duplicate_of"),
)
self._records[r.id] = r
self._by_hash[r.source_hash] = r.id
def save(self) -> None:
p = self.output_dir / _INDEX_FILE
data = {
"activities": [
{
"id": r.id,
"source_hash": r.source_hash,
"started_at": r.started_at.isoformat(),
"distance_m": r.distance_m,
"source": r.source,
"duplicate_of": r.duplicate_of,
}
for r in self._records.values()
]
}
p.write_text(json.dumps(data, indent=2))
def is_exact_duplicate(self, source_hash: str) -> Optional[str]:
"""Return existing activity ID if hash is already in the index."""
return self._by_hash.get(source_hash)
def find_near_duplicate(
self,
started_at: datetime,
distance_m: Optional[float],
) -> Optional[str]:
"""Return ID of a near-duplicate if one exists."""
for r in self._records.values():
if r.duplicate_of is not None:
continue # skip already-marked duplicates
if abs((r.started_at - started_at).total_seconds()) > 5 * 60:
continue
if distance_m is None or r.distance_m is None:
continue
ref = max(distance_m, r.distance_m)
if abs(distance_m - r.distance_m) / ref < 0.05:
return r.id
return None
def register(self, record: ActivityRecord) -> None:
self._records[record.id] = record
self._by_hash[record.source_hash] = record.id
def pick_canonical(self, existing_id: str, new_source: Optional[str]) -> str:
"""Return the ID of whichever record should be canonical."""
existing = self._records[existing_id]
existing_q = _SOURCE_QUALITY.get(existing.source or "", 0)
new_q = _SOURCE_QUALITY.get(new_source or "", 0)
# New record is strictly better → existing becomes the duplicate
if new_q > existing_q:
return "__new__"
return existing_id