fix: validate activity file exists before treating cached hash as known

The dedup cache (.bincio_cache.json) persists source hashes across runs.
If the activities/ directory is wiped (--fresh in another session, or macOS
clearing /tmp after a restart) while the cache file survives at the user dir
level, re-running extract skips all files as "already extracted" and leaves
activities/ empty. Now only hashes whose corresponding .json file is present
on disk are treated as known, so missing files are always re-extracted.
This commit is contained in:
Davide Scaini
2026-04-25 09:54:38 +02:00
parent e08b024d15
commit c077fceba6
+7 -1
View File
@@ -169,7 +169,13 @@ def extract(
console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].") console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].")
dedup = DedupIndex(output_dir=cfg.output_dir) dedup = DedupIndex(output_dir=cfg.output_dir)
known_hashes: frozenset = frozenset(dedup._by_hash.keys()) # Only skip files whose output actually exists — the cache can outlive a
# --fresh wipe or manual deletion of the activities directory.
_acts_dir = cfg.output_dir / "activities"
known_hashes: frozenset = frozenset(
h for h, act_id in dedup._by_hash.items()
if (_acts_dir / f"{act_id}.json").exists()
)
n_workers = workers or cfg.workers or os.cpu_count() or 4 n_workers = workers or cfg.workers or os.cpu_count() or 4
console.print(f"Using [bold]{n_workers}[/bold] worker processes.") console.print(f"Using [bold]{n_workers}[/bold] worker processes.")