From c077fceba6a25830fcc3bc59608f2fea4c787fd1 Mon Sep 17 00:00:00 2001 From: Davide Scaini Date: Sat, 25 Apr 2026 09:54:38 +0200 Subject: [PATCH] fix: validate activity file exists before treating cached hash as known The dedup cache (.bincio_cache.json) persists source hashes across runs. If the activities/ directory is wiped (--fresh in another session, or macOS clearing /tmp after a restart) while the cache file survives at the user dir level, re-running extract skips all files as "already extracted" and leaves activities/ empty. Now only hashes whose corresponding .json file is present on disk are treated as known, so missing files are always re-extracted. --- bincio/extract/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bincio/extract/cli.py b/bincio/extract/cli.py index 6ddf179..24fcdc6 100644 --- a/bincio/extract/cli.py +++ b/bincio/extract/cli.py @@ -169,7 +169,13 @@ def extract( console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].") dedup = DedupIndex(output_dir=cfg.output_dir) - known_hashes: frozenset = frozenset(dedup._by_hash.keys()) + # Only skip files whose output actually exists — the cache can outlive a + # --fresh wipe or manual deletion of the activities directory. + _acts_dir = cfg.output_dir / "activities" + known_hashes: frozenset = frozenset( + h for h, act_id in dedup._by_hash.items() + if (_acts_dir / f"{act_id}.json").exists() + ) n_workers = workers or cfg.workers or os.cpu_count() or 4 console.print(f"Using [bold]{n_workers}[/bold] worker processes.")