"""Process a Strava bulk export ZIP file into a BAS data store. The ZIP (downloaded from strava.com/athlete/delete_your_account or the data export page) contains: activities/ ← GPX, FIT, TCX files (plain or .gz variants) activities.csv ← metadata (title, description, gear, strava ID) bikes.csv / shoes.csv / … (ignored here) Processing strategy: stream one activity at a time to keep disk usage low. The ZIP is never fully extracted; each activity file is extracted to a temp path, parsed, ingested, then immediately deleted. The ZIP itself is deleted once done. """ from __future__ import annotations import io import json import tempfile import zipfile from pathlib import Path from typing import Generator, Optional # File extensions recognised as activity files inside the ZIP. _ACTIVITY_SUFFIXES = {".gpx", ".fit", ".tcx", ".gpx.gz", ".fit.gz", ".tcx.gz"} def _is_activity_file(name: str) -> bool: n = name.lower() return any(n.endswith(s) for s in _ACTIVITY_SUFFIXES) def strava_zip_iter( zip_path: Path, data_dir: Path, originals_dir: Optional[Path] = None, privacy: str = "public", ) -> Generator[dict, None, None]: """Process a Strava export ZIP, yielding SSE-style progress dicts. Event types: {"type": "validating"} {"type": "error", "message": str} {"type": "extracting_csv"} {"type": "progress", "n": int, "total": int, "name": str, "status": "imported"|"skipped"|"error"} {"type": "done", "imported": int, "skipped": int, "error_count": int, "errors": list[str]} The zip_path file is deleted after processing regardless of success/failure. """ from bincio.extract.ingest import ingest_parsed from bincio.extract.parsers.factory import parse_file from bincio.extract.strava_csv import StravaMetadata yield {"type": "validating"} try: zf = zipfile.ZipFile(zip_path, "r") except zipfile.BadZipFile as e: zip_path.unlink(missing_ok=True) yield {"type": "error", "message": f"Not a valid ZIP file: {e}"} return try: names = zf.namelist() # Validate structure has_csv = "activities.csv" in names activity_files = [n for n in names if n.startswith("activities/") and _is_activity_file(n)] if not has_csv: yield {"type": "error", "message": "This doesn't look like a Strava export: activities.csv not found"} return if not activity_files: yield {"type": "error", "message": "No activity files found in activities/ folder"} return # Load activities.csv into memory (it's small — ~700 KB) yield {"type": "extracting_csv"} csv_bytes = zf.read("activities.csv") with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp_csv: tmp_csv.write(csv_bytes) tmp_csv_path = Path(tmp_csv.name) try: metadata = StravaMetadata(tmp_csv_path) finally: tmp_csv_path.unlink(missing_ok=True) total = len(activity_files) imported = 0 skipped = 0 errors: list[str] = [] for n, zip_entry in enumerate(activity_files, 1): entry_name = Path(zip_entry).name # e.g. "12345678.fit.gz" # Title from metadata if available; fall back to filename stem meta_row = metadata.lookup(entry_name) display_name = (meta_row or {}).get("Activity Name", "").strip() or entry_name # Determine activity ID from entry to check for duplicates before extracting # (can't do this without parsing, so we extract to a small temp file) suffix = "".join(Path(entry_name).suffixes) # ".fit.gz" or ".gpx" etc. tmp_path: Optional[Path] = None try: with tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir=data_dir) as tmp: tmp.write(zf.read(zip_entry)) tmp_path = Path(tmp.name) parsed = parse_file(tmp_path) # Enrich with CSV metadata if meta_row: if not parsed.title and meta_row.get("Activity Name"): parsed.title = meta_row["Activity Name"].strip() if not parsed.description and meta_row.get("Activity Description"): parsed.description = meta_row["Activity Description"].strip() if not parsed.strava_id and meta_row.get("Activity ID"): parsed.strava_id = meta_row["Activity ID"].strip() if not parsed.gear and meta_row.get("Gear"): parsed.gear = meta_row["Gear"].strip() if originals_dir is not None: import shutil orig_dest = originals_dir / entry_name shutil.copy2(tmp_path, orig_dest) ingest_parsed(parsed, data_dir, privacy=privacy) imported += 1 yield {"type": "progress", "n": n, "total": total, "name": display_name, "status": "imported"} except FileExistsError: skipped += 1 yield {"type": "progress", "n": n, "total": total, "name": display_name, "status": "skipped"} except Exception as exc: errors.append(f"{entry_name}: {type(exc).__name__}") yield {"type": "progress", "n": n, "total": total, "name": display_name, "status": "error"} finally: if tmp_path is not None: tmp_path.unlink(missing_ok=True) finally: zf.close() zip_path.unlink(missing_ok=True) yield { "type": "done", "imported": imported, "skipped": skipped, "error_count": len(errors), "errors": errors[:5], }