"""bincio extract — CLI command.""" import json import sys from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path from typing import Optional import click from rich.console import Console from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn from bincio.extract.config import ExtractConfig, default_config, load_config from bincio.extract.dedup import ActivityRecord, DedupIndex from bincio.extract.metrics import compute from bincio.extract.models import ParsedActivity from bincio.extract.parsers.factory import is_supported, parse_file from bincio.extract.strava_csv import StravaMetadata from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index console = Console() @click.command() @click.option("--config", "config_path", type=click.Path(exists=True), default=None, help="Path to extract_config.yaml (default: ./extract_config.yaml).") @click.option("--input", "input_dir", type=click.Path(exists=True), default=None, help="Input directory (overrides config).") @click.option("--output", "output_dir", type=click.Path(), default=None, help="Output directory (overrides config).") @click.option("--file", "single_file", type=click.Path(exists=True), default=None, help="Process a single file and print JSON to stdout.") @click.option("--since", default=None, metavar="YYYY-MM-DD", help="Only process files modified after this date.") @click.option("--workers", default=4, show_default=True, help="Number of parallel worker processes.") def extract( config_path: Optional[str], input_dir: Optional[str], output_dir: Optional[str], single_file: Optional[str], since: Optional[str], workers: int, ) -> None: """Parse GPX/FIT/TCX files and write BAS JSON data store.""" # ── single file mode ───────────────────────────────────────────────────── if single_file: _process_single(Path(single_file)) return # ── load config ────────────────────────────────────────────────────────── cfg = _resolve_config(config_path, input_dir, output_dir) cfg.output_dir.mkdir(parents=True, exist_ok=True) # ── gather files ───────────────────────────────────────────────────────── files = _collect_files(cfg, since) if not files: console.print("[yellow]No supported files found.[/yellow]") return console.print(f"Found [bold]{len(files)}[/bold] activity files.") # ── Strava metadata ────────────────────────────────────────────────────── strava_meta: Optional[StravaMetadata] = None if cfg.metadata_csv and cfg.metadata_csv.exists(): strava_meta = StravaMetadata(cfg.metadata_csv) console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].") # ── dedup index ────────────────────────────────────────────────────────── dedup = DedupIndex(output_dir=cfg.output_dir) # ── process ────────────────────────────────────────────────────────────── summaries: list[dict] = [] errors: list[tuple[Path, str]] = [] skipped = 0 owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name} with Progress( TextColumn("[progress.description]{task.description}"), BarColumn(), MofNCompleteColumn(), TimeElapsedColumn(), console=console, ) as progress: task = progress.add_task("Processing...", total=len(files)) with ProcessPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_parse_worker, f): f for f in files} for future in as_completed(futures): path = futures[future] progress.advance(task) try: activity = future.result() except Exception as exc: errors.append((path, str(exc))) continue # ── incremental skip ────────────────────────────────────── if cfg.incremental: existing_id = dedup.is_exact_duplicate(activity.source_hash) if existing_id: skipped += 1 continue # ── enrich from Strava CSV ──────────────────────────────── if strava_meta: strava_meta.enrich(activity.source_file, activity) # ── compute metrics ─────────────────────────────────────── metrics = compute(activity) # ── deduplication ───────────────────────────────────────── activity_id = make_activity_id(activity) duplicate_of: Optional[str] = None near_dup_id = dedup.find_near_duplicate( activity.started_at, metrics.distance_m ) if near_dup_id: source = _infer_source(activity) canonical = dedup.pick_canonical(near_dup_id, source) if canonical == "__new__": # New one is better — mark existing as duplicate existing = dedup._records[near_dup_id] existing.duplicate_of = activity_id else: duplicate_of = near_dup_id # ── write files ─────────────────────────────────────────── written_id = write_activity( activity, metrics, cfg.output_dir, privacy=cfg.default_privacy, duplicate_of=duplicate_of, rdp_epsilon=cfg.track.rdp_epsilon, ) # Register in dedup index dedup.register(ActivityRecord( id=written_id, source_hash=activity.source_hash, started_at=activity.started_at, distance_m=metrics.distance_m, source=_infer_source(activity), )) if duplicate_of is None: summaries.append( build_summary(activity, metrics, written_id, cfg.default_privacy) ) # ── write index.json ────────────────────────────────────────────────────── # Merge with any existing summaries from previous incremental runs existing_index = _load_existing_summaries(cfg.output_dir) all_summaries = {s["id"]: s for s in existing_index} for s in summaries: all_summaries[s["id"]] = s write_index(list(all_summaries.values()), cfg.output_dir, owner) dedup.save() # ── summary ─────────────────────────────────────────────────────────────── console.print( f"\n[green]Done.[/green] " f"Processed [bold]{len(summaries)}[/bold] activities, " f"skipped [bold]{skipped}[/bold] (already up to date), " f"errors [bold]{len(errors)}[/bold]." ) if errors: console.print("\n[red]Errors:[/red]") for path, msg in errors[:20]: console.print(f" {path.name}: {msg}") if len(errors) > 20: console.print(f" ... and {len(errors) - 20} more.") # ── helpers ─────────────────────────────────────────────────────────────────── def _parse_worker(path: Path) -> ParsedActivity: """Run in worker process — imports are isolated.""" from bincio.extract.parsers.factory import parse_file return parse_file(path) def _process_single(path: Path) -> None: from bincio.extract.parsers.factory import parse_file try: activity = parse_file(path) metrics = compute(activity) activity_id = make_activity_id(activity) from bincio.extract.writer import build_summary result = build_summary(activity, metrics, activity_id) click.echo(json.dumps(result, indent=2)) except Exception as exc: console.print(f"[red]Error:[/red] {exc}") sys.exit(1) def _resolve_config( config_path: Optional[str], input_dir: Optional[str], output_dir: Optional[str], ) -> ExtractConfig: if config_path: cfg = load_config(Path(config_path)) elif Path("extract_config.yaml").exists(): cfg = load_config(Path("extract_config.yaml")) elif input_dir: cfg = default_config( Path(input_dir).expanduser(), Path(output_dir or "./bincio_data").expanduser(), ) else: raise click.UsageError( "Provide --config, --input, or an extract_config.yaml in the current directory." ) if input_dir: cfg.input_dirs = [Path(input_dir).expanduser()] if output_dir: cfg.output_dir = Path(output_dir).expanduser() return cfg def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]: from bincio.extract.parsers.factory import is_supported import os from datetime import datetime since_ts: Optional[float] = None if since: since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp() files = [] for d in cfg.input_dirs: if not d.exists(): console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}") continue for path in d.rglob("*"): if not path.is_file(): continue if not is_supported(path): continue if since_ts and path.stat().st_mtime < since_ts: continue files.append(path) return files def _load_existing_summaries(output_dir: Path) -> list[dict]: index_path = output_dir / "index.json" if not index_path.exists(): return [] try: data = json.loads(index_path.read_text()) return data.get("activities", []) except Exception: return [] def _infer_source(activity: ParsedActivity) -> Optional[str]: if activity.strava_id: return "strava_export" name = activity.source_file.lower() if "activity" in name and len(name.split(".")) >= 3: return "karoo" if name.endswith((".fit", ".fit.gz")): return "fit_file" if name.endswith((".gpx", ".gpx.gz")): return "gpx_file" if name.endswith((".tcx", ".tcx.gz")): return "tcx_file" return None