Files
bincio-activity/bincio/extract/cli.py
T
2026-03-28 13:59:36 +01:00

272 lines
12 KiB
Python

"""bincio extract — CLI command."""
import json
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from typing import Optional
import click
from rich.console import Console
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn
from bincio.extract.config import ExtractConfig, default_config, load_config
from bincio.extract.dedup import ActivityRecord, DedupIndex
from bincio.extract.metrics import compute
from bincio.extract.models import ParsedActivity
from bincio.extract.parsers.factory import is_supported, parse_file
from bincio.extract.strava_csv import StravaMetadata
from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
console = Console()
@click.command()
@click.option("--config", "config_path", type=click.Path(exists=True), default=None,
help="Path to extract_config.yaml (default: ./extract_config.yaml).")
@click.option("--input", "input_dir", type=click.Path(exists=True), default=None,
help="Input directory (overrides config).")
@click.option("--output", "output_dir", type=click.Path(), default=None,
help="Output directory (overrides config).")
@click.option("--file", "single_file", type=click.Path(exists=True), default=None,
help="Process a single file and print JSON to stdout.")
@click.option("--since", default=None, metavar="YYYY-MM-DD",
help="Only process files modified after this date.")
@click.option("--workers", default=4, show_default=True,
help="Number of parallel worker processes.")
def extract(
config_path: Optional[str],
input_dir: Optional[str],
output_dir: Optional[str],
single_file: Optional[str],
since: Optional[str],
workers: int,
) -> None:
"""Parse GPX/FIT/TCX files and write BAS JSON data store."""
# ── single file mode ─────────────────────────────────────────────────────
if single_file:
_process_single(Path(single_file))
return
# ── load config ──────────────────────────────────────────────────────────
cfg = _resolve_config(config_path, input_dir, output_dir)
cfg.output_dir.mkdir(parents=True, exist_ok=True)
# ── gather files ─────────────────────────────────────────────────────────
files = _collect_files(cfg, since)
if not files:
console.print("[yellow]No supported files found.[/yellow]")
return
console.print(f"Found [bold]{len(files)}[/bold] activity files.")
# ── Strava metadata ──────────────────────────────────────────────────────
strava_meta: Optional[StravaMetadata] = None
if cfg.metadata_csv and cfg.metadata_csv.exists():
strava_meta = StravaMetadata(cfg.metadata_csv)
console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].")
# ── dedup index ──────────────────────────────────────────────────────────
dedup = DedupIndex(output_dir=cfg.output_dir)
# ── process ──────────────────────────────────────────────────────────────
summaries: list[dict] = []
errors: list[tuple[Path, str]] = []
skipped = 0
owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name}
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
MofNCompleteColumn(),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Processing...", total=len(files))
with ProcessPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_parse_worker, f): f for f in files}
for future in as_completed(futures):
path = futures[future]
progress.advance(task)
try:
activity = future.result()
except Exception as exc:
errors.append((path, str(exc)))
continue
# ── incremental skip ──────────────────────────────────────
if cfg.incremental:
existing_id = dedup.is_exact_duplicate(activity.source_hash)
if existing_id:
skipped += 1
continue
# ── enrich from Strava CSV ────────────────────────────────
if strava_meta:
strava_meta.enrich(activity.source_file, activity)
# ── compute metrics ───────────────────────────────────────
metrics = compute(activity)
# ── deduplication ─────────────────────────────────────────
activity_id = make_activity_id(activity)
duplicate_of: Optional[str] = None
near_dup_id = dedup.find_near_duplicate(
activity.started_at, metrics.distance_m
)
if near_dup_id:
source = _infer_source(activity)
canonical = dedup.pick_canonical(near_dup_id, source)
if canonical == "__new__":
# New one is better — mark existing as duplicate
existing = dedup._records[near_dup_id]
existing.duplicate_of = activity_id
else:
duplicate_of = near_dup_id
# ── write files ───────────────────────────────────────────
written_id = write_activity(
activity, metrics, cfg.output_dir,
privacy=cfg.default_privacy,
duplicate_of=duplicate_of,
rdp_epsilon=cfg.track.rdp_epsilon,
)
# Register in dedup index
dedup.register(ActivityRecord(
id=written_id,
source_hash=activity.source_hash,
started_at=activity.started_at,
distance_m=metrics.distance_m,
source=_infer_source(activity),
))
if duplicate_of is None:
summaries.append(
build_summary(activity, metrics, written_id, cfg.default_privacy)
)
# ── write index.json ──────────────────────────────────────────────────────
# Merge with any existing summaries from previous incremental runs
existing_index = _load_existing_summaries(cfg.output_dir)
all_summaries = {s["id"]: s for s in existing_index}
for s in summaries:
all_summaries[s["id"]] = s
write_index(list(all_summaries.values()), cfg.output_dir, owner)
dedup.save()
# ── summary ───────────────────────────────────────────────────────────────
console.print(
f"\n[green]Done.[/green] "
f"Processed [bold]{len(summaries)}[/bold] activities, "
f"skipped [bold]{skipped}[/bold] (already up to date), "
f"errors [bold]{len(errors)}[/bold]."
)
if errors:
console.print("\n[red]Errors:[/red]")
for path, msg in errors[:20]:
console.print(f" {path.name}: {msg}")
if len(errors) > 20:
console.print(f" ... and {len(errors) - 20} more.")
# ── helpers ───────────────────────────────────────────────────────────────────
def _parse_worker(path: Path) -> ParsedActivity:
"""Run in worker process — imports are isolated."""
from bincio.extract.parsers.factory import parse_file
return parse_file(path)
def _process_single(path: Path) -> None:
from bincio.extract.parsers.factory import parse_file
try:
activity = parse_file(path)
metrics = compute(activity)
activity_id = make_activity_id(activity)
from bincio.extract.writer import build_summary
result = build_summary(activity, metrics, activity_id)
click.echo(json.dumps(result, indent=2))
except Exception as exc:
console.print(f"[red]Error:[/red] {exc}")
sys.exit(1)
def _resolve_config(
config_path: Optional[str],
input_dir: Optional[str],
output_dir: Optional[str],
) -> ExtractConfig:
if config_path:
cfg = load_config(Path(config_path))
elif Path("extract_config.yaml").exists():
cfg = load_config(Path("extract_config.yaml"))
elif input_dir:
cfg = default_config(
Path(input_dir).expanduser(),
Path(output_dir or "./bincio_data").expanduser(),
)
else:
raise click.UsageError(
"Provide --config, --input, or an extract_config.yaml in the current directory."
)
if input_dir:
cfg.input_dirs = [Path(input_dir).expanduser()]
if output_dir:
cfg.output_dir = Path(output_dir).expanduser()
return cfg
def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]:
from bincio.extract.parsers.factory import is_supported
import os
from datetime import datetime
since_ts: Optional[float] = None
if since:
since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp()
files = []
for d in cfg.input_dirs:
if not d.exists():
console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}")
continue
for path in d.rglob("*"):
if not path.is_file():
continue
if not is_supported(path):
continue
if since_ts and path.stat().st_mtime < since_ts:
continue
files.append(path)
return files
def _load_existing_summaries(output_dir: Path) -> list[dict]:
index_path = output_dir / "index.json"
if not index_path.exists():
return []
try:
data = json.loads(index_path.read_text())
return data.get("activities", [])
except Exception:
return []
def _infer_source(activity: ParsedActivity) -> Optional[str]:
if activity.strava_id:
return "strava_export"
name = activity.source_file.lower()
if "activity" in name and len(name.split(".")) >= 3:
return "karoo"
if name.endswith((".fit", ".fit.gz")):
return "fit_file"
if name.endswith((".gpx", ".gpx.gz")):
return "gpx_file"
if name.endswith((".tcx", ".tcx.gz")):
return "tcx_file"
return None