parallelizing extraction, fix tcx files

This commit is contained in:
Davide Scaini
2026-03-28 14:24:16 +01:00
parent 38c5423aeb
commit 5d58126d2f
6 changed files with 226 additions and 192 deletions
+149 -122
View File
@@ -1,6 +1,7 @@
"""bincio extract — CLI command.""" """bincio extract — CLI command."""
import json import json
import os
import sys import sys
from concurrent.futures import ProcessPoolExecutor, as_completed from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
@@ -12,15 +13,88 @@ from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, T
from bincio.extract.config import ExtractConfig, default_config, load_config from bincio.extract.config import ExtractConfig, default_config, load_config
from bincio.extract.dedup import ActivityRecord, DedupIndex from bincio.extract.dedup import ActivityRecord, DedupIndex
from bincio.extract.metrics import compute from bincio.extract.parsers.factory import is_supported
from bincio.extract.models import ParsedActivity
from bincio.extract.parsers.factory import is_supported, parse_file
from bincio.extract.strava_csv import StravaMetadata
from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
console = Console() console = Console()
# ── per-worker state (set once via initializer, never re-pickled) ─────────────
_known_hashes: frozenset = frozenset()
_strava_lookup: dict = {}
_output_dir: Path = Path(".")
_privacy: str = "public"
_rdp_epsilon: float = 0.0001
def _worker_init(
known_hashes: frozenset,
strava_lookup: dict,
output_dir: Path,
privacy: str,
rdp_epsilon: float,
) -> None:
global _known_hashes, _strava_lookup, _output_dir, _privacy, _rdp_epsilon
_known_hashes = known_hashes
_strava_lookup = strava_lookup
_output_dir = output_dir
_privacy = privacy
_rdp_epsilon = rdp_epsilon
def _process_file(path: Path) -> dict:
"""Runs inside a worker process. Only receives a Path (tiny pickle).
All heavy shared data (_known_hashes, _strava_lookup, etc.) is already
in the worker's memory from the initializer — zero per-task overhead.
"""
from bincio.extract.metrics import compute
from bincio.extract.parsers.factory import parse_file
from bincio.extract.writer import build_summary, make_activity_id, write_activity
try:
activity = parse_file(path)
except Exception as exc:
return {"status": "error", "path": str(path), "error": str(exc)}
# Exact-duplicate check (free — just a set lookup)
if activity.source_hash in _known_hashes:
return {"status": "duplicate"}
# Enrich from Strava CSV
row = _strava_lookup.get(activity.source_file)
if row:
if not activity.title:
activity.title = row.get("Activity Name", "").strip() or None
if not activity.description:
activity.description = row.get("Activity Description", "").strip() or None
if not activity.strava_id:
activity.strava_id = row.get("Activity ID", "").strip() or None
try:
metrics = compute(activity)
activity_id = make_activity_id(activity)
write_activity(
activity, metrics, _output_dir,
privacy=_privacy,
rdp_epsilon=_rdp_epsilon,
)
summary = build_summary(activity, metrics, activity_id, _privacy)
except Exception as exc:
return {"status": "error", "path": str(path), "error": str(exc)}
return {
"status": "ok",
"summary": summary,
"id": activity_id,
"hash": activity.source_hash,
"started_at": activity.started_at.isoformat(),
"distance_m": metrics.distance_m,
"source": summary.get("source"),
}
# ── CLI ────────────────────────────────────────────────────────────────────────
@click.command() @click.command()
@click.option("--config", "config_path", type=click.Path(exists=True), default=None, @click.option("--config", "config_path", type=click.Path(exists=True), default=None,
help="Path to extract_config.yaml (default: ./extract_config.yaml).") help="Path to extract_config.yaml (default: ./extract_config.yaml).")
@@ -32,49 +106,48 @@ console = Console()
help="Process a single file and print JSON to stdout.") help="Process a single file and print JSON to stdout.")
@click.option("--since", default=None, metavar="YYYY-MM-DD", @click.option("--since", default=None, metavar="YYYY-MM-DD",
help="Only process files modified after this date.") help="Only process files modified after this date.")
@click.option("--workers", default=4, show_default=True, @click.option("--workers", default=None, type=int,
help="Number of parallel worker processes.") help="Parallel worker processes (default: CPU count).")
def extract( def extract(
config_path: Optional[str], config_path: Optional[str],
input_dir: Optional[str], input_dir: Optional[str],
output_dir: Optional[str], output_dir: Optional[str],
single_file: Optional[str], single_file: Optional[str],
since: Optional[str], since: Optional[str],
workers: int, workers: Optional[int],
) -> None: ) -> None:
"""Parse GPX/FIT/TCX files and write BAS JSON data store.""" """Parse GPX/FIT/TCX files and write BAS JSON data store."""
# ── single file mode ─────────────────────────────────────────────────────
if single_file: if single_file:
_process_single(Path(single_file)) _process_single(Path(single_file))
return return
# ── load config ──────────────────────────────────────────────────────────
cfg = _resolve_config(config_path, input_dir, output_dir) cfg = _resolve_config(config_path, input_dir, output_dir)
cfg.output_dir.mkdir(parents=True, exist_ok=True) cfg.output_dir.mkdir(parents=True, exist_ok=True)
# ── gather files ─────────────────────────────────────────────────────────
files = _collect_files(cfg, since) files = _collect_files(cfg, since)
if not files: if not files:
console.print("[yellow]No supported files found.[/yellow]") console.print("[yellow]No supported files found.[/yellow]")
return return
console.print(f"Found [bold]{len(files)}[/bold] activity files.") console.print(f"Found [bold]{len(files)}[/bold] activity files.")
# ── Strava metadata ────────────────────────────────────────────────────── # Build strava lookup once (serialised dict, sent to workers via initializer)
strava_meta: Optional[StravaMetadata] = None strava_lookup: dict = {}
if cfg.metadata_csv and cfg.metadata_csv.exists(): if cfg.metadata_csv and cfg.metadata_csv.exists():
strava_meta = StravaMetadata(cfg.metadata_csv) from bincio.extract.strava_csv import StravaMetadata
strava_lookup = StravaMetadata(cfg.metadata_csv)._by_filename
console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].") console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].")
# ── dedup index ──────────────────────────────────────────────────────────
dedup = DedupIndex(output_dir=cfg.output_dir) dedup = DedupIndex(output_dir=cfg.output_dir)
known_hashes: frozenset = frozenset(dedup._by_hash.keys())
# ── process ────────────────────────────────────────────────────────────── n_workers = workers or os.cpu_count() or 4
summaries: list[dict] = [] console.print(f"Using [bold]{n_workers}[/bold] worker processes.")
errors: list[tuple[Path, str]] = []
skipped = 0
owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name} owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name}
summaries: list[dict] = []
errors: list[tuple[str, str]] = []
skipped = 0
with Progress( with Progress(
TextColumn("[progress.description]{task.description}"), TextColumn("[progress.description]{task.description}"),
@@ -85,80 +158,52 @@ def extract(
) as progress: ) as progress:
task = progress.add_task("Processing...", total=len(files)) task = progress.add_task("Processing...", total=len(files))
with ProcessPoolExecutor(max_workers=workers) as pool: with ProcessPoolExecutor(
futures = {pool.submit(_parse_worker, f): f for f in files} max_workers=n_workers,
initializer=_worker_init,
initargs=(known_hashes, strava_lookup, cfg.output_dir, cfg.default_privacy, cfg.track.rdp_epsilon),
) as pool:
futures = {pool.submit(_process_file, f): f for f in files}
for future in as_completed(futures): for future in as_completed(futures):
path = futures[future]
progress.advance(task) progress.advance(task)
try: result = future.result()
activity = future.result()
except Exception as exc:
errors.append((path, str(exc)))
continue
# ── incremental skip ────────────────────────────────────── if result["status"] == "duplicate":
if cfg.incremental: skipped += 1
existing_id = dedup.is_exact_duplicate(activity.source_hash) elif result["status"] == "error":
if existing_id: errors.append((result["path"], result["error"]))
skipped += 1 else:
continue # Near-duplicate check — must be sequential (stateful)
from datetime import datetime
started_at = datetime.fromisoformat(result["started_at"])
near_id = dedup.find_near_duplicate(started_at, result["distance_m"])
# ── enrich from Strava CSV ──────────────────────────────── if near_id:
if strava_meta: canonical = dedup.pick_canonical(near_id, result.get("source"))
strava_meta.enrich(activity.source_file, activity) if canonical != "__new__":
_patch_duplicate_of(cfg.output_dir, result["id"], near_id)
skipped += 1
continue
_patch_duplicate_of(cfg.output_dir, near_id, result["id"])
dedup._records[near_id].duplicate_of = result["id"]
# ── compute metrics ─────────────────────────────────────── dedup.register(ActivityRecord(
metrics = compute(activity) id=result["id"],
source_hash=result["hash"],
started_at=started_at,
distance_m=result["distance_m"],
source=result.get("source"),
))
summaries.append(result["summary"])
# ── deduplication ───────────────────────────────────────── from bincio.extract.writer import write_index
activity_id = make_activity_id(activity) existing = _load_existing_summaries(cfg.output_dir)
duplicate_of: Optional[str] = None merged = {s["id"]: s for s in existing}
near_dup_id = dedup.find_near_duplicate(
activity.started_at, metrics.distance_m
)
if near_dup_id:
source = _infer_source(activity)
canonical = dedup.pick_canonical(near_dup_id, source)
if canonical == "__new__":
# New one is better — mark existing as duplicate
existing = dedup._records[near_dup_id]
existing.duplicate_of = activity_id
else:
duplicate_of = near_dup_id
# ── write files ───────────────────────────────────────────
written_id = write_activity(
activity, metrics, cfg.output_dir,
privacy=cfg.default_privacy,
duplicate_of=duplicate_of,
rdp_epsilon=cfg.track.rdp_epsilon,
)
# Register in dedup index
dedup.register(ActivityRecord(
id=written_id,
source_hash=activity.source_hash,
started_at=activity.started_at,
distance_m=metrics.distance_m,
source=_infer_source(activity),
))
if duplicate_of is None:
summaries.append(
build_summary(activity, metrics, written_id, cfg.default_privacy)
)
# ── write index.json ──────────────────────────────────────────────────────
# Merge with any existing summaries from previous incremental runs
existing_index = _load_existing_summaries(cfg.output_dir)
all_summaries = {s["id"]: s for s in existing_index}
for s in summaries: for s in summaries:
all_summaries[s["id"]] = s merged[s["id"]] = s
write_index(list(all_summaries.values()), cfg.output_dir, owner) write_index(list(merged.values()), cfg.output_dir, owner)
dedup.save() dedup.save()
# ── summary ───────────────────────────────────────────────────────────────
console.print( console.print(
f"\n[green]Done.[/green] " f"\n[green]Done.[/green] "
f"Processed [bold]{len(summaries)}[/bold] activities, " f"Processed [bold]{len(summaries)}[/bold] activities, "
@@ -168,28 +213,22 @@ def extract(
if errors: if errors:
console.print("\n[red]Errors:[/red]") console.print("\n[red]Errors:[/red]")
for path, msg in errors[:20]: for path, msg in errors[:20]:
console.print(f" {path.name}: {msg}") console.print(f" {Path(path).name}: {msg}")
if len(errors) > 20: if len(errors) > 20:
console.print(f" ... and {len(errors) - 20} more.") console.print(f" ... and {len(errors) - 20} more.")
# ── helpers ─────────────────────────────────────────────────────────────────── # ── helpers ───────────────────────────────────────────────────────────────────
def _parse_worker(path: Path) -> ParsedActivity:
"""Run in worker process — imports are isolated."""
from bincio.extract.parsers.factory import parse_file
return parse_file(path)
def _process_single(path: Path) -> None: def _process_single(path: Path) -> None:
from bincio.extract.metrics import compute
from bincio.extract.parsers.factory import parse_file from bincio.extract.parsers.factory import parse_file
from bincio.extract.writer import build_summary, make_activity_id
try: try:
activity = parse_file(path) activity = parse_file(path)
metrics = compute(activity) metrics = compute(activity)
activity_id = make_activity_id(activity) activity_id = make_activity_id(activity)
from bincio.extract.writer import build_summary click.echo(json.dumps(build_summary(activity, metrics, activity_id), indent=2))
result = build_summary(activity, metrics, activity_id)
click.echo(json.dumps(result, indent=2))
except Exception as exc: except Exception as exc:
console.print(f"[red]Error:[/red] {exc}") console.print(f"[red]Error:[/red] {exc}")
sys.exit(1) sys.exit(1)
@@ -221,51 +260,39 @@ def _resolve_config(
def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]: def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]:
from bincio.extract.parsers.factory import is_supported
import os
from datetime import datetime from datetime import datetime
since_ts: Optional[float] = None since_ts: Optional[float] = None
if since: if since:
since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp() since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp()
files = [] files = []
for d in cfg.input_dirs: for d in cfg.input_dirs:
if not d.exists(): if not d.exists():
console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}") console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}")
continue continue
for path in d.rglob("*"): for path in d.rglob("*"):
if not path.is_file(): if path.is_file() and is_supported(path):
continue if not since_ts or path.stat().st_mtime >= since_ts:
if not is_supported(path): files.append(path)
continue
if since_ts and path.stat().st_mtime < since_ts:
continue
files.append(path)
return files return files
def _load_existing_summaries(output_dir: Path) -> list[dict]: def _load_existing_summaries(output_dir: Path) -> list[dict]:
index_path = output_dir / "index.json" p = output_dir / "index.json"
if not index_path.exists(): if not p.exists():
return [] return []
try: try:
data = json.loads(index_path.read_text()) return json.loads(p.read_text()).get("activities", [])
return data.get("activities", [])
except Exception: except Exception:
return [] return []
def _infer_source(activity: ParsedActivity) -> Optional[str]: def _patch_duplicate_of(output_dir: Path, activity_id: str, canonical_id: str) -> None:
if activity.strava_id: p = output_dir / "activities" / f"{activity_id}.json"
return "strava_export" if not p.exists():
name = activity.source_file.lower() return
if "activity" in name and len(name.split(".")) >= 3: try:
return "karoo" data = json.loads(p.read_text())
if name.endswith((".fit", ".fit.gz")): data["duplicate_of"] = canonical_id
return "fit_file" p.write_text(json.dumps(data, indent=2, ensure_ascii=False))
if name.endswith((".gpx", ".gpx.gz")): except Exception:
return "gpx_file" pass
if name.endswith((".tcx", ".tcx.gz")):
return "tcx_file"
return None
+2
View File
@@ -108,6 +108,8 @@ class DedupIndex:
if distance_m is None or r.distance_m is None: if distance_m is None or r.distance_m is None:
continue continue
ref = max(distance_m, r.distance_m) ref = max(distance_m, r.distance_m)
if ref < 1.0:
continue # both near-zero (indoor/manual) — skip distance check
if abs(distance_m - r.distance_m) / ref < 0.05: if abs(distance_m - r.distance_m) / ref < 0.05:
return r.id return r.id
return None return None
+59 -61
View File
@@ -1,19 +1,28 @@
"""Compute aggregated metrics from a ParsedActivity. """Compute aggregated metrics from a ParsedActivity.
All calculations are self-contained — no external state needed. All calculations are self-contained — no external state needed.
Uses inline haversine rather than geopy.geodesic to keep the hot path fast.
""" """
import math import math
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime
from typing import Optional from typing import Optional
from geopy.distance import geodesic
from bincio.extract.models import DataPoint, ParsedActivity from bincio.extract.models import DataPoint, ParsedActivity
# Speed below which we consider the athlete stopped (km/h) # Speed below which we consider the athlete stopped (km/h)
_STOPPED_THRESHOLD_KMH = 1.0 _STOPPED_THRESHOLD_KMH = 1.0
_EARTH_R = 6_371_000.0 # metres
def _haversine_m(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Great-circle distance in metres. ~10x faster than geopy.geodesic."""
phi1 = math.radians(lat1)
phi2 = math.radians(lat2)
dphi = phi2 - phi1
dlam = math.radians(lon2 - lon1)
a = math.sin(dphi * 0.5) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlam * 0.5) ** 2
return 2.0 * _EARTH_R * math.asin(math.sqrt(min(a, 1.0)))
@dataclass @dataclass
@@ -30,7 +39,7 @@ class ComputedMetrics:
avg_cadence_rpm: Optional[int] avg_cadence_rpm: Optional[int]
avg_power_w: Optional[int] avg_power_w: Optional[int]
max_power_w: Optional[int] max_power_w: Optional[int]
bbox: Optional[tuple[float, float, float, float]] # min_lon, min_lat, max_lon, max_lat bbox: Optional[tuple[float, float, float, float]] # min_lon, min_lat, max_lon, max_lat
start_latlng: Optional[tuple[float, float]] start_latlng: Optional[tuple[float, float]]
end_latlng: Optional[tuple[float, float]] end_latlng: Optional[tuple[float, float]]
@@ -41,10 +50,8 @@ def compute(activity: ParsedActivity) -> ComputedMetrics:
return _empty() return _empty()
duration_s = _duration(pts) duration_s = _duration(pts)
distance_m = _distance(pts) distance_m, moving_time_s, avg_speed_kmh, max_speed_kmh = _gps_stats(pts)
moving_time_s, moving_speed_kmh = _moving_stats(pts)
gain, loss = _elevation(pts) gain, loss = _elevation(pts)
max_speed = _max_speed(pts)
avg_hr, max_hr = _hr_stats(pts) avg_hr, max_hr = _hr_stats(pts)
avg_cad = _avg_nonnull([p.cadence_rpm for p in pts]) avg_cad = _avg_nonnull([p.cadence_rpm for p in pts])
avg_pow = _avg_nonnull([p.power_w for p in pts]) avg_pow = _avg_nonnull([p.power_w for p in pts])
@@ -58,8 +65,8 @@ def compute(activity: ParsedActivity) -> ComputedMetrics:
moving_time_s=moving_time_s, moving_time_s=moving_time_s,
elevation_gain_m=round(gain, 1) if gain is not None else None, elevation_gain_m=round(gain, 1) if gain is not None else None,
elevation_loss_m=round(abs(loss), 1) if loss is not None else None, elevation_loss_m=round(abs(loss), 1) if loss is not None else None,
avg_speed_kmh=round(moving_speed_kmh, 2) if moving_speed_kmh else None, avg_speed_kmh=round(avg_speed_kmh, 2) if avg_speed_kmh else None,
max_speed_kmh=round(max_speed, 2) if max_speed else None, max_speed_kmh=round(max_speed_kmh, 2) if max_speed_kmh else None,
avg_hr_bpm=avg_hr, avg_hr_bpm=avg_hr,
max_hr_bpm=max_hr, max_hr_bpm=max_hr,
avg_cadence_rpm=avg_cad, avg_cadence_rpm=avg_cad,
@@ -71,66 +78,75 @@ def compute(activity: ParsedActivity) -> ComputedMetrics:
) )
# ── helpers ────────────────────────────────────────────────────────────────── # ── single-pass GPS stats ──────────────────────────────────────────────────────
# distance, moving time, avg speed, and max speed are all derived from the same
# per-segment loop, so we compute them in one pass instead of four.
def _duration(pts: list[DataPoint]) -> Optional[int]: def _gps_stats(
if len(pts) < 2: pts: list[DataPoint],
return None ) -> tuple[Optional[float], Optional[int], Optional[float], Optional[float]]:
return int((pts[-1].timestamp - pts[0].timestamp).total_seconds()) """Return (distance_m, moving_time_s, avg_speed_kmh, max_speed_kmh)."""
# Prefer device-recorded cumulative distance (FIT files always have this)
def _distance(pts: list[DataPoint]) -> Optional[float]: device_dist = next(
"""Prefer device-recorded cumulative distance; fall back to GPS geodesic."""
# If the last point has a device distance, use it
last_dist = next(
(p.distance_m for p in reversed(pts) if p.distance_m is not None), None (p.distance_m for p in reversed(pts) if p.distance_m is not None), None
) )
if last_dist is not None:
return round(last_dist, 1)
# GPS fallback
total = 0.0
has_gps = False
for a, b in zip(pts, pts[1:]):
if a.lat is None or a.lon is None or b.lat is None or b.lon is None:
continue
has_gps = True
total += geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
return round(total, 1) if has_gps else None
def _moving_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[float]]:
"""Return (moving_time_s, avg_speed_kmh_over_moving_time)."""
moving_s = 0 moving_s = 0
moving_dist_m = 0.0 moving_dist_m = 0.0
has_gps = False total_dist_m = 0.0
max_seg_kmh = 0.0
has_data = False
# Device speed values (used for max if present)
device_max_kmh: Optional[float] = None
if any(p.speed_kmh is not None for p in pts):
device_max_kmh = max(p.speed_kmh for p in pts if p.speed_kmh is not None)
for a, b in zip(pts, pts[1:]): for a, b in zip(pts, pts[1:]):
dt = (b.timestamp - a.timestamp).total_seconds() dt = (b.timestamp - a.timestamp).total_seconds()
if dt <= 0: if dt <= 0:
continue continue
# Compute speed for this interval from GPS
if a.lat is not None and a.lon is not None and b.lat is not None and b.lon is not None: if a.lat is not None and a.lon is not None and b.lat is not None and b.lon is not None:
has_gps = True seg_m = _haversine_m(a.lat, a.lon, b.lat, b.lon)
seg_m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
seg_kmh = (seg_m / dt) * 3.6 seg_kmh = (seg_m / dt) * 3.6
has_data = True
elif a.speed_kmh is not None: elif a.speed_kmh is not None:
seg_kmh = a.speed_kmh seg_kmh = a.speed_kmh
seg_m = (seg_kmh / 3.6) * dt seg_m = (seg_kmh / 3.6) * dt
has_gps = True # speed data present has_data = True
else: else:
continue continue
total_dist_m += seg_m
if seg_kmh > max_seg_kmh:
max_seg_kmh = seg_kmh
if seg_kmh >= _STOPPED_THRESHOLD_KMH: if seg_kmh >= _STOPPED_THRESHOLD_KMH:
moving_s += int(dt) moving_s += int(dt)
moving_dist_m += seg_m moving_dist_m += seg_m
if not has_gps or moving_s == 0: if not has_data:
return None, None return device_dist, None, None, None
avg_kmh = (moving_dist_m / moving_s) * 3.6 distance_m = device_dist if device_dist is not None else round(total_dist_m, 1)
return moving_s, avg_kmh moving_time_s = moving_s if moving_s > 0 else None
avg_speed_kmh = (moving_dist_m / moving_s) * 3.6 if moving_s > 0 else None
# Prefer device speed for max (more stable than GPS-derived per-second spikes)
max_speed_kmh = device_max_kmh if device_max_kmh is not None else (
max_seg_kmh if max_seg_kmh > 0 else None
)
return distance_m, moving_time_s, avg_speed_kmh, max_speed_kmh
# ── remaining helpers ──────────────────────────────────────────────────────────
def _duration(pts: list[DataPoint]) -> Optional[int]:
if len(pts) < 2:
return None
return int((pts[-1].timestamp - pts[0].timestamp).total_seconds())
def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]: def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]:
@@ -147,24 +163,6 @@ def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]:
return gain, loss return gain, loss
def _max_speed(pts: list[DataPoint]) -> Optional[float]:
# Prefer device speed; fall back to GPS-derived
device_speeds = [p.speed_kmh for p in pts if p.speed_kmh is not None]
if device_speeds:
return max(device_speeds)
# GPS-derived max
gps_speeds = []
for a, b in zip(pts, pts[1:]):
if a.lat is None or b.lat is None:
continue
dt = (b.timestamp - a.timestamp).total_seconds()
if dt <= 0:
continue
m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
gps_speeds.append((m / dt) * 3.6)
return max(gps_speeds) if gps_speeds else None
def _hr_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[int]]: def _hr_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[int]]:
hrs = [p.hr_bpm for p in pts if p.hr_bpm is not None] hrs = [p.hr_bpm for p in pts if p.hr_bpm is not None]
if not hrs: if not hrs:
+9 -3
View File
@@ -8,18 +8,24 @@ from lxml import etree
from bincio.extract.models import DataPoint, ParsedActivity from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.sport import normalise_sport from bincio.extract.sport import normalise_sport
_NS = { _NS_HTTP = {
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2", "tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
"ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2", "ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
} }
_NS_HTTPS = {
"tcx": "https://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
"ext": "https://www.garmin.com/xmlschemas/ActivityExtension/v2",
}
class TcxParser: class TcxParser:
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity: def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
# Some exporters (e.g. Garmin) prepend whitespace before the XML # Some exporters prepend whitespace before the XML declaration. Strip it.
# declaration, which is technically invalid. Strip it.
root = etree.fromstring(raw_bytes.lstrip()) root = etree.fromstring(raw_bytes.lstrip())
# Garmin sometimes uses https:// instead of http:// in the namespace URI.
_NS = _NS_HTTPS if b"https://www.garmin.com" in raw_bytes else _NS_HTTP
activities = root.findall(".//tcx:Activity", _NS) activities = root.findall(".//tcx:Activity", _NS)
if not activities: if not activities:
raise ValueError(f"No Activity elements found in {path.name}") raise ValueError(f"No Activity elements found in {path.name}")
+7 -5
View File
@@ -12,11 +12,13 @@ from bincio.extract.timeseries import build_timeseries
def make_activity_id(activity: ParsedActivity) -> str: def make_activity_id(activity: ParsedActivity) -> str:
"""Generate a BAS activity ID from started_at + optional title slug.""" """Generate a BAS activity ID from started_at + optional title slug.
ts = activity.started_at
# Compact ISO format: 2024-06-01T073012+0200 Always uses UTC with Z suffix so IDs are URL-safe (no + chars).
tz_str = ts.strftime("%z") # e.g. "+0200" or "" """
ts_part = ts.strftime("%Y-%m-%dT%H%M%S") + (tz_str or "Z") from datetime import timezone
ts = activity.started_at.astimezone(timezone.utc)
ts_part = ts.strftime("%Y-%m-%dT%H%M%SZ")
if activity.title: if activity.title:
slug = _slugify(activity.title) slug = _slugify(activity.title)
-1
View File
@@ -19,7 +19,6 @@ dependencies = [
# Data # Data
"pandas>=2.2", "pandas>=2.2",
# Geo # Geo
"geopy>=2.4",
"rdp>=0.8", "rdp>=0.8",
# Config & CLI # Config & CLI
"pyyaml>=6.0", "pyyaml>=6.0",