backend: initial commit
This commit is contained in:
@@ -0,0 +1,271 @@
|
||||
"""bincio extract — CLI command."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
from rich.console import Console
|
||||
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn
|
||||
|
||||
from bincio.extract.config import ExtractConfig, default_config, load_config
|
||||
from bincio.extract.dedup import ActivityRecord, DedupIndex
|
||||
from bincio.extract.metrics import compute
|
||||
from bincio.extract.models import ParsedActivity
|
||||
from bincio.extract.parsers.factory import is_supported, parse_file
|
||||
from bincio.extract.strava_csv import StravaMetadata
|
||||
from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--config", "config_path", type=click.Path(exists=True), default=None,
|
||||
help="Path to extract_config.yaml (default: ./extract_config.yaml).")
|
||||
@click.option("--input", "input_dir", type=click.Path(exists=True), default=None,
|
||||
help="Input directory (overrides config).")
|
||||
@click.option("--output", "output_dir", type=click.Path(), default=None,
|
||||
help="Output directory (overrides config).")
|
||||
@click.option("--file", "single_file", type=click.Path(exists=True), default=None,
|
||||
help="Process a single file and print JSON to stdout.")
|
||||
@click.option("--since", default=None, metavar="YYYY-MM-DD",
|
||||
help="Only process files modified after this date.")
|
||||
@click.option("--workers", default=4, show_default=True,
|
||||
help="Number of parallel worker processes.")
|
||||
def extract(
|
||||
config_path: Optional[str],
|
||||
input_dir: Optional[str],
|
||||
output_dir: Optional[str],
|
||||
single_file: Optional[str],
|
||||
since: Optional[str],
|
||||
workers: int,
|
||||
) -> None:
|
||||
"""Parse GPX/FIT/TCX files and write BAS JSON data store."""
|
||||
|
||||
# ── single file mode ─────────────────────────────────────────────────────
|
||||
if single_file:
|
||||
_process_single(Path(single_file))
|
||||
return
|
||||
|
||||
# ── load config ──────────────────────────────────────────────────────────
|
||||
cfg = _resolve_config(config_path, input_dir, output_dir)
|
||||
cfg.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ── gather files ─────────────────────────────────────────────────────────
|
||||
files = _collect_files(cfg, since)
|
||||
if not files:
|
||||
console.print("[yellow]No supported files found.[/yellow]")
|
||||
return
|
||||
console.print(f"Found [bold]{len(files)}[/bold] activity files.")
|
||||
|
||||
# ── Strava metadata ──────────────────────────────────────────────────────
|
||||
strava_meta: Optional[StravaMetadata] = None
|
||||
if cfg.metadata_csv and cfg.metadata_csv.exists():
|
||||
strava_meta = StravaMetadata(cfg.metadata_csv)
|
||||
console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].")
|
||||
|
||||
# ── dedup index ──────────────────────────────────────────────────────────
|
||||
dedup = DedupIndex(output_dir=cfg.output_dir)
|
||||
|
||||
# ── process ──────────────────────────────────────────────────────────────
|
||||
summaries: list[dict] = []
|
||||
errors: list[tuple[Path, str]] = []
|
||||
skipped = 0
|
||||
|
||||
owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name}
|
||||
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Processing...", total=len(files))
|
||||
|
||||
with ProcessPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {pool.submit(_parse_worker, f): f for f in files}
|
||||
for future in as_completed(futures):
|
||||
path = futures[future]
|
||||
progress.advance(task)
|
||||
try:
|
||||
activity = future.result()
|
||||
except Exception as exc:
|
||||
errors.append((path, str(exc)))
|
||||
continue
|
||||
|
||||
# ── incremental skip ──────────────────────────────────────
|
||||
if cfg.incremental:
|
||||
existing_id = dedup.is_exact_duplicate(activity.source_hash)
|
||||
if existing_id:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# ── enrich from Strava CSV ────────────────────────────────
|
||||
if strava_meta:
|
||||
strava_meta.enrich(activity.source_file, activity)
|
||||
|
||||
# ── compute metrics ───────────────────────────────────────
|
||||
metrics = compute(activity)
|
||||
|
||||
# ── deduplication ─────────────────────────────────────────
|
||||
activity_id = make_activity_id(activity)
|
||||
duplicate_of: Optional[str] = None
|
||||
|
||||
near_dup_id = dedup.find_near_duplicate(
|
||||
activity.started_at, metrics.distance_m
|
||||
)
|
||||
if near_dup_id:
|
||||
source = _infer_source(activity)
|
||||
canonical = dedup.pick_canonical(near_dup_id, source)
|
||||
if canonical == "__new__":
|
||||
# New one is better — mark existing as duplicate
|
||||
existing = dedup._records[near_dup_id]
|
||||
existing.duplicate_of = activity_id
|
||||
else:
|
||||
duplicate_of = near_dup_id
|
||||
|
||||
# ── write files ───────────────────────────────────────────
|
||||
written_id = write_activity(
|
||||
activity, metrics, cfg.output_dir,
|
||||
privacy=cfg.default_privacy,
|
||||
duplicate_of=duplicate_of,
|
||||
rdp_epsilon=cfg.track.rdp_epsilon,
|
||||
)
|
||||
|
||||
# Register in dedup index
|
||||
dedup.register(ActivityRecord(
|
||||
id=written_id,
|
||||
source_hash=activity.source_hash,
|
||||
started_at=activity.started_at,
|
||||
distance_m=metrics.distance_m,
|
||||
source=_infer_source(activity),
|
||||
))
|
||||
|
||||
if duplicate_of is None:
|
||||
summaries.append(
|
||||
build_summary(activity, metrics, written_id, cfg.default_privacy)
|
||||
)
|
||||
|
||||
# ── write index.json ──────────────────────────────────────────────────────
|
||||
# Merge with any existing summaries from previous incremental runs
|
||||
existing_index = _load_existing_summaries(cfg.output_dir)
|
||||
all_summaries = {s["id"]: s for s in existing_index}
|
||||
for s in summaries:
|
||||
all_summaries[s["id"]] = s
|
||||
write_index(list(all_summaries.values()), cfg.output_dir, owner)
|
||||
dedup.save()
|
||||
|
||||
# ── summary ───────────────────────────────────────────────────────────────
|
||||
console.print(
|
||||
f"\n[green]Done.[/green] "
|
||||
f"Processed [bold]{len(summaries)}[/bold] activities, "
|
||||
f"skipped [bold]{skipped}[/bold] (already up to date), "
|
||||
f"errors [bold]{len(errors)}[/bold]."
|
||||
)
|
||||
if errors:
|
||||
console.print("\n[red]Errors:[/red]")
|
||||
for path, msg in errors[:20]:
|
||||
console.print(f" {path.name}: {msg}")
|
||||
if len(errors) > 20:
|
||||
console.print(f" ... and {len(errors) - 20} more.")
|
||||
|
||||
|
||||
# ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _parse_worker(path: Path) -> ParsedActivity:
|
||||
"""Run in worker process — imports are isolated."""
|
||||
from bincio.extract.parsers.factory import parse_file
|
||||
return parse_file(path)
|
||||
|
||||
|
||||
def _process_single(path: Path) -> None:
|
||||
from bincio.extract.parsers.factory import parse_file
|
||||
try:
|
||||
activity = parse_file(path)
|
||||
metrics = compute(activity)
|
||||
activity_id = make_activity_id(activity)
|
||||
from bincio.extract.writer import build_summary
|
||||
result = build_summary(activity, metrics, activity_id)
|
||||
click.echo(json.dumps(result, indent=2))
|
||||
except Exception as exc:
|
||||
console.print(f"[red]Error:[/red] {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _resolve_config(
|
||||
config_path: Optional[str],
|
||||
input_dir: Optional[str],
|
||||
output_dir: Optional[str],
|
||||
) -> ExtractConfig:
|
||||
if config_path:
|
||||
cfg = load_config(Path(config_path))
|
||||
elif Path("extract_config.yaml").exists():
|
||||
cfg = load_config(Path("extract_config.yaml"))
|
||||
elif input_dir:
|
||||
cfg = default_config(
|
||||
Path(input_dir).expanduser(),
|
||||
Path(output_dir or "./bincio_data").expanduser(),
|
||||
)
|
||||
else:
|
||||
raise click.UsageError(
|
||||
"Provide --config, --input, or an extract_config.yaml in the current directory."
|
||||
)
|
||||
if input_dir:
|
||||
cfg.input_dirs = [Path(input_dir).expanduser()]
|
||||
if output_dir:
|
||||
cfg.output_dir = Path(output_dir).expanduser()
|
||||
return cfg
|
||||
|
||||
|
||||
def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]:
|
||||
from bincio.extract.parsers.factory import is_supported
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
since_ts: Optional[float] = None
|
||||
if since:
|
||||
since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp()
|
||||
|
||||
files = []
|
||||
for d in cfg.input_dirs:
|
||||
if not d.exists():
|
||||
console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}")
|
||||
continue
|
||||
for path in d.rglob("*"):
|
||||
if not path.is_file():
|
||||
continue
|
||||
if not is_supported(path):
|
||||
continue
|
||||
if since_ts and path.stat().st_mtime < since_ts:
|
||||
continue
|
||||
files.append(path)
|
||||
return files
|
||||
|
||||
|
||||
def _load_existing_summaries(output_dir: Path) -> list[dict]:
|
||||
index_path = output_dir / "index.json"
|
||||
if not index_path.exists():
|
||||
return []
|
||||
try:
|
||||
data = json.loads(index_path.read_text())
|
||||
return data.get("activities", [])
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _infer_source(activity: ParsedActivity) -> Optional[str]:
|
||||
if activity.strava_id:
|
||||
return "strava_export"
|
||||
name = activity.source_file.lower()
|
||||
if "activity" in name and len(name.split(".")) >= 3:
|
||||
return "karoo"
|
||||
if name.endswith((".fit", ".fit.gz")):
|
||||
return "fit_file"
|
||||
if name.endswith((".gpx", ".gpx.gz")):
|
||||
return "gpx_file"
|
||||
if name.endswith((".tcx", ".tcx.gz")):
|
||||
return "tcx_file"
|
||||
return None
|
||||
@@ -0,0 +1,88 @@
|
||||
"""Extract stage configuration — loaded from extract_config.yaml."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrackConfig:
|
||||
simplify: str = "rdp"
|
||||
rdp_epsilon: float = 0.0001
|
||||
timeseries_hz: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class SensorsConfig:
|
||||
heart_rate: bool = True
|
||||
cadence: bool = True
|
||||
temperature: bool = True
|
||||
power: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassifierConfig:
|
||||
enabled: bool = False # off by default; opt-in
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractConfig:
|
||||
input_dirs: list[Path]
|
||||
output_dir: Path
|
||||
metadata_csv: Optional[Path] = None
|
||||
default_privacy: str = "public"
|
||||
sensors: SensorsConfig = field(default_factory=SensorsConfig)
|
||||
track: TrackConfig = field(default_factory=TrackConfig)
|
||||
classifier: ClassifierConfig = field(default_factory=ClassifierConfig)
|
||||
incremental: bool = True
|
||||
owner_handle: str = "me"
|
||||
owner_display_name: str = "Me"
|
||||
|
||||
|
||||
def load_config(path: Path) -> ExtractConfig:
|
||||
raw = yaml.safe_load(path.read_text())
|
||||
|
||||
inp = raw.get("input", {})
|
||||
dirs = [Path(d).expanduser() for d in inp.get("dirs", [])]
|
||||
csv_path = inp.get("metadata_csv")
|
||||
|
||||
out = Path(raw.get("output", {}).get("dir", "./bincio_data")).expanduser()
|
||||
|
||||
owner = raw.get("owner", {})
|
||||
|
||||
sensors_raw = raw.get("sensors", {})
|
||||
sensors = SensorsConfig(
|
||||
heart_rate=sensors_raw.get("heart_rate", True),
|
||||
cadence=sensors_raw.get("cadence", True),
|
||||
temperature=sensors_raw.get("temperature", True),
|
||||
power=sensors_raw.get("power", True),
|
||||
)
|
||||
|
||||
track_raw = raw.get("track", {})
|
||||
track = TrackConfig(
|
||||
simplify=track_raw.get("simplify", "rdp"),
|
||||
rdp_epsilon=track_raw.get("rdp_epsilon", 0.0001),
|
||||
timeseries_hz=track_raw.get("timeseries_hz", 1),
|
||||
)
|
||||
|
||||
cls_raw = raw.get("classifier", {})
|
||||
classifier = ClassifierConfig(enabled=cls_raw.get("enabled", False))
|
||||
|
||||
return ExtractConfig(
|
||||
input_dirs=dirs,
|
||||
output_dir=out,
|
||||
metadata_csv=Path(csv_path).expanduser() if csv_path else None,
|
||||
default_privacy=raw.get("default_privacy", "public"),
|
||||
sensors=sensors,
|
||||
track=track,
|
||||
classifier=classifier,
|
||||
incremental=raw.get("incremental", True),
|
||||
owner_handle=owner.get("handle", "me"),
|
||||
owner_display_name=owner.get("display_name", "Me"),
|
||||
)
|
||||
|
||||
|
||||
def default_config(input_dir: Path, output_dir: Path) -> ExtractConfig:
|
||||
return ExtractConfig(input_dirs=[input_dir], output_dir=output_dir)
|
||||
@@ -0,0 +1,127 @@
|
||||
"""Duplicate activity detection.
|
||||
|
||||
Two kinds of duplicates:
|
||||
|
||||
1. Exact duplicate — same source_hash. Skip entirely.
|
||||
2. Near-duplicate — same ride recorded by two devices / exported from two
|
||||
platforms. Detected by (started_at ± 5 min) AND (distance ± 5%).
|
||||
The "better" source wins; the other gets duplicate_of set.
|
||||
|
||||
The deduplication index is a JSON file persisted in the output directory so
|
||||
that incremental runs don't re-evaluate already-resolved pairs.
|
||||
"""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
_INDEX_FILE = ".bincio_cache.json"
|
||||
|
||||
# Source quality ranking (higher = preferred when deduplicating)
|
||||
_SOURCE_QUALITY: dict[str, int] = {
|
||||
"karoo": 5,
|
||||
"fit_file": 4,
|
||||
"garmin_connect": 4,
|
||||
"strava_export": 3,
|
||||
"gpx_file": 2,
|
||||
"tcx_file": 1,
|
||||
"wahoo": 3,
|
||||
"komoot": 2,
|
||||
"manual": 0,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActivityRecord:
|
||||
"""Minimal record stored in the dedup index."""
|
||||
|
||||
id: str
|
||||
source_hash: str
|
||||
started_at: datetime
|
||||
distance_m: Optional[float]
|
||||
source: Optional[str]
|
||||
duplicate_of: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DedupIndex:
|
||||
output_dir: Path
|
||||
_records: dict[str, ActivityRecord] = field(default_factory=dict)
|
||||
# source_hash → id, for exact-duplicate lookup
|
||||
_by_hash: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self._load()
|
||||
|
||||
def _load(self) -> None:
|
||||
p = self.output_dir / _INDEX_FILE
|
||||
if not p.exists():
|
||||
return
|
||||
data = json.loads(p.read_text())
|
||||
for item in data.get("activities", []):
|
||||
started_at = datetime.fromisoformat(item["started_at"])
|
||||
r = ActivityRecord(
|
||||
id=item["id"],
|
||||
source_hash=item["source_hash"],
|
||||
started_at=started_at,
|
||||
distance_m=item.get("distance_m"),
|
||||
source=item.get("source"),
|
||||
duplicate_of=item.get("duplicate_of"),
|
||||
)
|
||||
self._records[r.id] = r
|
||||
self._by_hash[r.source_hash] = r.id
|
||||
|
||||
def save(self) -> None:
|
||||
p = self.output_dir / _INDEX_FILE
|
||||
data = {
|
||||
"activities": [
|
||||
{
|
||||
"id": r.id,
|
||||
"source_hash": r.source_hash,
|
||||
"started_at": r.started_at.isoformat(),
|
||||
"distance_m": r.distance_m,
|
||||
"source": r.source,
|
||||
"duplicate_of": r.duplicate_of,
|
||||
}
|
||||
for r in self._records.values()
|
||||
]
|
||||
}
|
||||
p.write_text(json.dumps(data, indent=2))
|
||||
|
||||
def is_exact_duplicate(self, source_hash: str) -> Optional[str]:
|
||||
"""Return existing activity ID if hash is already in the index."""
|
||||
return self._by_hash.get(source_hash)
|
||||
|
||||
def find_near_duplicate(
|
||||
self,
|
||||
started_at: datetime,
|
||||
distance_m: Optional[float],
|
||||
) -> Optional[str]:
|
||||
"""Return ID of a near-duplicate if one exists."""
|
||||
for r in self._records.values():
|
||||
if r.duplicate_of is not None:
|
||||
continue # skip already-marked duplicates
|
||||
if abs((r.started_at - started_at).total_seconds()) > 5 * 60:
|
||||
continue
|
||||
if distance_m is None or r.distance_m is None:
|
||||
continue
|
||||
ref = max(distance_m, r.distance_m)
|
||||
if abs(distance_m - r.distance_m) / ref < 0.05:
|
||||
return r.id
|
||||
return None
|
||||
|
||||
def register(self, record: ActivityRecord) -> None:
|
||||
self._records[record.id] = record
|
||||
self._by_hash[record.source_hash] = record.id
|
||||
|
||||
def pick_canonical(self, existing_id: str, new_source: Optional[str]) -> str:
|
||||
"""Return the ID of whichever record should be canonical."""
|
||||
existing = self._records[existing_id]
|
||||
existing_q = _SOURCE_QUALITY.get(existing.source or "", 0)
|
||||
new_q = _SOURCE_QUALITY.get(new_source or "", 0)
|
||||
# New record is strictly better → existing becomes the duplicate
|
||||
if new_q > existing_q:
|
||||
return "__new__"
|
||||
return existing_id
|
||||
@@ -0,0 +1,210 @@
|
||||
"""Compute aggregated metrics from a ParsedActivity.
|
||||
|
||||
All calculations are self-contained — no external state needed.
|
||||
"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from geopy.distance import geodesic
|
||||
|
||||
from bincio.extract.models import DataPoint, ParsedActivity
|
||||
|
||||
# Speed below which we consider the athlete stopped (km/h)
|
||||
_STOPPED_THRESHOLD_KMH = 1.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComputedMetrics:
|
||||
distance_m: Optional[float]
|
||||
duration_s: Optional[int]
|
||||
moving_time_s: Optional[int]
|
||||
elevation_gain_m: Optional[float]
|
||||
elevation_loss_m: Optional[float]
|
||||
avg_speed_kmh: Optional[float]
|
||||
max_speed_kmh: Optional[float]
|
||||
avg_hr_bpm: Optional[int]
|
||||
max_hr_bpm: Optional[int]
|
||||
avg_cadence_rpm: Optional[int]
|
||||
avg_power_w: Optional[int]
|
||||
max_power_w: Optional[int]
|
||||
bbox: Optional[tuple[float, float, float, float]] # min_lon, min_lat, max_lon, max_lat
|
||||
start_latlng: Optional[tuple[float, float]]
|
||||
end_latlng: Optional[tuple[float, float]]
|
||||
|
||||
|
||||
def compute(activity: ParsedActivity) -> ComputedMetrics:
|
||||
pts = activity.points
|
||||
if not pts:
|
||||
return _empty()
|
||||
|
||||
duration_s = _duration(pts)
|
||||
distance_m = _distance(pts)
|
||||
moving_time_s, moving_speed_kmh = _moving_stats(pts)
|
||||
gain, loss = _elevation(pts)
|
||||
max_speed = _max_speed(pts)
|
||||
avg_hr, max_hr = _hr_stats(pts)
|
||||
avg_cad = _avg_nonnull([p.cadence_rpm for p in pts])
|
||||
avg_pow = _avg_nonnull([p.power_w for p in pts])
|
||||
max_pow = _max_nonnull([p.power_w for p in pts])
|
||||
bbox = _bbox(pts)
|
||||
start_ll, end_ll = _endpoints(pts)
|
||||
|
||||
return ComputedMetrics(
|
||||
distance_m=distance_m,
|
||||
duration_s=duration_s,
|
||||
moving_time_s=moving_time_s,
|
||||
elevation_gain_m=round(gain, 1) if gain is not None else None,
|
||||
elevation_loss_m=round(abs(loss), 1) if loss is not None else None,
|
||||
avg_speed_kmh=round(moving_speed_kmh, 2) if moving_speed_kmh else None,
|
||||
max_speed_kmh=round(max_speed, 2) if max_speed else None,
|
||||
avg_hr_bpm=avg_hr,
|
||||
max_hr_bpm=max_hr,
|
||||
avg_cadence_rpm=avg_cad,
|
||||
avg_power_w=avg_pow,
|
||||
max_power_w=max_pow,
|
||||
bbox=bbox,
|
||||
start_latlng=start_ll,
|
||||
end_latlng=end_ll,
|
||||
)
|
||||
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _duration(pts: list[DataPoint]) -> Optional[int]:
|
||||
if len(pts) < 2:
|
||||
return None
|
||||
return int((pts[-1].timestamp - pts[0].timestamp).total_seconds())
|
||||
|
||||
|
||||
def _distance(pts: list[DataPoint]) -> Optional[float]:
|
||||
"""Prefer device-recorded cumulative distance; fall back to GPS geodesic."""
|
||||
# If the last point has a device distance, use it
|
||||
last_dist = next(
|
||||
(p.distance_m for p in reversed(pts) if p.distance_m is not None), None
|
||||
)
|
||||
if last_dist is not None:
|
||||
return round(last_dist, 1)
|
||||
|
||||
# GPS fallback
|
||||
total = 0.0
|
||||
has_gps = False
|
||||
for a, b in zip(pts, pts[1:]):
|
||||
if a.lat is None or a.lon is None or b.lat is None or b.lon is None:
|
||||
continue
|
||||
has_gps = True
|
||||
total += geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
|
||||
return round(total, 1) if has_gps else None
|
||||
|
||||
|
||||
def _moving_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[float]]:
|
||||
"""Return (moving_time_s, avg_speed_kmh_over_moving_time)."""
|
||||
moving_s = 0
|
||||
moving_dist_m = 0.0
|
||||
has_gps = False
|
||||
|
||||
for a, b in zip(pts, pts[1:]):
|
||||
dt = (b.timestamp - a.timestamp).total_seconds()
|
||||
if dt <= 0:
|
||||
continue
|
||||
|
||||
# Compute speed for this interval from GPS
|
||||
if a.lat is not None and a.lon is not None and b.lat is not None and b.lon is not None:
|
||||
has_gps = True
|
||||
seg_m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
|
||||
seg_kmh = (seg_m / dt) * 3.6
|
||||
elif a.speed_kmh is not None:
|
||||
seg_kmh = a.speed_kmh
|
||||
seg_m = (seg_kmh / 3.6) * dt
|
||||
has_gps = True # speed data present
|
||||
else:
|
||||
continue
|
||||
|
||||
if seg_kmh >= _STOPPED_THRESHOLD_KMH:
|
||||
moving_s += int(dt)
|
||||
moving_dist_m += seg_m
|
||||
|
||||
if not has_gps or moving_s == 0:
|
||||
return None, None
|
||||
|
||||
avg_kmh = (moving_dist_m / moving_s) * 3.6
|
||||
return moving_s, avg_kmh
|
||||
|
||||
|
||||
def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]:
|
||||
elevations = [p.elevation_m for p in pts if p.elevation_m is not None]
|
||||
if len(elevations) < 2:
|
||||
return None, None
|
||||
gain = loss = 0.0
|
||||
for a, b in zip(elevations, elevations[1:]):
|
||||
diff = b - a
|
||||
if diff > 0:
|
||||
gain += diff
|
||||
else:
|
||||
loss += diff
|
||||
return gain, loss
|
||||
|
||||
|
||||
def _max_speed(pts: list[DataPoint]) -> Optional[float]:
|
||||
# Prefer device speed; fall back to GPS-derived
|
||||
device_speeds = [p.speed_kmh for p in pts if p.speed_kmh is not None]
|
||||
if device_speeds:
|
||||
return max(device_speeds)
|
||||
# GPS-derived max
|
||||
gps_speeds = []
|
||||
for a, b in zip(pts, pts[1:]):
|
||||
if a.lat is None or b.lat is None:
|
||||
continue
|
||||
dt = (b.timestamp - a.timestamp).total_seconds()
|
||||
if dt <= 0:
|
||||
continue
|
||||
m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters
|
||||
gps_speeds.append((m / dt) * 3.6)
|
||||
return max(gps_speeds) if gps_speeds else None
|
||||
|
||||
|
||||
def _hr_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[int]]:
|
||||
hrs = [p.hr_bpm for p in pts if p.hr_bpm is not None]
|
||||
if not hrs:
|
||||
return None, None
|
||||
return int(sum(hrs) / len(hrs)), max(hrs)
|
||||
|
||||
|
||||
def _avg_nonnull(values: list) -> Optional[int]:
|
||||
v = [x for x in values if x is not None]
|
||||
return int(sum(v) / len(v)) if v else None
|
||||
|
||||
|
||||
def _max_nonnull(values: list) -> Optional[int]:
|
||||
v = [x for x in values if x is not None]
|
||||
return max(v) if v else None
|
||||
|
||||
|
||||
def _bbox(pts: list[DataPoint]) -> Optional[tuple[float, float, float, float]]:
|
||||
lats = [p.lat for p in pts if p.lat is not None]
|
||||
lons = [p.lon for p in pts if p.lon is not None]
|
||||
if not lats:
|
||||
return None
|
||||
return (min(lons), min(lats), max(lons), max(lats))
|
||||
|
||||
|
||||
def _endpoints(
|
||||
pts: list[DataPoint],
|
||||
) -> tuple[Optional[tuple[float, float]], Optional[tuple[float, float]]]:
|
||||
gps = [(p.lat, p.lon) for p in pts if p.lat is not None and p.lon is not None]
|
||||
if not gps:
|
||||
return None, None
|
||||
return gps[0], gps[-1]
|
||||
|
||||
|
||||
def _empty() -> ComputedMetrics:
|
||||
return ComputedMetrics(
|
||||
distance_m=None, duration_s=None, moving_time_s=None,
|
||||
elevation_gain_m=None, elevation_loss_m=None,
|
||||
avg_speed_kmh=None, max_speed_kmh=None,
|
||||
avg_hr_bpm=None, max_hr_bpm=None,
|
||||
avg_cadence_rpm=None, avg_power_w=None, max_power_w=None,
|
||||
bbox=None, start_latlng=None, end_latlng=None,
|
||||
)
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Core data models for the extract stage.
|
||||
|
||||
ParsedActivity is the internal representation produced by parsers.
|
||||
It gets fed into metrics computation and the BAS JSON writer.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataPoint:
|
||||
"""One measurement sample from a GPS/sensor recording."""
|
||||
|
||||
timestamp: datetime
|
||||
lat: Optional[float] = None
|
||||
lon: Optional[float] = None
|
||||
elevation_m: Optional[float] = None
|
||||
hr_bpm: Optional[int] = None
|
||||
cadence_rpm: Optional[int] = None
|
||||
# Speed from device (km/h). May be absent; we compute it from GPS if so.
|
||||
speed_kmh: Optional[float] = None
|
||||
power_w: Optional[int] = None
|
||||
temperature_c: Optional[float] = None
|
||||
# Cumulative distance from device (metres), if recorded.
|
||||
distance_m: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LapData:
|
||||
index: int
|
||||
started_at: datetime
|
||||
duration_s: Optional[int] = None
|
||||
distance_m: Optional[float] = None
|
||||
elevation_gain_m: Optional[float] = None
|
||||
avg_speed_kmh: Optional[float] = None
|
||||
avg_hr_bpm: Optional[int] = None
|
||||
avg_power_w: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedActivity:
|
||||
"""Raw activity data as produced by a parser, before metric computation."""
|
||||
|
||||
points: list[DataPoint]
|
||||
sport: str # normalised to BAS sport enum
|
||||
started_at: datetime
|
||||
source_file: str # basename of original file
|
||||
source_hash: str # "sha256:{hex}"
|
||||
|
||||
sub_sport: Optional[str] = None
|
||||
device: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
gear: Optional[str] = None
|
||||
strava_id: Optional[str] = None
|
||||
laps: list[LapData] = field(default_factory=list)
|
||||
@@ -0,0 +1,34 @@
|
||||
"""Abstract base class for all activity parsers."""
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from bincio.extract.models import ParsedActivity
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
@abstractmethod
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
"""Parse activity from raw file bytes.
|
||||
|
||||
Receives pre-read bytes so the factory can compute the hash once and
|
||||
handle decompression transparently before dispatching.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _sha256(data: bytes) -> str:
|
||||
return "sha256:" + hashlib.sha256(data).hexdigest()
|
||||
|
||||
@staticmethod
|
||||
def _read_file(path: Path) -> tuple[bytes, bytes]:
|
||||
"""Return (raw_bytes, decompressed_bytes).
|
||||
|
||||
raw_bytes is the original file content (used for hashing).
|
||||
decompressed_bytes is what parsers should actually parse.
|
||||
"""
|
||||
raw = path.read_bytes()
|
||||
if path.suffix == ".gz":
|
||||
return raw, gzip.decompress(raw)
|
||||
return raw, raw
|
||||
@@ -0,0 +1,46 @@
|
||||
"""Parser factory — selects the right parser based on file extension."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from bincio.extract.models import ParsedActivity
|
||||
from bincio.extract.parsers.base import BaseParser
|
||||
from bincio.extract.parsers.fit import FitParser
|
||||
from bincio.extract.parsers.gpx import GpxParser
|
||||
from bincio.extract.parsers.tcx import TcxParser
|
||||
|
||||
# Supported extensions (including .gz variants)
|
||||
SUPPORTED = {".fit", ".gpx", ".tcx", ".fit.gz", ".gpx.gz", ".tcx.gz"}
|
||||
|
||||
_PARSERS: dict[str, type[BaseParser]] = {
|
||||
".fit": FitParser,
|
||||
".gpx": GpxParser,
|
||||
".tcx": TcxParser,
|
||||
}
|
||||
|
||||
|
||||
def _base_ext(path: Path) -> str:
|
||||
"""Return the meaningful extension, stripping .gz if present."""
|
||||
if path.suffix == ".gz":
|
||||
return Path(path.stem).suffix # e.g. ".fit" from "ride.fit.gz"
|
||||
return path.suffix
|
||||
|
||||
|
||||
def is_supported(path: Path) -> bool:
|
||||
suffix = "".join(path.suffixes[-2:]) if path.suffix == ".gz" else path.suffix
|
||||
return suffix in SUPPORTED
|
||||
|
||||
|
||||
def parse_file(path: Path) -> ParsedActivity:
|
||||
"""Parse an activity file, handling .gz transparently."""
|
||||
ext = _base_ext(path)
|
||||
parser_cls = _PARSERS.get(ext)
|
||||
if parser_cls is None:
|
||||
raise ValueError(f"Unsupported file type: {path.name!r}")
|
||||
|
||||
raw_bytes, content_bytes = BaseParser._read_file(path)
|
||||
parser = parser_cls()
|
||||
activity = parser.parse(path, content_bytes)
|
||||
# Attach hash of the *original* bytes (compressed if .gz) for dedup
|
||||
activity.source_hash = BaseParser._sha256(raw_bytes)
|
||||
activity.source_file = path.name
|
||||
return activity
|
||||
@@ -0,0 +1,133 @@
|
||||
"""FIT file parser (Garmin binary format)."""
|
||||
|
||||
from datetime import timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import fitdecode
|
||||
|
||||
from bincio.extract.models import DataPoint, LapData, ParsedActivity
|
||||
from bincio.extract.sport import normalise_sport
|
||||
|
||||
|
||||
class FitParser:
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
import io
|
||||
|
||||
points: list[DataPoint] = []
|
||||
laps: list[LapData] = []
|
||||
sport: str = "cycling"
|
||||
sub_sport: str | None = None
|
||||
device: str | None = None
|
||||
|
||||
with fitdecode.FitReader(io.BytesIO(raw_bytes)) as fit:
|
||||
for frame in fit:
|
||||
if not isinstance(frame, fitdecode.FitDataMessage):
|
||||
continue
|
||||
|
||||
if frame.name == "sport":
|
||||
sport = normalise_sport(_get(frame, "sport", "cycling"))
|
||||
sub_sport = _normalise_sub_sport(_get(frame, "sub_sport"))
|
||||
|
||||
elif frame.name == "device_info":
|
||||
mfr = _get(frame, "manufacturer")
|
||||
prod = _get(frame, "product_name") or _get(frame, "garmin_product")
|
||||
if mfr and prod:
|
||||
device = f"{mfr} {prod}"
|
||||
elif prod:
|
||||
device = str(prod)
|
||||
|
||||
elif frame.name == "record":
|
||||
ts = _get(frame, "timestamp")
|
||||
if ts is None:
|
||||
continue
|
||||
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
|
||||
lat = _semicircles_to_deg(_get(frame, "position_lat"))
|
||||
lon = _semicircles_to_deg(_get(frame, "position_long"))
|
||||
speed_raw = _get(frame, "speed") # m/s
|
||||
|
||||
dp = DataPoint(
|
||||
timestamp=ts,
|
||||
lat=lat,
|
||||
lon=lon,
|
||||
elevation_m=_get(frame, "altitude"),
|
||||
hr_bpm=_get(frame, "heart_rate"),
|
||||
cadence_rpm=_get(frame, "cadence"),
|
||||
speed_kmh=speed_raw * 3.6 if speed_raw is not None else None,
|
||||
power_w=_get(frame, "power"),
|
||||
temperature_c=_get(frame, "temperature"),
|
||||
distance_m=_get(frame, "distance"),
|
||||
)
|
||||
points.append(dp)
|
||||
|
||||
elif frame.name == "lap":
|
||||
ts = _get(frame, "start_time")
|
||||
if ts is not None:
|
||||
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
elapsed = _get(frame, "total_elapsed_time")
|
||||
speed_raw = _get(frame, "avg_speed")
|
||||
laps.append(
|
||||
LapData(
|
||||
index=len(laps),
|
||||
started_at=ts,
|
||||
duration_s=int(elapsed) if elapsed else None,
|
||||
distance_m=_get(frame, "total_distance"),
|
||||
elevation_gain_m=_get(frame, "total_ascent"),
|
||||
avg_speed_kmh=speed_raw * 3.6 if speed_raw else None,
|
||||
avg_hr_bpm=_get(frame, "avg_heart_rate"),
|
||||
avg_power_w=_get(frame, "avg_power"),
|
||||
)
|
||||
)
|
||||
|
||||
if not points:
|
||||
raise ValueError(f"No record messages found in {path.name}")
|
||||
|
||||
return ParsedActivity(
|
||||
points=points,
|
||||
sport=sport,
|
||||
sub_sport=sub_sport,
|
||||
started_at=points[0].timestamp,
|
||||
device=device,
|
||||
laps=laps,
|
||||
source_file=path.name,
|
||||
source_hash="",
|
||||
)
|
||||
|
||||
|
||||
def _get(frame: fitdecode.FitDataMessage, field: str, default: Any = None) -> Any:
|
||||
try:
|
||||
return frame.get_value(field)
|
||||
except KeyError:
|
||||
return default
|
||||
|
||||
|
||||
def _semicircles_to_deg(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
deg = float(value) * (180.0 / 2**31)
|
||||
# Sanity check: invalid semicircle values often come out as ±180+
|
||||
if abs(deg) > 180:
|
||||
return None
|
||||
return deg
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _normalise_sub_sport(value: Any) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
s = str(value).lower().replace(" ", "_")
|
||||
mapping = {
|
||||
"road": "road",
|
||||
"mountain": "mountain",
|
||||
"gravel_cycling": "gravel",
|
||||
"cyclocross": "gravel",
|
||||
"indoor_cycling": "indoor",
|
||||
"trail": "trail",
|
||||
"track": "track",
|
||||
}
|
||||
return mapping.get(s, s) or None
|
||||
@@ -0,0 +1,82 @@
|
||||
"""GPX file parser."""
|
||||
|
||||
from datetime import timezone
|
||||
from pathlib import Path
|
||||
|
||||
import gpxpy
|
||||
import gpxpy.gpx
|
||||
|
||||
from bincio.extract.models import DataPoint, ParsedActivity
|
||||
from bincio.extract.parsers.base import BaseParser
|
||||
from bincio.extract.sport import normalise_sport
|
||||
|
||||
# Known GPX extension namespaces
|
||||
_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1"
|
||||
_NS_GARMIN_V2 = "http://www.garmin.com/xmlschemas/TrackPointExtension/v2"
|
||||
|
||||
|
||||
class GpxParser(BaseParser):
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
gpx = gpxpy.parse(raw_bytes.decode("utf-8", errors="replace"))
|
||||
|
||||
points: list[DataPoint] = []
|
||||
for track in gpx.tracks:
|
||||
for segment in track.segments:
|
||||
for pt in segment.points:
|
||||
if pt.time is None:
|
||||
continue
|
||||
ts = pt.time
|
||||
if ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
|
||||
dp = DataPoint(
|
||||
timestamp=ts,
|
||||
lat=pt.latitude,
|
||||
lon=pt.longitude,
|
||||
elevation_m=pt.elevation,
|
||||
)
|
||||
_apply_extensions(pt, dp)
|
||||
points.append(dp)
|
||||
|
||||
if not points:
|
||||
raise ValueError(f"No trackpoints found in {path.name}")
|
||||
|
||||
sport = normalise_sport(
|
||||
(gpx.tracks[0].type if gpx.tracks else None) or "cycling"
|
||||
)
|
||||
started_at = points[0].timestamp
|
||||
|
||||
return ParsedActivity(
|
||||
points=points,
|
||||
sport=sport,
|
||||
started_at=started_at,
|
||||
source_file=path.name,
|
||||
source_hash="", # set by factory
|
||||
)
|
||||
|
||||
|
||||
def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
|
||||
"""Extract HR, cadence, temperature from Garmin TrackPointExtension."""
|
||||
if pt.extensions is None:
|
||||
return
|
||||
for ext in pt.extensions:
|
||||
ns = _strip_ns(ext.tag)
|
||||
if ns == "TrackPointExtension":
|
||||
for child in ext:
|
||||
tag = _strip_ns(child.tag)
|
||||
val = child.text
|
||||
if val is None:
|
||||
continue
|
||||
if tag == "hr":
|
||||
dp.hr_bpm = int(float(val))
|
||||
elif tag == "cad":
|
||||
dp.cadence_rpm = int(float(val))
|
||||
elif tag == "atemp":
|
||||
dp.temperature_c = float(val)
|
||||
elif tag == "speed":
|
||||
dp.speed_kmh = float(val) * 3.6 # m/s → km/h
|
||||
|
||||
|
||||
def _strip_ns(tag: str) -> str:
|
||||
"""'{namespace}localname' → 'localname'."""
|
||||
return tag.split("}")[-1] if "}" in tag else tag
|
||||
@@ -0,0 +1,89 @@
|
||||
"""TCX (Training Center XML) file parser."""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from bincio.extract.models import DataPoint, ParsedActivity
|
||||
from bincio.extract.sport import normalise_sport
|
||||
|
||||
_NS = {
|
||||
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
|
||||
"ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
|
||||
}
|
||||
|
||||
|
||||
class TcxParser:
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
# Some exporters (e.g. Garmin) prepend whitespace before the XML
|
||||
# declaration, which is technically invalid. Strip it.
|
||||
root = etree.fromstring(raw_bytes.lstrip())
|
||||
|
||||
activities = root.findall(".//tcx:Activity", _NS)
|
||||
if not activities:
|
||||
raise ValueError(f"No Activity elements found in {path.name}")
|
||||
|
||||
# Use the first activity
|
||||
act = activities[0]
|
||||
sport_attr = act.get("Sport", "Biking")
|
||||
sport = normalise_sport(sport_attr)
|
||||
|
||||
points: list[DataPoint] = []
|
||||
for tp in act.findall(".//tcx:Trackpoint", _NS):
|
||||
ts_el = tp.find("tcx:Time", _NS)
|
||||
if ts_el is None or not ts_el.text:
|
||||
continue
|
||||
ts = _parse_ts(ts_el.text)
|
||||
|
||||
lat, lon = None, None
|
||||
pos = tp.find("tcx:Position", _NS)
|
||||
if pos is not None:
|
||||
lat_el = pos.find("tcx:LatitudeDegrees", _NS)
|
||||
lon_el = pos.find("tcx:LongitudeDegrees", _NS)
|
||||
lat = float(lat_el.text) if lat_el is not None and lat_el.text else None
|
||||
lon = float(lon_el.text) if lon_el is not None and lon_el.text else None
|
||||
|
||||
ele_el = tp.find("tcx:AltitudeMeters", _NS)
|
||||
hr_el = tp.find(".//tcx:HeartRateBpm/tcx:Value", _NS)
|
||||
cad_el = tp.find("tcx:Cadence", _NS)
|
||||
dist_el = tp.find("tcx:DistanceMeters", _NS)
|
||||
|
||||
# Extensions (speed, watts)
|
||||
speed_el = tp.find(".//ext:Speed", _NS)
|
||||
power_el = tp.find(".//ext:Watts", _NS)
|
||||
|
||||
dp = DataPoint(
|
||||
timestamp=ts,
|
||||
lat=lat,
|
||||
lon=lon,
|
||||
elevation_m=float(ele_el.text) if ele_el is not None and ele_el.text else None,
|
||||
hr_bpm=int(float(hr_el.text)) if hr_el is not None and hr_el.text else None,
|
||||
cadence_rpm=int(float(cad_el.text)) if cad_el is not None and cad_el.text else None,
|
||||
distance_m=float(dist_el.text) if dist_el is not None and dist_el.text else None,
|
||||
speed_kmh=float(speed_el.text) * 3.6 if speed_el is not None and speed_el.text else None,
|
||||
power_w=int(float(power_el.text)) if power_el is not None and power_el.text else None,
|
||||
)
|
||||
points.append(dp)
|
||||
|
||||
if not points:
|
||||
raise ValueError(f"No trackpoints found in {path.name}")
|
||||
|
||||
return ParsedActivity(
|
||||
points=points,
|
||||
sport=sport,
|
||||
started_at=points[0].timestamp,
|
||||
source_file=path.name,
|
||||
source_hash="",
|
||||
)
|
||||
|
||||
|
||||
def _parse_ts(s: str) -> datetime:
|
||||
# ISO 8601 with or without fractional seconds
|
||||
s = s.rstrip("Z")
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
|
||||
try:
|
||||
return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
raise ValueError(f"Cannot parse timestamp: {s!r}")
|
||||
@@ -0,0 +1,60 @@
|
||||
"""GPS track simplification using the Ramer-Douglas-Peucker algorithm."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from rdp import rdp
|
||||
|
||||
from bincio.extract.models import DataPoint
|
||||
|
||||
|
||||
def simplify_track(
|
||||
points: list[DataPoint],
|
||||
epsilon: float = 0.0001,
|
||||
) -> list[DataPoint]:
|
||||
"""Return a simplified subset of points using RDP.
|
||||
|
||||
epsilon is in degrees (~11m at equator for 0.0001).
|
||||
Points without GPS coordinates are dropped.
|
||||
"""
|
||||
gps_pts = [(p, p.lat, p.lon) for p in points if p.lat is not None and p.lon is not None]
|
||||
if len(gps_pts) < 2:
|
||||
return [p for p, _, _ in gps_pts]
|
||||
|
||||
coords = [[lon, lat] for _, lat, lon in gps_pts]
|
||||
mask = rdp(coords, epsilon=epsilon, return_mask=True)
|
||||
return [p for (p, _, _), keep in zip(gps_pts, mask) if keep]
|
||||
|
||||
|
||||
def build_geojson(
|
||||
points: list[DataPoint],
|
||||
activity_id: str,
|
||||
epsilon: float = 0.0001,
|
||||
original_count: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Build a GeoJSON Feature for the simplified track."""
|
||||
simplified = simplify_track(points, epsilon=epsilon)
|
||||
|
||||
coordinates = [
|
||||
[p.lon, p.lat, p.elevation_m] if p.elevation_m is not None else [p.lon, p.lat]
|
||||
for p in simplified
|
||||
if p.lon is not None and p.lat is not None
|
||||
]
|
||||
|
||||
# Parallel speed array for gradient coloring
|
||||
speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified]
|
||||
|
||||
return {
|
||||
"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "LineString",
|
||||
"coordinates": coordinates,
|
||||
},
|
||||
"properties": {
|
||||
"id": activity_id,
|
||||
"speeds": speeds,
|
||||
"simplification": "rdp",
|
||||
"rdp_epsilon": epsilon,
|
||||
"point_count_original": original_count or len(points),
|
||||
"point_count_simplified": len(coordinates),
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
"""Sport name normalisation."""
|
||||
|
||||
_MAPPING: dict[str, str] = {
|
||||
# cycling variants
|
||||
"cycling": "cycling",
|
||||
"biking": "cycling",
|
||||
"bike": "cycling",
|
||||
"road_biking": "cycling",
|
||||
"mountain_biking": "cycling",
|
||||
"gravel_cycling": "cycling",
|
||||
"cyclocross": "cycling",
|
||||
"indoor_cycling": "cycling",
|
||||
"virtual_ride": "cycling",
|
||||
"e-biking": "cycling",
|
||||
# running
|
||||
"running": "running",
|
||||
"run": "running",
|
||||
"trail_running": "running",
|
||||
"treadmill_running": "running",
|
||||
"virtual_run": "running",
|
||||
# hiking
|
||||
"hiking": "hiking",
|
||||
"hike": "hiking",
|
||||
# walking
|
||||
"walking": "walking",
|
||||
"walk": "walking",
|
||||
# swimming
|
||||
"swimming": "swimming",
|
||||
"swim": "swimming",
|
||||
"open_water_swimming": "swimming",
|
||||
}
|
||||
|
||||
BAS_SPORTS = {"cycling", "running", "hiking", "walking", "swimming", "other"}
|
||||
|
||||
|
||||
def normalise_sport(raw: object) -> str:
|
||||
if raw is None:
|
||||
return "other"
|
||||
key = str(raw).lower().strip().replace(" ", "_")
|
||||
return _MAPPING.get(key, "other")
|
||||
@@ -0,0 +1,55 @@
|
||||
"""Import metadata from Strava's activities.csv bulk export.
|
||||
|
||||
Strava export columns we care about:
|
||||
Activity ID, Activity Date, Activity Name, Activity Type,
|
||||
Activity Description, Filename
|
||||
"""
|
||||
|
||||
import csv
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
_STRAVA_DATE_FMTS = (
|
||||
"%b %d, %Y, %I:%M:%S %p", # "Jun 1, 2024, 7:30:12 AM"
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
|
||||
class StravaMetadata:
|
||||
"""Maps original filename → Strava metadata."""
|
||||
|
||||
def __init__(self, csv_path: Path) -> None:
|
||||
self._by_filename: dict[str, dict] = {}
|
||||
self._load(csv_path)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
with path.open(newline="", encoding="utf-8-sig") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
filename = row.get("Filename", "").strip()
|
||||
if not filename:
|
||||
continue
|
||||
# Strava stores paths like "activities/12345.fit.gz"
|
||||
basename = Path(filename).name
|
||||
self._by_filename[basename] = row
|
||||
|
||||
def lookup(self, source_file: str) -> Optional[dict]:
|
||||
"""Return the Strava CSV row for a given source filename, or None."""
|
||||
return self._by_filename.get(source_file)
|
||||
|
||||
def enrich(self, source_file: str, activity: object) -> None:
|
||||
"""Mutate a ParsedActivity with Strava metadata if found."""
|
||||
row = self.lookup(source_file)
|
||||
if row is None:
|
||||
return
|
||||
|
||||
if not activity.title and row.get("Activity Name"): # type: ignore[attr-defined]
|
||||
activity.title = row["Activity Name"].strip() # type: ignore[attr-defined]
|
||||
|
||||
if not activity.description and row.get("Activity Description"): # type: ignore[attr-defined]
|
||||
activity.description = row["Activity Description"].strip() # type: ignore[attr-defined]
|
||||
|
||||
if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined]
|
||||
activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined]
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Downsample a list of DataPoints to at most 1 sample/second and build
|
||||
the BAS timeseries object (parallel arrays)."""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from bincio.extract.models import DataPoint
|
||||
|
||||
|
||||
def build_timeseries(
|
||||
points: list[DataPoint],
|
||||
started_at: datetime,
|
||||
privacy: str = "public",
|
||||
) -> dict:
|
||||
"""Return the BAS `timeseries` object.
|
||||
|
||||
privacy='no_gps' or 'private' → lat/lon set to null.
|
||||
Downsamples so at most one point per second is emitted.
|
||||
"""
|
||||
if not points:
|
||||
return {"t": []}
|
||||
|
||||
include_gps = privacy not in ("no_gps", "private")
|
||||
|
||||
# Downsample: keep at most one point per second
|
||||
sampled: list[DataPoint] = []
|
||||
last_t: Optional[int] = None
|
||||
for p in points:
|
||||
t = int((p.timestamp - started_at).total_seconds())
|
||||
if t < 0:
|
||||
continue
|
||||
if last_t is not None and t == last_t:
|
||||
continue # skip sub-second duplicates
|
||||
sampled.append(p)
|
||||
last_t = t
|
||||
|
||||
ts_vals = [int((p.timestamp - started_at).total_seconds()) for p in sampled]
|
||||
lat_vals = [round(p.lat, 7) if p.lat is not None else None for p in sampled] if include_gps else None
|
||||
lon_vals = [round(p.lon, 7) if p.lon is not None else None for p in sampled] if include_gps else None
|
||||
ele_vals = [round(p.elevation_m, 1) if p.elevation_m is not None else None for p in sampled]
|
||||
spd_vals = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in sampled]
|
||||
hr_vals = [p.hr_bpm for p in sampled]
|
||||
cad_vals = [p.cadence_rpm for p in sampled]
|
||||
pwr_vals = [p.power_w for p in sampled]
|
||||
tmp_vals = [round(p.temperature_c, 1) if p.temperature_c is not None else None for p in sampled]
|
||||
|
||||
result: dict = {
|
||||
"t": ts_vals,
|
||||
"lat": lat_vals,
|
||||
"lon": lon_vals,
|
||||
"elevation_m": ele_vals,
|
||||
"speed_kmh": spd_vals,
|
||||
"hr_bpm": hr_vals,
|
||||
"cadence_rpm": cad_vals,
|
||||
"power_w": pwr_vals,
|
||||
"temperature_c": tmp_vals,
|
||||
}
|
||||
return result
|
||||
@@ -0,0 +1,198 @@
|
||||
"""Write a processed activity to BAS JSON files."""
|
||||
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
from bincio.extract.metrics import ComputedMetrics
|
||||
from bincio.extract.models import LapData, ParsedActivity
|
||||
from bincio.extract.simplify import build_geojson
|
||||
from bincio.extract.timeseries import build_timeseries
|
||||
|
||||
|
||||
def make_activity_id(activity: ParsedActivity) -> str:
|
||||
"""Generate a BAS activity ID from started_at + optional title slug."""
|
||||
ts = activity.started_at
|
||||
# Compact ISO format: 2024-06-01T073012+0200
|
||||
tz_str = ts.strftime("%z") # e.g. "+0200" or ""
|
||||
ts_part = ts.strftime("%Y-%m-%dT%H%M%S") + (tz_str or "Z")
|
||||
|
||||
if activity.title:
|
||||
slug = _slugify(activity.title)
|
||||
return f"{ts_part}-{slug}" if slug else ts_part
|
||||
return ts_part
|
||||
|
||||
|
||||
def write_activity(
|
||||
activity: ParsedActivity,
|
||||
metrics: ComputedMetrics,
|
||||
output_dir: Path,
|
||||
privacy: str = "public",
|
||||
duplicate_of: str | None = None,
|
||||
rdp_epsilon: float = 0.0001,
|
||||
) -> str:
|
||||
"""Write {id}.json and (if GPS available) {id}.geojson. Returns the ID."""
|
||||
activity_id = make_activity_id(activity)
|
||||
acts_dir = output_dir / "activities"
|
||||
acts_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
source = _infer_source(activity)
|
||||
has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private")
|
||||
|
||||
# ── detail JSON ──────────────────────────────────────────────────────────
|
||||
detail: dict = {
|
||||
"bas_version": "1.0",
|
||||
"id": activity_id,
|
||||
"title": activity.title or _auto_title(activity),
|
||||
"description": activity.description,
|
||||
"sport": activity.sport,
|
||||
"sub_sport": activity.sub_sport,
|
||||
"started_at": activity.started_at.isoformat(),
|
||||
"distance_m": metrics.distance_m,
|
||||
"duration_s": metrics.duration_s,
|
||||
"moving_time_s": metrics.moving_time_s,
|
||||
"elevation_gain_m": metrics.elevation_gain_m,
|
||||
"elevation_loss_m": metrics.elevation_loss_m,
|
||||
"avg_speed_kmh": metrics.avg_speed_kmh,
|
||||
"max_speed_kmh": metrics.max_speed_kmh,
|
||||
"avg_hr_bpm": metrics.avg_hr_bpm,
|
||||
"max_hr_bpm": metrics.max_hr_bpm,
|
||||
"avg_cadence_rpm": metrics.avg_cadence_rpm,
|
||||
"avg_power_w": metrics.avg_power_w,
|
||||
"max_power_w": metrics.max_power_w,
|
||||
"gear": activity.gear,
|
||||
"device": activity.device,
|
||||
"bbox": list(metrics.bbox) if metrics.bbox else None,
|
||||
"start_latlng": list(metrics.start_latlng) if metrics.start_latlng else None,
|
||||
"end_latlng": list(metrics.end_latlng) if metrics.end_latlng else None,
|
||||
"laps": [_serialise_lap(lap) for lap in activity.laps],
|
||||
"timeseries": build_timeseries(activity.points, activity.started_at, privacy),
|
||||
"source": source,
|
||||
"source_file": activity.source_file,
|
||||
"source_hash": activity.source_hash,
|
||||
"strava_id": activity.strava_id,
|
||||
"duplicate_of": duplicate_of,
|
||||
"privacy": privacy,
|
||||
"custom": {},
|
||||
}
|
||||
|
||||
json_path = acts_dir / f"{activity_id}.json"
|
||||
json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
|
||||
|
||||
# ── GeoJSON track ────────────────────────────────────────────────────────
|
||||
if has_gps:
|
||||
geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon)
|
||||
geojson_path = acts_dir / f"{activity_id}.geojson"
|
||||
geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False))
|
||||
|
||||
return activity_id
|
||||
|
||||
|
||||
def build_summary(
|
||||
activity: ParsedActivity,
|
||||
metrics: ComputedMetrics,
|
||||
activity_id: str,
|
||||
privacy: str = "public",
|
||||
) -> dict:
|
||||
"""Build the Activity Summary object for index.json."""
|
||||
has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private")
|
||||
return {
|
||||
"id": activity_id,
|
||||
"title": activity.title or _auto_title(activity),
|
||||
"sport": activity.sport,
|
||||
"sub_sport": activity.sub_sport,
|
||||
"started_at": activity.started_at.isoformat(),
|
||||
"distance_m": metrics.distance_m,
|
||||
"duration_s": metrics.duration_s,
|
||||
"moving_time_s": metrics.moving_time_s,
|
||||
"elevation_gain_m": metrics.elevation_gain_m,
|
||||
"avg_speed_kmh": metrics.avg_speed_kmh,
|
||||
"max_speed_kmh": metrics.max_speed_kmh,
|
||||
"avg_hr_bpm": metrics.avg_hr_bpm,
|
||||
"max_hr_bpm": metrics.max_hr_bpm,
|
||||
"avg_cadence_rpm": metrics.avg_cadence_rpm,
|
||||
"avg_power_w": metrics.avg_power_w,
|
||||
"source": _infer_source(activity),
|
||||
"privacy": privacy,
|
||||
"detail_url": f"activities/{activity_id}.json",
|
||||
"track_url": f"activities/{activity_id}.geojson" if has_gps else None,
|
||||
}
|
||||
|
||||
|
||||
def write_index(summaries: list[dict], output_dir: Path, owner: dict) -> None:
|
||||
"""Write index.json (sorted newest first)."""
|
||||
sorted_summaries = sorted(
|
||||
summaries,
|
||||
key=lambda s: s["started_at"],
|
||||
reverse=True,
|
||||
)
|
||||
index = {
|
||||
"bas_version": "1.0",
|
||||
"owner": owner,
|
||||
"generated_at": _now_iso(),
|
||||
"shards": [],
|
||||
"activities": sorted_summaries,
|
||||
}
|
||||
(output_dir / "index.json").write_text(
|
||||
json.dumps(index, indent=2, ensure_ascii=False)
|
||||
)
|
||||
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _now_iso() -> str:
|
||||
from datetime import datetime, timezone
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _auto_title(activity: ParsedActivity) -> str:
|
||||
ts = activity.started_at
|
||||
hour = ts.hour
|
||||
if 5 <= hour < 12:
|
||||
time_of_day = "Morning"
|
||||
elif 12 <= hour < 17:
|
||||
time_of_day = "Afternoon"
|
||||
elif 17 <= hour < 21:
|
||||
time_of_day = "Evening"
|
||||
else:
|
||||
time_of_day = "Night"
|
||||
sport = activity.sport.capitalize()
|
||||
return f"{time_of_day} {sport}"
|
||||
|
||||
|
||||
def _infer_source(activity: ParsedActivity) -> str | None:
|
||||
if activity.strava_id:
|
||||
return "strava_export"
|
||||
name = activity.source_file.lower()
|
||||
# Karoo uses UUID-style names
|
||||
if "activity" in name and len(name.split(".")) >= 3:
|
||||
return "karoo"
|
||||
if name.endswith(".fit") or name.endswith(".fit.gz"):
|
||||
return "fit_file"
|
||||
if name.endswith(".gpx") or name.endswith(".gpx.gz"):
|
||||
return "gpx_file"
|
||||
if name.endswith(".tcx") or name.endswith(".tcx.gz"):
|
||||
return "tcx_file"
|
||||
return None
|
||||
|
||||
|
||||
def _slugify(text: str) -> str:
|
||||
text = unicodedata.normalize("NFKD", text)
|
||||
text = text.encode("ascii", "ignore").decode("ascii")
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^a-z0-9]+", "-", text)
|
||||
return text.strip("-")[:60]
|
||||
|
||||
|
||||
def _serialise_lap(lap: LapData) -> dict:
|
||||
return {
|
||||
"index": lap.index,
|
||||
"started_at": lap.started_at.isoformat(),
|
||||
"duration_s": lap.duration_s,
|
||||
"distance_m": lap.distance_m,
|
||||
"elevation_gain_m": lap.elevation_gain_m,
|
||||
"avg_speed_kmh": lap.avg_speed_kmh,
|
||||
"avg_hr_bpm": lap.avg_hr_bpm,
|
||||
"avg_power_w": lap.avg_power_w,
|
||||
}
|
||||
Reference in New Issue
Block a user