commit 38c5423aeb98ddf3469bfa26dd020cd3d348579d Author: Davide Scaini Date: Sat Mar 28 13:57:12 2026 +0100 backend: initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..968f5e8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +.venv/ +dist/ +build/ +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ +htmlcov/ +.coverage +.idea* + +# uv +uv.lock + +# Node / Astro +site/node_modules/ +site/dist/ +site/.astro/ + +# BAS data stores (user data, not committed to the tool repo) +bincio_data/ +*.bincio_cache.json + +# OS +.DS_Store diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/SCHEMA.md b/SCHEMA.md new file mode 100644 index 0000000..cb0f3b8 --- /dev/null +++ b/SCHEMA.md @@ -0,0 +1,369 @@ +# BincioActivity Schema (BAS) — v1.0 + +The BincioActivity Schema defines how activity data is stored and shared as +plain JSON files. It is the **federation protocol**: if you publish a +BAS-compliant data store, any BincioActivity instance can read it. + +Any tool — in any language — can produce BAS-compliant JSON without using the +`bincio` Python package. The schema is the contract; the package is one +implementation. + +--- + +## Files + +A BAS data store is a directory (or URL prefix) with this structure: + +``` +{store_root}/ + index.json ← user manifest and activity feed + index_{year}.json ← optional yearly shards (large datasets) + activities/ + {id}.json ← full activity detail + {id}.geojson ← simplified GPS track (optional) +``` + +All files are UTF-8 JSON. All timestamps are ISO 8601 with timezone offset. +All distances are in metres. All speeds are in km/h. All durations are in +seconds. `null` means "not recorded / not available". + +--- + +## `index.json` + +The entry point for a data store. + +```json +{ + "bas_version": "1.0", + "owner": { + "handle": "brutsalvadi", + "display_name": "Bru", + "avatar_url": null + }, + "generated_at": "2026-03-28T10:00:00Z", + "shards": [ + { "year": 2024, "url": "index_2024.json", "count": 312 } + ], + "activities": [ ... ] +} +``` + +### Fields + +| Field | Type | Required | Description | +|---|---|---|---| +| `bas_version` | string | yes | Schema version. Currently `"1.0"`. | +| `owner.handle` | string | yes | URL-safe identifier, e.g. `"brutsalvadi"`. | +| `owner.display_name` | string | yes | Human-readable name. | +| `owner.avatar_url` | string\|null | no | Absolute URL to an avatar image. | +| `generated_at` | string | yes | ISO 8601 timestamp of when this file was generated. | +| `shards` | array | no | Pointers to yearly shard files. See below. | +| `activities` | array | yes | Array of **Activity Summary** objects. May be empty. | + +`index.json` should contain all activities when the total count is under ~5,000. +Above that, use yearly shards and keep only the most recent 200 activities +inline in `index.json` for fast feed rendering. + +### Shard object + +| Field | Type | Description | +|---|---|---| +| `year` | integer | Calendar year covered by this shard. | +| `url` | string | Relative or absolute URL to the shard file. | +| `count` | integer | Number of activities in the shard. | + +--- + +## Activity Summary object + +Appears in `index.json` (and yearly shard files). Contains only the fields +needed to render an activity card in a feed — no timeseries, no full track. + +```json +{ + "id": "2024-06-01T073012+0200-morning-ride", + "title": "Morning Ride", + "sport": "cycling", + "sub_sport": "road", + "started_at": "2024-06-01T07:30:12+02:00", + "distance_m": 42300.0, + "duration_s": 5400, + "moving_time_s": 5100, + "elevation_gain_m": 620.0, + "avg_speed_kmh": 28.2, + "max_speed_kmh": 52.1, + "avg_hr_bpm": 148, + "max_hr_bpm": 178, + "avg_cadence_rpm": 88, + "avg_power_w": null, + "source": "strava_export", + "privacy": "public", + "detail_url": "activities/2024-06-01T073012+0200-morning-ride.json", + "track_url": "activities/2024-06-01T073012+0200-morning-ride.geojson" +} +``` + +### Fields + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | string | yes | Unique identifier. See **Activity ID** section. | +| `title` | string | yes | Human-readable name. May be auto-generated if not in source. | +| `sport` | string | yes | One of: `cycling`, `running`, `hiking`, `walking`, `swimming`, `other`. | +| `sub_sport` | string\|null | no | e.g. `road`, `mountain`, `gravel`, `indoor`, `trail`. | +| `started_at` | string | yes | ISO 8601 timestamp with timezone. | +| `distance_m` | number\|null | no | Total distance in metres. | +| `duration_s` | integer\|null | no | Total elapsed time in seconds. | +| `moving_time_s` | integer\|null | no | Time in motion (stopped periods excluded). | +| `elevation_gain_m` | number\|null | no | Cumulative positive elevation in metres. | +| `avg_speed_kmh` | number\|null | no | Average speed over moving time. | +| `max_speed_kmh` | number\|null | no | Maximum instantaneous speed. | +| `avg_hr_bpm` | integer\|null | no | Average heart rate. | +| `max_hr_bpm` | integer\|null | no | Maximum heart rate. | +| `avg_cadence_rpm` | integer\|null | no | Average cadence (rpm for cycling, spm for running). | +| `avg_power_w` | integer\|null | no | Average power in watts. | +| `source` | string\|null | no | Origin of data. See **Source values**. | +| `privacy` | string | yes | One of: `public`, `blur_start`, `no_gps`, `private`. | +| `detail_url` | string\|null | no | Relative or absolute URL to the full activity JSON. | +| `track_url` | string\|null | no | Relative or absolute URL to the GeoJSON track. `null` if `privacy` is `no_gps`. | + +### Activity ID + +The canonical ID format is: + +``` +{started_at_compact}[-{slug}] +``` + +Where `started_at_compact` is the start timestamp with special characters +removed: `2024-06-01T073012+0200`, and `slug` is an optional URL-safe +lowercase title (spaces → hyphens, non-ASCII stripped). + +Example: `2024-06-01T073012+0200-morning-ride` + +IDs must be unique within a data store. When a title is unavailable, the +timestamp alone is sufficient: `2024-06-01T073012+0200`. + +### Source values + +| Value | Description | +|---|---| +| `strava_export` | Strava bulk data export | +| `garmin_connect` | Garmin Connect bulk export | +| `wahoo` | Wahoo ELEMNT / SYSTM export | +| `komoot` | Komoot GPX export | +| `gpx_file` | Generic GPX file | +| `fit_file` | Generic FIT file | +| `tcx_file` | Generic TCX file | +| `karoo` | Hammerhead Karoo device export | +| `manual` | Manually created | + +### Privacy levels + +| Level | GPS track published | Timeseries lat/lon | Stats in index | +|---|---|---|---| +| `public` | Full track | Included | Yes | +| `blur_start` | First/last 200 m removed | Trimmed | Yes | +| `no_gps` | Not published | Not included | Yes | +| `private` | Not published | Not included | No (not in index at all) | + +--- + +## `activities/{id}.json` + +Full activity record. Extends the Summary with timeseries and metadata. + +```json +{ + "bas_version": "1.0", + "id": "2024-06-01T073012+0200-morning-ride", + "title": "Morning Ride", + "description": "Easy morning spin before work.", + "sport": "cycling", + "sub_sport": "road", + "started_at": "2024-06-01T07:30:12+02:00", + "distance_m": 42300.0, + "duration_s": 5400, + "moving_time_s": 5100, + "elevation_gain_m": 620.0, + "elevation_loss_m": 615.0, + "avg_speed_kmh": 28.2, + "max_speed_kmh": 52.1, + "avg_hr_bpm": 148, + "max_hr_bpm": 178, + "avg_cadence_rpm": 88, + "avg_power_w": null, + "max_power_w": null, + "gear": "Canyon Ultimate CF SL", + "device": "Hammerhead Karoo 2", + "bbox": [9.1234, 45.4321, 9.5678, 45.8765], + "start_latlng": [45.4321, 9.1234], + "end_latlng": [45.4321, 9.1235], + "laps": [], + "timeseries": { + "t": [0, 1, 2], + "lat": [45.4321, 45.4322, 45.4323], + "lon": [9.1234, 9.1235, 9.1236], + "elevation_m": [120.0, 120.5, 121.0], + "speed_kmh": [0.0, 15.2, 22.4], + "hr_bpm": [null, 142, 145], + "cadence_rpm": [null, 85, 88], + "power_w": [null, null, null], + "temperature_c": [null, null, null] + }, + "source": "karoo", + "source_file": "13957.activity.abc123.fit", + "source_hash": "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "strava_id": null, + "privacy": "public", + "custom": {} +} +``` + +### Additional fields (beyond Summary) + +| Field | Type | Required | Description | +|---|---|---|---| +| `description` | string\|null | no | Free-text description. | +| `elevation_loss_m` | number\|null | no | Cumulative negative elevation. | +| `max_power_w` | integer\|null | no | Maximum power in watts. | +| `gear` | string\|null | no | Equipment used (bike name, shoe model…). | +| `device` | string\|null | no | Recording device (e.g. `"Garmin Edge 530"`). | +| `bbox` | array\|null | no | `[min_lon, min_lat, max_lon, max_lat]`. Null if no GPS. | +| `start_latlng` | array\|null | no | `[lat, lon]` of activity start. | +| `end_latlng` | array\|null | no | `[lat, lon]` of activity end. | +| `laps` | array | yes | Array of **Lap** objects. Empty array if no laps. | +| `timeseries` | object | yes | Parallel arrays of sensor data. See below. | +| `source_file` | string\|null | no | Original filename (basename only, no path). | +| `source_hash` | string\|null | no | `sha256:{hex}` of the original raw file bytes. Used for deduplication. | +| `strava_id` | string\|null | no | Strava activity ID if origin is a Strava export. | +| `custom` | object | yes | Free dict for plugin-computed fields. Must be present, may be `{}`. | + +### Timeseries object + +Parallel arrays, all the same length. Index `i` corresponds to `t[i]` seconds +after the activity start. + +| Key | Type | Unit | Description | +|---|---|---|---| +| `t` | int[] | seconds | Seconds since `started_at`. Always present. | +| `lat` | float[]\|null | degrees | Latitude. `null` if no GPS or privacy=`no_gps`. | +| `lon` | float[]\|null | degrees | Longitude. `null` if no GPS or privacy=`no_gps`. | +| `elevation_m` | float[] | metres | Elevation. Array of nulls if unavailable. | +| `speed_kmh` | float[] | km/h | Speed. Array of nulls if unavailable. | +| `hr_bpm` | int[] | bpm | Heart rate. Array of nulls if no HR sensor. | +| `cadence_rpm` | int[] | rpm/spm | Cadence. Array of nulls if unavailable. | +| `power_w` | int[] | watts | Power. Array of nulls if no power meter. | +| `temperature_c` | float[] | °C | Temperature. Array of nulls if unavailable. | + +Timeseries are downsampled to at most 1 sample per second. The exact +downsampling strategy is implementation-defined; linear interpolation or +nearest-neighbour are both acceptable. + +`lat` and `lon` arrays are either both present (both non-null arrays) or both +`null`. Treat `null` the same as an array of nulls. + +### Lap object + +```json +{ + "index": 0, + "started_at": "2024-06-01T07:30:12+02:00", + "duration_s": 1800, + "distance_m": 21150.0, + "elevation_gain_m": 310.0, + "avg_speed_kmh": 28.2, + "avg_hr_bpm": 145, + "avg_power_w": null +} +``` + +--- + +## `activities/{id}.geojson` + +Simplified GPS track for map rendering. Omitted entirely when +`privacy` is `no_gps` or `private`. + +```json +{ + "type": "Feature", + "geometry": { + "type": "LineString", + "coordinates": [ + [9.1234, 45.4321, 120.0], + [9.1235, 45.4322, 120.5] + ] + }, + "properties": { + "id": "2024-06-01T073012+0200-morning-ride", + "speeds": [0.0, 15.2], + "simplification": "rdp", + "rdp_epsilon": 0.0001, + "point_count_original": 7200, + "point_count_simplified": 843 + } +} +``` + +Coordinates are `[longitude, latitude, elevation_metres]` per GeoJSON spec. +The `speeds` property is a parallel array to `coordinates` — one speed value +per point — used for gradient coloring on the map. + +--- + +## Deduplication + +Activities from different sources (e.g. a Strava export and a Karoo export) +may represent the same real-world ride. Producers should detect and handle +duplicates before writing the data store. + +### Exact duplicate +Two files with the same `source_hash` are byte-for-byte identical. Only one +should be processed; the other is silently skipped. + +### Near-duplicate (same ride, different source) +Two activities are considered near-duplicates if: +- `|started_at difference|` < 5 minutes, **and** +- `|distance_m difference| / max(distance_m)` < 5% + +When a near-duplicate is detected: +1. One is kept as the **canonical** record (priority: FIT > GPX > TCX, + then prefer the source with more sensor channels). +2. The duplicate is written with `"duplicate_of": "{canonical_id}"` and + `"privacy": "private"` so it is excluded from feeds but remains auditable. + +### Deduplication metadata in detail record + +```json +{ + "source_hash": "sha256:e3b0c...", + "duplicate_of": null +} +``` + +| Field | Type | Description | +|---|---|---| +| `source_hash` | string\|null | `sha256:{hex}` of original file bytes. | +| `duplicate_of` | string\|null | ID of the canonical activity, if this is a duplicate. | + +--- + +## Versioning + +The `bas_version` field allows consumers to handle schema evolution. Consumers +should: +- Reject files with a major version higher than they support. +- Accept and ignore unknown fields (forward compatibility). +- Treat missing optional fields as `null` (backward compatibility). + +Current version: **1.0** + +--- + +## Changelog + +| Version | Date | Changes | +|---|---|---| +| 1.0 | 2026-03-28 | Initial release. | diff --git a/bincio/__init__.py b/bincio/__init__.py new file mode 100644 index 0000000..c5997df --- /dev/null +++ b/bincio/__init__.py @@ -0,0 +1,3 @@ +"""BincioActivity — federated, open-source activity stats.""" + +__version__ = "0.1.0" diff --git a/bincio/cli.py b/bincio/cli.py new file mode 100644 index 0000000..796c628 --- /dev/null +++ b/bincio/cli.py @@ -0,0 +1,18 @@ +"""Top-level CLI entry point: `bincio extract` and `bincio render`.""" + +import click + +from bincio import __version__ + + +@click.group() +@click.version_option(__version__) +def main() -> None: + """BincioActivity — federated, open-source activity stats.""" + + +from bincio.extract.cli import extract # noqa: E402 +from bincio.render.cli import render # noqa: E402 + +main.add_command(extract) +main.add_command(render) diff --git a/bincio/extract/__init__.py b/bincio/extract/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bincio/extract/cli.py b/bincio/extract/cli.py new file mode 100644 index 0000000..d57b03d --- /dev/null +++ b/bincio/extract/cli.py @@ -0,0 +1,271 @@ +"""bincio extract — CLI command.""" + +import json +import sys +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from typing import Optional + +import click +from rich.console import Console +from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn + +from bincio.extract.config import ExtractConfig, default_config, load_config +from bincio.extract.dedup import ActivityRecord, DedupIndex +from bincio.extract.metrics import compute +from bincio.extract.models import ParsedActivity +from bincio.extract.parsers.factory import is_supported, parse_file +from bincio.extract.strava_csv import StravaMetadata +from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index + +console = Console() + + +@click.command() +@click.option("--config", "config_path", type=click.Path(exists=True), default=None, + help="Path to extract_config.yaml (default: ./extract_config.yaml).") +@click.option("--input", "input_dir", type=click.Path(exists=True), default=None, + help="Input directory (overrides config).") +@click.option("--output", "output_dir", type=click.Path(), default=None, + help="Output directory (overrides config).") +@click.option("--file", "single_file", type=click.Path(exists=True), default=None, + help="Process a single file and print JSON to stdout.") +@click.option("--since", default=None, metavar="YYYY-MM-DD", + help="Only process files modified after this date.") +@click.option("--workers", default=4, show_default=True, + help="Number of parallel worker processes.") +def extract( + config_path: Optional[str], + input_dir: Optional[str], + output_dir: Optional[str], + single_file: Optional[str], + since: Optional[str], + workers: int, +) -> None: + """Parse GPX/FIT/TCX files and write BAS JSON data store.""" + + # ── single file mode ───────────────────────────────────────────────────── + if single_file: + _process_single(Path(single_file)) + return + + # ── load config ────────────────────────────────────────────────────────── + cfg = _resolve_config(config_path, input_dir, output_dir) + cfg.output_dir.mkdir(parents=True, exist_ok=True) + + # ── gather files ───────────────────────────────────────────────────────── + files = _collect_files(cfg, since) + if not files: + console.print("[yellow]No supported files found.[/yellow]") + return + console.print(f"Found [bold]{len(files)}[/bold] activity files.") + + # ── Strava metadata ────────────────────────────────────────────────────── + strava_meta: Optional[StravaMetadata] = None + if cfg.metadata_csv and cfg.metadata_csv.exists(): + strava_meta = StravaMetadata(cfg.metadata_csv) + console.print(f"Loaded Strava metadata from [cyan]{cfg.metadata_csv.name}[/cyan].") + + # ── dedup index ────────────────────────────────────────────────────────── + dedup = DedupIndex(output_dir=cfg.output_dir) + + # ── process ────────────────────────────────────────────────────────────── + summaries: list[dict] = [] + errors: list[tuple[Path, str]] = [] + skipped = 0 + + owner = {"handle": cfg.owner_handle, "display_name": cfg.owner_display_name} + + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + console=console, + ) as progress: + task = progress.add_task("Processing...", total=len(files)) + + with ProcessPoolExecutor(max_workers=workers) as pool: + futures = {pool.submit(_parse_worker, f): f for f in files} + for future in as_completed(futures): + path = futures[future] + progress.advance(task) + try: + activity = future.result() + except Exception as exc: + errors.append((path, str(exc))) + continue + + # ── incremental skip ────────────────────────────────────── + if cfg.incremental: + existing_id = dedup.is_exact_duplicate(activity.source_hash) + if existing_id: + skipped += 1 + continue + + # ── enrich from Strava CSV ──────────────────────────────── + if strava_meta: + strava_meta.enrich(activity.source_file, activity) + + # ── compute metrics ─────────────────────────────────────── + metrics = compute(activity) + + # ── deduplication ───────────────────────────────────────── + activity_id = make_activity_id(activity) + duplicate_of: Optional[str] = None + + near_dup_id = dedup.find_near_duplicate( + activity.started_at, metrics.distance_m + ) + if near_dup_id: + source = _infer_source(activity) + canonical = dedup.pick_canonical(near_dup_id, source) + if canonical == "__new__": + # New one is better — mark existing as duplicate + existing = dedup._records[near_dup_id] + existing.duplicate_of = activity_id + else: + duplicate_of = near_dup_id + + # ── write files ─────────────────────────────────────────── + written_id = write_activity( + activity, metrics, cfg.output_dir, + privacy=cfg.default_privacy, + duplicate_of=duplicate_of, + rdp_epsilon=cfg.track.rdp_epsilon, + ) + + # Register in dedup index + dedup.register(ActivityRecord( + id=written_id, + source_hash=activity.source_hash, + started_at=activity.started_at, + distance_m=metrics.distance_m, + source=_infer_source(activity), + )) + + if duplicate_of is None: + summaries.append( + build_summary(activity, metrics, written_id, cfg.default_privacy) + ) + + # ── write index.json ────────────────────────────────────────────────────── + # Merge with any existing summaries from previous incremental runs + existing_index = _load_existing_summaries(cfg.output_dir) + all_summaries = {s["id"]: s for s in existing_index} + for s in summaries: + all_summaries[s["id"]] = s + write_index(list(all_summaries.values()), cfg.output_dir, owner) + dedup.save() + + # ── summary ─────────────────────────────────────────────────────────────── + console.print( + f"\n[green]Done.[/green] " + f"Processed [bold]{len(summaries)}[/bold] activities, " + f"skipped [bold]{skipped}[/bold] (already up to date), " + f"errors [bold]{len(errors)}[/bold]." + ) + if errors: + console.print("\n[red]Errors:[/red]") + for path, msg in errors[:20]: + console.print(f" {path.name}: {msg}") + if len(errors) > 20: + console.print(f" ... and {len(errors) - 20} more.") + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def _parse_worker(path: Path) -> ParsedActivity: + """Run in worker process — imports are isolated.""" + from bincio.extract.parsers.factory import parse_file + return parse_file(path) + + +def _process_single(path: Path) -> None: + from bincio.extract.parsers.factory import parse_file + try: + activity = parse_file(path) + metrics = compute(activity) + activity_id = make_activity_id(activity) + from bincio.extract.writer import build_summary + result = build_summary(activity, metrics, activity_id) + click.echo(json.dumps(result, indent=2)) + except Exception as exc: + console.print(f"[red]Error:[/red] {exc}") + sys.exit(1) + + +def _resolve_config( + config_path: Optional[str], + input_dir: Optional[str], + output_dir: Optional[str], +) -> ExtractConfig: + if config_path: + cfg = load_config(Path(config_path)) + elif Path("extract_config.yaml").exists(): + cfg = load_config(Path("extract_config.yaml")) + elif input_dir: + cfg = default_config( + Path(input_dir).expanduser(), + Path(output_dir or "./bincio_data").expanduser(), + ) + else: + raise click.UsageError( + "Provide --config, --input, or an extract_config.yaml in the current directory." + ) + if input_dir: + cfg.input_dirs = [Path(input_dir).expanduser()] + if output_dir: + cfg.output_dir = Path(output_dir).expanduser() + return cfg + + +def _collect_files(cfg: ExtractConfig, since: Optional[str]) -> list[Path]: + from bincio.extract.parsers.factory import is_supported + import os + from datetime import datetime + + since_ts: Optional[float] = None + if since: + since_ts = datetime.strptime(since, "%Y-%m-%d").timestamp() + + files = [] + for d in cfg.input_dirs: + if not d.exists(): + console.print(f"[yellow]Warning:[/yellow] input dir not found: {d}") + continue + for path in d.rglob("*"): + if not path.is_file(): + continue + if not is_supported(path): + continue + if since_ts and path.stat().st_mtime < since_ts: + continue + files.append(path) + return files + + +def _load_existing_summaries(output_dir: Path) -> list[dict]: + index_path = output_dir / "index.json" + if not index_path.exists(): + return [] + try: + data = json.loads(index_path.read_text()) + return data.get("activities", []) + except Exception: + return [] + + +def _infer_source(activity: ParsedActivity) -> Optional[str]: + if activity.strava_id: + return "strava_export" + name = activity.source_file.lower() + if "activity" in name and len(name.split(".")) >= 3: + return "karoo" + if name.endswith((".fit", ".fit.gz")): + return "fit_file" + if name.endswith((".gpx", ".gpx.gz")): + return "gpx_file" + if name.endswith((".tcx", ".tcx.gz")): + return "tcx_file" + return None diff --git a/bincio/extract/config.py b/bincio/extract/config.py new file mode 100644 index 0000000..1aa51d3 --- /dev/null +++ b/bincio/extract/config.py @@ -0,0 +1,88 @@ +"""Extract stage configuration — loaded from extract_config.yaml.""" + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +import yaml + + +@dataclass +class TrackConfig: + simplify: str = "rdp" + rdp_epsilon: float = 0.0001 + timeseries_hz: int = 1 + + +@dataclass +class SensorsConfig: + heart_rate: bool = True + cadence: bool = True + temperature: bool = True + power: bool = True + + +@dataclass +class ClassifierConfig: + enabled: bool = False # off by default; opt-in + + +@dataclass +class ExtractConfig: + input_dirs: list[Path] + output_dir: Path + metadata_csv: Optional[Path] = None + default_privacy: str = "public" + sensors: SensorsConfig = field(default_factory=SensorsConfig) + track: TrackConfig = field(default_factory=TrackConfig) + classifier: ClassifierConfig = field(default_factory=ClassifierConfig) + incremental: bool = True + owner_handle: str = "me" + owner_display_name: str = "Me" + + +def load_config(path: Path) -> ExtractConfig: + raw = yaml.safe_load(path.read_text()) + + inp = raw.get("input", {}) + dirs = [Path(d).expanduser() for d in inp.get("dirs", [])] + csv_path = inp.get("metadata_csv") + + out = Path(raw.get("output", {}).get("dir", "./bincio_data")).expanduser() + + owner = raw.get("owner", {}) + + sensors_raw = raw.get("sensors", {}) + sensors = SensorsConfig( + heart_rate=sensors_raw.get("heart_rate", True), + cadence=sensors_raw.get("cadence", True), + temperature=sensors_raw.get("temperature", True), + power=sensors_raw.get("power", True), + ) + + track_raw = raw.get("track", {}) + track = TrackConfig( + simplify=track_raw.get("simplify", "rdp"), + rdp_epsilon=track_raw.get("rdp_epsilon", 0.0001), + timeseries_hz=track_raw.get("timeseries_hz", 1), + ) + + cls_raw = raw.get("classifier", {}) + classifier = ClassifierConfig(enabled=cls_raw.get("enabled", False)) + + return ExtractConfig( + input_dirs=dirs, + output_dir=out, + metadata_csv=Path(csv_path).expanduser() if csv_path else None, + default_privacy=raw.get("default_privacy", "public"), + sensors=sensors, + track=track, + classifier=classifier, + incremental=raw.get("incremental", True), + owner_handle=owner.get("handle", "me"), + owner_display_name=owner.get("display_name", "Me"), + ) + + +def default_config(input_dir: Path, output_dir: Path) -> ExtractConfig: + return ExtractConfig(input_dirs=[input_dir], output_dir=output_dir) diff --git a/bincio/extract/dedup.py b/bincio/extract/dedup.py new file mode 100644 index 0000000..642b3f9 --- /dev/null +++ b/bincio/extract/dedup.py @@ -0,0 +1,127 @@ +"""Duplicate activity detection. + +Two kinds of duplicates: + +1. Exact duplicate — same source_hash. Skip entirely. +2. Near-duplicate — same ride recorded by two devices / exported from two + platforms. Detected by (started_at ± 5 min) AND (distance ± 5%). + The "better" source wins; the other gets duplicate_of set. + +The deduplication index is a JSON file persisted in the output directory so +that incremental runs don't re-evaluate already-resolved pairs. +""" + +import json +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional + +_INDEX_FILE = ".bincio_cache.json" + +# Source quality ranking (higher = preferred when deduplicating) +_SOURCE_QUALITY: dict[str, int] = { + "karoo": 5, + "fit_file": 4, + "garmin_connect": 4, + "strava_export": 3, + "gpx_file": 2, + "tcx_file": 1, + "wahoo": 3, + "komoot": 2, + "manual": 0, +} + + +@dataclass +class ActivityRecord: + """Minimal record stored in the dedup index.""" + + id: str + source_hash: str + started_at: datetime + distance_m: Optional[float] + source: Optional[str] + duplicate_of: Optional[str] = None + + +@dataclass +class DedupIndex: + output_dir: Path + _records: dict[str, ActivityRecord] = field(default_factory=dict) + # source_hash → id, for exact-duplicate lookup + _by_hash: dict[str, str] = field(default_factory=dict) + + def __post_init__(self) -> None: + self._load() + + def _load(self) -> None: + p = self.output_dir / _INDEX_FILE + if not p.exists(): + return + data = json.loads(p.read_text()) + for item in data.get("activities", []): + started_at = datetime.fromisoformat(item["started_at"]) + r = ActivityRecord( + id=item["id"], + source_hash=item["source_hash"], + started_at=started_at, + distance_m=item.get("distance_m"), + source=item.get("source"), + duplicate_of=item.get("duplicate_of"), + ) + self._records[r.id] = r + self._by_hash[r.source_hash] = r.id + + def save(self) -> None: + p = self.output_dir / _INDEX_FILE + data = { + "activities": [ + { + "id": r.id, + "source_hash": r.source_hash, + "started_at": r.started_at.isoformat(), + "distance_m": r.distance_m, + "source": r.source, + "duplicate_of": r.duplicate_of, + } + for r in self._records.values() + ] + } + p.write_text(json.dumps(data, indent=2)) + + def is_exact_duplicate(self, source_hash: str) -> Optional[str]: + """Return existing activity ID if hash is already in the index.""" + return self._by_hash.get(source_hash) + + def find_near_duplicate( + self, + started_at: datetime, + distance_m: Optional[float], + ) -> Optional[str]: + """Return ID of a near-duplicate if one exists.""" + for r in self._records.values(): + if r.duplicate_of is not None: + continue # skip already-marked duplicates + if abs((r.started_at - started_at).total_seconds()) > 5 * 60: + continue + if distance_m is None or r.distance_m is None: + continue + ref = max(distance_m, r.distance_m) + if abs(distance_m - r.distance_m) / ref < 0.05: + return r.id + return None + + def register(self, record: ActivityRecord) -> None: + self._records[record.id] = record + self._by_hash[record.source_hash] = record.id + + def pick_canonical(self, existing_id: str, new_source: Optional[str]) -> str: + """Return the ID of whichever record should be canonical.""" + existing = self._records[existing_id] + existing_q = _SOURCE_QUALITY.get(existing.source or "", 0) + new_q = _SOURCE_QUALITY.get(new_source or "", 0) + # New record is strictly better → existing becomes the duplicate + if new_q > existing_q: + return "__new__" + return existing_id diff --git a/bincio/extract/metrics.py b/bincio/extract/metrics.py new file mode 100644 index 0000000..ad589f8 --- /dev/null +++ b/bincio/extract/metrics.py @@ -0,0 +1,210 @@ +"""Compute aggregated metrics from a ParsedActivity. + +All calculations are self-contained — no external state needed. +""" + +import math +from dataclasses import dataclass +from datetime import datetime +from typing import Optional + +from geopy.distance import geodesic + +from bincio.extract.models import DataPoint, ParsedActivity + +# Speed below which we consider the athlete stopped (km/h) +_STOPPED_THRESHOLD_KMH = 1.0 + + +@dataclass +class ComputedMetrics: + distance_m: Optional[float] + duration_s: Optional[int] + moving_time_s: Optional[int] + elevation_gain_m: Optional[float] + elevation_loss_m: Optional[float] + avg_speed_kmh: Optional[float] + max_speed_kmh: Optional[float] + avg_hr_bpm: Optional[int] + max_hr_bpm: Optional[int] + avg_cadence_rpm: Optional[int] + avg_power_w: Optional[int] + max_power_w: Optional[int] + bbox: Optional[tuple[float, float, float, float]] # min_lon, min_lat, max_lon, max_lat + start_latlng: Optional[tuple[float, float]] + end_latlng: Optional[tuple[float, float]] + + +def compute(activity: ParsedActivity) -> ComputedMetrics: + pts = activity.points + if not pts: + return _empty() + + duration_s = _duration(pts) + distance_m = _distance(pts) + moving_time_s, moving_speed_kmh = _moving_stats(pts) + gain, loss = _elevation(pts) + max_speed = _max_speed(pts) + avg_hr, max_hr = _hr_stats(pts) + avg_cad = _avg_nonnull([p.cadence_rpm for p in pts]) + avg_pow = _avg_nonnull([p.power_w for p in pts]) + max_pow = _max_nonnull([p.power_w for p in pts]) + bbox = _bbox(pts) + start_ll, end_ll = _endpoints(pts) + + return ComputedMetrics( + distance_m=distance_m, + duration_s=duration_s, + moving_time_s=moving_time_s, + elevation_gain_m=round(gain, 1) if gain is not None else None, + elevation_loss_m=round(abs(loss), 1) if loss is not None else None, + avg_speed_kmh=round(moving_speed_kmh, 2) if moving_speed_kmh else None, + max_speed_kmh=round(max_speed, 2) if max_speed else None, + avg_hr_bpm=avg_hr, + max_hr_bpm=max_hr, + avg_cadence_rpm=avg_cad, + avg_power_w=avg_pow, + max_power_w=max_pow, + bbox=bbox, + start_latlng=start_ll, + end_latlng=end_ll, + ) + + +# ── helpers ────────────────────────────────────────────────────────────────── + +def _duration(pts: list[DataPoint]) -> Optional[int]: + if len(pts) < 2: + return None + return int((pts[-1].timestamp - pts[0].timestamp).total_seconds()) + + +def _distance(pts: list[DataPoint]) -> Optional[float]: + """Prefer device-recorded cumulative distance; fall back to GPS geodesic.""" + # If the last point has a device distance, use it + last_dist = next( + (p.distance_m for p in reversed(pts) if p.distance_m is not None), None + ) + if last_dist is not None: + return round(last_dist, 1) + + # GPS fallback + total = 0.0 + has_gps = False + for a, b in zip(pts, pts[1:]): + if a.lat is None or a.lon is None or b.lat is None or b.lon is None: + continue + has_gps = True + total += geodesic((a.lat, a.lon), (b.lat, b.lon)).meters + return round(total, 1) if has_gps else None + + +def _moving_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[float]]: + """Return (moving_time_s, avg_speed_kmh_over_moving_time).""" + moving_s = 0 + moving_dist_m = 0.0 + has_gps = False + + for a, b in zip(pts, pts[1:]): + dt = (b.timestamp - a.timestamp).total_seconds() + if dt <= 0: + continue + + # Compute speed for this interval from GPS + if a.lat is not None and a.lon is not None and b.lat is not None and b.lon is not None: + has_gps = True + seg_m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters + seg_kmh = (seg_m / dt) * 3.6 + elif a.speed_kmh is not None: + seg_kmh = a.speed_kmh + seg_m = (seg_kmh / 3.6) * dt + has_gps = True # speed data present + else: + continue + + if seg_kmh >= _STOPPED_THRESHOLD_KMH: + moving_s += int(dt) + moving_dist_m += seg_m + + if not has_gps or moving_s == 0: + return None, None + + avg_kmh = (moving_dist_m / moving_s) * 3.6 + return moving_s, avg_kmh + + +def _elevation(pts: list[DataPoint]) -> tuple[Optional[float], Optional[float]]: + elevations = [p.elevation_m for p in pts if p.elevation_m is not None] + if len(elevations) < 2: + return None, None + gain = loss = 0.0 + for a, b in zip(elevations, elevations[1:]): + diff = b - a + if diff > 0: + gain += diff + else: + loss += diff + return gain, loss + + +def _max_speed(pts: list[DataPoint]) -> Optional[float]: + # Prefer device speed; fall back to GPS-derived + device_speeds = [p.speed_kmh for p in pts if p.speed_kmh is not None] + if device_speeds: + return max(device_speeds) + # GPS-derived max + gps_speeds = [] + for a, b in zip(pts, pts[1:]): + if a.lat is None or b.lat is None: + continue + dt = (b.timestamp - a.timestamp).total_seconds() + if dt <= 0: + continue + m = geodesic((a.lat, a.lon), (b.lat, b.lon)).meters + gps_speeds.append((m / dt) * 3.6) + return max(gps_speeds) if gps_speeds else None + + +def _hr_stats(pts: list[DataPoint]) -> tuple[Optional[int], Optional[int]]: + hrs = [p.hr_bpm for p in pts if p.hr_bpm is not None] + if not hrs: + return None, None + return int(sum(hrs) / len(hrs)), max(hrs) + + +def _avg_nonnull(values: list) -> Optional[int]: + v = [x for x in values if x is not None] + return int(sum(v) / len(v)) if v else None + + +def _max_nonnull(values: list) -> Optional[int]: + v = [x for x in values if x is not None] + return max(v) if v else None + + +def _bbox(pts: list[DataPoint]) -> Optional[tuple[float, float, float, float]]: + lats = [p.lat for p in pts if p.lat is not None] + lons = [p.lon for p in pts if p.lon is not None] + if not lats: + return None + return (min(lons), min(lats), max(lons), max(lats)) + + +def _endpoints( + pts: list[DataPoint], +) -> tuple[Optional[tuple[float, float]], Optional[tuple[float, float]]]: + gps = [(p.lat, p.lon) for p in pts if p.lat is not None and p.lon is not None] + if not gps: + return None, None + return gps[0], gps[-1] + + +def _empty() -> ComputedMetrics: + return ComputedMetrics( + distance_m=None, duration_s=None, moving_time_s=None, + elevation_gain_m=None, elevation_loss_m=None, + avg_speed_kmh=None, max_speed_kmh=None, + avg_hr_bpm=None, max_hr_bpm=None, + avg_cadence_rpm=None, avg_power_w=None, max_power_w=None, + bbox=None, start_latlng=None, end_latlng=None, + ) diff --git a/bincio/extract/models.py b/bincio/extract/models.py new file mode 100644 index 0000000..253bb38 --- /dev/null +++ b/bincio/extract/models.py @@ -0,0 +1,58 @@ +"""Core data models for the extract stage. + +ParsedActivity is the internal representation produced by parsers. +It gets fed into metrics computation and the BAS JSON writer. +""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Optional + + +@dataclass +class DataPoint: + """One measurement sample from a GPS/sensor recording.""" + + timestamp: datetime + lat: Optional[float] = None + lon: Optional[float] = None + elevation_m: Optional[float] = None + hr_bpm: Optional[int] = None + cadence_rpm: Optional[int] = None + # Speed from device (km/h). May be absent; we compute it from GPS if so. + speed_kmh: Optional[float] = None + power_w: Optional[int] = None + temperature_c: Optional[float] = None + # Cumulative distance from device (metres), if recorded. + distance_m: Optional[float] = None + + +@dataclass +class LapData: + index: int + started_at: datetime + duration_s: Optional[int] = None + distance_m: Optional[float] = None + elevation_gain_m: Optional[float] = None + avg_speed_kmh: Optional[float] = None + avg_hr_bpm: Optional[int] = None + avg_power_w: Optional[int] = None + + +@dataclass +class ParsedActivity: + """Raw activity data as produced by a parser, before metric computation.""" + + points: list[DataPoint] + sport: str # normalised to BAS sport enum + started_at: datetime + source_file: str # basename of original file + source_hash: str # "sha256:{hex}" + + sub_sport: Optional[str] = None + device: Optional[str] = None + title: Optional[str] = None + description: Optional[str] = None + gear: Optional[str] = None + strava_id: Optional[str] = None + laps: list[LapData] = field(default_factory=list) diff --git a/bincio/extract/parsers/__init__.py b/bincio/extract/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bincio/extract/parsers/base.py b/bincio/extract/parsers/base.py new file mode 100644 index 0000000..c68f424 --- /dev/null +++ b/bincio/extract/parsers/base.py @@ -0,0 +1,34 @@ +"""Abstract base class for all activity parsers.""" + +import gzip +import hashlib +from abc import ABC, abstractmethod +from pathlib import Path + +from bincio.extract.models import ParsedActivity + + +class BaseParser(ABC): + @abstractmethod + def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity: + """Parse activity from raw file bytes. + + Receives pre-read bytes so the factory can compute the hash once and + handle decompression transparently before dispatching. + """ + + @staticmethod + def _sha256(data: bytes) -> str: + return "sha256:" + hashlib.sha256(data).hexdigest() + + @staticmethod + def _read_file(path: Path) -> tuple[bytes, bytes]: + """Return (raw_bytes, decompressed_bytes). + + raw_bytes is the original file content (used for hashing). + decompressed_bytes is what parsers should actually parse. + """ + raw = path.read_bytes() + if path.suffix == ".gz": + return raw, gzip.decompress(raw) + return raw, raw diff --git a/bincio/extract/parsers/factory.py b/bincio/extract/parsers/factory.py new file mode 100644 index 0000000..0be3231 --- /dev/null +++ b/bincio/extract/parsers/factory.py @@ -0,0 +1,46 @@ +"""Parser factory — selects the right parser based on file extension.""" + +from pathlib import Path + +from bincio.extract.models import ParsedActivity +from bincio.extract.parsers.base import BaseParser +from bincio.extract.parsers.fit import FitParser +from bincio.extract.parsers.gpx import GpxParser +from bincio.extract.parsers.tcx import TcxParser + +# Supported extensions (including .gz variants) +SUPPORTED = {".fit", ".gpx", ".tcx", ".fit.gz", ".gpx.gz", ".tcx.gz"} + +_PARSERS: dict[str, type[BaseParser]] = { + ".fit": FitParser, + ".gpx": GpxParser, + ".tcx": TcxParser, +} + + +def _base_ext(path: Path) -> str: + """Return the meaningful extension, stripping .gz if present.""" + if path.suffix == ".gz": + return Path(path.stem).suffix # e.g. ".fit" from "ride.fit.gz" + return path.suffix + + +def is_supported(path: Path) -> bool: + suffix = "".join(path.suffixes[-2:]) if path.suffix == ".gz" else path.suffix + return suffix in SUPPORTED + + +def parse_file(path: Path) -> ParsedActivity: + """Parse an activity file, handling .gz transparently.""" + ext = _base_ext(path) + parser_cls = _PARSERS.get(ext) + if parser_cls is None: + raise ValueError(f"Unsupported file type: {path.name!r}") + + raw_bytes, content_bytes = BaseParser._read_file(path) + parser = parser_cls() + activity = parser.parse(path, content_bytes) + # Attach hash of the *original* bytes (compressed if .gz) for dedup + activity.source_hash = BaseParser._sha256(raw_bytes) + activity.source_file = path.name + return activity diff --git a/bincio/extract/parsers/fit.py b/bincio/extract/parsers/fit.py new file mode 100644 index 0000000..fd50c42 --- /dev/null +++ b/bincio/extract/parsers/fit.py @@ -0,0 +1,133 @@ +"""FIT file parser (Garmin binary format).""" + +from datetime import timezone +from pathlib import Path +from typing import Any + +import fitdecode + +from bincio.extract.models import DataPoint, LapData, ParsedActivity +from bincio.extract.sport import normalise_sport + + +class FitParser: + def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity: + import io + + points: list[DataPoint] = [] + laps: list[LapData] = [] + sport: str = "cycling" + sub_sport: str | None = None + device: str | None = None + + with fitdecode.FitReader(io.BytesIO(raw_bytes)) as fit: + for frame in fit: + if not isinstance(frame, fitdecode.FitDataMessage): + continue + + if frame.name == "sport": + sport = normalise_sport(_get(frame, "sport", "cycling")) + sub_sport = _normalise_sub_sport(_get(frame, "sub_sport")) + + elif frame.name == "device_info": + mfr = _get(frame, "manufacturer") + prod = _get(frame, "product_name") or _get(frame, "garmin_product") + if mfr and prod: + device = f"{mfr} {prod}" + elif prod: + device = str(prod) + + elif frame.name == "record": + ts = _get(frame, "timestamp") + if ts is None: + continue + if hasattr(ts, "tzinfo") and ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + + lat = _semicircles_to_deg(_get(frame, "position_lat")) + lon = _semicircles_to_deg(_get(frame, "position_long")) + speed_raw = _get(frame, "speed") # m/s + + dp = DataPoint( + timestamp=ts, + lat=lat, + lon=lon, + elevation_m=_get(frame, "altitude"), + hr_bpm=_get(frame, "heart_rate"), + cadence_rpm=_get(frame, "cadence"), + speed_kmh=speed_raw * 3.6 if speed_raw is not None else None, + power_w=_get(frame, "power"), + temperature_c=_get(frame, "temperature"), + distance_m=_get(frame, "distance"), + ) + points.append(dp) + + elif frame.name == "lap": + ts = _get(frame, "start_time") + if ts is not None: + if hasattr(ts, "tzinfo") and ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + elapsed = _get(frame, "total_elapsed_time") + speed_raw = _get(frame, "avg_speed") + laps.append( + LapData( + index=len(laps), + started_at=ts, + duration_s=int(elapsed) if elapsed else None, + distance_m=_get(frame, "total_distance"), + elevation_gain_m=_get(frame, "total_ascent"), + avg_speed_kmh=speed_raw * 3.6 if speed_raw else None, + avg_hr_bpm=_get(frame, "avg_heart_rate"), + avg_power_w=_get(frame, "avg_power"), + ) + ) + + if not points: + raise ValueError(f"No record messages found in {path.name}") + + return ParsedActivity( + points=points, + sport=sport, + sub_sport=sub_sport, + started_at=points[0].timestamp, + device=device, + laps=laps, + source_file=path.name, + source_hash="", + ) + + +def _get(frame: fitdecode.FitDataMessage, field: str, default: Any = None) -> Any: + try: + return frame.get_value(field) + except KeyError: + return default + + +def _semicircles_to_deg(value: Any) -> float | None: + if value is None: + return None + try: + deg = float(value) * (180.0 / 2**31) + # Sanity check: invalid semicircle values often come out as ±180+ + if abs(deg) > 180: + return None + return deg + except (TypeError, ValueError): + return None + + +def _normalise_sub_sport(value: Any) -> str | None: + if value is None: + return None + s = str(value).lower().replace(" ", "_") + mapping = { + "road": "road", + "mountain": "mountain", + "gravel_cycling": "gravel", + "cyclocross": "gravel", + "indoor_cycling": "indoor", + "trail": "trail", + "track": "track", + } + return mapping.get(s, s) or None diff --git a/bincio/extract/parsers/gpx.py b/bincio/extract/parsers/gpx.py new file mode 100644 index 0000000..665ff04 --- /dev/null +++ b/bincio/extract/parsers/gpx.py @@ -0,0 +1,82 @@ +"""GPX file parser.""" + +from datetime import timezone +from pathlib import Path + +import gpxpy +import gpxpy.gpx + +from bincio.extract.models import DataPoint, ParsedActivity +from bincio.extract.parsers.base import BaseParser +from bincio.extract.sport import normalise_sport + +# Known GPX extension namespaces +_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1" +_NS_GARMIN_V2 = "http://www.garmin.com/xmlschemas/TrackPointExtension/v2" + + +class GpxParser(BaseParser): + def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity: + gpx = gpxpy.parse(raw_bytes.decode("utf-8", errors="replace")) + + points: list[DataPoint] = [] + for track in gpx.tracks: + for segment in track.segments: + for pt in segment.points: + if pt.time is None: + continue + ts = pt.time + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + + dp = DataPoint( + timestamp=ts, + lat=pt.latitude, + lon=pt.longitude, + elevation_m=pt.elevation, + ) + _apply_extensions(pt, dp) + points.append(dp) + + if not points: + raise ValueError(f"No trackpoints found in {path.name}") + + sport = normalise_sport( + (gpx.tracks[0].type if gpx.tracks else None) or "cycling" + ) + started_at = points[0].timestamp + + return ParsedActivity( + points=points, + sport=sport, + started_at=started_at, + source_file=path.name, + source_hash="", # set by factory + ) + + +def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None: + """Extract HR, cadence, temperature from Garmin TrackPointExtension.""" + if pt.extensions is None: + return + for ext in pt.extensions: + ns = _strip_ns(ext.tag) + if ns == "TrackPointExtension": + for child in ext: + tag = _strip_ns(child.tag) + val = child.text + if val is None: + continue + if tag == "hr": + dp.hr_bpm = int(float(val)) + elif tag == "cad": + dp.cadence_rpm = int(float(val)) + elif tag == "atemp": + dp.temperature_c = float(val) + elif tag == "speed": + dp.speed_kmh = float(val) * 3.6 # m/s → km/h + + +def _strip_ns(tag: str) -> str: + """'{namespace}localname' → 'localname'.""" + return tag.split("}")[-1] if "}" in tag else tag diff --git a/bincio/extract/parsers/tcx.py b/bincio/extract/parsers/tcx.py new file mode 100644 index 0000000..bda8998 --- /dev/null +++ b/bincio/extract/parsers/tcx.py @@ -0,0 +1,89 @@ +"""TCX (Training Center XML) file parser.""" + +from datetime import datetime, timezone +from pathlib import Path + +from lxml import etree + +from bincio.extract.models import DataPoint, ParsedActivity +from bincio.extract.sport import normalise_sport + +_NS = { + "tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2", + "ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2", +} + + +class TcxParser: + def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity: + # Some exporters (e.g. Garmin) prepend whitespace before the XML + # declaration, which is technically invalid. Strip it. + root = etree.fromstring(raw_bytes.lstrip()) + + activities = root.findall(".//tcx:Activity", _NS) + if not activities: + raise ValueError(f"No Activity elements found in {path.name}") + + # Use the first activity + act = activities[0] + sport_attr = act.get("Sport", "Biking") + sport = normalise_sport(sport_attr) + + points: list[DataPoint] = [] + for tp in act.findall(".//tcx:Trackpoint", _NS): + ts_el = tp.find("tcx:Time", _NS) + if ts_el is None or not ts_el.text: + continue + ts = _parse_ts(ts_el.text) + + lat, lon = None, None + pos = tp.find("tcx:Position", _NS) + if pos is not None: + lat_el = pos.find("tcx:LatitudeDegrees", _NS) + lon_el = pos.find("tcx:LongitudeDegrees", _NS) + lat = float(lat_el.text) if lat_el is not None and lat_el.text else None + lon = float(lon_el.text) if lon_el is not None and lon_el.text else None + + ele_el = tp.find("tcx:AltitudeMeters", _NS) + hr_el = tp.find(".//tcx:HeartRateBpm/tcx:Value", _NS) + cad_el = tp.find("tcx:Cadence", _NS) + dist_el = tp.find("tcx:DistanceMeters", _NS) + + # Extensions (speed, watts) + speed_el = tp.find(".//ext:Speed", _NS) + power_el = tp.find(".//ext:Watts", _NS) + + dp = DataPoint( + timestamp=ts, + lat=lat, + lon=lon, + elevation_m=float(ele_el.text) if ele_el is not None and ele_el.text else None, + hr_bpm=int(float(hr_el.text)) if hr_el is not None and hr_el.text else None, + cadence_rpm=int(float(cad_el.text)) if cad_el is not None and cad_el.text else None, + distance_m=float(dist_el.text) if dist_el is not None and dist_el.text else None, + speed_kmh=float(speed_el.text) * 3.6 if speed_el is not None and speed_el.text else None, + power_w=int(float(power_el.text)) if power_el is not None and power_el.text else None, + ) + points.append(dp) + + if not points: + raise ValueError(f"No trackpoints found in {path.name}") + + return ParsedActivity( + points=points, + sport=sport, + started_at=points[0].timestamp, + source_file=path.name, + source_hash="", + ) + + +def _parse_ts(s: str) -> datetime: + # ISO 8601 with or without fractional seconds + s = s.rstrip("Z") + for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"): + try: + return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc) + except ValueError: + continue + raise ValueError(f"Cannot parse timestamp: {s!r}") diff --git a/bincio/extract/simplify.py b/bincio/extract/simplify.py new file mode 100644 index 0000000..d4f9206 --- /dev/null +++ b/bincio/extract/simplify.py @@ -0,0 +1,60 @@ +"""GPS track simplification using the Ramer-Douglas-Peucker algorithm.""" + +from typing import Optional + +from rdp import rdp + +from bincio.extract.models import DataPoint + + +def simplify_track( + points: list[DataPoint], + epsilon: float = 0.0001, +) -> list[DataPoint]: + """Return a simplified subset of points using RDP. + + epsilon is in degrees (~11m at equator for 0.0001). + Points without GPS coordinates are dropped. + """ + gps_pts = [(p, p.lat, p.lon) for p in points if p.lat is not None and p.lon is not None] + if len(gps_pts) < 2: + return [p for p, _, _ in gps_pts] + + coords = [[lon, lat] for _, lat, lon in gps_pts] + mask = rdp(coords, epsilon=epsilon, return_mask=True) + return [p for (p, _, _), keep in zip(gps_pts, mask) if keep] + + +def build_geojson( + points: list[DataPoint], + activity_id: str, + epsilon: float = 0.0001, + original_count: Optional[int] = None, +) -> dict: + """Build a GeoJSON Feature for the simplified track.""" + simplified = simplify_track(points, epsilon=epsilon) + + coordinates = [ + [p.lon, p.lat, p.elevation_m] if p.elevation_m is not None else [p.lon, p.lat] + for p in simplified + if p.lon is not None and p.lat is not None + ] + + # Parallel speed array for gradient coloring + speeds = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in simplified] + + return { + "type": "Feature", + "geometry": { + "type": "LineString", + "coordinates": coordinates, + }, + "properties": { + "id": activity_id, + "speeds": speeds, + "simplification": "rdp", + "rdp_epsilon": epsilon, + "point_count_original": original_count or len(points), + "point_count_simplified": len(coordinates), + }, + } diff --git a/bincio/extract/sport.py b/bincio/extract/sport.py new file mode 100644 index 0000000..72afea2 --- /dev/null +++ b/bincio/extract/sport.py @@ -0,0 +1,40 @@ +"""Sport name normalisation.""" + +_MAPPING: dict[str, str] = { + # cycling variants + "cycling": "cycling", + "biking": "cycling", + "bike": "cycling", + "road_biking": "cycling", + "mountain_biking": "cycling", + "gravel_cycling": "cycling", + "cyclocross": "cycling", + "indoor_cycling": "cycling", + "virtual_ride": "cycling", + "e-biking": "cycling", + # running + "running": "running", + "run": "running", + "trail_running": "running", + "treadmill_running": "running", + "virtual_run": "running", + # hiking + "hiking": "hiking", + "hike": "hiking", + # walking + "walking": "walking", + "walk": "walking", + # swimming + "swimming": "swimming", + "swim": "swimming", + "open_water_swimming": "swimming", +} + +BAS_SPORTS = {"cycling", "running", "hiking", "walking", "swimming", "other"} + + +def normalise_sport(raw: object) -> str: + if raw is None: + return "other" + key = str(raw).lower().strip().replace(" ", "_") + return _MAPPING.get(key, "other") diff --git a/bincio/extract/strava_csv.py b/bincio/extract/strava_csv.py new file mode 100644 index 0000000..472d847 --- /dev/null +++ b/bincio/extract/strava_csv.py @@ -0,0 +1,55 @@ +"""Import metadata from Strava's activities.csv bulk export. + +Strava export columns we care about: + Activity ID, Activity Date, Activity Name, Activity Type, + Activity Description, Filename +""" + +import csv +import re +from pathlib import Path +from typing import Optional + + +_STRAVA_DATE_FMTS = ( + "%b %d, %Y, %I:%M:%S %p", # "Jun 1, 2024, 7:30:12 AM" + "%Y-%m-%d %H:%M:%S", +) + + +class StravaMetadata: + """Maps original filename → Strava metadata.""" + + def __init__(self, csv_path: Path) -> None: + self._by_filename: dict[str, dict] = {} + self._load(csv_path) + + def _load(self, path: Path) -> None: + with path.open(newline="", encoding="utf-8-sig") as f: + reader = csv.DictReader(f) + for row in reader: + filename = row.get("Filename", "").strip() + if not filename: + continue + # Strava stores paths like "activities/12345.fit.gz" + basename = Path(filename).name + self._by_filename[basename] = row + + def lookup(self, source_file: str) -> Optional[dict]: + """Return the Strava CSV row for a given source filename, or None.""" + return self._by_filename.get(source_file) + + def enrich(self, source_file: str, activity: object) -> None: + """Mutate a ParsedActivity with Strava metadata if found.""" + row = self.lookup(source_file) + if row is None: + return + + if not activity.title and row.get("Activity Name"): # type: ignore[attr-defined] + activity.title = row["Activity Name"].strip() # type: ignore[attr-defined] + + if not activity.description and row.get("Activity Description"): # type: ignore[attr-defined] + activity.description = row["Activity Description"].strip() # type: ignore[attr-defined] + + if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined] + activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined] diff --git a/bincio/extract/timeseries.py b/bincio/extract/timeseries.py new file mode 100644 index 0000000..987f110 --- /dev/null +++ b/bincio/extract/timeseries.py @@ -0,0 +1,58 @@ +"""Downsample a list of DataPoints to at most 1 sample/second and build +the BAS timeseries object (parallel arrays).""" + +from datetime import datetime +from typing import Optional + +from bincio.extract.models import DataPoint + + +def build_timeseries( + points: list[DataPoint], + started_at: datetime, + privacy: str = "public", +) -> dict: + """Return the BAS `timeseries` object. + + privacy='no_gps' or 'private' → lat/lon set to null. + Downsamples so at most one point per second is emitted. + """ + if not points: + return {"t": []} + + include_gps = privacy not in ("no_gps", "private") + + # Downsample: keep at most one point per second + sampled: list[DataPoint] = [] + last_t: Optional[int] = None + for p in points: + t = int((p.timestamp - started_at).total_seconds()) + if t < 0: + continue + if last_t is not None and t == last_t: + continue # skip sub-second duplicates + sampled.append(p) + last_t = t + + ts_vals = [int((p.timestamp - started_at).total_seconds()) for p in sampled] + lat_vals = [round(p.lat, 7) if p.lat is not None else None for p in sampled] if include_gps else None + lon_vals = [round(p.lon, 7) if p.lon is not None else None for p in sampled] if include_gps else None + ele_vals = [round(p.elevation_m, 1) if p.elevation_m is not None else None for p in sampled] + spd_vals = [round(p.speed_kmh, 2) if p.speed_kmh is not None else None for p in sampled] + hr_vals = [p.hr_bpm for p in sampled] + cad_vals = [p.cadence_rpm for p in sampled] + pwr_vals = [p.power_w for p in sampled] + tmp_vals = [round(p.temperature_c, 1) if p.temperature_c is not None else None for p in sampled] + + result: dict = { + "t": ts_vals, + "lat": lat_vals, + "lon": lon_vals, + "elevation_m": ele_vals, + "speed_kmh": spd_vals, + "hr_bpm": hr_vals, + "cadence_rpm": cad_vals, + "power_w": pwr_vals, + "temperature_c": tmp_vals, + } + return result diff --git a/bincio/extract/writer.py b/bincio/extract/writer.py new file mode 100644 index 0000000..8872011 --- /dev/null +++ b/bincio/extract/writer.py @@ -0,0 +1,198 @@ +"""Write a processed activity to BAS JSON files.""" + +import json +import re +import unicodedata +from pathlib import Path + +from bincio.extract.metrics import ComputedMetrics +from bincio.extract.models import LapData, ParsedActivity +from bincio.extract.simplify import build_geojson +from bincio.extract.timeseries import build_timeseries + + +def make_activity_id(activity: ParsedActivity) -> str: + """Generate a BAS activity ID from started_at + optional title slug.""" + ts = activity.started_at + # Compact ISO format: 2024-06-01T073012+0200 + tz_str = ts.strftime("%z") # e.g. "+0200" or "" + ts_part = ts.strftime("%Y-%m-%dT%H%M%S") + (tz_str or "Z") + + if activity.title: + slug = _slugify(activity.title) + return f"{ts_part}-{slug}" if slug else ts_part + return ts_part + + +def write_activity( + activity: ParsedActivity, + metrics: ComputedMetrics, + output_dir: Path, + privacy: str = "public", + duplicate_of: str | None = None, + rdp_epsilon: float = 0.0001, +) -> str: + """Write {id}.json and (if GPS available) {id}.geojson. Returns the ID.""" + activity_id = make_activity_id(activity) + acts_dir = output_dir / "activities" + acts_dir.mkdir(parents=True, exist_ok=True) + + source = _infer_source(activity) + has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private") + + # ── detail JSON ────────────────────────────────────────────────────────── + detail: dict = { + "bas_version": "1.0", + "id": activity_id, + "title": activity.title or _auto_title(activity), + "description": activity.description, + "sport": activity.sport, + "sub_sport": activity.sub_sport, + "started_at": activity.started_at.isoformat(), + "distance_m": metrics.distance_m, + "duration_s": metrics.duration_s, + "moving_time_s": metrics.moving_time_s, + "elevation_gain_m": metrics.elevation_gain_m, + "elevation_loss_m": metrics.elevation_loss_m, + "avg_speed_kmh": metrics.avg_speed_kmh, + "max_speed_kmh": metrics.max_speed_kmh, + "avg_hr_bpm": metrics.avg_hr_bpm, + "max_hr_bpm": metrics.max_hr_bpm, + "avg_cadence_rpm": metrics.avg_cadence_rpm, + "avg_power_w": metrics.avg_power_w, + "max_power_w": metrics.max_power_w, + "gear": activity.gear, + "device": activity.device, + "bbox": list(metrics.bbox) if metrics.bbox else None, + "start_latlng": list(metrics.start_latlng) if metrics.start_latlng else None, + "end_latlng": list(metrics.end_latlng) if metrics.end_latlng else None, + "laps": [_serialise_lap(lap) for lap in activity.laps], + "timeseries": build_timeseries(activity.points, activity.started_at, privacy), + "source": source, + "source_file": activity.source_file, + "source_hash": activity.source_hash, + "strava_id": activity.strava_id, + "duplicate_of": duplicate_of, + "privacy": privacy, + "custom": {}, + } + + json_path = acts_dir / f"{activity_id}.json" + json_path.write_text(json.dumps(detail, indent=2, ensure_ascii=False)) + + # ── GeoJSON track ──────────────────────────────────────────────────────── + if has_gps: + geojson = build_geojson(activity.points, activity_id, epsilon=rdp_epsilon) + geojson_path = acts_dir / f"{activity_id}.geojson" + geojson_path.write_text(json.dumps(geojson, indent=2, ensure_ascii=False)) + + return activity_id + + +def build_summary( + activity: ParsedActivity, + metrics: ComputedMetrics, + activity_id: str, + privacy: str = "public", +) -> dict: + """Build the Activity Summary object for index.json.""" + has_gps = metrics.bbox is not None and privacy not in ("no_gps", "private") + return { + "id": activity_id, + "title": activity.title or _auto_title(activity), + "sport": activity.sport, + "sub_sport": activity.sub_sport, + "started_at": activity.started_at.isoformat(), + "distance_m": metrics.distance_m, + "duration_s": metrics.duration_s, + "moving_time_s": metrics.moving_time_s, + "elevation_gain_m": metrics.elevation_gain_m, + "avg_speed_kmh": metrics.avg_speed_kmh, + "max_speed_kmh": metrics.max_speed_kmh, + "avg_hr_bpm": metrics.avg_hr_bpm, + "max_hr_bpm": metrics.max_hr_bpm, + "avg_cadence_rpm": metrics.avg_cadence_rpm, + "avg_power_w": metrics.avg_power_w, + "source": _infer_source(activity), + "privacy": privacy, + "detail_url": f"activities/{activity_id}.json", + "track_url": f"activities/{activity_id}.geojson" if has_gps else None, + } + + +def write_index(summaries: list[dict], output_dir: Path, owner: dict) -> None: + """Write index.json (sorted newest first).""" + sorted_summaries = sorted( + summaries, + key=lambda s: s["started_at"], + reverse=True, + ) + index = { + "bas_version": "1.0", + "owner": owner, + "generated_at": _now_iso(), + "shards": [], + "activities": sorted_summaries, + } + (output_dir / "index.json").write_text( + json.dumps(index, indent=2, ensure_ascii=False) + ) + + +# ── helpers ────────────────────────────────────────────────────────────────── + +def _now_iso() -> str: + from datetime import datetime, timezone + return datetime.now(timezone.utc).isoformat() + + +def _auto_title(activity: ParsedActivity) -> str: + ts = activity.started_at + hour = ts.hour + if 5 <= hour < 12: + time_of_day = "Morning" + elif 12 <= hour < 17: + time_of_day = "Afternoon" + elif 17 <= hour < 21: + time_of_day = "Evening" + else: + time_of_day = "Night" + sport = activity.sport.capitalize() + return f"{time_of_day} {sport}" + + +def _infer_source(activity: ParsedActivity) -> str | None: + if activity.strava_id: + return "strava_export" + name = activity.source_file.lower() + # Karoo uses UUID-style names + if "activity" in name and len(name.split(".")) >= 3: + return "karoo" + if name.endswith(".fit") or name.endswith(".fit.gz"): + return "fit_file" + if name.endswith(".gpx") or name.endswith(".gpx.gz"): + return "gpx_file" + if name.endswith(".tcx") or name.endswith(".tcx.gz"): + return "tcx_file" + return None + + +def _slugify(text: str) -> str: + text = unicodedata.normalize("NFKD", text) + text = text.encode("ascii", "ignore").decode("ascii") + text = text.lower() + text = re.sub(r"[^a-z0-9]+", "-", text) + return text.strip("-")[:60] + + +def _serialise_lap(lap: LapData) -> dict: + return { + "index": lap.index, + "started_at": lap.started_at.isoformat(), + "duration_s": lap.duration_s, + "distance_m": lap.distance_m, + "elevation_gain_m": lap.elevation_gain_m, + "avg_speed_kmh": lap.avg_speed_kmh, + "avg_hr_bpm": lap.avg_hr_bpm, + "avg_power_w": lap.avg_power_w, + } diff --git a/bincio/render/__init__.py b/bincio/render/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bincio/render/cli.py b/bincio/render/cli.py new file mode 100644 index 0000000..16f8ca3 --- /dev/null +++ b/bincio/render/cli.py @@ -0,0 +1,18 @@ +"""bincio render — CLI command (stub, Astro stage TBD).""" + +import click +from rich.console import Console + +console = Console() + + +@click.command() +@click.option("--config", "config_path", default="site_config.yaml") +@click.option("--out", "out_dir", default="./site/dist") +@click.option("--serve", is_flag=True, help="Start dev server with hot reload.") +@click.option("--deploy", default=None, metavar="TARGET", + help="Deploy target: 'github'.") +def render(config_path: str, out_dir: str, serve: bool, deploy: str | None) -> None: + """Generate static site from BAS data store (Astro stage — coming soon).""" + console.print("[yellow]bincio render is not yet implemented.[/yellow]") + console.print("The web renderer (Astro + MapLibre + Observable Plot) is next.") diff --git a/extract_config.example.yaml b/extract_config.example.yaml new file mode 100644 index 0000000..2f608bc --- /dev/null +++ b/extract_config.example.yaml @@ -0,0 +1,32 @@ +owner: + handle: brutsalvadi + display_name: Bru + +input: + dirs: + - ~/src/cycling_data_davide/activities + - ~/src/cycling_data_davide/Karoo_2026 + - ~/src/cycling_data_davide/Karoo + # Strava bulk export metadata — provides names, descriptions, gear + metadata_csv: ~/src/cycling_data_davide/activities.csv + +output: + dir: ~/bincio_data + +default_privacy: public + +sensors: + heart_rate: true + cadence: true + temperature: true + power: true + +track: + simplify: rdp + rdp_epsilon: 0.0001 # ~11m at equator + timeseries_hz: 1 # 1 sample/second max + +classifier: + enabled: false # ML activity type classifier (requires scikit-learn extra) + +incremental: true # skip files whose hash hasn't changed since last run diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..66a7e88 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,76 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "bincio" +version = "0.1.0" +description = "Federated, open-source, self-hosted activity stats platform" +readme = "README.md" +requires-python = ">=3.12" +license = { text = "MIT" } +authors = [{ name = "Davide Brugali" }] + +dependencies = [ + # Parsing + "gpxpy>=1.6", + "fitdecode>=0.11", + "lxml>=5.0", # TCX (XML) + # Data + "pandas>=2.2", + # Geo + "geopy>=2.4", + "rdp>=0.8", + # Config & CLI + "pyyaml>=6.0", + "click>=8.1", + "rich>=13.0", # pretty console output + # Schema validation + "jsonschema>=4.23", +] + +[project.optional-dependencies] +classifier = [ + "scikit-learn>=1.5", +] +dev = [ + "pytest>=9.0", + "pytest-cov>=5.0", + "ruff>=0.9", + "mypy>=1.11", + "types-pyyaml", + "types-jsonschema", +] + +[project.scripts] +bincio = "bincio.cli:main" + +[dependency-groups] +dev = [ + "pytest>=9.0", + "pytest-cov>=5.0", + "ruff>=0.9", + "mypy>=1.11", + "types-pyyaml", + "types-jsonschema", +] + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I", "UP", "B", "SIM"] +ignore = ["E501"] + +[tool.mypy] +python_version = "3.12" +strict = true +ignore_missing_imports = true + +[tool.pytest.ini_options] +testpaths = ["tests"] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "integration: marks tests requiring real activity files", +] diff --git a/schema/bas-v1.schema.json b/schema/bas-v1.schema.json new file mode 100644 index 0000000..add0b8c --- /dev/null +++ b/schema/bas-v1.schema.json @@ -0,0 +1,180 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/bincio-activity/bincio/blob/main/schema/bas-v1.schema.json", + "title": "BincioActivity Schema v1.0", + "description": "Schema for BincioActivity (BAS) data files.", + "$defs": { + "sport": { + "type": "string", + "enum": ["cycling", "running", "hiking", "walking", "swimming", "other"] + }, + "sub_sport": { + "type": ["string", "null"], + "enum": ["road", "mountain", "gravel", "indoor", "trail", "track", null] + }, + "privacy": { + "type": "string", + "enum": ["public", "blur_start", "no_gps", "private"] + }, + "source": { + "type": ["string", "null"], + "enum": [ + "strava_export", "garmin_connect", "wahoo", "komoot", + "gpx_file", "fit_file", "tcx_file", "karoo", "manual", null + ] + }, + "latlng": { + "type": ["array", "null"], + "items": { "type": "number" }, + "minItems": 2, + "maxItems": 2 + }, + "bbox": { + "type": ["array", "null"], + "items": { "type": "number" }, + "minItems": 4, + "maxItems": 4, + "description": "[min_lon, min_lat, max_lon, max_lat]" + }, + "lap": { + "type": "object", + "required": ["index", "started_at", "duration_s"], + "properties": { + "index": { "type": "integer", "minimum": 0 }, + "started_at": { "type": "string", "format": "date-time" }, + "duration_s": { "type": ["integer", "null"] }, + "distance_m": { "type": ["number", "null"] }, + "elevation_gain_m": { "type": ["number", "null"] }, + "avg_speed_kmh": { "type": ["number", "null"] }, + "avg_hr_bpm": { "type": ["integer", "null"] }, + "avg_power_w": { "type": ["integer", "null"] } + }, + "additionalProperties": false + }, + "timeseries": { + "type": "object", + "required": ["t"], + "properties": { + "t": { "type": "array", "items": { "type": "integer" } }, + "lat": { "type": ["array", "null"], "items": { "type": ["number", "null"] } }, + "lon": { "type": ["array", "null"], "items": { "type": ["number", "null"] } }, + "elevation_m": { "type": "array", "items": { "type": ["number", "null"] } }, + "speed_kmh": { "type": "array", "items": { "type": ["number", "null"] } }, + "hr_bpm": { "type": "array", "items": { "type": ["integer", "null"] } }, + "cadence_rpm": { "type": "array", "items": { "type": ["integer", "null"] } }, + "power_w": { "type": "array", "items": { "type": ["integer", "null"] } }, + "temperature_c": { "type": "array", "items": { "type": ["number", "null"] } } + }, + "additionalProperties": false + }, + "activity_summary": { + "type": "object", + "required": ["id", "title", "sport", "started_at", "privacy"], + "properties": { + "id": { "type": "string", "minLength": 1 }, + "title": { "type": "string" }, + "sport": { "$ref": "#/$defs/sport" }, + "sub_sport": { "$ref": "#/$defs/sub_sport" }, + "started_at": { "type": "string", "format": "date-time" }, + "distance_m": { "type": ["number", "null"] }, + "duration_s": { "type": ["integer", "null"] }, + "moving_time_s": { "type": ["integer", "null"] }, + "elevation_gain_m": { "type": ["number", "null"] }, + "avg_speed_kmh": { "type": ["number", "null"] }, + "max_speed_kmh": { "type": ["number", "null"] }, + "avg_hr_bpm": { "type": ["integer", "null"] }, + "max_hr_bpm": { "type": ["integer", "null"] }, + "avg_cadence_rpm": { "type": ["integer", "null"] }, + "avg_power_w": { "type": ["integer", "null"] }, + "source": { "$ref": "#/$defs/source" }, + "privacy": { "$ref": "#/$defs/privacy" }, + "detail_url": { "type": ["string", "null"] }, + "track_url": { "type": ["string", "null"] } + }, + "additionalProperties": false + } + }, + "oneOf": [ + { + "title": "index.json", + "type": "object", + "required": ["bas_version", "owner", "generated_at", "activities"], + "properties": { + "bas_version": { "type": "string", "const": "1.0" }, + "owner": { + "type": "object", + "required": ["handle", "display_name"], + "properties": { + "handle": { "type": "string", "minLength": 1 }, + "display_name": { "type": "string" }, + "avatar_url": { "type": ["string", "null"] } + }, + "additionalProperties": false + }, + "generated_at": { "type": "string", "format": "date-time" }, + "shards": { + "type": "array", + "items": { + "type": "object", + "required": ["year", "url", "count"], + "properties": { + "year": { "type": "integer" }, + "url": { "type": "string" }, + "count": { "type": "integer" } + }, + "additionalProperties": false + } + }, + "activities": { + "type": "array", + "items": { "$ref": "#/$defs/activity_summary" } + } + }, + "additionalProperties": false + }, + { + "title": "activities/{id}.json", + "type": "object", + "required": [ + "bas_version", "id", "title", "sport", "started_at", + "privacy", "laps", "timeseries", "custom" + ], + "properties": { + "bas_version": { "type": "string", "const": "1.0" }, + "id": { "type": "string", "minLength": 1 }, + "title": { "type": "string" }, + "description": { "type": ["string", "null"] }, + "sport": { "$ref": "#/$defs/sport" }, + "sub_sport": { "$ref": "#/$defs/sub_sport" }, + "started_at": { "type": "string", "format": "date-time" }, + "distance_m": { "type": ["number", "null"] }, + "duration_s": { "type": ["integer", "null"] }, + "moving_time_s": { "type": ["integer", "null"] }, + "elevation_gain_m": { "type": ["number", "null"] }, + "elevation_loss_m": { "type": ["number", "null"] }, + "avg_speed_kmh": { "type": ["number", "null"] }, + "max_speed_kmh": { "type": ["number", "null"] }, + "avg_hr_bpm": { "type": ["integer", "null"] }, + "max_hr_bpm": { "type": ["integer", "null"] }, + "avg_cadence_rpm": { "type": ["integer", "null"] }, + "avg_power_w": { "type": ["integer", "null"] }, + "max_power_w": { "type": ["integer", "null"] }, + "gear": { "type": ["string", "null"] }, + "device": { "type": ["string", "null"] }, + "bbox": { "$ref": "#/$defs/bbox" }, + "start_latlng": { "$ref": "#/$defs/latlng" }, + "end_latlng": { "$ref": "#/$defs/latlng" }, + "laps": { "type": "array", "items": { "$ref": "#/$defs/lap" } }, + "timeseries": { "$ref": "#/$defs/timeseries" }, + "source": { "$ref": "#/$defs/source" }, + "source_file": { "type": ["string", "null"] }, + "source_hash": { "type": ["string", "null"] }, + "strava_id": { "type": ["string", "null"] }, + "duplicate_of": { "type": ["string", "null"] }, + "privacy": { "$ref": "#/$defs/privacy" }, + "custom": { "type": "object" } + }, + "additionalProperties": false + } + ] +} diff --git a/site/astro.config.mjs b/site/astro.config.mjs new file mode 100644 index 0000000..6af1db6 --- /dev/null +++ b/site/astro.config.mjs @@ -0,0 +1,10 @@ +import { defineConfig } from "astro/config"; +import svelte from "@astrojs/svelte"; +import tailwind from "@astrojs/tailwind"; + +export default defineConfig({ + integrations: [svelte(), tailwind()], + output: "static", + // When hosting at a subdirectory (e.g. GitHub Pages project site), set: + // base: "/repo-name", +}); diff --git a/site/package.json b/site/package.json new file mode 100644 index 0000000..c0f6980 --- /dev/null +++ b/site/package.json @@ -0,0 +1,25 @@ +{ + "name": "bincio-site", + "type": "module", + "version": "0.1.0", + "private": true, + "scripts": { + "dev": "astro dev", + "build": "astro build", + "preview": "astro preview", + "astro": "astro" + }, + "dependencies": { + "@astrojs/svelte": "^7.0.0", + "@astrojs/tailwind": "^5.1.0", + "astro": "^5.0.0", + "maplibre-gl": "^5.0.0", + "@observablehq/plot": "^0.6.0", + "svelte": "^5.0.0", + "tailwindcss": "^3.4.0" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "typescript": "^5.7.0" + } +} diff --git a/site/src/lib/types.ts b/site/src/lib/types.ts new file mode 100644 index 0000000..0198533 --- /dev/null +++ b/site/src/lib/types.ts @@ -0,0 +1,74 @@ +/** TypeScript types mirroring BAS v1.0 schema. */ + +export type Sport = "cycling" | "running" | "hiking" | "walking" | "swimming" | "other"; +export type SubSport = "road" | "mountain" | "gravel" | "indoor" | "trail" | "track" | null; +export type Privacy = "public" | "blur_start" | "no_gps" | "private"; + +export interface ActivitySummary { + id: string; + title: string; + sport: Sport; + sub_sport: SubSport; + started_at: string; // ISO 8601 + distance_m: number | null; + duration_s: number | null; + moving_time_s: number | null; + elevation_gain_m: number | null; + avg_speed_kmh: number | null; + max_speed_kmh: number | null; + avg_hr_bpm: number | null; + max_hr_bpm: number | null; + avg_cadence_rpm: number | null; + avg_power_w: number | null; + source: string | null; + privacy: Privacy; + detail_url: string | null; + track_url: string | null; +} + +export interface BASIndex { + bas_version: string; + owner: { handle: string; display_name: string; avatar_url: string | null }; + generated_at: string; + shards: Array<{ year: number; url: string; count: number }>; + activities: ActivitySummary[]; +} + +export interface Timeseries { + t: number[]; + lat: number[] | null; + lon: number[] | null; + elevation_m: (number | null)[]; + speed_kmh: (number | null)[]; + hr_bpm: (number | null)[]; + cadence_rpm: (number | null)[]; + power_w: (number | null)[]; + temperature_c: (number | null)[]; +} + +export interface ActivityDetail extends ActivitySummary { + description: string | null; + elevation_loss_m: number | null; + max_power_w: number | null; + gear: string | null; + device: string | null; + bbox: [number, number, number, number] | null; + start_latlng: [number, number] | null; + end_latlng: [number, number] | null; + laps: Lap[]; + timeseries: Timeseries; + strava_id: string | null; + duplicate_of: string | null; + custom: Record; +} + +export interface Lap { + index: number; + started_at: string; + duration_s: number | null; + distance_m: number | null; + elevation_gain_m: number | null; + avg_speed_kmh: number | null; + avg_hr_bpm: number | null; + avg_power_w: number | null; +} diff --git a/site/tailwind.config.mjs b/site/tailwind.config.mjs new file mode 100644 index 0000000..0986ab7 --- /dev/null +++ b/site/tailwind.config.mjs @@ -0,0 +1,18 @@ +/** @type {import('tailwindcss').Config} */ +export default { + content: ["./src/**/*.{astro,html,js,jsx,md,mdx,svelte,ts,tsx,vue}"], + darkMode: "class", + theme: { + extend: { + colors: { + // BincioActivity accent — override via CSS variable in site_config + accent: "var(--color-accent, #00c8ff)", + }, + fontFamily: { + sans: ["Inter", "system-ui", "sans-serif"], + mono: ["JetBrains Mono", "monospace"], + }, + }, + }, + plugins: [], +}; diff --git a/site/tsconfig.json b/site/tsconfig.json new file mode 100644 index 0000000..e749c65 --- /dev/null +++ b/site/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "astro/tsconfigs/strict", + "compilerOptions": { + "baseUrl": ".", + "paths": { + "@lib/*": ["src/lib/*"], + "@components/*": ["src/components/*"] + } + } +} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_sport.py b/tests/test_sport.py new file mode 100644 index 0000000..7fd615d --- /dev/null +++ b/tests/test_sport.py @@ -0,0 +1,16 @@ +from bincio.extract.sport import normalise_sport + + +def test_cycling_variants(): + for raw in ("cycling", "Biking", "road_biking", "virtual_ride", "e-biking"): + assert normalise_sport(raw) == "cycling", raw + + +def test_running_variants(): + for raw in ("running", "Run", "trail_running", "virtual_run"): + assert normalise_sport(raw) == "running", raw + + +def test_unknown_falls_back_to_other(): + assert normalise_sport("yoga") == "other" + assert normalise_sport(None) == "other" diff --git a/tests/test_writer.py b/tests/test_writer.py new file mode 100644 index 0000000..ae5a371 --- /dev/null +++ b/tests/test_writer.py @@ -0,0 +1,36 @@ +from bincio.extract.writer import make_activity_id, _slugify +from bincio.extract.models import ParsedActivity, DataPoint +from datetime import datetime, timezone + + +def _dummy_activity(title=None): + ts = datetime(2024, 6, 1, 7, 30, 12, tzinfo=timezone.utc) + return ParsedActivity( + points=[DataPoint(timestamp=ts)], + sport="cycling", + started_at=ts, + source_file="test.fit", + source_hash="sha256:abc", + title=title, + ) + + +def test_id_with_title(): + act = _dummy_activity("Morning Ride") + aid = make_activity_id(act) + assert aid.startswith("2024-06-01T") + assert "morning-ride" in aid + + +def test_id_without_title(): + act = _dummy_activity() + aid = make_activity_id(act) + assert "2024-06-01T" in aid + # No trailing dash + assert not aid.endswith("-") + + +def test_slugify(): + assert _slugify("Morning Ride!") == "morning-ride" + assert _slugify(" Vélo ") == "velo" # é → e via NFKD + ASCII + assert _slugify("") == ""