trying to get sub label showed properly

This commit is contained in:
Davide Scaini
2026-03-30 20:09:01 +02:00
parent c58bc8f7d5
commit 877472e620
11 changed files with 157 additions and 24 deletions
+25 -1
View File
@@ -113,6 +113,8 @@ def _process_file(path: Path) -> dict:
help="Only process files modified after this date.")
@click.option("--workers", default=None, type=int,
help="Parallel worker processes (default: CPU count).")
@click.option("--dev", "dev_sample", default=None, type=int, metavar="N",
help="Dev mode: sample N files evenly across the full list, output to /tmp/bincio_dev/.")
def extract(
config_path: Optional[str],
input_dir: Optional[str],
@@ -120,6 +122,7 @@ def extract(
single_file: Optional[str],
since: Optional[str],
workers: Optional[int],
dev_sample: Optional[int],
) -> None:
"""Parse GPX/FIT/TCX files and write BAS JSON data store."""
@@ -128,13 +131,25 @@ def extract(
return
cfg = _resolve_config(config_path, input_dir, output_dir)
if dev_sample is not None:
cfg.output_dir = Path("/tmp/bincio_dev")
cfg.incremental = False
console.print(f"[yellow]Dev mode:[/yellow] sampling {dev_sample} files → [cyan]{cfg.output_dir}[/cyan]")
cfg.output_dir.mkdir(parents=True, exist_ok=True)
files = _collect_files(cfg, since)
if not files:
console.print("[yellow]No supported files found.[/yellow]")
return
console.print(f"Found [bold]{len(files)}[/bold] activity files.")
if dev_sample is not None:
total = len(files)
files = _sample_diverse(files, dev_sample)
console.print(f"Sampled [bold]{len(files)}[/bold] files from {total} total.")
else:
console.print(f"Found [bold]{len(files)}[/bold] activity files.")
# Build strava lookup once (serialised dict, sent to workers via initializer)
strava_lookup: dict = {}
@@ -314,6 +329,15 @@ def _load_existing_summaries(output_dir: Path) -> list[dict]:
return []
def _sample_diverse(files: list[Path], n: int) -> list[Path]:
"""Return n files sampled evenly across the sorted list for date/format diversity."""
if len(files) <= n:
return files
files = sorted(files)
step = len(files) / n
return [files[int(i * step)] for i in range(n)]
def _patch_duplicate_of(output_dir: Path, activity_id: str, canonical_id: str) -> None:
p = output_dir / "activities" / f"{activity_id}.json"
if not p.exists():
+5 -4
View File
@@ -8,7 +8,7 @@ import gpxpy.gpx
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.parsers.base import BaseParser
from bincio.extract.sport import normalise_sport
from bincio.extract.sport import normalise_sport, normalise_sub_sport
# Known GPX extension namespaces
_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1"
@@ -41,14 +41,15 @@ class GpxParser(BaseParser):
if not points:
raise ValueError(f"No trackpoints found in {path.name}")
sport = normalise_sport(
(gpx.tracks[0].type if gpx.tracks else None) or "cycling"
)
raw_sport = (gpx.tracks[0].type if gpx.tracks else None) or "cycling"
sport = normalise_sport(raw_sport)
sub_sport = normalise_sub_sport(raw_sport)
started_at = points[0].timestamp
return ParsedActivity(
points=points,
sport=sport,
sub_sport=sub_sport,
started_at=started_at,
source_file=path.name,
source_hash="", # set by factory
+4 -2
View File
@@ -6,7 +6,7 @@ from pathlib import Path
from lxml import etree
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.sport import normalise_sport
from bincio.extract.sport import normalise_sport, normalise_sub_sport
_NS_HTTP = {
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
@@ -33,7 +33,8 @@ class TcxParser:
# Use the first activity
act = activities[0]
sport_attr = act.get("Sport", "Biking")
sport = normalise_sport(sport_attr)
sport = normalise_sport(sport_attr)
sub_sport = normalise_sub_sport(sport_attr)
points: list[DataPoint] = []
for tp in act.findall(".//tcx:Trackpoint", _NS):
@@ -78,6 +79,7 @@ class TcxParser:
return ParsedActivity(
points=points,
sport=sport,
sub_sport=sub_sport,
started_at=points[0].timestamp,
source_file=path.name,
source_hash="",
+57 -4
View File
@@ -47,10 +47,13 @@ _MAPPING: dict[str, str] = {
"skiing": "skiing",
"cross_country_skiing": "skiing",
"nordic_skiing": "skiing",
"nordic_ski": "skiing",
"downhill_skiing": "skiing",
"alpine_skiing": "skiing",
"alpine_ski": "skiing",
"skate_skiing": "skiing",
"backcountry_skiing": "skiing",
"backcountry_ski": "skiing",
# swimming
"swimming": "swimming",
"swim": "swimming",
@@ -58,13 +61,63 @@ _MAPPING: dict[str, str] = {
"lap_swimming": "swimming",
}
_SUB_SPORT_MAPPING: dict[str, str] = {
# cycling
"road_biking": "road",
"road_cycling": "road",
"mountain_biking": "mountain",
"mountain_bike_ride": "mountain",
"gravel_cycling": "gravel",
"gravel_ride": "gravel",
"cyclocross": "gravel",
"indoor_cycling": "indoor",
"indoor_ride": "indoor",
"virtual_ride": "indoor",
# running
"trail_running": "trail",
"trail_run": "trail",
"treadmill_running": "indoor",
"treadmill": "indoor",
"indoor_run": "indoor",
"virtual_run": "indoor",
"track_run": "track",
# skiing
"cross_country_skiing": "nordic",
"nordic_skiing": "nordic",
"nordic_ski": "nordic",
"skate_skiing": "nordic",
"backcountry_skiing": "nordic",
"backcountry_ski": "nordic",
"downhill_skiing": "alpine",
"alpine_skiing": "alpine",
"alpine_ski": "alpine",
# swimming
"open_water_swimming": "open_water",
"lap_swimming": "pool",
}
BAS_SPORTS = {"cycling", "running", "hiking", "walking", "swimming", "skiing", "other"}
def _normalise_key(raw: object) -> str:
key = str(raw).strip()
# CamelCase → snake_case ("MountainBikeRide" → "mountain_bike_ride")
key = re.sub(r"([A-Z])", r"_\1", key).lower().lstrip("_")
key = key.replace(" ", "_").replace("-", "_")
return re.sub(r"^\d+", "", key)
def normalise_sport(raw: object) -> str:
if raw is None:
return "other"
key = str(raw).lower().strip().replace(" ", "_").replace("-", "_")
# Strip leading date-like prefixes e.g. "20231117outdoor_run" → "outdoor_run"
key = re.sub(r"^\d+", "", key)
return _MAPPING.get(key, "other")
return _MAPPING.get(_normalise_key(raw), "other")
def normalise_sub_sport(raw: object) -> str | None:
"""Infer sub_sport from a raw sport type string (e.g. 'mountain_bike_ride''mountain').
Returns None when no sub_sport is implied (e.g. plain 'ride', 'run').
"""
if raw is None:
return None
return _SUB_SPORT_MAPPING.get(_normalise_key(raw))
+9 -2
View File
@@ -29,6 +29,8 @@ def import_group() -> None:
help="Only import activities after this date (default: incremental from last sync).")
@click.option("--reauth", is_flag=True, default=False,
help="Force re-authorization even if valid tokens exist.")
@click.option("--dev", "dev_sample", default=None, type=int, metavar="N",
help="Dev mode: import only the N most recent activities, output to /tmp/bincio_dev/.")
def strava_cmd(
client_id: Optional[str],
client_secret: Optional[str],
@@ -36,6 +38,7 @@ def strava_cmd(
config_path: Optional[str],
since: Optional[str],
reauth: bool,
dev_sample: Optional[int],
) -> None:
"""Import activities from Strava.
@@ -90,7 +93,11 @@ def strava_cmd(
"Add them to extract_config.yaml under import.strava, or pass --client-id/--client-secret."
)
out = _resolve_output(output_dir, cfg)
if dev_sample is not None:
out = Path("/tmp/bincio_dev")
console.print(f"[yellow]Dev mode:[/yellow] importing {dev_sample} activities → [cyan]{out}[/cyan]")
else:
out = _resolve_output(output_dir, cfg)
console.print(f"Output dir: [cyan]{out}[/cyan]")
if reauth and TOKENS_FILE.exists():
@@ -108,7 +115,7 @@ def strava_cmd(
except ValueError:
raise click.BadParameter(f"Expected YYYY-MM-DD, got {since!r}", param_hint="--since")
strava_sync(client, out, since_dt, console)
strava_sync(client, out, since_dt, console, limit=dev_sample)
def _load_config(config_path: Optional[str]):
+9 -2
View File
@@ -26,7 +26,7 @@ from rich.console import Console
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn, TimeElapsedColumn
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.sport import normalise_sport
from bincio.extract.sport import normalise_sport, normalise_sub_sport
STRAVA_AUTH_URL = "https://www.strava.com/oauth/authorize"
STRAVA_TOKEN_URL = "https://www.strava.com/oauth/token"
@@ -214,7 +214,9 @@ def _strava_to_parsed(act: dict, streams: dict[str, list]) -> ParsedActivity:
"""Build a ParsedActivity from a Strava activity dict + its streams."""
started_at = datetime.fromisoformat(act["start_date"].replace("Z", "+00:00"))
sport = normalise_sport(act.get("sport_type") or act.get("type") or "")
raw_sport = act.get("sport_type") or act.get("type") or ""
sport = normalise_sport(raw_sport)
sub_sport = normalise_sub_sport(raw_sport)
times = streams.get("time", []) # seconds since start
latlngs = streams.get("latlng", []) # [[lat, lon], ...]
@@ -244,6 +246,7 @@ def _strava_to_parsed(act: dict, streams: dict[str, list]) -> ParsedActivity:
return ParsedActivity(
points = points,
sport = sport,
sub_sport = sub_sport,
started_at = started_at,
source_file = f"strava_{strava_id}",
source_hash = source_hash,
@@ -287,6 +290,7 @@ def sync(
output_dir: Path,
since: datetime | None,
console: Console,
limit: int | None = None,
) -> None:
"""Fetch new Strava activities and write BAS JSON files.
@@ -323,6 +327,9 @@ def sync(
f"Found [bold]{len(new_acts)}[/bold] new activities "
f"([bold]{len(all_acts) - len(new_acts)}[/bold] already imported)."
)
if limit is not None and len(new_acts) > limit:
new_acts = new_acts[:limit]
console.print(f"[yellow]Dev mode:[/yellow] capped to {limit} activities.")
if not new_acts:
console.print("[green]All up to date.[/green]")
return