backend: initial commit

This commit is contained in:
Davide Scaini
2026-03-28 13:57:12 +01:00
commit 38c5423aeb
36 changed files with 2463 additions and 0 deletions
View File
+34
View File
@@ -0,0 +1,34 @@
"""Abstract base class for all activity parsers."""
import gzip
import hashlib
from abc import ABC, abstractmethod
from pathlib import Path
from bincio.extract.models import ParsedActivity
class BaseParser(ABC):
@abstractmethod
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
"""Parse activity from raw file bytes.
Receives pre-read bytes so the factory can compute the hash once and
handle decompression transparently before dispatching.
"""
@staticmethod
def _sha256(data: bytes) -> str:
return "sha256:" + hashlib.sha256(data).hexdigest()
@staticmethod
def _read_file(path: Path) -> tuple[bytes, bytes]:
"""Return (raw_bytes, decompressed_bytes).
raw_bytes is the original file content (used for hashing).
decompressed_bytes is what parsers should actually parse.
"""
raw = path.read_bytes()
if path.suffix == ".gz":
return raw, gzip.decompress(raw)
return raw, raw
+46
View File
@@ -0,0 +1,46 @@
"""Parser factory — selects the right parser based on file extension."""
from pathlib import Path
from bincio.extract.models import ParsedActivity
from bincio.extract.parsers.base import BaseParser
from bincio.extract.parsers.fit import FitParser
from bincio.extract.parsers.gpx import GpxParser
from bincio.extract.parsers.tcx import TcxParser
# Supported extensions (including .gz variants)
SUPPORTED = {".fit", ".gpx", ".tcx", ".fit.gz", ".gpx.gz", ".tcx.gz"}
_PARSERS: dict[str, type[BaseParser]] = {
".fit": FitParser,
".gpx": GpxParser,
".tcx": TcxParser,
}
def _base_ext(path: Path) -> str:
"""Return the meaningful extension, stripping .gz if present."""
if path.suffix == ".gz":
return Path(path.stem).suffix # e.g. ".fit" from "ride.fit.gz"
return path.suffix
def is_supported(path: Path) -> bool:
suffix = "".join(path.suffixes[-2:]) if path.suffix == ".gz" else path.suffix
return suffix in SUPPORTED
def parse_file(path: Path) -> ParsedActivity:
"""Parse an activity file, handling .gz transparently."""
ext = _base_ext(path)
parser_cls = _PARSERS.get(ext)
if parser_cls is None:
raise ValueError(f"Unsupported file type: {path.name!r}")
raw_bytes, content_bytes = BaseParser._read_file(path)
parser = parser_cls()
activity = parser.parse(path, content_bytes)
# Attach hash of the *original* bytes (compressed if .gz) for dedup
activity.source_hash = BaseParser._sha256(raw_bytes)
activity.source_file = path.name
return activity
+133
View File
@@ -0,0 +1,133 @@
"""FIT file parser (Garmin binary format)."""
from datetime import timezone
from pathlib import Path
from typing import Any
import fitdecode
from bincio.extract.models import DataPoint, LapData, ParsedActivity
from bincio.extract.sport import normalise_sport
class FitParser:
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
import io
points: list[DataPoint] = []
laps: list[LapData] = []
sport: str = "cycling"
sub_sport: str | None = None
device: str | None = None
with fitdecode.FitReader(io.BytesIO(raw_bytes)) as fit:
for frame in fit:
if not isinstance(frame, fitdecode.FitDataMessage):
continue
if frame.name == "sport":
sport = normalise_sport(_get(frame, "sport", "cycling"))
sub_sport = _normalise_sub_sport(_get(frame, "sub_sport"))
elif frame.name == "device_info":
mfr = _get(frame, "manufacturer")
prod = _get(frame, "product_name") or _get(frame, "garmin_product")
if mfr and prod:
device = f"{mfr} {prod}"
elif prod:
device = str(prod)
elif frame.name == "record":
ts = _get(frame, "timestamp")
if ts is None:
continue
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
lat = _semicircles_to_deg(_get(frame, "position_lat"))
lon = _semicircles_to_deg(_get(frame, "position_long"))
speed_raw = _get(frame, "speed") # m/s
dp = DataPoint(
timestamp=ts,
lat=lat,
lon=lon,
elevation_m=_get(frame, "altitude"),
hr_bpm=_get(frame, "heart_rate"),
cadence_rpm=_get(frame, "cadence"),
speed_kmh=speed_raw * 3.6 if speed_raw is not None else None,
power_w=_get(frame, "power"),
temperature_c=_get(frame, "temperature"),
distance_m=_get(frame, "distance"),
)
points.append(dp)
elif frame.name == "lap":
ts = _get(frame, "start_time")
if ts is not None:
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
elapsed = _get(frame, "total_elapsed_time")
speed_raw = _get(frame, "avg_speed")
laps.append(
LapData(
index=len(laps),
started_at=ts,
duration_s=int(elapsed) if elapsed else None,
distance_m=_get(frame, "total_distance"),
elevation_gain_m=_get(frame, "total_ascent"),
avg_speed_kmh=speed_raw * 3.6 if speed_raw else None,
avg_hr_bpm=_get(frame, "avg_heart_rate"),
avg_power_w=_get(frame, "avg_power"),
)
)
if not points:
raise ValueError(f"No record messages found in {path.name}")
return ParsedActivity(
points=points,
sport=sport,
sub_sport=sub_sport,
started_at=points[0].timestamp,
device=device,
laps=laps,
source_file=path.name,
source_hash="",
)
def _get(frame: fitdecode.FitDataMessage, field: str, default: Any = None) -> Any:
try:
return frame.get_value(field)
except KeyError:
return default
def _semicircles_to_deg(value: Any) -> float | None:
if value is None:
return None
try:
deg = float(value) * (180.0 / 2**31)
# Sanity check: invalid semicircle values often come out as ±180+
if abs(deg) > 180:
return None
return deg
except (TypeError, ValueError):
return None
def _normalise_sub_sport(value: Any) -> str | None:
if value is None:
return None
s = str(value).lower().replace(" ", "_")
mapping = {
"road": "road",
"mountain": "mountain",
"gravel_cycling": "gravel",
"cyclocross": "gravel",
"indoor_cycling": "indoor",
"trail": "trail",
"track": "track",
}
return mapping.get(s, s) or None
+82
View File
@@ -0,0 +1,82 @@
"""GPX file parser."""
from datetime import timezone
from pathlib import Path
import gpxpy
import gpxpy.gpx
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.parsers.base import BaseParser
from bincio.extract.sport import normalise_sport
# Known GPX extension namespaces
_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1"
_NS_GARMIN_V2 = "http://www.garmin.com/xmlschemas/TrackPointExtension/v2"
class GpxParser(BaseParser):
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
gpx = gpxpy.parse(raw_bytes.decode("utf-8", errors="replace"))
points: list[DataPoint] = []
for track in gpx.tracks:
for segment in track.segments:
for pt in segment.points:
if pt.time is None:
continue
ts = pt.time
if ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
dp = DataPoint(
timestamp=ts,
lat=pt.latitude,
lon=pt.longitude,
elevation_m=pt.elevation,
)
_apply_extensions(pt, dp)
points.append(dp)
if not points:
raise ValueError(f"No trackpoints found in {path.name}")
sport = normalise_sport(
(gpx.tracks[0].type if gpx.tracks else None) or "cycling"
)
started_at = points[0].timestamp
return ParsedActivity(
points=points,
sport=sport,
started_at=started_at,
source_file=path.name,
source_hash="", # set by factory
)
def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
"""Extract HR, cadence, temperature from Garmin TrackPointExtension."""
if pt.extensions is None:
return
for ext in pt.extensions:
ns = _strip_ns(ext.tag)
if ns == "TrackPointExtension":
for child in ext:
tag = _strip_ns(child.tag)
val = child.text
if val is None:
continue
if tag == "hr":
dp.hr_bpm = int(float(val))
elif tag == "cad":
dp.cadence_rpm = int(float(val))
elif tag == "atemp":
dp.temperature_c = float(val)
elif tag == "speed":
dp.speed_kmh = float(val) * 3.6 # m/s → km/h
def _strip_ns(tag: str) -> str:
"""'{namespace}localname''localname'."""
return tag.split("}")[-1] if "}" in tag else tag
+89
View File
@@ -0,0 +1,89 @@
"""TCX (Training Center XML) file parser."""
from datetime import datetime, timezone
from pathlib import Path
from lxml import etree
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.sport import normalise_sport
_NS = {
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
"ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
}
class TcxParser:
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
# Some exporters (e.g. Garmin) prepend whitespace before the XML
# declaration, which is technically invalid. Strip it.
root = etree.fromstring(raw_bytes.lstrip())
activities = root.findall(".//tcx:Activity", _NS)
if not activities:
raise ValueError(f"No Activity elements found in {path.name}")
# Use the first activity
act = activities[0]
sport_attr = act.get("Sport", "Biking")
sport = normalise_sport(sport_attr)
points: list[DataPoint] = []
for tp in act.findall(".//tcx:Trackpoint", _NS):
ts_el = tp.find("tcx:Time", _NS)
if ts_el is None or not ts_el.text:
continue
ts = _parse_ts(ts_el.text)
lat, lon = None, None
pos = tp.find("tcx:Position", _NS)
if pos is not None:
lat_el = pos.find("tcx:LatitudeDegrees", _NS)
lon_el = pos.find("tcx:LongitudeDegrees", _NS)
lat = float(lat_el.text) if lat_el is not None and lat_el.text else None
lon = float(lon_el.text) if lon_el is not None and lon_el.text else None
ele_el = tp.find("tcx:AltitudeMeters", _NS)
hr_el = tp.find(".//tcx:HeartRateBpm/tcx:Value", _NS)
cad_el = tp.find("tcx:Cadence", _NS)
dist_el = tp.find("tcx:DistanceMeters", _NS)
# Extensions (speed, watts)
speed_el = tp.find(".//ext:Speed", _NS)
power_el = tp.find(".//ext:Watts", _NS)
dp = DataPoint(
timestamp=ts,
lat=lat,
lon=lon,
elevation_m=float(ele_el.text) if ele_el is not None and ele_el.text else None,
hr_bpm=int(float(hr_el.text)) if hr_el is not None and hr_el.text else None,
cadence_rpm=int(float(cad_el.text)) if cad_el is not None and cad_el.text else None,
distance_m=float(dist_el.text) if dist_el is not None and dist_el.text else None,
speed_kmh=float(speed_el.text) * 3.6 if speed_el is not None and speed_el.text else None,
power_w=int(float(power_el.text)) if power_el is not None and power_el.text else None,
)
points.append(dp)
if not points:
raise ValueError(f"No trackpoints found in {path.name}")
return ParsedActivity(
points=points,
sport=sport,
started_at=points[0].timestamp,
source_file=path.name,
source_hash="",
)
def _parse_ts(s: str) -> datetime:
# ISO 8601 with or without fractional seconds
s = s.rstrip("Z")
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
try:
return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
except ValueError:
continue
raise ValueError(f"Cannot parse timestamp: {s!r}")