backend: initial commit
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
"""Abstract base class for all activity parsers."""
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from bincio.extract.models import ParsedActivity
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
@abstractmethod
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
"""Parse activity from raw file bytes.
|
||||
|
||||
Receives pre-read bytes so the factory can compute the hash once and
|
||||
handle decompression transparently before dispatching.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _sha256(data: bytes) -> str:
|
||||
return "sha256:" + hashlib.sha256(data).hexdigest()
|
||||
|
||||
@staticmethod
|
||||
def _read_file(path: Path) -> tuple[bytes, bytes]:
|
||||
"""Return (raw_bytes, decompressed_bytes).
|
||||
|
||||
raw_bytes is the original file content (used for hashing).
|
||||
decompressed_bytes is what parsers should actually parse.
|
||||
"""
|
||||
raw = path.read_bytes()
|
||||
if path.suffix == ".gz":
|
||||
return raw, gzip.decompress(raw)
|
||||
return raw, raw
|
||||
@@ -0,0 +1,46 @@
|
||||
"""Parser factory — selects the right parser based on file extension."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from bincio.extract.models import ParsedActivity
|
||||
from bincio.extract.parsers.base import BaseParser
|
||||
from bincio.extract.parsers.fit import FitParser
|
||||
from bincio.extract.parsers.gpx import GpxParser
|
||||
from bincio.extract.parsers.tcx import TcxParser
|
||||
|
||||
# Supported extensions (including .gz variants)
|
||||
SUPPORTED = {".fit", ".gpx", ".tcx", ".fit.gz", ".gpx.gz", ".tcx.gz"}
|
||||
|
||||
_PARSERS: dict[str, type[BaseParser]] = {
|
||||
".fit": FitParser,
|
||||
".gpx": GpxParser,
|
||||
".tcx": TcxParser,
|
||||
}
|
||||
|
||||
|
||||
def _base_ext(path: Path) -> str:
|
||||
"""Return the meaningful extension, stripping .gz if present."""
|
||||
if path.suffix == ".gz":
|
||||
return Path(path.stem).suffix # e.g. ".fit" from "ride.fit.gz"
|
||||
return path.suffix
|
||||
|
||||
|
||||
def is_supported(path: Path) -> bool:
|
||||
suffix = "".join(path.suffixes[-2:]) if path.suffix == ".gz" else path.suffix
|
||||
return suffix in SUPPORTED
|
||||
|
||||
|
||||
def parse_file(path: Path) -> ParsedActivity:
|
||||
"""Parse an activity file, handling .gz transparently."""
|
||||
ext = _base_ext(path)
|
||||
parser_cls = _PARSERS.get(ext)
|
||||
if parser_cls is None:
|
||||
raise ValueError(f"Unsupported file type: {path.name!r}")
|
||||
|
||||
raw_bytes, content_bytes = BaseParser._read_file(path)
|
||||
parser = parser_cls()
|
||||
activity = parser.parse(path, content_bytes)
|
||||
# Attach hash of the *original* bytes (compressed if .gz) for dedup
|
||||
activity.source_hash = BaseParser._sha256(raw_bytes)
|
||||
activity.source_file = path.name
|
||||
return activity
|
||||
@@ -0,0 +1,133 @@
|
||||
"""FIT file parser (Garmin binary format)."""
|
||||
|
||||
from datetime import timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import fitdecode
|
||||
|
||||
from bincio.extract.models import DataPoint, LapData, ParsedActivity
|
||||
from bincio.extract.sport import normalise_sport
|
||||
|
||||
|
||||
class FitParser:
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
import io
|
||||
|
||||
points: list[DataPoint] = []
|
||||
laps: list[LapData] = []
|
||||
sport: str = "cycling"
|
||||
sub_sport: str | None = None
|
||||
device: str | None = None
|
||||
|
||||
with fitdecode.FitReader(io.BytesIO(raw_bytes)) as fit:
|
||||
for frame in fit:
|
||||
if not isinstance(frame, fitdecode.FitDataMessage):
|
||||
continue
|
||||
|
||||
if frame.name == "sport":
|
||||
sport = normalise_sport(_get(frame, "sport", "cycling"))
|
||||
sub_sport = _normalise_sub_sport(_get(frame, "sub_sport"))
|
||||
|
||||
elif frame.name == "device_info":
|
||||
mfr = _get(frame, "manufacturer")
|
||||
prod = _get(frame, "product_name") or _get(frame, "garmin_product")
|
||||
if mfr and prod:
|
||||
device = f"{mfr} {prod}"
|
||||
elif prod:
|
||||
device = str(prod)
|
||||
|
||||
elif frame.name == "record":
|
||||
ts = _get(frame, "timestamp")
|
||||
if ts is None:
|
||||
continue
|
||||
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
|
||||
lat = _semicircles_to_deg(_get(frame, "position_lat"))
|
||||
lon = _semicircles_to_deg(_get(frame, "position_long"))
|
||||
speed_raw = _get(frame, "speed") # m/s
|
||||
|
||||
dp = DataPoint(
|
||||
timestamp=ts,
|
||||
lat=lat,
|
||||
lon=lon,
|
||||
elevation_m=_get(frame, "altitude"),
|
||||
hr_bpm=_get(frame, "heart_rate"),
|
||||
cadence_rpm=_get(frame, "cadence"),
|
||||
speed_kmh=speed_raw * 3.6 if speed_raw is not None else None,
|
||||
power_w=_get(frame, "power"),
|
||||
temperature_c=_get(frame, "temperature"),
|
||||
distance_m=_get(frame, "distance"),
|
||||
)
|
||||
points.append(dp)
|
||||
|
||||
elif frame.name == "lap":
|
||||
ts = _get(frame, "start_time")
|
||||
if ts is not None:
|
||||
if hasattr(ts, "tzinfo") and ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
elapsed = _get(frame, "total_elapsed_time")
|
||||
speed_raw = _get(frame, "avg_speed")
|
||||
laps.append(
|
||||
LapData(
|
||||
index=len(laps),
|
||||
started_at=ts,
|
||||
duration_s=int(elapsed) if elapsed else None,
|
||||
distance_m=_get(frame, "total_distance"),
|
||||
elevation_gain_m=_get(frame, "total_ascent"),
|
||||
avg_speed_kmh=speed_raw * 3.6 if speed_raw else None,
|
||||
avg_hr_bpm=_get(frame, "avg_heart_rate"),
|
||||
avg_power_w=_get(frame, "avg_power"),
|
||||
)
|
||||
)
|
||||
|
||||
if not points:
|
||||
raise ValueError(f"No record messages found in {path.name}")
|
||||
|
||||
return ParsedActivity(
|
||||
points=points,
|
||||
sport=sport,
|
||||
sub_sport=sub_sport,
|
||||
started_at=points[0].timestamp,
|
||||
device=device,
|
||||
laps=laps,
|
||||
source_file=path.name,
|
||||
source_hash="",
|
||||
)
|
||||
|
||||
|
||||
def _get(frame: fitdecode.FitDataMessage, field: str, default: Any = None) -> Any:
|
||||
try:
|
||||
return frame.get_value(field)
|
||||
except KeyError:
|
||||
return default
|
||||
|
||||
|
||||
def _semicircles_to_deg(value: Any) -> float | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
deg = float(value) * (180.0 / 2**31)
|
||||
# Sanity check: invalid semicircle values often come out as ±180+
|
||||
if abs(deg) > 180:
|
||||
return None
|
||||
return deg
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _normalise_sub_sport(value: Any) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
s = str(value).lower().replace(" ", "_")
|
||||
mapping = {
|
||||
"road": "road",
|
||||
"mountain": "mountain",
|
||||
"gravel_cycling": "gravel",
|
||||
"cyclocross": "gravel",
|
||||
"indoor_cycling": "indoor",
|
||||
"trail": "trail",
|
||||
"track": "track",
|
||||
}
|
||||
return mapping.get(s, s) or None
|
||||
@@ -0,0 +1,82 @@
|
||||
"""GPX file parser."""
|
||||
|
||||
from datetime import timezone
|
||||
from pathlib import Path
|
||||
|
||||
import gpxpy
|
||||
import gpxpy.gpx
|
||||
|
||||
from bincio.extract.models import DataPoint, ParsedActivity
|
||||
from bincio.extract.parsers.base import BaseParser
|
||||
from bincio.extract.sport import normalise_sport
|
||||
|
||||
# Known GPX extension namespaces
|
||||
_NS_GARMIN = "http://www.garmin.com/xmlschemas/TrackPointExtension/v1"
|
||||
_NS_GARMIN_V2 = "http://www.garmin.com/xmlschemas/TrackPointExtension/v2"
|
||||
|
||||
|
||||
class GpxParser(BaseParser):
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
gpx = gpxpy.parse(raw_bytes.decode("utf-8", errors="replace"))
|
||||
|
||||
points: list[DataPoint] = []
|
||||
for track in gpx.tracks:
|
||||
for segment in track.segments:
|
||||
for pt in segment.points:
|
||||
if pt.time is None:
|
||||
continue
|
||||
ts = pt.time
|
||||
if ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
|
||||
dp = DataPoint(
|
||||
timestamp=ts,
|
||||
lat=pt.latitude,
|
||||
lon=pt.longitude,
|
||||
elevation_m=pt.elevation,
|
||||
)
|
||||
_apply_extensions(pt, dp)
|
||||
points.append(dp)
|
||||
|
||||
if not points:
|
||||
raise ValueError(f"No trackpoints found in {path.name}")
|
||||
|
||||
sport = normalise_sport(
|
||||
(gpx.tracks[0].type if gpx.tracks else None) or "cycling"
|
||||
)
|
||||
started_at = points[0].timestamp
|
||||
|
||||
return ParsedActivity(
|
||||
points=points,
|
||||
sport=sport,
|
||||
started_at=started_at,
|
||||
source_file=path.name,
|
||||
source_hash="", # set by factory
|
||||
)
|
||||
|
||||
|
||||
def _apply_extensions(pt: gpxpy.gpx.GPXTrackPoint, dp: DataPoint) -> None:
|
||||
"""Extract HR, cadence, temperature from Garmin TrackPointExtension."""
|
||||
if pt.extensions is None:
|
||||
return
|
||||
for ext in pt.extensions:
|
||||
ns = _strip_ns(ext.tag)
|
||||
if ns == "TrackPointExtension":
|
||||
for child in ext:
|
||||
tag = _strip_ns(child.tag)
|
||||
val = child.text
|
||||
if val is None:
|
||||
continue
|
||||
if tag == "hr":
|
||||
dp.hr_bpm = int(float(val))
|
||||
elif tag == "cad":
|
||||
dp.cadence_rpm = int(float(val))
|
||||
elif tag == "atemp":
|
||||
dp.temperature_c = float(val)
|
||||
elif tag == "speed":
|
||||
dp.speed_kmh = float(val) * 3.6 # m/s → km/h
|
||||
|
||||
|
||||
def _strip_ns(tag: str) -> str:
|
||||
"""'{namespace}localname' → 'localname'."""
|
||||
return tag.split("}")[-1] if "}" in tag else tag
|
||||
@@ -0,0 +1,89 @@
|
||||
"""TCX (Training Center XML) file parser."""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from bincio.extract.models import DataPoint, ParsedActivity
|
||||
from bincio.extract.sport import normalise_sport
|
||||
|
||||
_NS = {
|
||||
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
|
||||
"ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
|
||||
}
|
||||
|
||||
|
||||
class TcxParser:
|
||||
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
|
||||
# Some exporters (e.g. Garmin) prepend whitespace before the XML
|
||||
# declaration, which is technically invalid. Strip it.
|
||||
root = etree.fromstring(raw_bytes.lstrip())
|
||||
|
||||
activities = root.findall(".//tcx:Activity", _NS)
|
||||
if not activities:
|
||||
raise ValueError(f"No Activity elements found in {path.name}")
|
||||
|
||||
# Use the first activity
|
||||
act = activities[0]
|
||||
sport_attr = act.get("Sport", "Biking")
|
||||
sport = normalise_sport(sport_attr)
|
||||
|
||||
points: list[DataPoint] = []
|
||||
for tp in act.findall(".//tcx:Trackpoint", _NS):
|
||||
ts_el = tp.find("tcx:Time", _NS)
|
||||
if ts_el is None or not ts_el.text:
|
||||
continue
|
||||
ts = _parse_ts(ts_el.text)
|
||||
|
||||
lat, lon = None, None
|
||||
pos = tp.find("tcx:Position", _NS)
|
||||
if pos is not None:
|
||||
lat_el = pos.find("tcx:LatitudeDegrees", _NS)
|
||||
lon_el = pos.find("tcx:LongitudeDegrees", _NS)
|
||||
lat = float(lat_el.text) if lat_el is not None and lat_el.text else None
|
||||
lon = float(lon_el.text) if lon_el is not None and lon_el.text else None
|
||||
|
||||
ele_el = tp.find("tcx:AltitudeMeters", _NS)
|
||||
hr_el = tp.find(".//tcx:HeartRateBpm/tcx:Value", _NS)
|
||||
cad_el = tp.find("tcx:Cadence", _NS)
|
||||
dist_el = tp.find("tcx:DistanceMeters", _NS)
|
||||
|
||||
# Extensions (speed, watts)
|
||||
speed_el = tp.find(".//ext:Speed", _NS)
|
||||
power_el = tp.find(".//ext:Watts", _NS)
|
||||
|
||||
dp = DataPoint(
|
||||
timestamp=ts,
|
||||
lat=lat,
|
||||
lon=lon,
|
||||
elevation_m=float(ele_el.text) if ele_el is not None and ele_el.text else None,
|
||||
hr_bpm=int(float(hr_el.text)) if hr_el is not None and hr_el.text else None,
|
||||
cadence_rpm=int(float(cad_el.text)) if cad_el is not None and cad_el.text else None,
|
||||
distance_m=float(dist_el.text) if dist_el is not None and dist_el.text else None,
|
||||
speed_kmh=float(speed_el.text) * 3.6 if speed_el is not None and speed_el.text else None,
|
||||
power_w=int(float(power_el.text)) if power_el is not None and power_el.text else None,
|
||||
)
|
||||
points.append(dp)
|
||||
|
||||
if not points:
|
||||
raise ValueError(f"No trackpoints found in {path.name}")
|
||||
|
||||
return ParsedActivity(
|
||||
points=points,
|
||||
sport=sport,
|
||||
started_at=points[0].timestamp,
|
||||
source_file=path.name,
|
||||
source_hash="",
|
||||
)
|
||||
|
||||
|
||||
def _parse_ts(s: str) -> datetime:
|
||||
# ISO 8601 with or without fractional seconds
|
||||
s = s.rstrip("Z")
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S"):
|
||||
try:
|
||||
return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
raise ValueError(f"Cannot parse timestamp: {s!r}")
|
||||
Reference in New Issue
Block a user