parallelizing extraction, fix tcx files

This commit is contained in:
Davide Scaini
2026-03-28 14:24:16 +01:00
parent 38c5423aeb
commit 5d58126d2f
6 changed files with 226 additions and 192 deletions
+9 -3
View File
@@ -8,18 +8,24 @@ from lxml import etree
from bincio.extract.models import DataPoint, ParsedActivity
from bincio.extract.sport import normalise_sport
_NS = {
_NS_HTTP = {
"tcx": "http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
"ext": "http://www.garmin.com/xmlschemas/ActivityExtension/v2",
}
_NS_HTTPS = {
"tcx": "https://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2",
"ext": "https://www.garmin.com/xmlschemas/ActivityExtension/v2",
}
class TcxParser:
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
# Some exporters (e.g. Garmin) prepend whitespace before the XML
# declaration, which is technically invalid. Strip it.
# Some exporters prepend whitespace before the XML declaration. Strip it.
root = etree.fromstring(raw_bytes.lstrip())
# Garmin sometimes uses https:// instead of http:// in the namespace URI.
_NS = _NS_HTTPS if b"https://www.garmin.com" in raw_bytes else _NS_HTTP
activities = root.findall(".//tcx:Activity", _NS)
if not activities:
raise ValueError(f"No Activity elements found in {path.name}")