Files
bincio-activity/bincio/extract/parsers/base.py
T
Davide Scaini cd1cdca33b extract: auto-detect gzip by magic bytes, not just .gz extension
Files compressed with gzip but named without .gz (e.g. activity.gpx
containing gzip data) now decompress transparently.
2026-04-16 18:49:01 +02:00

41 lines
1.3 KiB
Python

"""Abstract base class for all activity parsers."""
import gzip
import hashlib
from abc import ABC, abstractmethod
from pathlib import Path
from bincio.extract.models import ParsedActivity
class BaseParser(ABC):
@abstractmethod
def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
"""Parse activity from raw file bytes.
Receives pre-read bytes so the factory can compute the hash once and
handle decompression transparently before dispatching.
"""
@staticmethod
def _sha256(data: bytes) -> str:
return "sha256:" + hashlib.sha256(data).hexdigest()
@staticmethod
def _read_file(path: Path) -> tuple[bytes, bytes]:
"""Return (raw_bytes, decompressed_bytes).
raw_bytes is the original file content (used for hashing).
decompressed_bytes is what parsers should actually parse.
Gzip is handled both by extension (.gz) and by magic bytes (0x1f 0x8b),
so files that are gzip-compressed but named without .gz still parse correctly.
"""
raw = path.read_bytes()
if path.suffix == ".gz" or raw[:2] == b'\x1f\x8b':
try:
return raw, gzip.decompress(raw)
except Exception:
pass # not actually gzip despite the magic bytes — fall through
return raw, raw