bincio-activity/bincio/extract/parsers/base.py

"""Abstract base class for all activity parsers."""

import gzip
import hashlib
from abc import ABC, abstractmethod
from pathlib import Path

from bincio.extract.models import ParsedActivity


class BaseParser(ABC):
    @abstractmethod
    def parse(self, path: Path, raw_bytes: bytes) -> ParsedActivity:
        """Parse activity from raw file bytes.

        Receives pre-read bytes so the factory can compute the hash once and
        handle decompression transparently before dispatching.
        """

    @staticmethod
    def _sha256(data: bytes) -> str:
        return "sha256:" + hashlib.sha256(data).hexdigest()

    @staticmethod
    def _read_file(path: Path) -> tuple[bytes, bytes]:
        """Return (raw_bytes, decompressed_bytes).

        raw_bytes is the original file content (used for hashing).
        decompressed_bytes is what parsers should actually parse.

        Gzip is handled both by extension (.gz) and by magic bytes (0x1f 0x8b),
        so files that are gzip-compressed but named without .gz still parse correctly.
        """
        raw = path.read_bytes()
        if path.suffix == ".gz" or raw[:2] == b'\x1f\x8b':
            try:
                return raw, gzip.decompress(raw)
            except Exception:
                pass  # not actually gzip despite the magic bytes — fall through
        return raw, raw