bincio-activity/bincio/extract/strava_csv.py

"""Import metadata from Strava's activities.csv bulk export.

Strava export columns we care about:
  Activity ID, Activity Date, Activity Name, Activity Description, Filename
"""

import csv
import json
import re
from pathlib import Path
from typing import Iterator, Optional


_STRAVA_DATE_FMTS = (
    "%b %d, %Y, %I:%M:%S %p",  # "Jun 1, 2024, 7:30:12 AM"
    "%Y-%m-%d %H:%M:%S",
)


class StravaMetadata:
    """Maps original filename → Strava metadata, with secondary strava_id index."""

    def __init__(self, csv_path: Path) -> None:
        self._by_filename: dict[str, dict] = {}
        self._by_strava_id: dict[str, dict] = {}
        self._load(csv_path)

    def _load(self, path: Path) -> None:
        with path.open(newline="", encoding="utf-8-sig") as f:
            reader = csv.DictReader(f)
            for row in reader:
                filename = row.get("Filename", "").strip()
                if filename:
                    basename = Path(filename).name
                    self._by_filename[basename] = row
                strava_id = row.get("Activity ID", "").strip()
                if strava_id:
                    self._by_strava_id[strava_id] = row

    def lookup(self, source_file: str) -> Optional[dict]:
        """Return the Strava CSV row for a given source filename, or None."""
        return self._by_filename.get(source_file)

    def lookup_by_strava_id(self, strava_id: str) -> Optional[dict]:
        """Return the Strava CSV row for a given Strava activity ID, or None."""
        return self._by_strava_id.get(str(strava_id))

    def enrich(self, source_file: str, activity: object) -> None:
        """Mutate a ParsedActivity with Strava metadata if found."""
        row = self.lookup(source_file)
        if row is None:
            return

        if not activity.title and row.get("Activity Name"):  # type: ignore[attr-defined]
            activity.title = row["Activity Name"].strip()  # type: ignore[attr-defined]

        if not activity.description and row.get("Activity Description"):  # type: ignore[attr-defined]
            activity.description = row["Activity Description"].strip()  # type: ignore[attr-defined]

        if not activity.strava_id and row.get("Activity ID"):  # type: ignore[attr-defined]
            activity.strava_id = row["Activity ID"].strip()  # type: ignore[attr-defined]


# ── Retroactive sidecar update ────────────────────────────────────────────────

def _parse_sidecar(path: Path) -> tuple[dict, str]:
    """Return (frontmatter_dict, body) from a sidecar .md file."""
    import re as _re
    import yaml
    text = path.read_text(encoding="utf-8")
    if text.startswith("---"):
        parts = _re.split(r"^---[ \t]*$", text, maxsplit=2, flags=_re.MULTILINE)
        if len(parts) >= 3:
            fm = yaml.safe_load(parts[1]) or {}
            return fm, parts[2].strip()
    return {}, text.strip()


def _write_sidecar(path: Path, fm: dict, body: str) -> None:
    import yaml
    path.parent.mkdir(parents=True, exist_ok=True)
    fm_text = yaml.safe_dump(fm, default_flow_style=False, allow_unicode=True).strip()
    content = f"---\n{fm_text}\n---\n"
    if body:
        content += f"\n{body}\n"
    path.write_text(content, encoding="utf-8")


def _update_sidecar_from_row(sidecar_path: Path, row: dict) -> bool:
    """Create or update a sidecar with CSV title/description.

    Only fills fields that are not already set in the sidecar.
    Returns True if anything changed.
    """
    title = row.get("Activity Name", "").strip()
    description = row.get("Activity Description", "").strip()
    if not title and not description:
        return False

    fm, body = _parse_sidecar(sidecar_path) if sidecar_path.exists() else ({}, "")

    changed = False
    if title and "title" not in fm:
        fm["title"] = title
        changed = True
    if description and not body:
        body = description
        changed = True

    if not changed:
        return False

    _write_sidecar(sidecar_path, fm, body)
    return True


def apply_csv_to_data_dir(data_dir: Path, metadata: StravaMetadata) -> int:
    """Retroactively apply CSV metadata to existing activities via sidecars.

    Scans all activity JSONs in data_dir/activities/.  For each activity that
    has a strava_id, looks up the corresponding CSV row and creates/updates
    the sidecar in data_dir/edits/ with any missing title or description.

    Only writes fields not already present in the sidecar — manual edits are
    never overwritten.

    Returns the count of activities whose sidecars were created or updated.
    """
    activities_dir = data_dir / "activities"
    edits_dir = data_dir / "edits"

    if not activities_dir.exists():
        return 0

    updated = 0
    for json_path in sorted(activities_dir.glob("*.json")):
        try:
            detail = json.loads(json_path.read_text(encoding="utf-8"))
        except Exception:
            continue

        strava_id = detail.get("strava_id")
        if not strava_id:
            continue

        row = metadata.lookup_by_strava_id(str(strava_id))
        if row is None:
            continue

        activity_id = json_path.stem
        sidecar_path = edits_dir / f"{activity_id}.md"
        if _update_sidecar_from_row(sidecar_path, row):
            updated += 1

    return updated