ingest activities.csv

This commit is contained in:
Davide Scaini
2026-04-11 08:13:27 +02:00
parent cbd5a98cd3
commit 01db4eb9ae
5 changed files with 367 additions and 79 deletions
+109 -9
View File
@@ -1,14 +1,14 @@
"""Import metadata from Strava's activities.csv bulk export.
Strava export columns we care about:
Activity ID, Activity Date, Activity Name, Activity Type,
Activity Description, Filename
Activity ID, Activity Date, Activity Name, Activity Description, Filename
"""
import csv
import json
import re
from pathlib import Path
from typing import Optional
from typing import Iterator, Optional
_STRAVA_DATE_FMTS = (
@@ -18,10 +18,11 @@ _STRAVA_DATE_FMTS = (
class StravaMetadata:
"""Maps original filename → Strava metadata."""
"""Maps original filename → Strava metadata, with secondary strava_id index."""
def __init__(self, csv_path: Path) -> None:
self._by_filename: dict[str, dict] = {}
self._by_strava_id: dict[str, dict] = {}
self._load(csv_path)
def _load(self, path: Path) -> None:
@@ -29,16 +30,21 @@ class StravaMetadata:
reader = csv.DictReader(f)
for row in reader:
filename = row.get("Filename", "").strip()
if not filename:
continue
# Strava stores paths like "activities/12345.fit.gz"
basename = Path(filename).name
self._by_filename[basename] = row
if filename:
basename = Path(filename).name
self._by_filename[basename] = row
strava_id = row.get("Activity ID", "").strip()
if strava_id:
self._by_strava_id[strava_id] = row
def lookup(self, source_file: str) -> Optional[dict]:
"""Return the Strava CSV row for a given source filename, or None."""
return self._by_filename.get(source_file)
def lookup_by_strava_id(self, strava_id: str) -> Optional[dict]:
"""Return the Strava CSV row for a given Strava activity ID, or None."""
return self._by_strava_id.get(str(strava_id))
def enrich(self, source_file: str, activity: object) -> None:
"""Mutate a ParsedActivity with Strava metadata if found."""
row = self.lookup(source_file)
@@ -53,3 +59,97 @@ class StravaMetadata:
if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined]
activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined]
# ── Retroactive sidecar update ────────────────────────────────────────────────
def _parse_sidecar(path: Path) -> tuple[dict, str]:
"""Return (frontmatter_dict, body) from a sidecar .md file."""
import re as _re
import yaml
text = path.read_text(encoding="utf-8")
if text.startswith("---"):
parts = _re.split(r"^---[ \t]*$", text, maxsplit=2, flags=_re.MULTILINE)
if len(parts) >= 3:
fm = yaml.safe_load(parts[1]) or {}
return fm, parts[2].strip()
return {}, text.strip()
def _write_sidecar(path: Path, fm: dict, body: str) -> None:
import yaml
path.parent.mkdir(parents=True, exist_ok=True)
fm_text = yaml.safe_dump(fm, default_flow_style=False, allow_unicode=True).strip()
content = f"---\n{fm_text}\n---\n"
if body:
content += f"\n{body}\n"
path.write_text(content, encoding="utf-8")
def _update_sidecar_from_row(sidecar_path: Path, row: dict) -> bool:
"""Create or update a sidecar with CSV title/description.
Only fills fields that are not already set in the sidecar.
Returns True if anything changed.
"""
title = row.get("Activity Name", "").strip()
description = row.get("Activity Description", "").strip()
if not title and not description:
return False
fm, body = _parse_sidecar(sidecar_path) if sidecar_path.exists() else ({}, "")
changed = False
if title and "title" not in fm:
fm["title"] = title
changed = True
if description and not body:
body = description
changed = True
if not changed:
return False
_write_sidecar(sidecar_path, fm, body)
return True
def apply_csv_to_data_dir(data_dir: Path, metadata: StravaMetadata) -> int:
"""Retroactively apply CSV metadata to existing activities via sidecars.
Scans all activity JSONs in data_dir/activities/. For each activity that
has a strava_id, looks up the corresponding CSV row and creates/updates
the sidecar in data_dir/edits/ with any missing title or description.
Only writes fields not already present in the sidecar — manual edits are
never overwritten.
Returns the count of activities whose sidecars were created or updated.
"""
activities_dir = data_dir / "activities"
edits_dir = data_dir / "edits"
if not activities_dir.exists():
return 0
updated = 0
for json_path in sorted(activities_dir.glob("*.json")):
try:
detail = json.loads(json_path.read_text(encoding="utf-8"))
except Exception:
continue
strava_id = detail.get("strava_id")
if not strava_id:
continue
row = metadata.lookup_by_strava_id(str(strava_id))
if row is None:
continue
activity_id = json_path.stem
sidecar_path = edits_dir / f"{activity_id}.md"
if _update_sidecar_from_row(sidecar_path, row):
updated += 1
return updated