From 01db4eb9ae4f5323d4dd9eaefd83ca77730845fc Mon Sep 17 00:00:00 2001 From: Davide Scaini Date: Sat, 11 Apr 2026 08:13:27 +0200 Subject: [PATCH] ingest activities.csv --- bincio/edit/server.py | 145 ++++++++++++++++++++++------------- bincio/extract/strava_csv.py | 118 +++++++++++++++++++++++++--- bincio/serve/server.py | 55 +++++++++++-- scripts/bulk_private.py | 108 ++++++++++++++++++++++++++ site/src/layouts/Base.astro | 20 +++-- 5 files changed, 367 insertions(+), 79 deletions(-) create mode 100644 scripts/bulk_private.py diff --git a/bincio/edit/server.py b/bincio/edit/server.py index eda65cc..f70faee 100644 --- a/bincio/edit/server.py +++ b/bincio/edit/server.py @@ -538,73 +538,108 @@ def _file_suffix(name: str) -> str: @app.post("/api/upload") async def upload_activity( - file: UploadFile = File(...), + files: list[UploadFile] = File(...), store_original: bool = Form(False), ) -> JSONResponse: - """Accept a FIT/GPX/TCX file, extract it, update index.json, and re-merge.""" + """Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge. + + activities.csv (Strava export format) can be included in the batch to: + - Enrich activity files being uploaded in the same batch (matched by filename) + - Retroactively update sidecars for existing activities (matched by strava_id) + """ + from bincio.extract.ingest import ingest_parsed + from bincio.extract.parsers.factory import parse_file + from bincio.extract.writer import make_activity_id + from bincio.render.merge import merge_all + dd = _get_data_dir() - - name = Path(file.filename or "upload.fit").name # strip any path components - suffix = _file_suffix(name) - if suffix not in _SUPPORTED_SUFFIXES: - raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.") - - _MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB - contents = await file.read() - if len(contents) > _MAX_UPLOAD_BYTES: - raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.") - staging = dd / "_uploads" staging.mkdir(exist_ok=True) - staged = staging / name - staged.write_bytes(contents) - kept = False - try: - from bincio.extract.metrics import compute - from bincio.extract.parsers.factory import parse_file - from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index + _MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB - activity = parse_file(staged) - metrics = compute(activity) - activity_id = make_activity_id(activity) - - existing_json = dd / "activities" / f"{activity_id}.json" - if existing_json.exists(): - raise HTTPException(409, f"Activity already exists: {activity_id}") - - write_activity(activity, metrics, dd, privacy="public", rdp_epsilon=0.0001) - summary = build_summary(activity, metrics, activity_id, "public") - - # Read current index to preserve owner + existing summaries - index_path = dd / "index.json" - if index_path.exists(): - index_data = json.loads(index_path.read_text(encoding="utf-8")) + # Separate CSV files from activity files + csv_files: list[UploadFile] = [] + activity_files: list[UploadFile] = [] + for f in files: + name = Path(f.filename or "").name.lower() + if name.endswith(".csv"): + csv_files.append(f) else: - index_data = {"owner": {"handle": "unknown"}, "activities": []} - owner = index_data.get("owner", {}) - existing = {s["id"]: s for s in index_data.get("activities", [])} - existing[activity_id] = summary - write_index(list(existing.values()), dd, owner) + activity_files.append(f) - if store_original: - originals_dir = dd / "originals" - originals_dir.mkdir(exist_ok=True) - staged.rename(originals_dir / name) - kept = True + # Build metadata from the first CSV found (activities.csv from Strava export) + metadata = None + if csv_files: + from bincio.extract.strava_csv import StravaMetadata + import tempfile + csv_upload = csv_files[0] + csv_bytes = await csv_upload.read() + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp: + tmp.write(csv_bytes) + tmp_path = Path(tmp.name) + try: + metadata = StravaMetadata(tmp_path) + finally: + tmp_path.unlink(missing_ok=True) - from bincio.render.merge import merge_all + results = [] + any_added = False + + for file in activity_files: + name = Path(file.filename or "upload.fit").name + suffix = _file_suffix(name) + if suffix not in _SUPPORTED_SUFFIXES: + results.append({"name": name, "ok": False, "error": f"Unsupported file type '{Path(name).suffix}'"}) + continue + + contents = await file.read() + if len(contents) > _MAX_UPLOAD_BYTES: + results.append({"name": name, "ok": False, "error": "File too large (max 50 MB)"}) + continue + + staged = staging / name + staged.write_bytes(contents) + kept = False + try: + activity = parse_file(staged) + + # Enrich with CSV metadata when available (matched by filename) + if metadata is not None: + metadata.enrich(name, activity) + + activity_id = make_activity_id(activity) + if (dd / "activities" / f"{activity_id}.json").exists(): + results.append({"name": name, "ok": False, "error": "duplicate"}) + continue + + ingest_parsed(activity, dd, privacy="public") + + if store_original: + originals_dir = dd / "originals" + originals_dir.mkdir(exist_ok=True) + staged.rename(originals_dir / name) + kept = True + + results.append({"name": name, "ok": True, "id": activity_id}) + any_added = True + except Exception: + results.append({"name": name, "ok": False, "error": "Processing failed"}) + finally: + if not kept: + staged.unlink(missing_ok=True) + + # Retroactively update sidecars for existing activities matched by strava_id + csv_updates = 0 + if metadata is not None: + from bincio.extract.strava_csv import apply_csv_to_data_dir + csv_updates = apply_csv_to_data_dir(dd, metadata) + + if any_added or csv_updates: merge_all(dd) - except HTTPException: - raise - except Exception as exc: - raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}") - finally: - if not kept: - staged.unlink(missing_ok=True) - - return JSONResponse({"ok": True, "id": activity_id}) + added = [r for r in results if r["ok"]] + return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results}) @app.post("/api/import-bas") diff --git a/bincio/extract/strava_csv.py b/bincio/extract/strava_csv.py index 472d847..a77f0b0 100644 --- a/bincio/extract/strava_csv.py +++ b/bincio/extract/strava_csv.py @@ -1,14 +1,14 @@ """Import metadata from Strava's activities.csv bulk export. Strava export columns we care about: - Activity ID, Activity Date, Activity Name, Activity Type, - Activity Description, Filename + Activity ID, Activity Date, Activity Name, Activity Description, Filename """ import csv +import json import re from pathlib import Path -from typing import Optional +from typing import Iterator, Optional _STRAVA_DATE_FMTS = ( @@ -18,10 +18,11 @@ _STRAVA_DATE_FMTS = ( class StravaMetadata: - """Maps original filename → Strava metadata.""" + """Maps original filename → Strava metadata, with secondary strava_id index.""" def __init__(self, csv_path: Path) -> None: self._by_filename: dict[str, dict] = {} + self._by_strava_id: dict[str, dict] = {} self._load(csv_path) def _load(self, path: Path) -> None: @@ -29,16 +30,21 @@ class StravaMetadata: reader = csv.DictReader(f) for row in reader: filename = row.get("Filename", "").strip() - if not filename: - continue - # Strava stores paths like "activities/12345.fit.gz" - basename = Path(filename).name - self._by_filename[basename] = row + if filename: + basename = Path(filename).name + self._by_filename[basename] = row + strava_id = row.get("Activity ID", "").strip() + if strava_id: + self._by_strava_id[strava_id] = row def lookup(self, source_file: str) -> Optional[dict]: """Return the Strava CSV row for a given source filename, or None.""" return self._by_filename.get(source_file) + def lookup_by_strava_id(self, strava_id: str) -> Optional[dict]: + """Return the Strava CSV row for a given Strava activity ID, or None.""" + return self._by_strava_id.get(str(strava_id)) + def enrich(self, source_file: str, activity: object) -> None: """Mutate a ParsedActivity with Strava metadata if found.""" row = self.lookup(source_file) @@ -53,3 +59,97 @@ class StravaMetadata: if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined] activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined] + + +# ── Retroactive sidecar update ──────────────────────────────────────────────── + +def _parse_sidecar(path: Path) -> tuple[dict, str]: + """Return (frontmatter_dict, body) from a sidecar .md file.""" + import re as _re + import yaml + text = path.read_text(encoding="utf-8") + if text.startswith("---"): + parts = _re.split(r"^---[ \t]*$", text, maxsplit=2, flags=_re.MULTILINE) + if len(parts) >= 3: + fm = yaml.safe_load(parts[1]) or {} + return fm, parts[2].strip() + return {}, text.strip() + + +def _write_sidecar(path: Path, fm: dict, body: str) -> None: + import yaml + path.parent.mkdir(parents=True, exist_ok=True) + fm_text = yaml.safe_dump(fm, default_flow_style=False, allow_unicode=True).strip() + content = f"---\n{fm_text}\n---\n" + if body: + content += f"\n{body}\n" + path.write_text(content, encoding="utf-8") + + +def _update_sidecar_from_row(sidecar_path: Path, row: dict) -> bool: + """Create or update a sidecar with CSV title/description. + + Only fills fields that are not already set in the sidecar. + Returns True if anything changed. + """ + title = row.get("Activity Name", "").strip() + description = row.get("Activity Description", "").strip() + if not title and not description: + return False + + fm, body = _parse_sidecar(sidecar_path) if sidecar_path.exists() else ({}, "") + + changed = False + if title and "title" not in fm: + fm["title"] = title + changed = True + if description and not body: + body = description + changed = True + + if not changed: + return False + + _write_sidecar(sidecar_path, fm, body) + return True + + +def apply_csv_to_data_dir(data_dir: Path, metadata: StravaMetadata) -> int: + """Retroactively apply CSV metadata to existing activities via sidecars. + + Scans all activity JSONs in data_dir/activities/. For each activity that + has a strava_id, looks up the corresponding CSV row and creates/updates + the sidecar in data_dir/edits/ with any missing title or description. + + Only writes fields not already present in the sidecar — manual edits are + never overwritten. + + Returns the count of activities whose sidecars were created or updated. + """ + activities_dir = data_dir / "activities" + edits_dir = data_dir / "edits" + + if not activities_dir.exists(): + return 0 + + updated = 0 + for json_path in sorted(activities_dir.glob("*.json")): + try: + detail = json.loads(json_path.read_text(encoding="utf-8")) + except Exception: + continue + + strava_id = detail.get("strava_id") + if not strava_id: + continue + + row = metadata.lookup_by_strava_id(str(strava_id)) + if row is None: + continue + + activity_id = json_path.stem + sidecar_path = edits_dir / f"{activity_id}.md" + if _update_sidecar_from_row(sidecar_path, row): + updated += 1 + + return updated diff --git a/bincio/serve/server.py b/bincio/serve/server.py index c26292d..ef69c02 100644 --- a/bincio/serve/server.py +++ b/bincio/serve/server.py @@ -530,6 +530,12 @@ async def upload_activity( store_original: bool = Form(False), bincio_session: Optional[str] = Cookie(default=None), ) -> JSONResponse: + """Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge. + + activities.csv (Strava export format) can be included in the batch to: + - Enrich activity files being uploaded in the same batch (matched by filename) + - Retroactively update sidecars for existing activities (matched by strava_id) + """ from bincio.extract.ingest import ingest_parsed from bincio.extract.parsers.factory import parse_file from bincio.extract.writer import make_activity_id @@ -540,13 +546,36 @@ async def upload_activity( staging = dd / "_uploads" staging.mkdir(exist_ok=True) + # Separate CSV files from activity files + csv_files: list[UploadFile] = [] + activity_files: list[UploadFile] = [] + for f in files: + fname = Path(f.filename or "").name.lower() + if fname.endswith(".csv"): + csv_files.append(f) + else: + activity_files.append(f) + + # Build metadata from the first CSV found (activities.csv from Strava export) + metadata = None + if csv_files: + from bincio.extract.strava_csv import StravaMetadata + import tempfile + csv_bytes = await csv_files[0].read() + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp: + tmp.write(csv_bytes) + tmp_path = Path(tmp.name) + try: + metadata = StravaMetadata(tmp_path) + finally: + tmp_path.unlink(missing_ok=True) + results = [] any_added = False - for file in files: + for file in activity_files: name = Path(file.filename or "upload.fit").name - p = Path(name.lower()) - suffix = (p.stem.rsplit(".", 1)[-1].join([".", ".gz"]) if "." in p.stem else ".gz") if p.suffix == ".gz" else p.suffix + suffix = _file_suffix(name) if suffix not in _SUPPORTED_SUFFIXES: results.append({"name": name, "ok": False, "error": f"Unsupported file type '{suffix}'"}) continue @@ -561,6 +590,11 @@ async def upload_activity( kept = False try: activity = parse_file(staged) + + # Enrich with CSV metadata when available (matched by filename) + if metadata is not None: + metadata.enrich(name, activity) + activity_id = make_activity_id(activity) if (dd / "activities" / f"{activity_id}.json").exists(): results.append({"name": name, "ok": False, "error": "duplicate"}) @@ -573,18 +607,25 @@ async def upload_activity( kept = True results.append({"name": name, "ok": True, "id": activity_id}) any_added = True - except Exception as exc: + except Exception: results.append({"name": name, "ok": False, "error": "Processing failed"}) finally: if not kept: staged.unlink(missing_ok=True) - if any_added: + # Retroactively update sidecars for existing activities matched by strava_id + csv_updates = 0 + if metadata is not None: + from bincio.extract.strava_csv import apply_csv_to_data_dir + csv_updates = apply_csv_to_data_dir(dd, metadata) + + if any_added or csv_updates: merge_all(dd) - _trigger_rebuild(user.handle) + if any_added: + _trigger_rebuild(user.handle) added = [r for r in results if r["ok"]] - return JSONResponse({"ok": True, "added": len(added), "results": results}) + return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results}) @app.post("/api/upload/strava-zip") diff --git a/scripts/bulk_private.py b/scripts/bulk_private.py new file mode 100644 index 0000000..7a33941 --- /dev/null +++ b/scripts/bulk_private.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Bulk-set activities matching a title pattern to private by writing sidecar files. + +Usage: + uv run python scripts/bulk_private.py --data-dir /var/bincio/data/brut --match "morning walk" "afternoon walk" + + --dry-run Print what would be changed without writing anything. + --handle Subdirectory name (if data-dir is the root, not the user dir). +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + +import yaml + + +def parse_sidecar(path: Path) -> tuple[dict, str]: + text = path.read_text(encoding="utf-8") + if text.startswith("---"): + parts = re.split(r"^---[ \t]*$", text, maxsplit=2, flags=re.MULTILINE) + if len(parts) >= 3: + fm = yaml.safe_load(parts[1]) or {} + return fm, parts[2].strip() + return {}, text.strip() + + +def write_sidecar(path: Path, fm: dict, body: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + content = "---\n" + yaml.dump(fm, allow_unicode=True, default_flow_style=False) + "---\n" + if body: + content += "\n" + body + "\n" + path.write_text(content, encoding="utf-8") + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--data-dir", required=True, help="User data directory (e.g. /var/bincio/data/brut)") + ap.add_argument("--handle", default=None, help="Handle subdir if data-dir is the instance root") + ap.add_argument("--match", nargs="+", required=True, help="Title patterns to match (case-insensitive substring)") + ap.add_argument("--dry-run", action="store_true", help="Print changes without writing") + args = ap.parse_args() + + data_dir = Path(args.data_dir) + if args.handle: + data_dir = data_dir / args.handle + + index_path = data_dir / "index.json" + if not index_path.exists(): + sys.exit(f"ERROR: index.json not found at {index_path}") + + index = json.loads(index_path.read_text(encoding="utf-8")) + activities = index.get("activities", []) + + patterns = [p.lower() for p in args.match] + + matched = [ + a for a in activities + if any(pat in (a.get("title") or "").lower() for pat in patterns) + ] + + if not matched: + print("No activities matched.") + return + + print(f"Found {len(matched)} matching activities:") + edits_dir = data_dir / "edits" + changed = 0 + + for act in matched: + aid = act["id"] + title = act.get("title", "(no title)") + date = act.get("started_at", "")[:10] + sidecar_path = edits_dir / f"{aid}.md" + + # Load existing sidecar if present + if sidecar_path.exists(): + fm, body = parse_sidecar(sidecar_path) + else: + fm, body = {}, "" + + if fm.get("private") is True: + print(f" [already private] {date} {title}") + continue + + print(f" {'[DRY RUN] ' if args.dry_run else ''}→ private {date} {title}") + if not args.dry_run: + fm["private"] = True + write_sidecar(sidecar_path, fm, body) + changed += 1 + + if args.dry_run: + print("\nDry run — nothing written. Re-run without --dry-run to apply.") + else: + print(f"\n{changed} sidecar(s) written.") + if changed: + print("Running merge_all …") + from bincio.render.merge import merge_all + n = merge_all(data_dir) + print(f"merge_all done ({n} sidecar(s) applied).") + + +if __name__ == "__main__": + main() diff --git a/site/src/layouts/Base.astro b/site/src/layouts/Base.astro index b531a45..a44c785 100644 --- a/site/src/layouts/Base.astro +++ b/site/src/layouts/Base.astro @@ -275,8 +275,8 @@ try { id="upload-drop" class="border-2 border-dashed border-zinc-700 rounded-lg p-8 text-center text-zinc-500 text-sm cursor-pointer hover:border-zinc-500 hover:text-zinc-300 transition-colors" > -
Drop FIT, GPX, or TCX files
or click to browse
- +
Drop FIT, GPX, TCX, or activities.csv
or click to browse
+