ingest activities.csv

2026-04-11 08:13:27 +02:00
parent cbd5a98cd3
commit 01db4eb9ae
5 changed files with 367 additions and 79 deletions
@@ -538,73 +538,108 @@ def _file_suffix(name: str) -> str:

@app.post("/api/upload")
 async def upload_activity(
-    file: UploadFile = File(...),
+    files: list[UploadFile] = File(...),
    store_original: bool = Form(False),
 ) -> JSONResponse:
-    """Accept a FIT/GPX/TCX file, extract it, update index.json, and re-merge."""
+    """Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge.
+
+    activities.csv (Strava export format) can be included in the batch to:
+      - Enrich activity files being uploaded in the same batch (matched by filename)
+      - Retroactively update sidecars for existing activities (matched by strava_id)
+    """
+    from bincio.extract.ingest import ingest_parsed
+    from bincio.extract.parsers.factory import parse_file
+    from bincio.extract.writer import make_activity_id
+    from bincio.render.merge import merge_all
+
    dd = _get_data_dir()
-
-    name = Path(file.filename or "upload.fit").name  # strip any path components
-    suffix = _file_suffix(name)
-    if suffix not in _SUPPORTED_SUFFIXES:
-        raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.")
-
-    _MAX_UPLOAD_BYTES = 50 * 1024 * 1024  # 50 MB
-    contents = await file.read()
-    if len(contents) > _MAX_UPLOAD_BYTES:
-        raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.")
-
    staging = dd / "_uploads"
    staging.mkdir(exist_ok=True)
-    staged = staging / name
-    staged.write_bytes(contents)

-    kept = False
-    try:
-        from bincio.extract.metrics import compute
-        from bincio.extract.parsers.factory import parse_file
-        from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
+    _MAX_UPLOAD_BYTES = 50 * 1024 * 1024  # 50 MB

-        activity = parse_file(staged)
-        metrics = compute(activity)
-        activity_id = make_activity_id(activity)
-
-        existing_json = dd / "activities" / f"{activity_id}.json"
-        if existing_json.exists():
-            raise HTTPException(409, f"Activity already exists: {activity_id}")
-
-        write_activity(activity, metrics, dd, privacy="public", rdp_epsilon=0.0001)
-        summary = build_summary(activity, metrics, activity_id, "public")
-
-        # Read current index to preserve owner + existing summaries
-        index_path = dd / "index.json"
-        if index_path.exists():
-            index_data = json.loads(index_path.read_text(encoding="utf-8"))
+    # Separate CSV files from activity files
+    csv_files: list[UploadFile] = []
+    activity_files: list[UploadFile] = []
+    for f in files:
+        name = Path(f.filename or "").name.lower()
+        if name.endswith(".csv"):
+            csv_files.append(f)
        else:
-            index_data = {"owner": {"handle": "unknown"}, "activities": []}
-        owner = index_data.get("owner", {})
-        existing = {s["id"]: s for s in index_data.get("activities", [])}
-        existing[activity_id] = summary
-        write_index(list(existing.values()), dd, owner)
+            activity_files.append(f)

-        if store_original:
-            originals_dir = dd / "originals"
-            originals_dir.mkdir(exist_ok=True)
-            staged.rename(originals_dir / name)
-            kept = True
+    # Build metadata from the first CSV found (activities.csv from Strava export)
+    metadata = None
+    if csv_files:
+        from bincio.extract.strava_csv import StravaMetadata
+        import tempfile
+        csv_upload = csv_files[0]
+        csv_bytes = await csv_upload.read()
+        with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
+            tmp.write(csv_bytes)
+            tmp_path = Path(tmp.name)
+        try:
+            metadata = StravaMetadata(tmp_path)
+        finally:
+            tmp_path.unlink(missing_ok=True)

-        from bincio.render.merge import merge_all
+    results = []
+    any_added = False
+
+    for file in activity_files:
+        name = Path(file.filename or "upload.fit").name
+        suffix = _file_suffix(name)
+        if suffix not in _SUPPORTED_SUFFIXES:
+            results.append({"name": name, "ok": False, "error": f"Unsupported file type '{Path(name).suffix}'"})
+            continue
+
+        contents = await file.read()
+        if len(contents) > _MAX_UPLOAD_BYTES:
+            results.append({"name": name, "ok": False, "error": "File too large (max 50 MB)"})
+            continue
+
+        staged = staging / name
+        staged.write_bytes(contents)
+        kept = False
+        try:
+            activity = parse_file(staged)
+
+            # Enrich with CSV metadata when available (matched by filename)
+            if metadata is not None:
+                metadata.enrich(name, activity)
+
+            activity_id = make_activity_id(activity)
+            if (dd / "activities" / f"{activity_id}.json").exists():
+                results.append({"name": name, "ok": False, "error": "duplicate"})
+                continue
+
+            ingest_parsed(activity, dd, privacy="public")
+
+            if store_original:
+                originals_dir = dd / "originals"
+                originals_dir.mkdir(exist_ok=True)
+                staged.rename(originals_dir / name)
+                kept = True
+
+            results.append({"name": name, "ok": True, "id": activity_id})
+            any_added = True
+        except Exception:
+            results.append({"name": name, "ok": False, "error": "Processing failed"})
+        finally:
+            if not kept:
+                staged.unlink(missing_ok=True)
+
+    # Retroactively update sidecars for existing activities matched by strava_id
+    csv_updates = 0
+    if metadata is not None:
+        from bincio.extract.strava_csv import apply_csv_to_data_dir
+        csv_updates = apply_csv_to_data_dir(dd, metadata)
+
+    if any_added or csv_updates:
        merge_all(dd)

-    except HTTPException:
-        raise
-    except Exception as exc:
-        raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}")
-    finally:
-        if not kept:
-            staged.unlink(missing_ok=True)
-
-    return JSONResponse({"ok": True, "id": activity_id})
+    added = [r for r in results if r["ok"]]
+    return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results})


@app.post("/api/import-bas")
@@ -1,14 +1,14 @@
 """Import metadata from Strava's activities.csv bulk export.

 Strava export columns we care about:
-  Activity ID, Activity Date, Activity Name, Activity Type,
-  Activity Description, Filename
+  Activity ID, Activity Date, Activity Name, Activity Description, Filename
 """

 import csv
+import json
 import re
 from pathlib import Path
-from typing import Optional
+from typing import Iterator, Optional


 _STRAVA_DATE_FMTS = (
@@ -18,10 +18,11 @@ _STRAVA_DATE_FMTS = (


 class StravaMetadata:
-    """Maps original filename → Strava metadata."""
+    """Maps original filename → Strava metadata, with secondary strava_id index."""

    def __init__(self, csv_path: Path) -> None:
        self._by_filename: dict[str, dict] = {}
+        self._by_strava_id: dict[str, dict] = {}
        self._load(csv_path)

    def _load(self, path: Path) -> None:
@@ -29,16 +30,21 @@ class StravaMetadata:
            reader = csv.DictReader(f)
            for row in reader:
                filename = row.get("Filename", "").strip()
-                if not filename:
-                    continue
-                # Strava stores paths like "activities/12345.fit.gz"
-                basename = Path(filename).name
-                self._by_filename[basename] = row
+                if filename:
+                    basename = Path(filename).name
+                    self._by_filename[basename] = row
+                strava_id = row.get("Activity ID", "").strip()
+                if strava_id:
+                    self._by_strava_id[strava_id] = row

    def lookup(self, source_file: str) -> Optional[dict]:
        """Return the Strava CSV row for a given source filename, or None."""
        return self._by_filename.get(source_file)

+    def lookup_by_strava_id(self, strava_id: str) -> Optional[dict]:
+        """Return the Strava CSV row for a given Strava activity ID, or None."""
+        return self._by_strava_id.get(str(strava_id))
+
    def enrich(self, source_file: str, activity: object) -> None:
        """Mutate a ParsedActivity with Strava metadata if found."""
        row = self.lookup(source_file)
@@ -53,3 +59,97 @@ class StravaMetadata:

        if not activity.strava_id and row.get("Activity ID"):  # type: ignore[attr-defined]
            activity.strava_id = row["Activity ID"].strip()  # type: ignore[attr-defined]
+
+
+# ── Retroactive sidecar update ────────────────────────────────────────────────
+
+def _parse_sidecar(path: Path) -> tuple[dict, str]:
+    """Return (frontmatter_dict, body) from a sidecar .md file."""
+    import re as _re
+    import yaml
+    text = path.read_text(encoding="utf-8")
+    if text.startswith("---"):
+        parts = _re.split(r"^---[ \t]*$", text, maxsplit=2, flags=_re.MULTILINE)
+        if len(parts) >= 3:
+            fm = yaml.safe_load(parts[1]) or {}
+            return fm, parts[2].strip()
+    return {}, text.strip()
+
+
+def _write_sidecar(path: Path, fm: dict, body: str) -> None:
+    import yaml
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fm_text = yaml.safe_dump(fm, default_flow_style=False, allow_unicode=True).strip()
+    content = f"---\n{fm_text}\n---\n"
+    if body:
+        content += f"\n{body}\n"
+    path.write_text(content, encoding="utf-8")
+
+
+def _update_sidecar_from_row(sidecar_path: Path, row: dict) -> bool:
+    """Create or update a sidecar with CSV title/description.
+
+    Only fills fields that are not already set in the sidecar.
+    Returns True if anything changed.
+    """
+    title = row.get("Activity Name", "").strip()
+    description = row.get("Activity Description", "").strip()
+    if not title and not description:
+        return False
+
+    fm, body = _parse_sidecar(sidecar_path) if sidecar_path.exists() else ({}, "")
+
+    changed = False
+    if title and "title" not in fm:
+        fm["title"] = title
+        changed = True
+    if description and not body:
+        body = description
+        changed = True
+
+    if not changed:
+        return False
+
+    _write_sidecar(sidecar_path, fm, body)
+    return True
+
+
+def apply_csv_to_data_dir(data_dir: Path, metadata: StravaMetadata) -> int:
+    """Retroactively apply CSV metadata to existing activities via sidecars.
+
+    Scans all activity JSONs in data_dir/activities/.  For each activity that
+    has a strava_id, looks up the corresponding CSV row and creates/updates
+    the sidecar in data_dir/edits/ with any missing title or description.
+
+    Only writes fields not already present in the sidecar — manual edits are
+    never overwritten.
+
+    Returns the count of activities whose sidecars were created or updated.
+    """
+    activities_dir = data_dir / "activities"
+    edits_dir = data_dir / "edits"
+
+    if not activities_dir.exists():
+        return 0
+
+    updated = 0
+    for json_path in sorted(activities_dir.glob("*.json")):
+        try:
+            detail = json.loads(json_path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+
+        strava_id = detail.get("strava_id")
+        if not strava_id:
+            continue
+
+        row = metadata.lookup_by_strava_id(str(strava_id))
+        if row is None:
+            continue
+
+        activity_id = json_path.stem
+        sidecar_path = edits_dir / f"{activity_id}.md"
+        if _update_sidecar_from_row(sidecar_path, row):
+            updated += 1
+
+    return updated
@@ -530,6 +530,12 @@ async def upload_activity(
    store_original: bool = Form(False),
    bincio_session: Optional[str] = Cookie(default=None),
 ) -> JSONResponse:
+    """Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge.
+
+    activities.csv (Strava export format) can be included in the batch to:
+      - Enrich activity files being uploaded in the same batch (matched by filename)
+      - Retroactively update sidecars for existing activities (matched by strava_id)
+    """
    from bincio.extract.ingest import ingest_parsed
    from bincio.extract.parsers.factory import parse_file
    from bincio.extract.writer import make_activity_id
@@ -540,13 +546,36 @@ async def upload_activity(
    staging = dd / "_uploads"
    staging.mkdir(exist_ok=True)

+    # Separate CSV files from activity files
+    csv_files: list[UploadFile] = []
+    activity_files: list[UploadFile] = []
+    for f in files:
+        fname = Path(f.filename or "").name.lower()
+        if fname.endswith(".csv"):
+            csv_files.append(f)
+        else:
+            activity_files.append(f)
+
+    # Build metadata from the first CSV found (activities.csv from Strava export)
+    metadata = None
+    if csv_files:
+        from bincio.extract.strava_csv import StravaMetadata
+        import tempfile
+        csv_bytes = await csv_files[0].read()
+        with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
+            tmp.write(csv_bytes)
+            tmp_path = Path(tmp.name)
+        try:
+            metadata = StravaMetadata(tmp_path)
+        finally:
+            tmp_path.unlink(missing_ok=True)
+
    results = []
    any_added = False

-    for file in files:
+    for file in activity_files:
        name = Path(file.filename or "upload.fit").name
-        p = Path(name.lower())
-        suffix = (p.stem.rsplit(".", 1)[-1].join([".", ".gz"]) if "." in p.stem else ".gz") if p.suffix == ".gz" else p.suffix
+        suffix = _file_suffix(name)
        if suffix not in _SUPPORTED_SUFFIXES:
            results.append({"name": name, "ok": False, "error": f"Unsupported file type '{suffix}'"})
            continue
@@ -561,6 +590,11 @@ async def upload_activity(
        kept = False
        try:
            activity = parse_file(staged)
+
+            # Enrich with CSV metadata when available (matched by filename)
+            if metadata is not None:
+                metadata.enrich(name, activity)
+
            activity_id = make_activity_id(activity)
            if (dd / "activities" / f"{activity_id}.json").exists():
                results.append({"name": name, "ok": False, "error": "duplicate"})
@@ -573,18 +607,25 @@ async def upload_activity(
                kept = True
            results.append({"name": name, "ok": True, "id": activity_id})
            any_added = True
-        except Exception as exc:
+        except Exception:
            results.append({"name": name, "ok": False, "error": "Processing failed"})
        finally:
            if not kept:
                staged.unlink(missing_ok=True)

-    if any_added:
+    # Retroactively update sidecars for existing activities matched by strava_id
+    csv_updates = 0
+    if metadata is not None:
+        from bincio.extract.strava_csv import apply_csv_to_data_dir
+        csv_updates = apply_csv_to_data_dir(dd, metadata)
+
+    if any_added or csv_updates:
        merge_all(dd)
-        _trigger_rebuild(user.handle)
+        if any_added:
+            _trigger_rebuild(user.handle)

    added = [r for r in results if r["ok"]]
-    return JSONResponse({"ok": True, "added": len(added), "results": results})
+    return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results})


@app.post("/api/upload/strava-zip")
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Bulk-set activities matching a title pattern to private by writing sidecar files.
+
+Usage:
+    uv run python scripts/bulk_private.py --data-dir /var/bincio/data/brut --match "morning walk" "afternoon walk"
+
+    --dry-run   Print what would be changed without writing anything.
+    --handle    Subdirectory name (if data-dir is the root, not the user dir).
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import yaml
+
+
+def parse_sidecar(path: Path) -> tuple[dict, str]:
+    text = path.read_text(encoding="utf-8")
+    if text.startswith("---"):
+        parts = re.split(r"^---[ \t]*$", text, maxsplit=2, flags=re.MULTILINE)
+        if len(parts) >= 3:
+            fm = yaml.safe_load(parts[1]) or {}
+            return fm, parts[2].strip()
+    return {}, text.strip()
+
+
+def write_sidecar(path: Path, fm: dict, body: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    content = "---\n" + yaml.dump(fm, allow_unicode=True, default_flow_style=False) + "---\n"
+    if body:
+        content += "\n" + body + "\n"
+    path.write_text(content, encoding="utf-8")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--data-dir", required=True, help="User data directory (e.g. /var/bincio/data/brut)")
+    ap.add_argument("--handle", default=None, help="Handle subdir if data-dir is the instance root")
+    ap.add_argument("--match", nargs="+", required=True, help="Title patterns to match (case-insensitive substring)")
+    ap.add_argument("--dry-run", action="store_true", help="Print changes without writing")
+    args = ap.parse_args()
+
+    data_dir = Path(args.data_dir)
+    if args.handle:
+        data_dir = data_dir / args.handle
+
+    index_path = data_dir / "index.json"
+    if not index_path.exists():
+        sys.exit(f"ERROR: index.json not found at {index_path}")
+
+    index = json.loads(index_path.read_text(encoding="utf-8"))
+    activities = index.get("activities", [])
+
+    patterns = [p.lower() for p in args.match]
+
+    matched = [
+        a for a in activities
+        if any(pat in (a.get("title") or "").lower() for pat in patterns)
+    ]
+
+    if not matched:
+        print("No activities matched.")
+        return
+
+    print(f"Found {len(matched)} matching activities:")
+    edits_dir = data_dir / "edits"
+    changed = 0
+
+    for act in matched:
+        aid = act["id"]
+        title = act.get("title", "(no title)")
+        date = act.get("started_at", "")[:10]
+        sidecar_path = edits_dir / f"{aid}.md"
+
+        # Load existing sidecar if present
+        if sidecar_path.exists():
+            fm, body = parse_sidecar(sidecar_path)
+        else:
+            fm, body = {}, ""
+
+        if fm.get("private") is True:
+            print(f"  [already private] {date}  {title}")
+            continue
+
+        print(f"  {'[DRY RUN] ' if args.dry_run else ''}→ private  {date}  {title}")
+        if not args.dry_run:
+            fm["private"] = True
+            write_sidecar(sidecar_path, fm, body)
+            changed += 1
+
+    if args.dry_run:
+        print("\nDry run — nothing written. Re-run without --dry-run to apply.")
+    else:
+        print(f"\n{changed} sidecar(s) written.")
+        if changed:
+            print("Running merge_all …")
+            from bincio.render.merge import merge_all
+            n = merge_all(data_dir)
+            print(f"merge_all done ({n} sidecar(s) applied).")
+
+
+if __name__ == "__main__":
+    main()
@@ -275,8 +275,8 @@ try {
            id="upload-drop"
            class="border-2 border-dashed border-zinc-700 rounded-lg p-8 text-center text-zinc-500 text-sm cursor-pointer hover:border-zinc-500 hover:text-zinc-300 transition-colors"
          >
-            <div id="upload-label">Drop FIT, GPX, or TCX files<br/>or click to browse</div>
-            <input id="upload-input" type="file" accept=".fit,.gpx,.tcx,.fit.gz,.gpx.gz,.tcx.gz" class="hidden" multiple />
+            <div id="upload-label">Drop FIT, GPX, TCX, or activities.csv<br/>or click to browse</div>
+            <input id="upload-input" type="file" accept=".fit,.gpx,.tcx,.fit.gz,.gpx.gz,.tcx.gz,.csv" class="hidden" multiple />
          </div>
          <label class="flex items-start gap-2 mt-3 cursor-pointer group">
            <input
@@ -525,12 +525,16 @@ try {
          const d = await r.json();
          const dupes = d.results.filter(r => r.error === 'duplicate').length;
          const errors = d.results.filter(r => !r.ok && r.error !== 'duplicate').length;
-          let msg = `${d.added} added`;
-          if (dupes) msg += `, ${dupes} duplicate${dupes > 1 ? 's' : ''}`;
-          if (errors) msg += `, ${errors} failed`;
-          fileStatus.textContent = msg;
-          fileStatus.style.color = d.added > 0 ? '#4ade80' : '#a1a1aa';
-          if (d.added > 0) setTimeout(() => { window.location.reload(); }, 1200);
+          const parts = [];
+          if (d.added > 0) parts.push(`${d.added} added`);
+          if (d.csv_updates > 0) parts.push(`${d.csv_updates} updated from CSV`);
+          if (dupes) parts.push(`${dupes} duplicate${dupes > 1 ? 's' : ''}`);
+          if (errors) parts.push(`${errors} failed`);
+          if (parts.length === 0) parts.push('nothing to add');
+          fileStatus.textContent = parts.join(', ');
+          const anyGood = d.added > 0 || d.csv_updates > 0;
+          fileStatus.style.color = anyGood ? '#4ade80' : '#a1a1aa';
+          if (anyGood) setTimeout(() => { window.location.reload(); }, 1200);
          else drop.style.pointerEvents = '';
        } catch (e) {
          fileStatus.textContent = 'Error: ' + e.message;