ingest activities.csv

This commit is contained in:
Davide Scaini
2026-04-11 08:13:27 +02:00
parent cbd5a98cd3
commit 01db4eb9ae
5 changed files with 367 additions and 79 deletions
+90 -55
View File
@@ -538,73 +538,108 @@ def _file_suffix(name: str) -> str:
@app.post("/api/upload")
async def upload_activity(
file: UploadFile = File(...),
files: list[UploadFile] = File(...),
store_original: bool = Form(False),
) -> JSONResponse:
"""Accept a FIT/GPX/TCX file, extract it, update index.json, and re-merge."""
"""Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge.
activities.csv (Strava export format) can be included in the batch to:
- Enrich activity files being uploaded in the same batch (matched by filename)
- Retroactively update sidecars for existing activities (matched by strava_id)
"""
from bincio.extract.ingest import ingest_parsed
from bincio.extract.parsers.factory import parse_file
from bincio.extract.writer import make_activity_id
from bincio.render.merge import merge_all
dd = _get_data_dir()
name = Path(file.filename or "upload.fit").name # strip any path components
suffix = _file_suffix(name)
if suffix not in _SUPPORTED_SUFFIXES:
raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.")
_MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
contents = await file.read()
if len(contents) > _MAX_UPLOAD_BYTES:
raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.")
staging = dd / "_uploads"
staging.mkdir(exist_ok=True)
staged = staging / name
staged.write_bytes(contents)
kept = False
try:
from bincio.extract.metrics import compute
from bincio.extract.parsers.factory import parse_file
from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
_MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
activity = parse_file(staged)
metrics = compute(activity)
activity_id = make_activity_id(activity)
existing_json = dd / "activities" / f"{activity_id}.json"
if existing_json.exists():
raise HTTPException(409, f"Activity already exists: {activity_id}")
write_activity(activity, metrics, dd, privacy="public", rdp_epsilon=0.0001)
summary = build_summary(activity, metrics, activity_id, "public")
# Read current index to preserve owner + existing summaries
index_path = dd / "index.json"
if index_path.exists():
index_data = json.loads(index_path.read_text(encoding="utf-8"))
# Separate CSV files from activity files
csv_files: list[UploadFile] = []
activity_files: list[UploadFile] = []
for f in files:
name = Path(f.filename or "").name.lower()
if name.endswith(".csv"):
csv_files.append(f)
else:
index_data = {"owner": {"handle": "unknown"}, "activities": []}
owner = index_data.get("owner", {})
existing = {s["id"]: s for s in index_data.get("activities", [])}
existing[activity_id] = summary
write_index(list(existing.values()), dd, owner)
activity_files.append(f)
if store_original:
originals_dir = dd / "originals"
originals_dir.mkdir(exist_ok=True)
staged.rename(originals_dir / name)
kept = True
# Build metadata from the first CSV found (activities.csv from Strava export)
metadata = None
if csv_files:
from bincio.extract.strava_csv import StravaMetadata
import tempfile
csv_upload = csv_files[0]
csv_bytes = await csv_upload.read()
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
tmp.write(csv_bytes)
tmp_path = Path(tmp.name)
try:
metadata = StravaMetadata(tmp_path)
finally:
tmp_path.unlink(missing_ok=True)
from bincio.render.merge import merge_all
results = []
any_added = False
for file in activity_files:
name = Path(file.filename or "upload.fit").name
suffix = _file_suffix(name)
if suffix not in _SUPPORTED_SUFFIXES:
results.append({"name": name, "ok": False, "error": f"Unsupported file type '{Path(name).suffix}'"})
continue
contents = await file.read()
if len(contents) > _MAX_UPLOAD_BYTES:
results.append({"name": name, "ok": False, "error": "File too large (max 50 MB)"})
continue
staged = staging / name
staged.write_bytes(contents)
kept = False
try:
activity = parse_file(staged)
# Enrich with CSV metadata when available (matched by filename)
if metadata is not None:
metadata.enrich(name, activity)
activity_id = make_activity_id(activity)
if (dd / "activities" / f"{activity_id}.json").exists():
results.append({"name": name, "ok": False, "error": "duplicate"})
continue
ingest_parsed(activity, dd, privacy="public")
if store_original:
originals_dir = dd / "originals"
originals_dir.mkdir(exist_ok=True)
staged.rename(originals_dir / name)
kept = True
results.append({"name": name, "ok": True, "id": activity_id})
any_added = True
except Exception:
results.append({"name": name, "ok": False, "error": "Processing failed"})
finally:
if not kept:
staged.unlink(missing_ok=True)
# Retroactively update sidecars for existing activities matched by strava_id
csv_updates = 0
if metadata is not None:
from bincio.extract.strava_csv import apply_csv_to_data_dir
csv_updates = apply_csv_to_data_dir(dd, metadata)
if any_added or csv_updates:
merge_all(dd)
except HTTPException:
raise
except Exception as exc:
raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}")
finally:
if not kept:
staged.unlink(missing_ok=True)
return JSONResponse({"ok": True, "id": activity_id})
added = [r for r in results if r["ok"]]
return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results})
@app.post("/api/import-bas")
+109 -9
View File
@@ -1,14 +1,14 @@
"""Import metadata from Strava's activities.csv bulk export.
Strava export columns we care about:
Activity ID, Activity Date, Activity Name, Activity Type,
Activity Description, Filename
Activity ID, Activity Date, Activity Name, Activity Description, Filename
"""
import csv
import json
import re
from pathlib import Path
from typing import Optional
from typing import Iterator, Optional
_STRAVA_DATE_FMTS = (
@@ -18,10 +18,11 @@ _STRAVA_DATE_FMTS = (
class StravaMetadata:
"""Maps original filename → Strava metadata."""
"""Maps original filename → Strava metadata, with secondary strava_id index."""
def __init__(self, csv_path: Path) -> None:
self._by_filename: dict[str, dict] = {}
self._by_strava_id: dict[str, dict] = {}
self._load(csv_path)
def _load(self, path: Path) -> None:
@@ -29,16 +30,21 @@ class StravaMetadata:
reader = csv.DictReader(f)
for row in reader:
filename = row.get("Filename", "").strip()
if not filename:
continue
# Strava stores paths like "activities/12345.fit.gz"
basename = Path(filename).name
self._by_filename[basename] = row
if filename:
basename = Path(filename).name
self._by_filename[basename] = row
strava_id = row.get("Activity ID", "").strip()
if strava_id:
self._by_strava_id[strava_id] = row
def lookup(self, source_file: str) -> Optional[dict]:
"""Return the Strava CSV row for a given source filename, or None."""
return self._by_filename.get(source_file)
def lookup_by_strava_id(self, strava_id: str) -> Optional[dict]:
"""Return the Strava CSV row for a given Strava activity ID, or None."""
return self._by_strava_id.get(str(strava_id))
def enrich(self, source_file: str, activity: object) -> None:
"""Mutate a ParsedActivity with Strava metadata if found."""
row = self.lookup(source_file)
@@ -53,3 +59,97 @@ class StravaMetadata:
if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined]
activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined]
# ── Retroactive sidecar update ────────────────────────────────────────────────
def _parse_sidecar(path: Path) -> tuple[dict, str]:
"""Return (frontmatter_dict, body) from a sidecar .md file."""
import re as _re
import yaml
text = path.read_text(encoding="utf-8")
if text.startswith("---"):
parts = _re.split(r"^---[ \t]*$", text, maxsplit=2, flags=_re.MULTILINE)
if len(parts) >= 3:
fm = yaml.safe_load(parts[1]) or {}
return fm, parts[2].strip()
return {}, text.strip()
def _write_sidecar(path: Path, fm: dict, body: str) -> None:
import yaml
path.parent.mkdir(parents=True, exist_ok=True)
fm_text = yaml.safe_dump(fm, default_flow_style=False, allow_unicode=True).strip()
content = f"---\n{fm_text}\n---\n"
if body:
content += f"\n{body}\n"
path.write_text(content, encoding="utf-8")
def _update_sidecar_from_row(sidecar_path: Path, row: dict) -> bool:
"""Create or update a sidecar with CSV title/description.
Only fills fields that are not already set in the sidecar.
Returns True if anything changed.
"""
title = row.get("Activity Name", "").strip()
description = row.get("Activity Description", "").strip()
if not title and not description:
return False
fm, body = _parse_sidecar(sidecar_path) if sidecar_path.exists() else ({}, "")
changed = False
if title and "title" not in fm:
fm["title"] = title
changed = True
if description and not body:
body = description
changed = True
if not changed:
return False
_write_sidecar(sidecar_path, fm, body)
return True
def apply_csv_to_data_dir(data_dir: Path, metadata: StravaMetadata) -> int:
"""Retroactively apply CSV metadata to existing activities via sidecars.
Scans all activity JSONs in data_dir/activities/. For each activity that
has a strava_id, looks up the corresponding CSV row and creates/updates
the sidecar in data_dir/edits/ with any missing title or description.
Only writes fields not already present in the sidecar — manual edits are
never overwritten.
Returns the count of activities whose sidecars were created or updated.
"""
activities_dir = data_dir / "activities"
edits_dir = data_dir / "edits"
if not activities_dir.exists():
return 0
updated = 0
for json_path in sorted(activities_dir.glob("*.json")):
try:
detail = json.loads(json_path.read_text(encoding="utf-8"))
except Exception:
continue
strava_id = detail.get("strava_id")
if not strava_id:
continue
row = metadata.lookup_by_strava_id(str(strava_id))
if row is None:
continue
activity_id = json_path.stem
sidecar_path = edits_dir / f"{activity_id}.md"
if _update_sidecar_from_row(sidecar_path, row):
updated += 1
return updated
+48 -7
View File
@@ -530,6 +530,12 @@ async def upload_activity(
store_original: bool = Form(False),
bincio_session: Optional[str] = Cookie(default=None),
) -> JSONResponse:
"""Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge.
activities.csv (Strava export format) can be included in the batch to:
- Enrich activity files being uploaded in the same batch (matched by filename)
- Retroactively update sidecars for existing activities (matched by strava_id)
"""
from bincio.extract.ingest import ingest_parsed
from bincio.extract.parsers.factory import parse_file
from bincio.extract.writer import make_activity_id
@@ -540,13 +546,36 @@ async def upload_activity(
staging = dd / "_uploads"
staging.mkdir(exist_ok=True)
# Separate CSV files from activity files
csv_files: list[UploadFile] = []
activity_files: list[UploadFile] = []
for f in files:
fname = Path(f.filename or "").name.lower()
if fname.endswith(".csv"):
csv_files.append(f)
else:
activity_files.append(f)
# Build metadata from the first CSV found (activities.csv from Strava export)
metadata = None
if csv_files:
from bincio.extract.strava_csv import StravaMetadata
import tempfile
csv_bytes = await csv_files[0].read()
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
tmp.write(csv_bytes)
tmp_path = Path(tmp.name)
try:
metadata = StravaMetadata(tmp_path)
finally:
tmp_path.unlink(missing_ok=True)
results = []
any_added = False
for file in files:
for file in activity_files:
name = Path(file.filename or "upload.fit").name
p = Path(name.lower())
suffix = (p.stem.rsplit(".", 1)[-1].join([".", ".gz"]) if "." in p.stem else ".gz") if p.suffix == ".gz" else p.suffix
suffix = _file_suffix(name)
if suffix not in _SUPPORTED_SUFFIXES:
results.append({"name": name, "ok": False, "error": f"Unsupported file type '{suffix}'"})
continue
@@ -561,6 +590,11 @@ async def upload_activity(
kept = False
try:
activity = parse_file(staged)
# Enrich with CSV metadata when available (matched by filename)
if metadata is not None:
metadata.enrich(name, activity)
activity_id = make_activity_id(activity)
if (dd / "activities" / f"{activity_id}.json").exists():
results.append({"name": name, "ok": False, "error": "duplicate"})
@@ -573,18 +607,25 @@ async def upload_activity(
kept = True
results.append({"name": name, "ok": True, "id": activity_id})
any_added = True
except Exception as exc:
except Exception:
results.append({"name": name, "ok": False, "error": "Processing failed"})
finally:
if not kept:
staged.unlink(missing_ok=True)
if any_added:
# Retroactively update sidecars for existing activities matched by strava_id
csv_updates = 0
if metadata is not None:
from bincio.extract.strava_csv import apply_csv_to_data_dir
csv_updates = apply_csv_to_data_dir(dd, metadata)
if any_added or csv_updates:
merge_all(dd)
_trigger_rebuild(user.handle)
if any_added:
_trigger_rebuild(user.handle)
added = [r for r in results if r["ok"]]
return JSONResponse({"ok": True, "added": len(added), "results": results})
return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results})
@app.post("/api/upload/strava-zip")
+108
View File
@@ -0,0 +1,108 @@
#!/usr/bin/env python3
"""
Bulk-set activities matching a title pattern to private by writing sidecar files.
Usage:
uv run python scripts/bulk_private.py --data-dir /var/bincio/data/brut --match "morning walk" "afternoon walk"
--dry-run Print what would be changed without writing anything.
--handle Subdirectory name (if data-dir is the root, not the user dir).
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
import yaml
def parse_sidecar(path: Path) -> tuple[dict, str]:
text = path.read_text(encoding="utf-8")
if text.startswith("---"):
parts = re.split(r"^---[ \t]*$", text, maxsplit=2, flags=re.MULTILINE)
if len(parts) >= 3:
fm = yaml.safe_load(parts[1]) or {}
return fm, parts[2].strip()
return {}, text.strip()
def write_sidecar(path: Path, fm: dict, body: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
content = "---\n" + yaml.dump(fm, allow_unicode=True, default_flow_style=False) + "---\n"
if body:
content += "\n" + body + "\n"
path.write_text(content, encoding="utf-8")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--data-dir", required=True, help="User data directory (e.g. /var/bincio/data/brut)")
ap.add_argument("--handle", default=None, help="Handle subdir if data-dir is the instance root")
ap.add_argument("--match", nargs="+", required=True, help="Title patterns to match (case-insensitive substring)")
ap.add_argument("--dry-run", action="store_true", help="Print changes without writing")
args = ap.parse_args()
data_dir = Path(args.data_dir)
if args.handle:
data_dir = data_dir / args.handle
index_path = data_dir / "index.json"
if not index_path.exists():
sys.exit(f"ERROR: index.json not found at {index_path}")
index = json.loads(index_path.read_text(encoding="utf-8"))
activities = index.get("activities", [])
patterns = [p.lower() for p in args.match]
matched = [
a for a in activities
if any(pat in (a.get("title") or "").lower() for pat in patterns)
]
if not matched:
print("No activities matched.")
return
print(f"Found {len(matched)} matching activities:")
edits_dir = data_dir / "edits"
changed = 0
for act in matched:
aid = act["id"]
title = act.get("title", "(no title)")
date = act.get("started_at", "")[:10]
sidecar_path = edits_dir / f"{aid}.md"
# Load existing sidecar if present
if sidecar_path.exists():
fm, body = parse_sidecar(sidecar_path)
else:
fm, body = {}, ""
if fm.get("private") is True:
print(f" [already private] {date} {title}")
continue
print(f" {'[DRY RUN] ' if args.dry_run else ''}→ private {date} {title}")
if not args.dry_run:
fm["private"] = True
write_sidecar(sidecar_path, fm, body)
changed += 1
if args.dry_run:
print("\nDry run — nothing written. Re-run without --dry-run to apply.")
else:
print(f"\n{changed} sidecar(s) written.")
if changed:
print("Running merge_all …")
from bincio.render.merge import merge_all
n = merge_all(data_dir)
print(f"merge_all done ({n} sidecar(s) applied).")
if __name__ == "__main__":
main()
+12 -8
View File
@@ -275,8 +275,8 @@ try {
id="upload-drop"
class="border-2 border-dashed border-zinc-700 rounded-lg p-8 text-center text-zinc-500 text-sm cursor-pointer hover:border-zinc-500 hover:text-zinc-300 transition-colors"
>
<div id="upload-label">Drop FIT, GPX, or TCX files<br/>or click to browse</div>
<input id="upload-input" type="file" accept=".fit,.gpx,.tcx,.fit.gz,.gpx.gz,.tcx.gz" class="hidden" multiple />
<div id="upload-label">Drop FIT, GPX, TCX, or activities.csv<br/>or click to browse</div>
<input id="upload-input" type="file" accept=".fit,.gpx,.tcx,.fit.gz,.gpx.gz,.tcx.gz,.csv" class="hidden" multiple />
</div>
<label class="flex items-start gap-2 mt-3 cursor-pointer group">
<input
@@ -525,12 +525,16 @@ try {
const d = await r.json();
const dupes = d.results.filter(r => r.error === 'duplicate').length;
const errors = d.results.filter(r => !r.ok && r.error !== 'duplicate').length;
let msg = `${d.added} added`;
if (dupes) msg += `, ${dupes} duplicate${dupes > 1 ? 's' : ''}`;
if (errors) msg += `, ${errors} failed`;
fileStatus.textContent = msg;
fileStatus.style.color = d.added > 0 ? '#4ade80' : '#a1a1aa';
if (d.added > 0) setTimeout(() => { window.location.reload(); }, 1200);
const parts = [];
if (d.added > 0) parts.push(`${d.added} added`);
if (d.csv_updates > 0) parts.push(`${d.csv_updates} updated from CSV`);
if (dupes) parts.push(`${dupes} duplicate${dupes > 1 ? 's' : ''}`);
if (errors) parts.push(`${errors} failed`);
if (parts.length === 0) parts.push('nothing to add');
fileStatus.textContent = parts.join(', ');
const anyGood = d.added > 0 || d.csv_updates > 0;
fileStatus.style.color = anyGood ? '#4ade80' : '#a1a1aa';
if (anyGood) setTimeout(() => { window.location.reload(); }, 1200);
else drop.style.pointerEvents = '';
} catch (e) {
fileStatus.textContent = 'Error: ' + e.message;