ingest activities.csv
This commit is contained in:
+90
-55
@@ -538,73 +538,108 @@ def _file_suffix(name: str) -> str:
|
||||
|
||||
@app.post("/api/upload")
|
||||
async def upload_activity(
|
||||
file: UploadFile = File(...),
|
||||
files: list[UploadFile] = File(...),
|
||||
store_original: bool = Form(False),
|
||||
) -> JSONResponse:
|
||||
"""Accept a FIT/GPX/TCX file, extract it, update index.json, and re-merge."""
|
||||
"""Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge.
|
||||
|
||||
activities.csv (Strava export format) can be included in the batch to:
|
||||
- Enrich activity files being uploaded in the same batch (matched by filename)
|
||||
- Retroactively update sidecars for existing activities (matched by strava_id)
|
||||
"""
|
||||
from bincio.extract.ingest import ingest_parsed
|
||||
from bincio.extract.parsers.factory import parse_file
|
||||
from bincio.extract.writer import make_activity_id
|
||||
from bincio.render.merge import merge_all
|
||||
|
||||
dd = _get_data_dir()
|
||||
|
||||
name = Path(file.filename or "upload.fit").name # strip any path components
|
||||
suffix = _file_suffix(name)
|
||||
if suffix not in _SUPPORTED_SUFFIXES:
|
||||
raise HTTPException(400, f"Unsupported file type '{Path(name).suffix}'. Expected FIT, GPX, or TCX.")
|
||||
|
||||
_MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||
contents = await file.read()
|
||||
if len(contents) > _MAX_UPLOAD_BYTES:
|
||||
raise HTTPException(413, f"File too large ({len(contents)} bytes). Maximum is 50 MB.")
|
||||
|
||||
staging = dd / "_uploads"
|
||||
staging.mkdir(exist_ok=True)
|
||||
staged = staging / name
|
||||
staged.write_bytes(contents)
|
||||
|
||||
kept = False
|
||||
try:
|
||||
from bincio.extract.metrics import compute
|
||||
from bincio.extract.parsers.factory import parse_file
|
||||
from bincio.extract.writer import build_summary, make_activity_id, write_activity, write_index
|
||||
_MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB
|
||||
|
||||
activity = parse_file(staged)
|
||||
metrics = compute(activity)
|
||||
activity_id = make_activity_id(activity)
|
||||
|
||||
existing_json = dd / "activities" / f"{activity_id}.json"
|
||||
if existing_json.exists():
|
||||
raise HTTPException(409, f"Activity already exists: {activity_id}")
|
||||
|
||||
write_activity(activity, metrics, dd, privacy="public", rdp_epsilon=0.0001)
|
||||
summary = build_summary(activity, metrics, activity_id, "public")
|
||||
|
||||
# Read current index to preserve owner + existing summaries
|
||||
index_path = dd / "index.json"
|
||||
if index_path.exists():
|
||||
index_data = json.loads(index_path.read_text(encoding="utf-8"))
|
||||
# Separate CSV files from activity files
|
||||
csv_files: list[UploadFile] = []
|
||||
activity_files: list[UploadFile] = []
|
||||
for f in files:
|
||||
name = Path(f.filename or "").name.lower()
|
||||
if name.endswith(".csv"):
|
||||
csv_files.append(f)
|
||||
else:
|
||||
index_data = {"owner": {"handle": "unknown"}, "activities": []}
|
||||
owner = index_data.get("owner", {})
|
||||
existing = {s["id"]: s for s in index_data.get("activities", [])}
|
||||
existing[activity_id] = summary
|
||||
write_index(list(existing.values()), dd, owner)
|
||||
activity_files.append(f)
|
||||
|
||||
if store_original:
|
||||
originals_dir = dd / "originals"
|
||||
originals_dir.mkdir(exist_ok=True)
|
||||
staged.rename(originals_dir / name)
|
||||
kept = True
|
||||
# Build metadata from the first CSV found (activities.csv from Strava export)
|
||||
metadata = None
|
||||
if csv_files:
|
||||
from bincio.extract.strava_csv import StravaMetadata
|
||||
import tempfile
|
||||
csv_upload = csv_files[0]
|
||||
csv_bytes = await csv_upload.read()
|
||||
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
|
||||
tmp.write(csv_bytes)
|
||||
tmp_path = Path(tmp.name)
|
||||
try:
|
||||
metadata = StravaMetadata(tmp_path)
|
||||
finally:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
from bincio.render.merge import merge_all
|
||||
results = []
|
||||
any_added = False
|
||||
|
||||
for file in activity_files:
|
||||
name = Path(file.filename or "upload.fit").name
|
||||
suffix = _file_suffix(name)
|
||||
if suffix not in _SUPPORTED_SUFFIXES:
|
||||
results.append({"name": name, "ok": False, "error": f"Unsupported file type '{Path(name).suffix}'"})
|
||||
continue
|
||||
|
||||
contents = await file.read()
|
||||
if len(contents) > _MAX_UPLOAD_BYTES:
|
||||
results.append({"name": name, "ok": False, "error": "File too large (max 50 MB)"})
|
||||
continue
|
||||
|
||||
staged = staging / name
|
||||
staged.write_bytes(contents)
|
||||
kept = False
|
||||
try:
|
||||
activity = parse_file(staged)
|
||||
|
||||
# Enrich with CSV metadata when available (matched by filename)
|
||||
if metadata is not None:
|
||||
metadata.enrich(name, activity)
|
||||
|
||||
activity_id = make_activity_id(activity)
|
||||
if (dd / "activities" / f"{activity_id}.json").exists():
|
||||
results.append({"name": name, "ok": False, "error": "duplicate"})
|
||||
continue
|
||||
|
||||
ingest_parsed(activity, dd, privacy="public")
|
||||
|
||||
if store_original:
|
||||
originals_dir = dd / "originals"
|
||||
originals_dir.mkdir(exist_ok=True)
|
||||
staged.rename(originals_dir / name)
|
||||
kept = True
|
||||
|
||||
results.append({"name": name, "ok": True, "id": activity_id})
|
||||
any_added = True
|
||||
except Exception:
|
||||
results.append({"name": name, "ok": False, "error": "Processing failed"})
|
||||
finally:
|
||||
if not kept:
|
||||
staged.unlink(missing_ok=True)
|
||||
|
||||
# Retroactively update sidecars for existing activities matched by strava_id
|
||||
csv_updates = 0
|
||||
if metadata is not None:
|
||||
from bincio.extract.strava_csv import apply_csv_to_data_dir
|
||||
csv_updates = apply_csv_to_data_dir(dd, metadata)
|
||||
|
||||
if any_added or csv_updates:
|
||||
merge_all(dd)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(422, f"Failed to process activity file: {type(exc).__name__}")
|
||||
finally:
|
||||
if not kept:
|
||||
staged.unlink(missing_ok=True)
|
||||
|
||||
return JSONResponse({"ok": True, "id": activity_id})
|
||||
added = [r for r in results if r["ok"]]
|
||||
return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results})
|
||||
|
||||
|
||||
@app.post("/api/import-bas")
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
"""Import metadata from Strava's activities.csv bulk export.
|
||||
|
||||
Strava export columns we care about:
|
||||
Activity ID, Activity Date, Activity Name, Activity Type,
|
||||
Activity Description, Filename
|
||||
Activity ID, Activity Date, Activity Name, Activity Description, Filename
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Iterator, Optional
|
||||
|
||||
|
||||
_STRAVA_DATE_FMTS = (
|
||||
@@ -18,10 +18,11 @@ _STRAVA_DATE_FMTS = (
|
||||
|
||||
|
||||
class StravaMetadata:
|
||||
"""Maps original filename → Strava metadata."""
|
||||
"""Maps original filename → Strava metadata, with secondary strava_id index."""
|
||||
|
||||
def __init__(self, csv_path: Path) -> None:
|
||||
self._by_filename: dict[str, dict] = {}
|
||||
self._by_strava_id: dict[str, dict] = {}
|
||||
self._load(csv_path)
|
||||
|
||||
def _load(self, path: Path) -> None:
|
||||
@@ -29,16 +30,21 @@ class StravaMetadata:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
filename = row.get("Filename", "").strip()
|
||||
if not filename:
|
||||
continue
|
||||
# Strava stores paths like "activities/12345.fit.gz"
|
||||
basename = Path(filename).name
|
||||
self._by_filename[basename] = row
|
||||
if filename:
|
||||
basename = Path(filename).name
|
||||
self._by_filename[basename] = row
|
||||
strava_id = row.get("Activity ID", "").strip()
|
||||
if strava_id:
|
||||
self._by_strava_id[strava_id] = row
|
||||
|
||||
def lookup(self, source_file: str) -> Optional[dict]:
|
||||
"""Return the Strava CSV row for a given source filename, or None."""
|
||||
return self._by_filename.get(source_file)
|
||||
|
||||
def lookup_by_strava_id(self, strava_id: str) -> Optional[dict]:
|
||||
"""Return the Strava CSV row for a given Strava activity ID, or None."""
|
||||
return self._by_strava_id.get(str(strava_id))
|
||||
|
||||
def enrich(self, source_file: str, activity: object) -> None:
|
||||
"""Mutate a ParsedActivity with Strava metadata if found."""
|
||||
row = self.lookup(source_file)
|
||||
@@ -53,3 +59,97 @@ class StravaMetadata:
|
||||
|
||||
if not activity.strava_id and row.get("Activity ID"): # type: ignore[attr-defined]
|
||||
activity.strava_id = row["Activity ID"].strip() # type: ignore[attr-defined]
|
||||
|
||||
|
||||
# ── Retroactive sidecar update ────────────────────────────────────────────────
|
||||
|
||||
def _parse_sidecar(path: Path) -> tuple[dict, str]:
|
||||
"""Return (frontmatter_dict, body) from a sidecar .md file."""
|
||||
import re as _re
|
||||
import yaml
|
||||
text = path.read_text(encoding="utf-8")
|
||||
if text.startswith("---"):
|
||||
parts = _re.split(r"^---[ \t]*$", text, maxsplit=2, flags=_re.MULTILINE)
|
||||
if len(parts) >= 3:
|
||||
fm = yaml.safe_load(parts[1]) or {}
|
||||
return fm, parts[2].strip()
|
||||
return {}, text.strip()
|
||||
|
||||
|
||||
def _write_sidecar(path: Path, fm: dict, body: str) -> None:
|
||||
import yaml
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fm_text = yaml.safe_dump(fm, default_flow_style=False, allow_unicode=True).strip()
|
||||
content = f"---\n{fm_text}\n---\n"
|
||||
if body:
|
||||
content += f"\n{body}\n"
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def _update_sidecar_from_row(sidecar_path: Path, row: dict) -> bool:
|
||||
"""Create or update a sidecar with CSV title/description.
|
||||
|
||||
Only fills fields that are not already set in the sidecar.
|
||||
Returns True if anything changed.
|
||||
"""
|
||||
title = row.get("Activity Name", "").strip()
|
||||
description = row.get("Activity Description", "").strip()
|
||||
if not title and not description:
|
||||
return False
|
||||
|
||||
fm, body = _parse_sidecar(sidecar_path) if sidecar_path.exists() else ({}, "")
|
||||
|
||||
changed = False
|
||||
if title and "title" not in fm:
|
||||
fm["title"] = title
|
||||
changed = True
|
||||
if description and not body:
|
||||
body = description
|
||||
changed = True
|
||||
|
||||
if not changed:
|
||||
return False
|
||||
|
||||
_write_sidecar(sidecar_path, fm, body)
|
||||
return True
|
||||
|
||||
|
||||
def apply_csv_to_data_dir(data_dir: Path, metadata: StravaMetadata) -> int:
|
||||
"""Retroactively apply CSV metadata to existing activities via sidecars.
|
||||
|
||||
Scans all activity JSONs in data_dir/activities/. For each activity that
|
||||
has a strava_id, looks up the corresponding CSV row and creates/updates
|
||||
the sidecar in data_dir/edits/ with any missing title or description.
|
||||
|
||||
Only writes fields not already present in the sidecar — manual edits are
|
||||
never overwritten.
|
||||
|
||||
Returns the count of activities whose sidecars were created or updated.
|
||||
"""
|
||||
activities_dir = data_dir / "activities"
|
||||
edits_dir = data_dir / "edits"
|
||||
|
||||
if not activities_dir.exists():
|
||||
return 0
|
||||
|
||||
updated = 0
|
||||
for json_path in sorted(activities_dir.glob("*.json")):
|
||||
try:
|
||||
detail = json.loads(json_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
strava_id = detail.get("strava_id")
|
||||
if not strava_id:
|
||||
continue
|
||||
|
||||
row = metadata.lookup_by_strava_id(str(strava_id))
|
||||
if row is None:
|
||||
continue
|
||||
|
||||
activity_id = json_path.stem
|
||||
sidecar_path = edits_dir / f"{activity_id}.md"
|
||||
if _update_sidecar_from_row(sidecar_path, row):
|
||||
updated += 1
|
||||
|
||||
return updated
|
||||
|
||||
+48
-7
@@ -530,6 +530,12 @@ async def upload_activity(
|
||||
store_original: bool = Form(False),
|
||||
bincio_session: Optional[str] = Cookie(default=None),
|
||||
) -> JSONResponse:
|
||||
"""Accept FIT/GPX/TCX files and/or activities.csv, extract, update index, re-merge.
|
||||
|
||||
activities.csv (Strava export format) can be included in the batch to:
|
||||
- Enrich activity files being uploaded in the same batch (matched by filename)
|
||||
- Retroactively update sidecars for existing activities (matched by strava_id)
|
||||
"""
|
||||
from bincio.extract.ingest import ingest_parsed
|
||||
from bincio.extract.parsers.factory import parse_file
|
||||
from bincio.extract.writer import make_activity_id
|
||||
@@ -540,13 +546,36 @@ async def upload_activity(
|
||||
staging = dd / "_uploads"
|
||||
staging.mkdir(exist_ok=True)
|
||||
|
||||
# Separate CSV files from activity files
|
||||
csv_files: list[UploadFile] = []
|
||||
activity_files: list[UploadFile] = []
|
||||
for f in files:
|
||||
fname = Path(f.filename or "").name.lower()
|
||||
if fname.endswith(".csv"):
|
||||
csv_files.append(f)
|
||||
else:
|
||||
activity_files.append(f)
|
||||
|
||||
# Build metadata from the first CSV found (activities.csv from Strava export)
|
||||
metadata = None
|
||||
if csv_files:
|
||||
from bincio.extract.strava_csv import StravaMetadata
|
||||
import tempfile
|
||||
csv_bytes = await csv_files[0].read()
|
||||
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
|
||||
tmp.write(csv_bytes)
|
||||
tmp_path = Path(tmp.name)
|
||||
try:
|
||||
metadata = StravaMetadata(tmp_path)
|
||||
finally:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
results = []
|
||||
any_added = False
|
||||
|
||||
for file in files:
|
||||
for file in activity_files:
|
||||
name = Path(file.filename or "upload.fit").name
|
||||
p = Path(name.lower())
|
||||
suffix = (p.stem.rsplit(".", 1)[-1].join([".", ".gz"]) if "." in p.stem else ".gz") if p.suffix == ".gz" else p.suffix
|
||||
suffix = _file_suffix(name)
|
||||
if suffix not in _SUPPORTED_SUFFIXES:
|
||||
results.append({"name": name, "ok": False, "error": f"Unsupported file type '{suffix}'"})
|
||||
continue
|
||||
@@ -561,6 +590,11 @@ async def upload_activity(
|
||||
kept = False
|
||||
try:
|
||||
activity = parse_file(staged)
|
||||
|
||||
# Enrich with CSV metadata when available (matched by filename)
|
||||
if metadata is not None:
|
||||
metadata.enrich(name, activity)
|
||||
|
||||
activity_id = make_activity_id(activity)
|
||||
if (dd / "activities" / f"{activity_id}.json").exists():
|
||||
results.append({"name": name, "ok": False, "error": "duplicate"})
|
||||
@@ -573,18 +607,25 @@ async def upload_activity(
|
||||
kept = True
|
||||
results.append({"name": name, "ok": True, "id": activity_id})
|
||||
any_added = True
|
||||
except Exception as exc:
|
||||
except Exception:
|
||||
results.append({"name": name, "ok": False, "error": "Processing failed"})
|
||||
finally:
|
||||
if not kept:
|
||||
staged.unlink(missing_ok=True)
|
||||
|
||||
if any_added:
|
||||
# Retroactively update sidecars for existing activities matched by strava_id
|
||||
csv_updates = 0
|
||||
if metadata is not None:
|
||||
from bincio.extract.strava_csv import apply_csv_to_data_dir
|
||||
csv_updates = apply_csv_to_data_dir(dd, metadata)
|
||||
|
||||
if any_added or csv_updates:
|
||||
merge_all(dd)
|
||||
_trigger_rebuild(user.handle)
|
||||
if any_added:
|
||||
_trigger_rebuild(user.handle)
|
||||
|
||||
added = [r for r in results if r["ok"]]
|
||||
return JSONResponse({"ok": True, "added": len(added), "results": results})
|
||||
return JSONResponse({"ok": True, "added": len(added), "csv_updates": csv_updates, "results": results})
|
||||
|
||||
|
||||
@app.post("/api/upload/strava-zip")
|
||||
|
||||
Reference in New Issue
Block a user