fix reextract: async generator + run_in_executor, imports at endpoint level

The sync generator was failing with a network error because Starlette's
iterate_in_threadpool doesn't properly propagate exceptions from sync
generators — the connection resets with no body.

Fix: convert event_stream to an async generator (Starlette handles these
natively without thread wrapping), move imports to the endpoint function
scope so failures raise HTTPException before the stream starts, and run
CPU-intensive work (parse + write) via loop.run_in_executor so the
async generator can actually yield between activities.
This commit is contained in:
Davide Scaini
2026-04-15 09:05:29 +02:00
parent 378cba85ad
commit 10dd1185b9
+46 -44
View File
@@ -630,12 +630,9 @@ async def admin_reextract_originals(
calls strava_to_parsed, and writes the activity JSON + GeoJSON. calls strava_to_parsed, and writes the activity JSON + GeoJSON.
Skips activities that already have a valid JSON file in activities/. Skips activities that already have a valid JSON file in activities/.
Streams SSE progress; call /rebuild after completion if no --webroot. Streams SSE progress. Calls merge_all + rebuild on completion.
SSE events:
{"type": "progress", "n": N, "total": T, "name": "...", "status": "imported"|"skipped"|"error", "detail": "..."}
{"type": "done", "imported": N, "skipped": N, "errors": N}
""" """
import asyncio
_require_admin(bincio_session) _require_admin(bincio_session)
user_dir = _get_data_dir() / handle user_dir = _get_data_dir() / handle
originals_dir = user_dir / "originals" / "strava" originals_dir = user_dir / "originals" / "strava"
@@ -646,34 +643,29 @@ async def admin_reextract_originals(
total = len(original_files) total = len(original_files)
log.info("reextract[%s]: starting, %d originals found", handle, total) log.info("reextract[%s]: starting, %d originals found", handle, total)
def event_stream(): # Imports at endpoint level so failures are visible before the stream starts
from bincio.extract.strava_api import strava_to_parsed from bincio.extract.strava_api import strava_to_parsed
from bincio.extract.metrics import compute as compute_metrics from bincio.extract.metrics import compute as compute_metrics
from bincio.extract.writer import ( from bincio.extract.writer import (
build_summary, make_activity_id, write_activity, build_summary, make_activity_id, write_activity,
write_index, write_athlete_json, write_index, write_athlete_json,
) )
from bincio.render.merge import merge_all from bincio.render.merge import merge_all
# Immediate heartbeat so the client knows the connection is alive async def event_stream():
yield f"data: {json.dumps({'type': 'status', 'message': f'Found {total} originals, starting extraction…'})}\n\n" yield f"data: {json.dumps({'type': 'status', 'message': f'Found {total} originals, starting extraction…'})}\n\n"
imported = 0 imported = 0
skipped = 0 skipped = 0
errors = 0 errors = 0
# Load existing index once upfront (to preserve any non-Strava activities)
index_path = user_dir / "index.json" index_path = user_dir / "index.json"
if index_path.exists(): try:
try: existing_index = json.loads(index_path.read_text(encoding="utf-8")) if index_path.exists() else {}
existing_index = json.loads(index_path.read_text(encoding="utf-8")) except Exception:
except Exception:
existing_index = {}
else:
existing_index = {} existing_index = {}
owner = existing_index.get("owner", {"handle": handle}) owner = existing_index.get("owner", {"handle": handle})
# Seed summaries from whatever already exists in activities/
summaries: dict[str, Any] = {s["id"]: s for s in existing_index.get("activities", [])} summaries: dict[str, Any] = {s["id"]: s for s in existing_index.get("activities", [])}
_COMPUTED = {"bas_version", "generated_at", "power_curve", "records", "best_climbs"} _COMPUTED = {"bas_version", "generated_at", "power_curve", "records", "best_climbs"}
@@ -686,42 +678,52 @@ async def admin_reextract_originals(
except Exception: except Exception:
pass pass
loop = asyncio.get_event_loop()
for n, orig_path in enumerate(original_files, 1): for n, orig_path in enumerate(original_files, 1):
try: try:
raw = json.loads(orig_path.read_text(encoding="utf-8")) # Run CPU-intensive parsing + file writing in a thread
meta = raw.get("meta", {}) def _process_one(op=orig_path):
streams = raw.get("streams", {}) raw = json.loads(op.read_text(encoding="utf-8"))
name = meta.get("name", orig_path.stem) meta = raw.get("meta", {})
streams = raw.get("streams", {})
name = meta.get("name", op.stem)
parsed = strava_to_parsed(meta, streams)
activity_id = make_activity_id(parsed)
if (user_dir / "activities" / f"{activity_id}.json").exists():
return "skipped", name, activity_id, None
metrics = compute_metrics(parsed)
effective_privacy = parsed.privacy if parsed.privacy is not None else "public"
write_activity(parsed, metrics, user_dir, privacy=effective_privacy, rdp_epsilon=0.0001)
summary = build_summary(parsed, metrics, activity_id, effective_privacy)
return "imported", name, activity_id, summary
parsed = strava_to_parsed(meta, streams) status, name, activity_id, summary = await loop.run_in_executor(None, _process_one)
activity_id = make_activity_id(parsed)
if (user_dir / "activities" / f"{activity_id}.json").exists(): if status == "skipped":
skipped += 1 skipped += 1
yield f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': name, 'status': 'skipped'})}\n\n" else:
continue summaries[activity_id] = summary
imported += 1
metrics = compute_metrics(parsed) yield f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': name, 'status': status})}\n\n"
effective_privacy = parsed.privacy if parsed.privacy is not None else "public"
write_activity(parsed, metrics, user_dir, privacy=effective_privacy, rdp_epsilon=0.0001)
summary = build_summary(parsed, metrics, activity_id, effective_privacy)
summaries[activity_id] = summary
imported += 1
yield f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': name, 'status': 'imported'})}\n\n"
except Exception as exc: except Exception as exc:
errors += 1 errors += 1
log.error("reextract[%s]: failed on %s: %s", handle, orig_path.name, exc, exc_info=True) log.error("reextract[%s]: failed on %s: %s", handle, orig_path.name, exc, exc_info=True)
yield f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': orig_path.stem, 'status': 'error', 'detail': str(exc)})}\n\n" yield f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': orig_path.stem, 'status': 'error', 'detail': str(exc)})}\n\n"
# Write index and athlete.json once at the end (not once per activity)
if imported > 0: if imported > 0:
yield f"data: {json.dumps({'type': 'status', 'message': 'Writing index and athlete data…'})}\n\n" yield f"data: {json.dumps({'type': 'status', 'message': 'Writing index and athlete data…'})}\n\n"
write_index(list(summaries.values()), user_dir, owner) try:
write_athlete_json(list(summaries.values()), user_dir, athlete_config) await loop.run_in_executor(None, lambda: write_index(list(summaries.values()), user_dir, owner))
yield f"data: {json.dumps({'type': 'status', 'message': 'Running merge and rebuild…'})}\n\n" await loop.run_in_executor(None, lambda: write_athlete_json(list(summaries.values()), user_dir, athlete_config))
merge_all(user_dir) yield f"data: {json.dumps({'type': 'status', 'message': 'Running merge and rebuild…'})}\n\n"
_trigger_rebuild(handle) await loop.run_in_executor(None, lambda: merge_all(user_dir))
_trigger_rebuild(handle)
except Exception as exc:
log.exception("reextract[%s]: error in final write/merge", handle)
yield f"data: {json.dumps({'type': 'error', 'message': f'Final write failed: {exc}'})}\n\n"
log.info("reextract[%s]: done — imported=%d skipped=%d errors=%d", handle, imported, skipped, errors) log.info("reextract[%s]: done — imported=%d skipped=%d errors=%d", handle, imported, skipped, errors)
yield f"data: {json.dumps({'type': 'done', 'imported': imported, 'skipped': skipped, 'errors': errors})}\n\n" yield f"data: {json.dumps({'type': 'done', 'imported': imported, 'skipped': skipped, 'errors': errors})}\n\n"