reextract-originals: run as subprocess to avoid OOM

The in-process approach loaded all 2015 Strava originals into the server
process memory, causing OOM kills. Now spawns `bincio reextract-originals`
as a child process; heavy work runs in an isolated Python interpreter that
exits when done, freeing all memory.

Also adds `bincio reextract-originals` as a standalone CLI command that
prints JSON-lines progress to stdout — useful for running directly on the
VPS via SSH for large backlogs.
This commit is contained in:
Davide Scaini
2026-04-15 09:42:31 +02:00
parent 6890892654
commit 1a563012e2
3 changed files with 146 additions and 106 deletions
+36 -106
View File
@@ -626,122 +626,52 @@ async def admin_reextract_originals(
) -> StreamingResponse:
"""Re-extract activities from stored Strava originals without hitting the API.
Reads each file in originals/strava/{id}.json (containing {"meta", "streams"}),
calls strava_to_parsed, and writes the activity JSON + GeoJSON.
Skips activities that already have a valid JSON file in activities/.
Streams SSE progress. Calls merge_all + rebuild on completion.
Spawns `bincio reextract-originals` as a subprocess so heavy memory use
is isolated from the server process. Streams its JSON-lines output as SSE.
Triggers a full rebuild on completion.
"""
import asyncio
import shutil
_require_admin(bincio_session)
user_dir = _get_data_dir() / handle
originals_dir = user_dir / "originals" / "strava"
if not originals_dir.exists():
raise HTTPException(404, f"No Strava originals directory for '{handle}'")
original_files = sorted(originals_dir.glob("*.json"))
total = len(original_files)
log.info("reextract[%s]: starting, %d originals found", handle, total)
loop = asyncio.get_event_loop()
# Queue carries SSE event strings; None is the sentinel for "done"
q: asyncio.Queue[str | None] = asyncio.Queue()
def _run_extraction() -> None:
"""Runs entirely in a thread pool worker. Puts SSE strings into q."""
try:
from bincio.extract.strava_api import strava_to_parsed
from bincio.extract.metrics import compute as compute_metrics
from bincio.extract.writer import (
build_summary, make_activity_id, write_activity,
write_index,
)
from bincio.render.merge import merge_all
except Exception as exc:
log.exception("reextract[%s]: import error", handle)
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'error', 'message': f'Import error: {exc}'})}\n\n")
loop.call_soon_threadsafe(q.put_nowait, None)
return
index_path = user_dir / "index.json"
try:
existing_index = json.loads(index_path.read_text(encoding="utf-8")) if index_path.exists() else {}
except Exception:
existing_index = {}
owner = existing_index.get("owner", {"handle": handle})
summaries: dict[str, Any] = {s["id"]: s for s in existing_index.get("activities", [])}
_COMPUTED = {"bas_version", "generated_at", "power_curve", "records", "best_climbs"}
athlete_config: dict[str, Any] = {}
athlete_path = user_dir / "athlete.json"
if athlete_path.exists():
try:
existing_athlete = json.loads(athlete_path.read_text(encoding="utf-8"))
athlete_config = {k: v for k, v in existing_athlete.items() if k not in _COMPUTED}
except Exception:
pass
imported = skipped = errors = 0
for n, orig_path in enumerate(original_files, 1):
try:
raw = json.loads(orig_path.read_text(encoding="utf-8"))
meta = raw.get("meta", {})
streams = raw.get("streams", {})
name = meta.get("name", orig_path.stem)
parsed = strava_to_parsed(meta, streams)
activity_id = make_activity_id(parsed)
if (user_dir / "activities" / f"{activity_id}.json").exists():
skipped += 1
status = "skipped"
else:
metrics = compute_metrics(parsed)
ep = parsed.privacy if parsed.privacy is not None else "public"
write_activity(parsed, metrics, user_dir, privacy=ep, rdp_epsilon=0.0001)
summaries[activity_id] = build_summary(parsed, metrics, activity_id, ep)
imported += 1
status = "imported"
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': name, 'status': status})}\n\n")
except Exception as exc:
errors += 1
log.error("reextract[%s]: failed on %s: %s", handle, orig_path.name, exc, exc_info=True)
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'progress', 'n': n, 'total': total, 'name': orig_path.stem, 'status': 'error', 'detail': str(exc)})}\n\n")
if imported > 0:
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'status', 'message': 'Writing index…'})}\n\n")
try:
write_index(list(summaries.values()), user_dir, owner)
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'status', 'message': 'Running merge and rebuild…'})}\n\n")
merge_all(user_dir)
_trigger_rebuild(handle)
except Exception as exc:
log.exception("reextract[%s]: error in final write/merge", handle)
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'error', 'message': f'Final write failed: {exc}'})}\n\n")
log.info("reextract[%s]: done — imported=%d skipped=%d errors=%d",
handle, imported, skipped, errors)
loop.call_soon_threadsafe(q.put_nowait,
f"data: {json.dumps({'type': 'done', 'imported': imported, 'skipped': skipped, 'errors': errors})}\n\n")
loop.call_soon_threadsafe(q.put_nowait, None) # sentinel
# Find the `uv` or `bincio` executable that launched us
uv_exe = shutil.which("uv") or "uv"
data_dir = str(_get_data_dir())
log.info("reextract[%s]: spawning subprocess via %s", handle, uv_exe)
async def event_stream():
yield f"data: {json.dumps({'type': 'status', 'message': f'Found {total} originals, starting extraction…'})}\n\n"
# Kick off the extraction thread
loop.run_in_executor(None, _run_extraction)
# Drain the queue until sentinel
while True:
chunk = await q.get()
if chunk is None:
break
yield chunk
proc = await asyncio.create_subprocess_exec(
uv_exe, "run", "bincio", "reextract-originals",
"--data-dir", data_dir,
"--handle", handle,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
assert proc.stdout is not None
async for raw_line in proc.stdout:
line = raw_line.decode(errors="replace").strip()
if not line:
continue
# Forward the JSON line as an SSE event
yield f"data: {line}\n\n"
await proc.wait()
stderr_out = b""
if proc.stderr:
stderr_out = await proc.stderr.read()
if proc.returncode != 0:
log.error("reextract[%s]: subprocess exited %d — stderr: %s",
handle, proc.returncode, stderr_out.decode(errors="replace")[:500])
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
else:
log.info("reextract[%s]: subprocess done, triggering rebuild", handle)
_trigger_rebuild(handle)
return StreamingResponse(
event_stream(),