reextract: process in batches of 100 to bound subprocess memory

One Python process for 2015 activities exhausts all RAM + swap on a
cheap VPS. Split into sequential batches of 100: each subprocess handles
100 activities and exits, returning all memory to the OS before the
next batch starts. The server chains batches in the SSE event_stream
and triggers a single rebuild when all batches complete.
This commit is contained in:
Davide Scaini
2026-04-15 10:08:55 +02:00
parent a67b237161
commit 25d80c8132
2 changed files with 67 additions and 31 deletions
+16 -5
View File
@@ -33,7 +33,9 @@ _GC_EVERY = 50 # call gc.collect() + malloc_trim every N activities
@click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory")
@click.option("--handle", required=True, help="User handle to re-extract for")
@click.option("--force", is_flag=True, default=False, help="Re-extract even if activity JSON already exists")
def reextract_originals(data_dir: str, handle: str, force: bool) -> None:
@click.option("--offset", default=0, type=int, help="Skip first N originals (for batch processing)")
@click.option("--limit", default=0, type=int, help="Process at most N originals then stop (0 = all)")
def reextract_originals(data_dir: str, handle: str, force: bool, offset: int, limit: int) -> None:
"""Re-extract activities from stored Strava originals (originals/strava/*.json).
Prints one JSON object per line to stdout for streaming progress:
@@ -57,13 +59,22 @@ def reextract_originals(data_dir: str, handle: str, force: bool) -> None:
_emit({"type": "error", "message": f"No Strava originals directory at {originals_dir}"})
sys.exit(1)
original_files = sorted(originals_dir.glob("*.json"))
total = len(original_files)
if total == 0:
all_files = sorted(originals_dir.glob("*.json"))
if not all_files:
_emit({"type": "error", "message": "No Strava originals found"})
sys.exit(1)
_emit({"type": "status", "message": f"Found {total} originals, starting extraction…"})
# Apply offset/limit for batch processing
batch = all_files[offset:] if not limit else all_files[offset: offset + limit]
total_all = len(all_files)
total = len(batch)
original_files = batch
_emit({"type": "status", "message": (
f"Batch {offset + 1}{offset + total} of {total_all}, starting extraction…"
if offset or limit else
f"Found {total_all} originals, starting extraction…"
)})
# Load existing index to get owner info and existing summaries
index_path = user_dir / "index.json"