reextract: reclaim RSS with malloc_trim + gc.collect every 50 activities

CPython's allocator holds freed memory in arenas and doesn't return it to
the OS, causing RSS to grow throughout the 2015-activity loop even when
each iteration's objects are freed. Call gc.collect() + malloc_trim(0)
every 50 activities to return freed pages to the kernel and keep RSS flat.
This commit is contained in:
Davide Scaini
2026-04-15 09:58:16 +02:00
parent 062ade28d3
commit a67b237161
+30 -2
View File
@@ -2,10 +2,11 @@
from __future__ import annotations from __future__ import annotations
import ctypes
import gc
import json import json
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Optional
import click import click
@@ -15,6 +16,19 @@ def _emit(obj: dict) -> None:
print(json.dumps(obj), flush=True) print(json.dumps(obj), flush=True)
# On Linux, malloc_trim(0) returns freed arenas to the OS, keeping RSS low.
# CPython's allocator otherwise holds onto freed memory indefinitely.
try:
_libc = ctypes.CDLL("libc.so.6")
def _trim_heap() -> None:
_libc.malloc_trim(0)
except Exception:
def _trim_heap() -> None: # type: ignore[misc]
pass
_GC_EVERY = 50 # call gc.collect() + malloc_trim every N activities
@click.command("reextract-originals") @click.command("reextract-originals")
@click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory") @click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory")
@click.option("--handle", required=True, help="User handle to re-extract for") @click.option("--handle", required=True, help="User handle to re-extract for")
@@ -83,13 +97,27 @@ def reextract_originals(data_dir: str, handle: str, force: bool) -> None:
imported += 1 imported += 1
_emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"}) _emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"})
# Explicitly free large objects to keep memory low # Explicitly free large objects; also free the raw JSON dict and streams
raw = meta = streams = None # type: ignore[assignment]
try:
del parsed, metrics del parsed, metrics
except NameError:
pass
except Exception as exc: except Exception as exc:
errors += 1 errors += 1
_emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem, _emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem,
"status": "error", "detail": str(exc)}) "status": "error", "detail": str(exc)})
# Periodically reclaim freed memory from CPython's allocator arena
if n % _GC_EVERY == 0:
gc.collect()
_trim_heap()
# Final cleanup before the index write (which loads all summaries at once)
gc.collect()
_trim_heap()
if imported > 0: if imported > 0:
_emit({"type": "status", "message": "Writing index…"}) _emit({"type": "status", "message": "Writing index…"})
try: try: