reextract: reclaim RSS with malloc_trim + gc.collect every 50 activities

CPython's allocator holds freed memory in arenas and doesn't return it to
the OS, causing RSS to grow throughout the 2015-activity loop even when
each iteration's objects are freed. Call gc.collect() + malloc_trim(0)
every 50 activities to return freed pages to the kernel and keep RSS flat.
This commit is contained in:
Davide Scaini
2026-04-15 09:58:16 +02:00
parent 062ade28d3
commit a67b237161
+30 -2
View File
@@ -2,10 +2,11 @@
from __future__ import annotations
import ctypes
import gc
import json
import sys
from pathlib import Path
from typing import Optional
import click
@@ -15,6 +16,19 @@ def _emit(obj: dict) -> None:
print(json.dumps(obj), flush=True)
# On Linux, malloc_trim(0) returns freed arenas to the OS, keeping RSS low.
# CPython's allocator otherwise holds onto freed memory indefinitely.
try:
_libc = ctypes.CDLL("libc.so.6")
def _trim_heap() -> None:
_libc.malloc_trim(0)
except Exception:
def _trim_heap() -> None: # type: ignore[misc]
pass
_GC_EVERY = 50 # call gc.collect() + malloc_trim every N activities
@click.command("reextract-originals")
@click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory")
@click.option("--handle", required=True, help="User handle to re-extract for")
@@ -83,13 +97,27 @@ def reextract_originals(data_dir: str, handle: str, force: bool) -> None:
imported += 1
_emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"})
# Explicitly free large objects to keep memory low
# Explicitly free large objects; also free the raw JSON dict and streams
raw = meta = streams = None # type: ignore[assignment]
try:
del parsed, metrics
except NameError:
pass
except Exception as exc:
errors += 1
_emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem,
"status": "error", "detail": str(exc)})
# Periodically reclaim freed memory from CPython's allocator arena
if n % _GC_EVERY == 0:
gc.collect()
_trim_heap()
# Final cleanup before the index write (which loads all summaries at once)
gc.collect()
_trim_heap()
if imported > 0:
_emit({"type": "status", "message": "Writing index…"})
try: