reextract: reclaim RSS with malloc_trim + gc.collect every 50 activities
CPython's allocator holds freed memory in arenas and doesn't return it to the OS, causing RSS to grow throughout the 2015-activity loop even when each iteration's objects are freed. Call gc.collect() + malloc_trim(0) every 50 activities to return freed pages to the kernel and keep RSS flat.
This commit is contained in:
+30
-2
@@ -2,10 +2,11 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ctypes
|
||||
import gc
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
|
||||
@@ -15,6 +16,19 @@ def _emit(obj: dict) -> None:
|
||||
print(json.dumps(obj), flush=True)
|
||||
|
||||
|
||||
# On Linux, malloc_trim(0) returns freed arenas to the OS, keeping RSS low.
|
||||
# CPython's allocator otherwise holds onto freed memory indefinitely.
|
||||
try:
|
||||
_libc = ctypes.CDLL("libc.so.6")
|
||||
def _trim_heap() -> None:
|
||||
_libc.malloc_trim(0)
|
||||
except Exception:
|
||||
def _trim_heap() -> None: # type: ignore[misc]
|
||||
pass
|
||||
|
||||
_GC_EVERY = 50 # call gc.collect() + malloc_trim every N activities
|
||||
|
||||
|
||||
@click.command("reextract-originals")
|
||||
@click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory")
|
||||
@click.option("--handle", required=True, help="User handle to re-extract for")
|
||||
@@ -83,13 +97,27 @@ def reextract_originals(data_dir: str, handle: str, force: bool) -> None:
|
||||
imported += 1
|
||||
_emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"})
|
||||
|
||||
# Explicitly free large objects to keep memory low
|
||||
# Explicitly free large objects; also free the raw JSON dict and streams
|
||||
raw = meta = streams = None # type: ignore[assignment]
|
||||
try:
|
||||
del parsed, metrics
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
except Exception as exc:
|
||||
errors += 1
|
||||
_emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem,
|
||||
"status": "error", "detail": str(exc)})
|
||||
|
||||
# Periodically reclaim freed memory from CPython's allocator arena
|
||||
if n % _GC_EVERY == 0:
|
||||
gc.collect()
|
||||
_trim_heap()
|
||||
|
||||
# Final cleanup before the index write (which loads all summaries at once)
|
||||
gc.collect()
|
||||
_trim_heap()
|
||||
|
||||
if imported > 0:
|
||||
_emit({"type": "status", "message": "Writing index…"})
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user