reextract: reclaim RSS with malloc_trim + gc.collect every 50 activities
CPython's allocator holds freed memory in arenas and doesn't return it to the OS, causing RSS to grow throughout the 2015-activity loop even when each iteration's objects are freed. Call gc.collect() + malloc_trim(0) every 50 activities to return freed pages to the kernel and keep RSS flat.
This commit is contained in:
+30
-2
@@ -2,10 +2,11 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ctypes
|
||||||
|
import gc
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
@@ -15,6 +16,19 @@ def _emit(obj: dict) -> None:
|
|||||||
print(json.dumps(obj), flush=True)
|
print(json.dumps(obj), flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
# On Linux, malloc_trim(0) returns freed arenas to the OS, keeping RSS low.
|
||||||
|
# CPython's allocator otherwise holds onto freed memory indefinitely.
|
||||||
|
try:
|
||||||
|
_libc = ctypes.CDLL("libc.so.6")
|
||||||
|
def _trim_heap() -> None:
|
||||||
|
_libc.malloc_trim(0)
|
||||||
|
except Exception:
|
||||||
|
def _trim_heap() -> None: # type: ignore[misc]
|
||||||
|
pass
|
||||||
|
|
||||||
|
_GC_EVERY = 50 # call gc.collect() + malloc_trim every N activities
|
||||||
|
|
||||||
|
|
||||||
@click.command("reextract-originals")
|
@click.command("reextract-originals")
|
||||||
@click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory")
|
@click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory")
|
||||||
@click.option("--handle", required=True, help="User handle to re-extract for")
|
@click.option("--handle", required=True, help="User handle to re-extract for")
|
||||||
@@ -83,13 +97,27 @@ def reextract_originals(data_dir: str, handle: str, force: bool) -> None:
|
|||||||
imported += 1
|
imported += 1
|
||||||
_emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"})
|
_emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"})
|
||||||
|
|
||||||
# Explicitly free large objects to keep memory low
|
# Explicitly free large objects; also free the raw JSON dict and streams
|
||||||
|
raw = meta = streams = None # type: ignore[assignment]
|
||||||
|
try:
|
||||||
del parsed, metrics
|
del parsed, metrics
|
||||||
|
except NameError:
|
||||||
|
pass
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors += 1
|
errors += 1
|
||||||
_emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem,
|
_emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem,
|
||||||
"status": "error", "detail": str(exc)})
|
"status": "error", "detail": str(exc)})
|
||||||
|
|
||||||
|
# Periodically reclaim freed memory from CPython's allocator arena
|
||||||
|
if n % _GC_EVERY == 0:
|
||||||
|
gc.collect()
|
||||||
|
_trim_heap()
|
||||||
|
|
||||||
|
# Final cleanup before the index write (which loads all summaries at once)
|
||||||
|
gc.collect()
|
||||||
|
_trim_heap()
|
||||||
|
|
||||||
if imported > 0:
|
if imported > 0:
|
||||||
_emit({"type": "status", "message": "Writing index…"})
|
_emit({"type": "status", "message": "Writing index…"})
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user