diff --git a/bincio/reextract_cmd.py b/bincio/reextract_cmd.py index c25c328..ff73287 100644 --- a/bincio/reextract_cmd.py +++ b/bincio/reextract_cmd.py @@ -2,10 +2,11 @@ from __future__ import annotations +import ctypes +import gc import json import sys from pathlib import Path -from typing import Optional import click @@ -15,6 +16,19 @@ def _emit(obj: dict) -> None: print(json.dumps(obj), flush=True) +# On Linux, malloc_trim(0) returns freed arenas to the OS, keeping RSS low. +# CPython's allocator otherwise holds onto freed memory indefinitely. +try: + _libc = ctypes.CDLL("libc.so.6") + def _trim_heap() -> None: + _libc.malloc_trim(0) +except Exception: + def _trim_heap() -> None: # type: ignore[misc] + pass + +_GC_EVERY = 50 # call gc.collect() + malloc_trim every N activities + + @click.command("reextract-originals") @click.option("--data-dir", required=True, type=click.Path(), help="BAS data directory") @click.option("--handle", required=True, help="User handle to re-extract for") @@ -83,13 +97,27 @@ def reextract_originals(data_dir: str, handle: str, force: bool) -> None: imported += 1 _emit({"type": "progress", "n": n, "total": total, "name": name, "status": "imported"}) - # Explicitly free large objects to keep memory low + # Explicitly free large objects; also free the raw JSON dict and streams + raw = meta = streams = None # type: ignore[assignment] + try: del parsed, metrics + except NameError: + pass + except Exception as exc: errors += 1 _emit({"type": "progress", "n": n, "total": total, "name": orig_path.stem, "status": "error", "detail": str(exc)}) + # Periodically reclaim freed memory from CPython's allocator arena + if n % _GC_EVERY == 0: + gc.collect() + _trim_heap() + + # Final cleanup before the index write (which loads all summaries at once) + gc.collect() + _trim_heap() + if imported > 0: _emit({"type": "status", "message": "Writing index…"}) try: