From 1dca00d5e3b99ecc685219fbbaa867db53fd6af9 Mon Sep 17 00:00:00 2001
From: Davide Scaini <davide.scaini@alumni.cern>
Date: Tue, 2 Jun 2026 15:47:55 +0200
Subject: [PATCH] update stats script

---
 scripts/usage_stats.py | 69 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 9 deletions(-)

diff --git a/scripts/usage_stats.py b/scripts/usage_stats.py
index 7e22f93..a3f88e7 100644
--- a/scripts/usage_stats.py
+++ b/scripts/usage_stats.py
@@ -32,6 +32,8 @@ import pandas as pd
 LOG_DIR    = Path("/var/log/nginx")
 OUTPUT_DIR = Path("/var/bincio/stats")
 OUTPUT     = OUTPUT_DIR / "latest.png"
+HISTORY    = OUTPUT_DIR / "weekly_history.csv"
+MAX_WEEKS  = 26  # 6 months
 
 # ── Log parsing ───────────────────────────────────────────────────────────────
 
@@ -116,6 +118,22 @@ def _feature(referer: str) -> str | None:
                 return label
     return None
 
+# ── History management ────────────────────────────────────────────────────────
+
+def load_history(history_file: Path) -> pd.DataFrame:
+    if history_file.exists():
+        try:
+            df = pd.read_csv(history_file, parse_dates=["week_start"])
+            return df.set_index("week_start")
+        except Exception as e:
+            print(f"Warning: could not load history: {e}", file=sys.stderr)
+            return pd.DataFrame()
+    return pd.DataFrame()
+
+def save_history(history: pd.DataFrame, history_file: Path) -> None:
+    history_file.parent.mkdir(parents=True, exist_ok=True)
+    history.reset_index().to_csv(history_file, index=False)
+
 # ── Loading ───────────────────────────────────────────────────────────────────
 
 def load_logs(log_dir: Path) -> pd.DataFrame:
@@ -176,19 +194,12 @@ def _style_ax(ax: plt.Axes) -> None:
     for spine in ax.spines.values():
         spine.set_edgecolor(GRID)
 
-def make_figure(df: pd.DataFrame, output: Path) -> None:
+def make_figure(df: pd.DataFrame, feat_weekly: pd.DataFrame, output: Path) -> None:
     # ── daily logins ──────────────────────────────────────────────────────────
     login_mask  = (df["method"] == "POST") & (df["path"] == "/api/auth/login") & (df["status"] == 200)
     weekly_logins = df[login_mask].set_index("ts").resample("W-MON").size()
 
     # ── feature usage (weekly) ────────────────────────────────────────────────
-    feat_df = df[df["feature"].notna()].copy()
-    feat_weekly = (
-        feat_df.set_index("ts")
-        .groupby([pd.Grouper(freq="W-MON"), "feature"])
-        .size()
-        .unstack(fill_value=0)
-    )
     feat_order = [f for f in FEATURE_COLORS if f in feat_weekly.columns]
     feat_weekly = feat_weekly[feat_order]
 
@@ -286,6 +297,7 @@ def main() -> None:
     ap = argparse.ArgumentParser(description="Generate bincio usage stats figure.")
     ap.add_argument("--log-dir", type=Path, default=LOG_DIR,  metavar="DIR")
     ap.add_argument("--output",  type=Path, default=OUTPUT,   metavar="FILE")
+    ap.add_argument("--history", type=Path, default=HISTORY,  metavar="FILE")
     args = ap.parse_args()
 
     print("Loading logs…", file=sys.stderr)
@@ -296,7 +308,46 @@ def main() -> None:
     print("  Feature breakdown:", file=sys.stderr)
     for feat, count in feat_counts.items():
         print(f"    {str(feat):12s}  {count:,}", file=sys.stderr)
-    make_figure(df, args.output)
+
+    # Load current week's feature usage
+    feat_df = df[df["feature"].notna()].copy()
+    current_weekly = (
+        feat_df.set_index("ts")
+        .groupby([pd.Grouper(freq="W-MON"), "feature"])
+        .size()
+        .unstack(fill_value=0)
+    )
+
+    # Load historical data and merge
+    history = load_history(args.history)
+    if not history.empty:
+        # Remove any weeks that overlap with current logs (in case of reruns)
+        if len(current_weekly) > 0:
+            latest_week_in_history = history.index.max()
+            earliest_week_in_current = current_weekly.index.min()
+            if latest_week_in_history >= earliest_week_in_current:
+                history = history[history.index < earliest_week_in_current]
+
+        # Concatenate and drop duplicates
+        all_weekly = pd.concat([history, current_weekly])
+        all_weekly = all_weekly[~all_weekly.index.duplicated(keep='first')]
+    else:
+        all_weekly = current_weekly
+
+    # Keep only last MAX_WEEKS weeks
+    if len(all_weekly) > MAX_WEEKS:
+        all_weekly = all_weekly.iloc[-MAX_WEEKS:]
+
+    # Save updated history (all data we kept)
+    save_history(all_weekly, args.history)
+
+    # Ensure all feature columns exist in all_weekly
+    feat_order = [f for f in FEATURE_COLORS if f in all_weekly.columns]
+    for feat in feat_order:
+        if feat not in all_weekly.columns:
+            all_weekly[feat] = 0
+
+    make_figure(df, all_weekly, args.output)
 
 
 if __name__ == "__main__":