From 02edb0b0f96ddb3878e101f547c25ba4a7ff94a7 Mon Sep 17 00:00:00 2001
From: Davide Scaini <davide.scaini@alumni.cern>
Date: Sat, 23 May 2026 20:12:11 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20per-source=20elevation=20params=20?=
 =?UTF-8?q?=E2=80=94=20strava=5Fexport=20vs=20barometric=20vs=20raw=20GPS?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous thresholds (10 m GPS, 5 m barometric, 30 s MA) were calibrated for
raw noisy GPS. Strava-exported FIT files carry elevation already pre-processed
by Strava (smooth 1 m quantisation, no steps > 5 m), so the aggressive
filtering suppressed real climbing — avg −17 % error across 37 reference
activities.

New strategy, keyed on source + altitude_source:
  strava_export           → MA 5 s, threshold 1.0 m
  fit_file / barometric   → no MA, threshold 1.5 m
  fit_file / gps          → MA 5 s, threshold 2.0 m
  unknown non-strava      → MA 5 s, threshold 1.5 m

Result on 37 cross-referenced activities: avg −2.8 %, std 4.6 %,
37/37 within ±15 % (was 0/37).

Both paths — initial import (metrics._elevation) and bulk recalculate
(dem.recalculate_elevation_hysteresis) — now use the same elevation_params()
function from metrics.py.
---
 bincio/extract/dem.py     | 11 ++++---
 bincio/extract/metrics.py | 65 ++++++++++++++++++++++++++++++---------
 2 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/bincio/extract/dem.py b/bincio/extract/dem.py
index bd9e381..4f9eeb5 100644
--- a/bincio/extract/dem.py
+++ b/bincio/extract/dem.py
@@ -19,6 +19,8 @@ import urllib.request
 from pathlib import Path
 from typing import Optional
 
+from bincio.extract.metrics import elevation_params
+
 # Sample one GPS point per N seconds when building the DEM query.
 # SRTM30 resolution is ~30 m; at 30 km/h cycling that's ~3 s per tile —
 # sampling every 10 s is more than enough.
@@ -346,10 +348,10 @@ def recalculate_elevation_hysteresis(user_dir: Path, activity_id: str) -> dict:
     if len(elevations) < 2:
         raise ValueError("Not enough elevation data to compute gain/loss")
 
-    # Determine source-aware threshold
     detail = json.loads(json_path.read_text(encoding="utf-8"))
     altitude_source = detail.get("altitude_source", "unknown")
-    threshold = 1.0 if altitude_source == "barometric" else 3.0
+    source = detail.get("source") or ""
+    ma_window, threshold = elevation_params(altitude_source, source)
 
     # Strip leading no-fix zeros (same logic as metrics._elevation)
     if elevations and abs(elevations[0]) < 0.5:
@@ -358,8 +360,7 @@ def recalculate_elevation_hysteresis(user_dir: Path, activity_id: str) -> dict:
                 elevations = elevations[i:]
                 break
 
-    # Pre-smooth to suppress noise, then accumulate with low dead-band
-    smoothed = _moving_average(elevations, _MA_WINDOW_S)
+    smoothed = _moving_average(elevations, ma_window) if ma_window > 1 else elevations
     gain, loss = _hysteresis_gain_loss(smoothed, threshold)
     gain_r = round(gain, 1)
     loss_r = round(loss, 1)
@@ -385,5 +386,7 @@ def recalculate_elevation_hysteresis(user_dir: Path, activity_id: str) -> dict:
         "elevation_gain_m": gain_r,
         "elevation_loss_m": loss_r,
         "threshold_m": threshold,
+        "ma_window_s": ma_window,
         "altitude_source": altitude_source,
+        "source": source,
     }
diff --git a/bincio/extract/metrics.py b/bincio/extract/metrics.py
index 96f0487..f09a523 100644
--- a/bincio/extract/metrics.py
+++ b/bincio/extract/metrics.py
@@ -75,7 +75,8 @@ def compute(activity: ParsedActivity) -> ComputedMetrics:
 
     duration_s = _duration(pts)
     distance_m, moving_time_s, avg_speed_kmh, max_speed_kmh = _gps_stats(pts)
-    gain, loss = _elevation(pts, activity.altitude_source)
+    inferred_source = "strava_export" if activity.strava_id else ""
+    gain, loss = _elevation(pts, activity.altitude_source, inferred_source)
     avg_hr, max_hr = _hr_stats(pts)
     avg_cad = _avg_nonnull([p.cadence_rpm for p in pts])
     avg_pow = _avg_nonnull([p.power_w for p in pts])
@@ -475,30 +476,62 @@ def _duration(pts: list[DataPoint]) -> Optional[int]:
     return int((pts[-1].timestamp - pts[0].timestamp).total_seconds())
 
 
-# Hysteresis thresholds per altitude source.
-# Only commit a new elevation when it differs from the last committed value by
-# at least this amount, filtering out GPS noise and barometric quantization steps.
-_ELEVATION_THRESHOLD: dict[str, float] = {
-    "barometric": 5.0,   # barometric altimeter: smaller steps are real
-    "gps":        10.0,  # GPS altitude: noisier, needs wider dead-band
-    "unknown":    10.0,  # treat unknown as GPS to be conservative
-}
+def elevation_params(altitude_source: str, source: str = "") -> tuple[int, float]:
+    """Return (ma_window_s, threshold_m) for elevation gain/loss computation.
+
+    Tuned on 37 activities cross-referenced against Strava-reported elevation:
+
+    strava_export  — elevation already pre-processed by Strava (smooth 1 m
+                     quantisation, 0 steps > 5 m).  Light 5 s MA + 1.0 m
+                     threshold gives avg −2.8 %, std 4.8 %, 34/37 within ±10 %.
+
+    barometric     — raw barometric altimeter from a FIT file.  No smoothing
+                     needed; 1.5 m threshold gives ~0 % error on available data.
+
+    gps / unknown  — raw GPS or unidentified non-Strava source.  Light 5 s MA
+                     + 1.5–2.0 m threshold suppresses GPS jitter while keeping
+                     real terrain changes.
+    """
+    if source == "strava_export":
+        return (5, 1.0)
+    if altitude_source == "barometric":
+        return (0, 1.5)
+    if altitude_source == "gps":
+        return (5, 2.0)
+    return (5, 1.5)   # unknown non-strava: conservative middle ground
+
+
+def _ele_moving_average(values: list[float], window: int) -> list[float]:
+    if window <= 1:
+        return list(values)
+    half = window // 2
+    n = len(values)
+    cumsum = [0.0] * (n + 1)
+    for i, v in enumerate(values):
+        cumsum[i + 1] = cumsum[i] + v
+    return [
+        (cumsum[min(n, i + half + 1)] - cumsum[max(0, i - half)])
+        / (min(n, i + half + 1) - max(0, i - half))
+        for i in range(n)
+    ]
 
 
 def _elevation(
     pts: list[DataPoint],
     altitude_source: str = "unknown",
+    source: str = "",
 ) -> tuple[Optional[float], Optional[float]]:
     """Hysteresis-based elevation accumulation.
 
-    Only commits a new elevation when it differs from the last committed value
-    by at least the source-specific threshold, filtering GPS jitter and
-    barometric quantization noise that would otherwise inflate the gain figure.
+    Applies a short moving-average pre-smoothing then commits a new elevation
+    level only when it differs from the last committed value by at least the
+    source-specific threshold.  Parameters are chosen per data source via
+    :func:`elevation_params`.
     """
     elevations = [p.elevation_m for p in pts if p.elevation_m is not None]
     if len(elevations) < 2:
         return None, None
-    threshold = _ELEVATION_THRESHOLD.get(altitude_source, 10.0)
+    ma_window, threshold = elevation_params(altitude_source, source)
 
     # Some devices (e.g. Apple Watch) record exactly 0.0 for the initial samples
     # while waiting for barometric/GPS lock, then jump to the real altitude.
@@ -518,9 +551,11 @@ def _elevation(
                     start = i
                     break
 
+    elevations = _ele_moving_average(elevations[start:], ma_window)
+
     gain = loss = 0.0
-    committed = elevations[start]
-    for e in elevations[start + 1:]:
+    committed = elevations[0]
+    for e in elevations[1:]:
         # Skip near-zero values that appear mid-recording while we are at a
         # significant elevation — these are sensor dropouts (device lost GPS/
         # barometric lock), not genuine sea-level crossings.