perf: spatial 10 m downsampling for timeseries

Extract _haversine_m from the inline block in _gps_speed_kmh, add
_spatial_downsample (keep one sample per 10 m traveled, GPS haversine
primary / speed×Δt fallback, indoor activities unchanged), and wire it
into build_timeseries() after the 1 s dedup loop.

Add --downsample-timeseries migration flag to bincio render that applies
the same downsampling to existing stored timeseries files without
re-extracting from original FIT/GPX files.
This commit is contained in:
Davide Scaini
2026-05-19 20:11:00 +02:00
parent 835968e8fe
commit 84eff1f3b0
2 changed files with 163 additions and 4 deletions
+65 -4
View File
@@ -8,6 +8,68 @@ from typing import Optional
from bincio.extract.models import DataPoint
def _haversine_m(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Great-circle distance in metres between two GPS points."""
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
return 2 * 6_371_000.0 * atan2(sqrt(a), sqrt(1 - a))
_SPATIAL_RESOLUTION_M = 10.0
def _spatial_downsample(
sampled: list[DataPoint],
resolution_m: float = _SPATIAL_RESOLUTION_M,
) -> list[DataPoint]:
"""Keep one sample per `resolution_m` of cumulative distance traveled.
Distance source priority:
1. GPS haversine (lat/lon present on both consecutive points)
2. speed_kmh × Δt (fallback when GPS absent or gapped)
If neither source is available (indoor, no speed data), returns `sampled`
unchanged. Always retains the first and last points.
"""
if len(sampled) < 2:
return sampled
has_gps = any(p.lat is not None and p.lon is not None for p in sampled)
has_speed = any(p.speed_kmh is not None for p in sampled)
if not has_gps and not has_speed:
return sampled
result: list[DataPoint] = [sampled[0]]
cum_dist = 0.0
last_kept = 0.0
prev_speed = 0.0
for i in range(1, len(sampled)):
prev, cur = sampled[i - 1], sampled[i]
dt = (cur.timestamp - prev.timestamp).total_seconds()
if (has_gps
and prev.lat is not None and prev.lon is not None
and cur.lat is not None and cur.lon is not None):
dist_m = _haversine_m(prev.lat, prev.lon, cur.lat, cur.lon)
else:
spd = cur.speed_kmh if cur.speed_kmh is not None else prev_speed
dist_m = (spd / 3.6) * max(dt, 0)
if cur.speed_kmh is not None:
prev_speed = cur.speed_kmh
cum_dist += dist_m
if cum_dist - last_kept >= resolution_m:
result.append(cur)
last_kept = cum_dist
if result[-1] is not sampled[-1]:
result.append(sampled[-1])
return result
def _gps_speed_kmh(
lat_vals: list[Optional[float]],
lon_vals: list[Optional[float]],
@@ -24,10 +86,7 @@ def _gps_speed_kmh(
dt = ts_vals[i] - ts_vals[i - 1]
if la0 is None or lo0 is None or la1 is None or lo1 is None or dt <= 0:
continue
dlat = radians(la1 - la0)
dlon = radians(lo1 - lo0)
a = sin(dlat / 2) ** 2 + cos(radians(la0)) * cos(radians(la1)) * sin(dlon / 2) ** 2
d_km = 2 * 6371.0 * atan2(sqrt(a), sqrt(1 - a))
d_km = _haversine_m(la0, lo0, la1, lo1) / 1000.0
raw[i] = d_km / dt * 3600.0
# 5-point centred moving average (skip None anchors)
@@ -69,6 +128,8 @@ def build_timeseries(
sampled.append(p)
last_t = t
sampled = _spatial_downsample(sampled)
ts_vals = [int((p.timestamp - started_at).total_seconds()) for p in sampled]
lat_vals = [round(p.lat, 7) if p.lat is not None else None for p in sampled] if include_gps else None
lon_vals = [round(p.lon, 7) if p.lon is not None else None for p in sampled] if include_gps else None
+98
View File
@@ -517,6 +517,96 @@ def _backfill_speed(data: Path, handle: str | None = None) -> None:
console.print(f" [cyan]{user_dir.name}[/cyan]: {updated} timeseries updated with GPS speed")
def _downsample_timeseries(data: Path, handle: str | None = None) -> None:
"""Apply 10 m spatial downsampling to all stored timeseries files in activities/.
Reads the parallel JSON arrays, computes which indices to keep using the
same distance logic as _spatial_downsample, slices every channel, and
writes the file back. Run bincio render --no-build afterward so _merge_edits
regenerates _merged/ from the smaller source files.
"""
import json
from bincio.extract.timeseries import _haversine_m, _SPATIAL_RESOLUTION_M
_CHANNELS = ("t", "lat", "lon", "elevation_m", "speed_kmh",
"hr_bpm", "cadence_rpm", "power_w", "temperature_c")
targets = [data / handle] if handle else _user_dirs(data)
for user_dir in targets:
acts_dir = user_dir / "activities"
if not acts_dir.exists():
continue
updated = skipped = 0
for ts_path in sorted(acts_dir.glob("*.timeseries.json")):
try:
ts = json.loads(ts_path.read_text(encoding="utf-8"))
except Exception:
continue
t_vals = ts.get("t") or []
lat_vals = ts.get("lat") or []
lon_vals = ts.get("lon") or []
spd_vals = ts.get("speed_kmh") or []
n = len(t_vals)
if n < 2:
skipped += 1
continue
has_gps = any(v is not None for v in lat_vals)
has_speed = any(v is not None for v in spd_vals)
if not has_gps and not has_speed:
skipped += 1
continue
kept: list[int] = [0]
cum_dist = last_kept = prev_speed = 0.0
for i in range(1, n):
dt = t_vals[i] - t_vals[i - 1]
la0 = lat_vals[i - 1] if lat_vals else None
lo0 = lon_vals[i - 1] if lon_vals else None
la1 = lat_vals[i] if lat_vals else None
lo1 = lon_vals[i] if lon_vals else None
if (has_gps and la0 is not None and lo0 is not None
and la1 is not None and lo1 is not None):
dist_m = _haversine_m(la0, lo0, la1, lo1)
else:
spd = (spd_vals[i] if spd_vals and spd_vals[i] is not None
else prev_speed)
dist_m = (spd / 3.6) * max(dt, 0)
if spd_vals and spd_vals[i] is not None:
prev_speed = spd_vals[i]
cum_dist += dist_m
if cum_dist - last_kept >= _SPATIAL_RESOLUTION_M:
kept.append(i)
last_kept = cum_dist
if kept[-1] != n - 1:
kept.append(n - 1)
if len(kept) >= n:
skipped += 1
continue # already sparse (very short / indoor / rest-stop heavy)
for key in _CHANNELS:
ch = ts.get(key)
if ch:
ts[key] = [ch[i] for i in kept]
ts_path.write_text(
json.dumps(ts, indent=2, ensure_ascii=False), encoding="utf-8"
)
updated += 1
console.print(
f" [cyan]{user_dir.name}[/cyan]: "
f"{updated} downsampled, {skipped} skipped (indoor / short / already sparse)"
)
@click.command()
@click.option("--config", "config_path", default=None,
help="Path to extract_config.yaml (reads output.dir from it).")
@@ -549,6 +639,9 @@ def _backfill_speed(data: Path, handle: str | None = None) -> None:
@click.option("--backfill-speed", "backfill_speed", is_flag=True,
help="Compute GPS-derived speed for timeseries where the device didn't record "
"per-second speed (run once to enable speed map coloring on older activities).")
@click.option("--downsample-timeseries", "downsample_timeseries", is_flag=True,
help="Apply 10 m spatial downsampling to all stored timeseries files "
"(run once after deploying the downsampling code).")
def render(
config_path: Optional[str],
data_dir: Optional[str],
@@ -563,6 +656,7 @@ def render(
recompute_vam: bool,
backfill_vam_summary: bool,
backfill_speed: bool,
downsample_timeseries: bool,
) -> None:
"""Build (or serve) the BincioActivity static site from a BAS data store."""
@@ -592,6 +686,10 @@ def render(
console.print("Backfilling GPS-derived speed into timeseries…")
_backfill_speed(data, handle=handle)
if downsample_timeseries:
console.print("Applying spatial downsampling to timeseries…")
_downsample_timeseries(data, handle=handle)
_merge_edits(data, handle=handle)
_rebuild_athlete_json(data, handle=handle)
_bake_tracks(data, handle=handle)