perf: year-shard index.json to cut initial load from MBs to ~1 year

merge_all/_merged/index.json is now a shard manifest; activities are
split into index-{year}.json files. The feed loads only the most-recent
year on first paint (~200 activities instead of all of them). Older
years are fetched lazily when the user clicks "Load older activities".

Also strips best_efforts / best_climb_m / source from shard files —
these fields are aggregation inputs only, never read by the feed UI.
This commit is contained in:
Davide Scaini
2026-04-19 22:21:10 +02:00
parent bb253cc2c1
commit cada2bcb03
5 changed files with 230 additions and 33 deletions
+53 -8
View File
@@ -155,8 +155,7 @@ def merge_one(data_dir: Path, activity_id: str) -> None:
activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
activities.sort(key=lambda a: 0 if a.get("custom", {}).get("highlight") else 1)
index["activities"] = activities
(merged_dir / "index.json").write_text(json.dumps(index, indent=2, ensure_ascii=False))
_write_year_shards(merged_dir, activities, index)
def merge_all(data_dir: Path) -> int:
@@ -267,11 +266,57 @@ def merge_all(data_dir: Path) -> int:
activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
activities.sort(key=lambda a: 0 if a.get("custom", {}).get("highlight") else 1)
index["activities"] = activities
(merged_dir / "index.json").write_text(
json.dumps(index, indent=2, ensure_ascii=False)
)
elif (merged_dir / "index.json").exists():
(merged_dir / "index.json").unlink()
_write_year_shards(merged_dir, activities, index)
else:
# Remove any stale year shard files if the source index disappeared
for f in merged_dir.glob("index-*.json"):
f.unlink()
if (merged_dir / "index.json").exists():
(merged_dir / "index.json").unlink()
return len(sidecars)
# Fields only needed for athlete.json aggregation at extract time — they add
# bulk to every summary entry but are never read by the feed UI.
_FEED_STRIP = {"best_efforts", "best_climb_m", "source"}
def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dict) -> None:
"""Split activities by year and write index-{year}.json shards.
Replaces merged_dir/index.json with a shard manifest so the feed can
load only the most-recent year on first paint and fetch older years lazily.
"""
from collections import defaultdict
# Remove stale year shard files from previous runs
for f in merged_dir.glob("index-*.json"):
f.unlink()
by_year: dict[str, list[dict]] = defaultdict(list)
for a in activities:
year = (a.get("started_at") or "")[:4] or "unknown"
# Strip aggregation-only fields to keep shard files small
slim = {k: v for k, v in a.items() if k not in _FEED_STRIP}
by_year[year].append(slim)
years = sorted(by_year.keys(), reverse=True) # newest first
shards = []
for year in years:
shard_doc = {
**{k: v for k, v in index_meta.items() if k not in ("activities", "shards")},
"shards": [],
"activities": by_year[year],
}
fname = f"index-{year}.json"
(merged_dir / fname).write_text(json.dumps(shard_doc, indent=2, ensure_ascii=False))
shards.append({"url": fname, "year": int(year) if year.isdigit() else 0,
"count": len(by_year[year])})
root_doc = {
**{k: v for k, v in index_meta.items() if k not in ("activities", "shards")},
"shards": shards,
"activities": [],
}
(merged_dir / "index.json").write_text(json.dumps(root_doc, indent=2, ensure_ascii=False))