de602ff5d9
New pref download_disabled_default (stored in user_prefs + mirrored to _user_settings.json for the render pipeline). When true, apply_sidecar marks all activities as download_disabled unless the sidecar explicitly sets download_disabled: false (per-activity opt-in from the edit drawer). Settings page gets an "Activity defaults" card with the toggle.
507 lines
20 KiB
Python
507 lines
20 KiB
Python
"""Apply sidecar .md edits to BAS JSON files.
|
||
|
||
Produces data_dir/_merged/ — a mirror of data_dir where:
|
||
- Files without sidecars are symlinked to the originals (cheap, preserves extracted data)
|
||
- Files with sidecars are written as merged copies
|
||
- index.json is rewritten with private filtering + highlight sort
|
||
|
||
This keeps data_dir/activities/*.json pristine (re-running extract never clobbers
|
||
user edits, and removing a sidecar always reverts fully on the next render).
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
import shutil
|
||
import threading
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
# Per-user-directory lock so concurrent upload requests and the dev file-watcher
|
||
# cannot run merge_all simultaneously on the same directory.
|
||
|
||
|
||
def _fix_surrogates(obj: object) -> object:
|
||
"""Recursively replace surrogate pairs in strings with proper Unicode code points.
|
||
|
||
Surrogate pairs (U+D800–U+DFFF) are valid in Python str but not in UTF-8.
|
||
They typically arise when emoji from UTF-16-encoded sources (Strava, some FIT
|
||
devices) are decoded incorrectly. encode/decode via utf-16 with surrogatepass
|
||
reconstructs the intended characters.
|
||
"""
|
||
if isinstance(obj, str):
|
||
try:
|
||
obj.encode("utf-8")
|
||
return obj
|
||
except UnicodeEncodeError:
|
||
return obj.encode("utf-16", "surrogatepass").decode("utf-16")
|
||
if isinstance(obj, dict):
|
||
return {k: _fix_surrogates(v) for k, v in obj.items()}
|
||
if isinstance(obj, list):
|
||
return [_fix_surrogates(v) for v in obj]
|
||
return obj
|
||
|
||
|
||
def _dumps(obj: object) -> str:
|
||
return json.dumps(_fix_surrogates(obj), indent=2, ensure_ascii=False)
|
||
_merge_locks: dict[str, threading.Lock] = {}
|
||
_merge_locks_mu = threading.Lock()
|
||
|
||
|
||
def _merge_lock(data_dir: Path) -> threading.Lock:
|
||
key = str(data_dir.resolve())
|
||
with _merge_locks_mu:
|
||
if key not in _merge_locks:
|
||
_merge_locks[key] = threading.Lock()
|
||
return _merge_locks[key]
|
||
|
||
|
||
def parse_sidecar(path: Path) -> tuple[dict, str]:
|
||
"""Return (frontmatter_dict, markdown_body) from a sidecar .md file."""
|
||
text = path.read_text(encoding="utf-8")
|
||
if text.startswith("---"):
|
||
parts = re.split(r"^---[ \t]*$", text, maxsplit=2, flags=re.MULTILINE)
|
||
if len(parts) >= 3:
|
||
fm = yaml.safe_load(parts[1]) or {}
|
||
return fm, parts[2].strip()
|
||
return {}, text.strip()
|
||
|
||
|
||
def apply_sidecar(detail: dict, fm: dict, body: str, *, download_disabled_default: bool = False) -> dict:
|
||
"""Apply sidecar overrides to a detail JSON dict. Returns a modified copy."""
|
||
from bincio.extract.writer import _infer_indoor_title
|
||
d = dict(detail)
|
||
d.setdefault("custom", {})
|
||
d["custom"] = dict(d["custom"]) # don't mutate original
|
||
|
||
if "title" in fm:
|
||
d["title"] = str(fm["title"])
|
||
if "sport" in fm:
|
||
d["sport"] = str(fm["sport"])
|
||
if "sub_sport" in fm:
|
||
d["sub_sport"] = str(fm["sub_sport"]) if fm["sub_sport"] else None
|
||
# Infer indoor from title when sub_sport is still absent after sidecar
|
||
if not d.get("sub_sport") and _infer_indoor_title(d.get("title") or ""):
|
||
d["sub_sport"] = "indoor"
|
||
if "gear" in fm:
|
||
d["gear"] = str(fm["gear"]) if fm["gear"] else d.get("gear")
|
||
if body:
|
||
d["description"] = body
|
||
elif "description" in fm:
|
||
d["description"] = str(fm["description"])
|
||
if "highlight" in fm:
|
||
d["custom"]["highlight"] = bool(fm["highlight"])
|
||
if "private" in fm:
|
||
d["privacy"] = "unlisted" if fm["private"] else detail.get("privacy", "public")
|
||
if "hide_stats" in fm:
|
||
d["custom"]["hide_stats"] = [str(s) for s in (fm["hide_stats"] or [])]
|
||
dd = fm.get("download_disabled") # True, False, or None (absent)
|
||
if dd is True:
|
||
d["download_disabled"] = True
|
||
elif dd is None and download_disabled_default:
|
||
d["download_disabled"] = True
|
||
# dd is False → explicit per-activity opt-in, leave unset
|
||
|
||
return d
|
||
|
||
|
||
def _apply_sidecar_summary(summary: dict, fm: dict) -> dict:
|
||
"""Apply sidecar overrides to an index summary entry."""
|
||
from bincio.extract.writer import _infer_indoor_title
|
||
s = dict(summary)
|
||
s.setdefault("custom", {})
|
||
s["custom"] = dict(s["custom"])
|
||
|
||
if "title" in fm:
|
||
s["title"] = str(fm["title"])
|
||
if "sport" in fm:
|
||
s["sport"] = str(fm["sport"])
|
||
if "sub_sport" in fm:
|
||
s["sub_sport"] = str(fm["sub_sport"]) if fm["sub_sport"] else None
|
||
if "highlight" in fm:
|
||
s["custom"]["highlight"] = bool(fm["highlight"])
|
||
if "private" in fm:
|
||
s["privacy"] = "unlisted" if fm["private"] else summary.get("privacy", "public")
|
||
# Infer indoor from title when sub_sport is still absent
|
||
if not s.get("sub_sport") and _infer_indoor_title(s.get("title") or ""):
|
||
s["sub_sport"] = "indoor"
|
||
|
||
return s
|
||
|
||
|
||
def merge_one(data_dir: Path, activity_id: str) -> None:
|
||
"""Apply (or remove) sidecar overrides for a single activity.
|
||
|
||
Updates data_dir/_merged/activities/{id}.json and rewrites
|
||
_merged/index.json. Faster than merge_all() for interactive edits
|
||
because it touches only one activity file instead of rebuilding the
|
||
whole _merged/activities/ directory.
|
||
|
||
Use merge_all() for bulk operations (first run, Strava sync, etc.).
|
||
"""
|
||
with _merge_lock(data_dir):
|
||
_merge_one_locked(data_dir, activity_id)
|
||
|
||
|
||
def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
|
||
edits_dir = data_dir / "edits"
|
||
acts_dir = data_dir / "activities"
|
||
merged_dir = data_dir / "_merged"
|
||
merged_acts = merged_dir / "activities"
|
||
merged_acts.mkdir(parents=True, exist_ok=True)
|
||
|
||
src = acts_dir / f"{activity_id}.json"
|
||
if not src.exists():
|
||
return
|
||
|
||
dest = merged_acts / f"{activity_id}.json"
|
||
|
||
# Determine if a sidecar or image list applies to this activity
|
||
sidecar_path = edits_dir / f"{activity_id}.md" if edits_dir.exists() else None
|
||
images_dir = edits_dir / "images" / activity_id if edits_dir.exists() else None
|
||
has_sidecar = sidecar_path is not None and sidecar_path.exists()
|
||
image_files: list[str] = []
|
||
if images_dir and images_dir.exists():
|
||
image_files = sorted(
|
||
p.name for p in images_dir.iterdir()
|
||
if p.is_file() and not p.name.startswith(".")
|
||
)
|
||
|
||
needs_merge = has_sidecar or bool(image_files)
|
||
# Also need a real file (not symlink) when title inference would change sub_sport
|
||
if not needs_merge and not has_sidecar:
|
||
from bincio.extract.writer import _infer_indoor_title
|
||
_peek = json.loads(src.read_text(encoding="utf-8"))
|
||
if not _peek.get("sub_sport") and _infer_indoor_title(_peek.get("title") or ""):
|
||
needs_merge = True
|
||
|
||
# Symlink the timeseries file (never merged — always points to the extract output)
|
||
ts_src = acts_dir / f"{activity_id}.timeseries.json"
|
||
ts_dest = merged_acts / f"{activity_id}.timeseries.json"
|
||
if ts_dest.exists() or ts_dest.is_symlink():
|
||
ts_dest.unlink()
|
||
if ts_src.exists():
|
||
ts_dest.symlink_to(ts_src.resolve())
|
||
|
||
# Remove the old dest (symlink or file) before writing the new one
|
||
if dest.exists() or dest.is_symlink():
|
||
dest.unlink()
|
||
|
||
if needs_merge:
|
||
detail = locals().get("_peek") or json.loads(src.read_text(encoding="utf-8"))
|
||
if has_sidecar:
|
||
fm, body = parse_sidecar(sidecar_path) # type: ignore[arg-type]
|
||
detail = apply_sidecar(detail, fm, body)
|
||
else:
|
||
# No sidecar — still apply title inference
|
||
detail = apply_sidecar(detail, {}, "")
|
||
if image_files:
|
||
detail["custom"] = dict(detail.get("custom") or {})
|
||
detail["custom"]["images"] = image_files
|
||
dest.write_text(_dumps(detail))
|
||
else:
|
||
dest.symlink_to(src.resolve())
|
||
|
||
# Rewrite index — load the full sidecar map so all summaries stay consistent
|
||
index_path = data_dir / "index.json"
|
||
if not index_path.exists():
|
||
return
|
||
|
||
all_sidecars: dict[str, tuple[dict, str]] = {}
|
||
if edits_dir and edits_dir.exists():
|
||
for md_path in edits_dir.glob("*.md"):
|
||
all_sidecars[md_path.stem] = parse_sidecar(md_path)
|
||
|
||
index = json.loads(index_path.read_text(encoding="utf-8"))
|
||
activities = []
|
||
for s in index.get("activities", []):
|
||
aid = s.get("id", "")
|
||
fm, _ = all_sidecars[aid] if aid in all_sidecars else ({}, "")
|
||
s = _apply_sidecar_summary(s, fm)
|
||
activities.append(s)
|
||
|
||
activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
|
||
activities.sort(key=lambda a: 0 if a.get("custom", {}).get("highlight") else 1)
|
||
|
||
_write_year_shards(merged_dir, activities, index)
|
||
|
||
|
||
def merge_all(data_dir: Path) -> int:
|
||
"""Build data_dir/_merged/ with all sidecar overrides applied.
|
||
|
||
Returns the number of sidecars found and applied.
|
||
"""
|
||
with _merge_lock(data_dir):
|
||
return _merge_all_locked(data_dir)
|
||
|
||
|
||
def _merge_all_locked(data_dir: Path) -> int:
|
||
edits_dir = data_dir / "edits"
|
||
acts_dir = data_dir / "activities"
|
||
merged_dir = data_dir / "_merged"
|
||
merged_acts = merged_dir / "activities"
|
||
|
||
_settings_path = data_dir / "_user_settings.json"
|
||
try:
|
||
_user_settings = json.loads(_settings_path.read_text(encoding="utf-8")) if _settings_path.exists() else {}
|
||
except (OSError, json.JSONDecodeError):
|
||
_user_settings = {}
|
||
_dl_default: bool = bool(_user_settings.get("download_disabled_default", False))
|
||
|
||
# Collect sidecars upfront
|
||
sidecars: dict[str, tuple[dict, str]] = {}
|
||
if edits_dir.exists():
|
||
for md_path in sorted(edits_dir.glob("*.md")):
|
||
sidecars[md_path.stem] = parse_sidecar(md_path)
|
||
|
||
# Collect image lists — activities with uploaded images get custom.images even
|
||
# if they have no sidecar text yet
|
||
image_lists: dict[str, list[str]] = {}
|
||
images_root = edits_dir / "images" if edits_dir.exists() else None
|
||
if images_root and images_root.exists():
|
||
for img_dir in sorted(images_root.iterdir()):
|
||
if img_dir.is_dir():
|
||
files = sorted(
|
||
p.name for p in img_dir.iterdir()
|
||
if p.is_file() and not p.name.startswith(".")
|
||
)
|
||
if files:
|
||
image_lists[img_dir.name] = files
|
||
|
||
to_merge = set(sidecars) | set(image_lists)
|
||
|
||
# Also include activities whose title implies indoor (no sidecar required)
|
||
_index_path = data_dir / "index.json"
|
||
_cached_index: dict | None = None
|
||
if _index_path.exists():
|
||
from bincio.extract.writer import _infer_indoor_title
|
||
_cached_index = json.loads(_index_path.read_text(encoding="utf-8"))
|
||
for _s in _cached_index.get("activities", []):
|
||
_aid = _s.get("id", "")
|
||
if _aid and not _s.get("sub_sport") and _infer_indoor_title(_s.get("title") or ""):
|
||
to_merge.add(_aid)
|
||
|
||
# Wipe and recreate _merged/activities/
|
||
shutil.rmtree(merged_acts, ignore_errors=True)
|
||
merged_acts.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Mirror activities/ — symlink unmodified, write merged copies for overridden
|
||
if acts_dir.exists():
|
||
for src in sorted(acts_dir.iterdir()):
|
||
if not src.is_file():
|
||
continue
|
||
dest = merged_acts / src.name
|
||
activity_id = src.stem
|
||
if src.suffix == ".json" and activity_id in to_merge:
|
||
detail = json.loads(src.read_text(encoding="utf-8"))
|
||
if activity_id in sidecars:
|
||
fm, body = sidecars[activity_id]
|
||
detail = apply_sidecar(detail, fm, body, download_disabled_default=_dl_default)
|
||
else:
|
||
detail = apply_sidecar(detail, {}, "", download_disabled_default=_dl_default)
|
||
if activity_id in image_lists:
|
||
detail["custom"] = dict(detail.get("custom") or {})
|
||
detail["custom"]["images"] = image_lists[activity_id]
|
||
dest.write_text(_dumps(detail))
|
||
else:
|
||
if not dest.exists() and not dest.is_symlink():
|
||
dest.symlink_to(src.resolve())
|
||
|
||
# Mirror edits/images/ → _merged/activities/images/ so the site can serve them
|
||
if images_root and images_root.exists():
|
||
merged_images = merged_acts / "images"
|
||
merged_images.mkdir(exist_ok=True)
|
||
for img_dir in images_root.iterdir():
|
||
if img_dir.is_dir():
|
||
dest_img = merged_images / img_dir.name
|
||
if not dest_img.exists():
|
||
dest_img.symlink_to(img_dir.resolve())
|
||
|
||
# Produce merged athlete.json — base from extract overlaid with edits/athlete.yaml
|
||
athlete_src = data_dir / "athlete.json"
|
||
athlete_dest = merged_dir / "athlete.json"
|
||
if athlete_dest.exists() or athlete_dest.is_symlink():
|
||
athlete_dest.unlink()
|
||
if athlete_src.exists():
|
||
athlete_edits_path = data_dir / "edits" / "athlete.yaml"
|
||
if athlete_edits_path.exists():
|
||
try:
|
||
import yaml as _yaml
|
||
edits = _yaml.safe_load(athlete_edits_path.read_text(encoding="utf-8")) or {}
|
||
except Exception:
|
||
edits = {}
|
||
else:
|
||
edits = {}
|
||
_ATHLETE_EDITABLE = {"max_hr", "ftp_w", "hr_zones", "power_zones", "seasons", "gear"}
|
||
if edits:
|
||
athlete_data = json.loads(athlete_src.read_text(encoding="utf-8"))
|
||
athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE})
|
||
athlete_dest.write_text(_dumps(athlete_data))
|
||
else:
|
||
athlete_dest.symlink_to(athlete_src.resolve())
|
||
|
||
# Write merged index.json (private filtered, highlight sorted)
|
||
index_path = data_dir / "index.json"
|
||
if index_path.exists():
|
||
index = _cached_index or json.loads(index_path.read_text(encoding="utf-8"))
|
||
activities = []
|
||
for s in index.get("activities", []):
|
||
aid = s.get("id", "")
|
||
fm, _ = sidecars[aid] if aid in sidecars else ({}, "")
|
||
s = _apply_sidecar_summary(s, fm)
|
||
activities.append(s)
|
||
|
||
# "unlisted" (and legacy "private") activities are kept in the index so
|
||
# the owner can reach them by direct URL; the feed UI filters them out
|
||
# for non-owners client-side.
|
||
# Sort: newest first, then bring highlighted activities to the top.
|
||
activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
|
||
activities.sort(key=lambda a: 0 if a.get("custom", {}).get("highlight") else 1)
|
||
|
||
_write_year_shards(merged_dir, activities, index)
|
||
else:
|
||
# Remove any stale year shard files if the source index disappeared
|
||
for f in merged_dir.glob("index-*.json"):
|
||
f.unlink()
|
||
if (merged_dir / "index.json").exists():
|
||
(merged_dir / "index.json").unlink()
|
||
|
||
return len(sidecars)
|
||
|
||
|
||
# Fields only needed for athlete.json aggregation at extract time — they add
|
||
# bulk to every summary entry but are never read by the feed UI.
|
||
_FEED_STRIP = {"best_efforts", "best_climb_m", "source"}
|
||
|
||
|
||
def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dict) -> None:
|
||
"""Split activities by year and write index-{year}.json shards.
|
||
|
||
Replaces merged_dir/index.json with a shard manifest so the feed can
|
||
load only the most-recent year on first paint and fetch older years lazily.
|
||
"""
|
||
from collections import defaultdict
|
||
|
||
# Remove stale year shard files from previous runs
|
||
for f in merged_dir.glob("index-*.json"):
|
||
f.unlink(missing_ok=True)
|
||
|
||
by_year: dict[str, list[dict]] = defaultdict(list)
|
||
for a in activities:
|
||
year = (a.get("started_at") or "")[:4] or "unknown"
|
||
# Strip aggregation-only fields to keep shard files small
|
||
slim = {k: v for k, v in a.items() if k not in _FEED_STRIP}
|
||
by_year[year].append(slim)
|
||
|
||
years = sorted(by_year.keys(), reverse=True) # newest first
|
||
shards = []
|
||
for year in years:
|
||
shard_doc = {
|
||
**{k: v for k, v in index_meta.items() if k not in ("activities", "shards")},
|
||
"shards": [],
|
||
"activities": by_year[year],
|
||
}
|
||
fname = f"index-{year}.json"
|
||
(merged_dir / fname).write_text(_dumps(shard_doc))
|
||
shards.append({"url": fname, "year": int(year) if year.isdigit() else 0,
|
||
"count": len(by_year[year])})
|
||
|
||
root_doc = {
|
||
**{k: v for k, v in index_meta.items() if k not in ("activities", "shards")},
|
||
"shards": shards,
|
||
"activities": [],
|
||
}
|
||
(merged_dir / "index.json").write_text(_dumps(root_doc))
|
||
|
||
|
||
FEED_PAGE_SIZE = 50
|
||
|
||
# Extra fields stripped from the combined feed — preview_coords is the biggest
|
||
# contributor (~24% of shard size) but the feed cards need it for thumbnails,
|
||
# so we keep it. mmp is never displayed in feed cards.
|
||
_COMBINED_FEED_STRIP = _FEED_STRIP | {"mmp"}
|
||
|
||
|
||
def write_combined_feed(data_dir: Path) -> int:
|
||
"""Build data_dir/feed.json and per-month data_dir/feed-YYYY-MM.json shards.
|
||
|
||
feed.json is a BAS shard index (same format as per-user index.json).
|
||
Each feed-YYYY-MM.json contains all activities for that month across all users,
|
||
sorted newest-first. Returns the number of activities written.
|
||
"""
|
||
user_dirs = sorted(
|
||
p for p in data_dir.iterdir()
|
||
if p.is_dir() and (p / "activities").exists()
|
||
)
|
||
|
||
all_activities: list[dict] = []
|
||
for user_dir in user_dirs:
|
||
handle = user_dir.name
|
||
merged = user_dir / "_merged"
|
||
index_path = merged / "index.json" if merged.exists() else user_dir / "index.json"
|
||
if not index_path.exists():
|
||
continue
|
||
|
||
index = json.loads(index_path.read_text(encoding="utf-8"))
|
||
shards = index.get("shards", [])
|
||
activities = index.get("activities", [])
|
||
|
||
if shards:
|
||
year_shards = [s for s in shards if re.match(r"index-\d{4}\.json$", s.get("url", ""))]
|
||
base = index_path.parent
|
||
for shard in year_shards:
|
||
shard_path = base / shard["url"]
|
||
if shard_path.exists():
|
||
shard_data = json.loads(shard_path.read_text(encoding="utf-8"))
|
||
for a in shard_data.get("activities", []):
|
||
a_tagged = {**a, "handle": handle}
|
||
detail_url = a_tagged.get("detail_url", "")
|
||
if detail_url and not detail_url.startswith("http") and not detail_url.startswith("/"):
|
||
merged_rel = f"{handle}/_merged/" if merged.exists() else f"{handle}/"
|
||
a_tagged["detail_url"] = merged_rel + detail_url
|
||
track_url = a_tagged.get("track_url", "")
|
||
if track_url and not track_url.startswith("http") and not track_url.startswith("/"):
|
||
merged_rel = f"{handle}/_merged/" if merged.exists() else f"{handle}/"
|
||
a_tagged["track_url"] = merged_rel + track_url
|
||
all_activities.append(a_tagged)
|
||
else:
|
||
for a in activities:
|
||
all_activities.append({**a, "handle": handle})
|
||
|
||
all_activities.sort(key=lambda a: a.get("started_at", ""), reverse=True)
|
||
|
||
# Remove stale feed files (sequential pages and old year shards)
|
||
for f in data_dir.glob("feed*.json"):
|
||
f.unlink(missing_ok=True)
|
||
|
||
if not all_activities:
|
||
return 0
|
||
|
||
# Group by YYYY-MM (month), preserving newest-first order within each bucket
|
||
by_month: dict[str, list[dict]] = {}
|
||
for a in all_activities:
|
||
ym = (a.get("started_at") or "")[:7] # "YYYY-MM"
|
||
if len(ym) == 7 and ym[4] == "-":
|
||
by_month.setdefault(ym, []).append(a)
|
||
|
||
months_desc = sorted(by_month.keys(), reverse=True)
|
||
|
||
# Write per-month shard files (~150-200 acts each → ~25 KB gzip)
|
||
for ym, acts in by_month.items():
|
||
slim = [{k: v for k, v in a.items() if k not in _COMBINED_FEED_STRIP} for a in acts]
|
||
doc: dict = {"bas_version": "1.0", "activities": slim}
|
||
(data_dir / f"feed-{ym}.json").write_text(_dumps(doc))
|
||
|
||
# Write feed.json as a BAS shard index (same pattern as per-user index.json)
|
||
index_doc: dict = {
|
||
"bas_version": "1.0",
|
||
"total_activities": len(all_activities),
|
||
"shards": [{"url": f"feed-{ym}.json"} for ym in months_desc],
|
||
"activities": [],
|
||
}
|
||
(data_dir / "feed.json").write_text(_dumps(index_doc))
|
||
|
||
return len(all_activities)
|