diff --git a/bincio/render/merge.py b/bincio/render/merge.py index d21a495..4bdefda 100644 --- a/bincio/render/merge.py +++ b/bincio/render/merge.py @@ -21,6 +21,31 @@ import yaml # Per-user-directory lock so concurrent upload requests and the dev file-watcher # cannot run merge_all simultaneously on the same directory. + + +def _fix_surrogates(obj: object) -> object: + """Recursively replace surrogate pairs in strings with proper Unicode code points. + + Surrogate pairs (U+D800–U+DFFF) are valid in Python str but not in UTF-8. + They typically arise when emoji from UTF-16-encoded sources (Strava, some FIT + devices) are decoded incorrectly. encode/decode via utf-16 with surrogatepass + reconstructs the intended characters. + """ + if isinstance(obj, str): + try: + obj.encode("utf-8") + return obj + except UnicodeEncodeError: + return obj.encode("utf-16", "surrogatepass").decode("utf-16") + if isinstance(obj, dict): + return {k: _fix_surrogates(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_fix_surrogates(v) for v in obj] + return obj + + +def _dumps(obj: object) -> str: + return json.dumps(_fix_surrogates(obj), indent=2, ensure_ascii=False) _merge_locks: dict[str, threading.Lock] = {} _merge_locks_mu = threading.Lock() @@ -152,7 +177,7 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None: if image_files: detail["custom"] = dict(detail.get("custom") or {}) detail["custom"]["images"] = image_files - dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False)) + dest.write_text(_dumps(detail)) else: dest.symlink_to(src.resolve()) @@ -237,7 +262,7 @@ def _merge_all_locked(data_dir: Path) -> int: if activity_id in image_lists: detail["custom"] = dict(detail.get("custom") or {}) detail["custom"]["images"] = image_lists[activity_id] - dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False)) + dest.write_text(_dumps(detail)) else: if not dest.exists() and not dest.is_symlink(): dest.symlink_to(src.resolve()) @@ -271,7 +296,7 @@ def _merge_all_locked(data_dir: Path) -> int: if edits: athlete_data = json.loads(athlete_src.read_text(encoding="utf-8")) athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE}) - athlete_dest.write_text(json.dumps(athlete_data, indent=2, ensure_ascii=False)) + athlete_dest.write_text(_dumps(athlete_data)) else: athlete_dest.symlink_to(athlete_src.resolve()) @@ -338,7 +363,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic "activities": by_year[year], } fname = f"index-{year}.json" - (merged_dir / fname).write_text(json.dumps(shard_doc, indent=2, ensure_ascii=False)) + (merged_dir / fname).write_text(_dumps(shard_doc)) shards.append({"url": fname, "year": int(year) if year.isdigit() else 0, "count": len(by_year[year])}) @@ -347,7 +372,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic "shards": shards, "activities": [], } - (merged_dir / "index.json").write_text(json.dumps(root_doc, indent=2, ensure_ascii=False)) + (merged_dir / "index.json").write_text(_dumps(root_doc)) FEED_PAGE_SIZE = 50 @@ -423,6 +448,6 @@ def write_combined_feed(data_dir: Path) -> int: "total_activities": len(all_activities), "activities": slim, } - (data_dir / fname).write_text(json.dumps(doc, indent=2, ensure_ascii=False)) + (data_dir / fname).write_text(_dumps(doc)) return len(all_activities)