Fix UnicodeEncodeError: sanitize surrogate pairs before JSON writes in merge.py

This commit is contained in:
Davide Scaini
2026-05-13 16:16:58 +02:00
parent c837464a28
commit ad9e428b1e
+31 -6
View File
@@ -21,6 +21,31 @@ import yaml
# Per-user-directory lock so concurrent upload requests and the dev file-watcher # Per-user-directory lock so concurrent upload requests and the dev file-watcher
# cannot run merge_all simultaneously on the same directory. # cannot run merge_all simultaneously on the same directory.
def _fix_surrogates(obj: object) -> object:
"""Recursively replace surrogate pairs in strings with proper Unicode code points.
Surrogate pairs (U+D800U+DFFF) are valid in Python str but not in UTF-8.
They typically arise when emoji from UTF-16-encoded sources (Strava, some FIT
devices) are decoded incorrectly. encode/decode via utf-16 with surrogatepass
reconstructs the intended characters.
"""
if isinstance(obj, str):
try:
obj.encode("utf-8")
return obj
except UnicodeEncodeError:
return obj.encode("utf-16", "surrogatepass").decode("utf-16")
if isinstance(obj, dict):
return {k: _fix_surrogates(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_fix_surrogates(v) for v in obj]
return obj
def _dumps(obj: object) -> str:
return json.dumps(_fix_surrogates(obj), indent=2, ensure_ascii=False)
_merge_locks: dict[str, threading.Lock] = {} _merge_locks: dict[str, threading.Lock] = {}
_merge_locks_mu = threading.Lock() _merge_locks_mu = threading.Lock()
@@ -152,7 +177,7 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
if image_files: if image_files:
detail["custom"] = dict(detail.get("custom") or {}) detail["custom"] = dict(detail.get("custom") or {})
detail["custom"]["images"] = image_files detail["custom"]["images"] = image_files
dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False)) dest.write_text(_dumps(detail))
else: else:
dest.symlink_to(src.resolve()) dest.symlink_to(src.resolve())
@@ -237,7 +262,7 @@ def _merge_all_locked(data_dir: Path) -> int:
if activity_id in image_lists: if activity_id in image_lists:
detail["custom"] = dict(detail.get("custom") or {}) detail["custom"] = dict(detail.get("custom") or {})
detail["custom"]["images"] = image_lists[activity_id] detail["custom"]["images"] = image_lists[activity_id]
dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False)) dest.write_text(_dumps(detail))
else: else:
if not dest.exists() and not dest.is_symlink(): if not dest.exists() and not dest.is_symlink():
dest.symlink_to(src.resolve()) dest.symlink_to(src.resolve())
@@ -271,7 +296,7 @@ def _merge_all_locked(data_dir: Path) -> int:
if edits: if edits:
athlete_data = json.loads(athlete_src.read_text(encoding="utf-8")) athlete_data = json.loads(athlete_src.read_text(encoding="utf-8"))
athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE}) athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE})
athlete_dest.write_text(json.dumps(athlete_data, indent=2, ensure_ascii=False)) athlete_dest.write_text(_dumps(athlete_data))
else: else:
athlete_dest.symlink_to(athlete_src.resolve()) athlete_dest.symlink_to(athlete_src.resolve())
@@ -338,7 +363,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic
"activities": by_year[year], "activities": by_year[year],
} }
fname = f"index-{year}.json" fname = f"index-{year}.json"
(merged_dir / fname).write_text(json.dumps(shard_doc, indent=2, ensure_ascii=False)) (merged_dir / fname).write_text(_dumps(shard_doc))
shards.append({"url": fname, "year": int(year) if year.isdigit() else 0, shards.append({"url": fname, "year": int(year) if year.isdigit() else 0,
"count": len(by_year[year])}) "count": len(by_year[year])})
@@ -347,7 +372,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic
"shards": shards, "shards": shards,
"activities": [], "activities": [],
} }
(merged_dir / "index.json").write_text(json.dumps(root_doc, indent=2, ensure_ascii=False)) (merged_dir / "index.json").write_text(_dumps(root_doc))
FEED_PAGE_SIZE = 50 FEED_PAGE_SIZE = 50
@@ -423,6 +448,6 @@ def write_combined_feed(data_dir: Path) -> int:
"total_activities": len(all_activities), "total_activities": len(all_activities),
"activities": slim, "activities": slim,
} }
(data_dir / fname).write_text(json.dumps(doc, indent=2, ensure_ascii=False)) (data_dir / fname).write_text(_dumps(doc))
return len(all_activities) return len(all_activities)