Fix UnicodeEncodeError: sanitize surrogate pairs before JSON writes in merge.py
This commit is contained in:
+31
-6
@@ -21,6 +21,31 @@ import yaml
|
||||
|
||||
# Per-user-directory lock so concurrent upload requests and the dev file-watcher
|
||||
# cannot run merge_all simultaneously on the same directory.
|
||||
|
||||
|
||||
def _fix_surrogates(obj: object) -> object:
|
||||
"""Recursively replace surrogate pairs in strings with proper Unicode code points.
|
||||
|
||||
Surrogate pairs (U+D800–U+DFFF) are valid in Python str but not in UTF-8.
|
||||
They typically arise when emoji from UTF-16-encoded sources (Strava, some FIT
|
||||
devices) are decoded incorrectly. encode/decode via utf-16 with surrogatepass
|
||||
reconstructs the intended characters.
|
||||
"""
|
||||
if isinstance(obj, str):
|
||||
try:
|
||||
obj.encode("utf-8")
|
||||
return obj
|
||||
except UnicodeEncodeError:
|
||||
return obj.encode("utf-16", "surrogatepass").decode("utf-16")
|
||||
if isinstance(obj, dict):
|
||||
return {k: _fix_surrogates(v) for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [_fix_surrogates(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def _dumps(obj: object) -> str:
|
||||
return json.dumps(_fix_surrogates(obj), indent=2, ensure_ascii=False)
|
||||
_merge_locks: dict[str, threading.Lock] = {}
|
||||
_merge_locks_mu = threading.Lock()
|
||||
|
||||
@@ -152,7 +177,7 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
|
||||
if image_files:
|
||||
detail["custom"] = dict(detail.get("custom") or {})
|
||||
detail["custom"]["images"] = image_files
|
||||
dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
|
||||
dest.write_text(_dumps(detail))
|
||||
else:
|
||||
dest.symlink_to(src.resolve())
|
||||
|
||||
@@ -237,7 +262,7 @@ def _merge_all_locked(data_dir: Path) -> int:
|
||||
if activity_id in image_lists:
|
||||
detail["custom"] = dict(detail.get("custom") or {})
|
||||
detail["custom"]["images"] = image_lists[activity_id]
|
||||
dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
|
||||
dest.write_text(_dumps(detail))
|
||||
else:
|
||||
if not dest.exists() and not dest.is_symlink():
|
||||
dest.symlink_to(src.resolve())
|
||||
@@ -271,7 +296,7 @@ def _merge_all_locked(data_dir: Path) -> int:
|
||||
if edits:
|
||||
athlete_data = json.loads(athlete_src.read_text(encoding="utf-8"))
|
||||
athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE})
|
||||
athlete_dest.write_text(json.dumps(athlete_data, indent=2, ensure_ascii=False))
|
||||
athlete_dest.write_text(_dumps(athlete_data))
|
||||
else:
|
||||
athlete_dest.symlink_to(athlete_src.resolve())
|
||||
|
||||
@@ -338,7 +363,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic
|
||||
"activities": by_year[year],
|
||||
}
|
||||
fname = f"index-{year}.json"
|
||||
(merged_dir / fname).write_text(json.dumps(shard_doc, indent=2, ensure_ascii=False))
|
||||
(merged_dir / fname).write_text(_dumps(shard_doc))
|
||||
shards.append({"url": fname, "year": int(year) if year.isdigit() else 0,
|
||||
"count": len(by_year[year])})
|
||||
|
||||
@@ -347,7 +372,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic
|
||||
"shards": shards,
|
||||
"activities": [],
|
||||
}
|
||||
(merged_dir / "index.json").write_text(json.dumps(root_doc, indent=2, ensure_ascii=False))
|
||||
(merged_dir / "index.json").write_text(_dumps(root_doc))
|
||||
|
||||
|
||||
FEED_PAGE_SIZE = 50
|
||||
@@ -423,6 +448,6 @@ def write_combined_feed(data_dir: Path) -> int:
|
||||
"total_activities": len(all_activities),
|
||||
"activities": slim,
|
||||
}
|
||||
(data_dir / fname).write_text(json.dumps(doc, indent=2, ensure_ascii=False))
|
||||
(data_dir / fname).write_text(_dumps(doc))
|
||||
|
||||
return len(all_activities)
|
||||
|
||||
Reference in New Issue
Block a user