Fix UnicodeEncodeError: sanitize surrogate pairs before JSON writes in merge.py
This commit is contained in:
+31
-6
@@ -21,6 +21,31 @@ import yaml
|
|||||||
|
|
||||||
# Per-user-directory lock so concurrent upload requests and the dev file-watcher
|
# Per-user-directory lock so concurrent upload requests and the dev file-watcher
|
||||||
# cannot run merge_all simultaneously on the same directory.
|
# cannot run merge_all simultaneously on the same directory.
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_surrogates(obj: object) -> object:
|
||||||
|
"""Recursively replace surrogate pairs in strings with proper Unicode code points.
|
||||||
|
|
||||||
|
Surrogate pairs (U+D800–U+DFFF) are valid in Python str but not in UTF-8.
|
||||||
|
They typically arise when emoji from UTF-16-encoded sources (Strava, some FIT
|
||||||
|
devices) are decoded incorrectly. encode/decode via utf-16 with surrogatepass
|
||||||
|
reconstructs the intended characters.
|
||||||
|
"""
|
||||||
|
if isinstance(obj, str):
|
||||||
|
try:
|
||||||
|
obj.encode("utf-8")
|
||||||
|
return obj
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
return obj.encode("utf-16", "surrogatepass").decode("utf-16")
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: _fix_surrogates(v) for k, v in obj.items()}
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return [_fix_surrogates(v) for v in obj]
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
def _dumps(obj: object) -> str:
|
||||||
|
return json.dumps(_fix_surrogates(obj), indent=2, ensure_ascii=False)
|
||||||
_merge_locks: dict[str, threading.Lock] = {}
|
_merge_locks: dict[str, threading.Lock] = {}
|
||||||
_merge_locks_mu = threading.Lock()
|
_merge_locks_mu = threading.Lock()
|
||||||
|
|
||||||
@@ -152,7 +177,7 @@ def _merge_one_locked(data_dir: Path, activity_id: str) -> None:
|
|||||||
if image_files:
|
if image_files:
|
||||||
detail["custom"] = dict(detail.get("custom") or {})
|
detail["custom"] = dict(detail.get("custom") or {})
|
||||||
detail["custom"]["images"] = image_files
|
detail["custom"]["images"] = image_files
|
||||||
dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
|
dest.write_text(_dumps(detail))
|
||||||
else:
|
else:
|
||||||
dest.symlink_to(src.resolve())
|
dest.symlink_to(src.resolve())
|
||||||
|
|
||||||
@@ -237,7 +262,7 @@ def _merge_all_locked(data_dir: Path) -> int:
|
|||||||
if activity_id in image_lists:
|
if activity_id in image_lists:
|
||||||
detail["custom"] = dict(detail.get("custom") or {})
|
detail["custom"] = dict(detail.get("custom") or {})
|
||||||
detail["custom"]["images"] = image_lists[activity_id]
|
detail["custom"]["images"] = image_lists[activity_id]
|
||||||
dest.write_text(json.dumps(detail, indent=2, ensure_ascii=False))
|
dest.write_text(_dumps(detail))
|
||||||
else:
|
else:
|
||||||
if not dest.exists() and not dest.is_symlink():
|
if not dest.exists() and not dest.is_symlink():
|
||||||
dest.symlink_to(src.resolve())
|
dest.symlink_to(src.resolve())
|
||||||
@@ -271,7 +296,7 @@ def _merge_all_locked(data_dir: Path) -> int:
|
|||||||
if edits:
|
if edits:
|
||||||
athlete_data = json.loads(athlete_src.read_text(encoding="utf-8"))
|
athlete_data = json.loads(athlete_src.read_text(encoding="utf-8"))
|
||||||
athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE})
|
athlete_data.update({k: v for k, v in edits.items() if k in _ATHLETE_EDITABLE})
|
||||||
athlete_dest.write_text(json.dumps(athlete_data, indent=2, ensure_ascii=False))
|
athlete_dest.write_text(_dumps(athlete_data))
|
||||||
else:
|
else:
|
||||||
athlete_dest.symlink_to(athlete_src.resolve())
|
athlete_dest.symlink_to(athlete_src.resolve())
|
||||||
|
|
||||||
@@ -338,7 +363,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic
|
|||||||
"activities": by_year[year],
|
"activities": by_year[year],
|
||||||
}
|
}
|
||||||
fname = f"index-{year}.json"
|
fname = f"index-{year}.json"
|
||||||
(merged_dir / fname).write_text(json.dumps(shard_doc, indent=2, ensure_ascii=False))
|
(merged_dir / fname).write_text(_dumps(shard_doc))
|
||||||
shards.append({"url": fname, "year": int(year) if year.isdigit() else 0,
|
shards.append({"url": fname, "year": int(year) if year.isdigit() else 0,
|
||||||
"count": len(by_year[year])})
|
"count": len(by_year[year])})
|
||||||
|
|
||||||
@@ -347,7 +372,7 @@ def _write_year_shards(merged_dir: Path, activities: list[dict], index_meta: dic
|
|||||||
"shards": shards,
|
"shards": shards,
|
||||||
"activities": [],
|
"activities": [],
|
||||||
}
|
}
|
||||||
(merged_dir / "index.json").write_text(json.dumps(root_doc, indent=2, ensure_ascii=False))
|
(merged_dir / "index.json").write_text(_dumps(root_doc))
|
||||||
|
|
||||||
|
|
||||||
FEED_PAGE_SIZE = 50
|
FEED_PAGE_SIZE = 50
|
||||||
@@ -423,6 +448,6 @@ def write_combined_feed(data_dir: Path) -> int:
|
|||||||
"total_activities": len(all_activities),
|
"total_activities": len(all_activities),
|
||||||
"activities": slim,
|
"activities": slim,
|
||||||
}
|
}
|
||||||
(data_dir / fname).write_text(json.dumps(doc, indent=2, ensure_ascii=False))
|
(data_dir / fname).write_text(_dumps(doc))
|
||||||
|
|
||||||
return len(all_activities)
|
return len(all_activities)
|
||||||
|
|||||||
Reference in New Issue
Block a user