From e36c9ce21807438b0ad3aa8773185375d997ade9 Mon Sep 17 00:00:00 2001 From: xtaodada Date: Mon, 21 Oct 2024 17:36:45 +0800 Subject: [PATCH] fix: bsky richtext encode --- models/models/bsky_richtext.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/models/models/bsky_richtext.py b/models/models/bsky_richtext.py index 4b02d88..80a6bd0 100644 --- a/models/models/bsky_richtext.py +++ b/models/models/bsky_richtext.py @@ -63,7 +63,7 @@ class Parser(HTMLParser): e = ParserModel( features=[entity(**extra)], - offset=len(self.text), + offset=len(self.text.encode("utf-8")), length=0, ) self.tag_entities[tag].append(e) @@ -73,7 +73,7 @@ class Parser(HTMLParser): for entities in self.tag_entities.values(): for entity in entities: - entity.length += len(data) + entity.length += len(data.encode("utf-8")) self.text += data @@ -214,10 +214,12 @@ class HTML: # no need to sort, but still add entities starting from the end for entity, offset in reversed(entities_offsets): text = ( - text[:offset] + text.encode("utf-8")[:offset].decode("utf-8") + entity - + html.escape(text[offset:last_offset]) - + text[last_offset:] + + html.escape( + text.encode("utf-8")[offset:last_offset].decode("utf-8") + ) + + text.encode("utf-8")[last_offset:].decode("utf-8") ) last_offset = offset