fix: bsky richtext encode

This commit is contained in:
xtaodada 2024-10-21 17:36:45 +08:00
parent 2883d13846
commit e36c9ce218
Signed by: xtaodada
GPG Key ID: 4CBB3F4FA8C85659

View File

@ -63,7 +63,7 @@ class Parser(HTMLParser):
e = ParserModel( e = ParserModel(
features=[entity(**extra)], features=[entity(**extra)],
offset=len(self.text), offset=len(self.text.encode("utf-8")),
length=0, length=0,
) )
self.tag_entities[tag].append(e) self.tag_entities[tag].append(e)
@ -73,7 +73,7 @@ class Parser(HTMLParser):
for entities in self.tag_entities.values(): for entities in self.tag_entities.values():
for entity in entities: for entity in entities:
entity.length += len(data) entity.length += len(data.encode("utf-8"))
self.text += data self.text += data
@ -214,10 +214,12 @@ class HTML:
# no need to sort, but still add entities starting from the end # no need to sort, but still add entities starting from the end
for entity, offset in reversed(entities_offsets): for entity, offset in reversed(entities_offsets):
text = ( text = (
text[:offset] text.encode("utf-8")[:offset].decode("utf-8")
+ entity + entity
+ html.escape(text[offset:last_offset]) + html.escape(
+ text[last_offset:] text.encode("utf-8")[offset:last_offset].decode("utf-8")
)
+ text.encode("utf-8")[last_offset:].decode("utf-8")
) )
last_offset = offset last_offset = offset