From 2883d138464afc521849d2c80b61861927e038dc Mon Sep 17 00:00:00 2001 From: xtaodada Date: Mon, 21 Oct 2024 17:07:24 +0800 Subject: [PATCH] feat: bsky support parse richtext --- models/models/bsky.py | 4 +- models/models/bsky_richtext.py | 227 +++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 models/models/bsky_richtext.py diff --git a/models/models/bsky.py b/models/models/bsky.py index 19369ed..0553f03 100644 --- a/models/models/bsky.py +++ b/models/models/bsky.py @@ -13,6 +13,8 @@ from atproto_client.models.app.bsky.embed.record import ( ViewRecord as BskyViewRecordRecord, ) +from .bsky_richtext import bsky_html_parser + if TYPE_CHECKING: from atproto_client.models.app.bsky.feed.defs import ( FeedViewPost, @@ -141,7 +143,7 @@ class HumanPost(BaseModel, frozen=False): if isinstance(post, BskyViewRecordRecord) else post.embed ) - content = record.text + content = bsky_html_parser.unparse(record.text, record.facets) if record.facets else record.text created_at = record.created_at # images images = [] diff --git a/models/models/bsky_richtext.py b/models/models/bsky_richtext.py new file mode 100644 index 0000000..4b02d88 --- /dev/null +++ b/models/models/bsky_richtext.py @@ -0,0 +1,227 @@ +import html +import re +from httpx import AsyncClient +from html.parser import HTMLParser +from typing import Optional +from atproto import models + +from pyrogram.parser import utils + +from init import logs + + +class ParserModel(models.AppBskyRichtextFacet.Main): + index: Optional[str] = None + offset: int + length: int + + def get_origin(self) -> "models.AppBskyRichtextFacet.Main": + index = models.AppBskyRichtextFacet.ByteSlice( + byte_start=self.offset, byte_end=self.offset + self.length + ) + return models.AppBskyRichtextFacet.Main(features=self.features, index=index) + + @staticmethod + def from_origin(origin: "models.AppBskyRichtextFacet.Main") -> "ParserModel": + return ParserModel( + features=origin.features, + offset=origin.index.byte_start, + length=origin.index.byte_end - origin.index.byte_start, + ) + + +class Parser(HTMLParser): + MENTION_RE = re.compile(r"bsky\.app/profile/([^/]+)") + + def __init__(self): + super().__init__() + + self.text = "" + self.facts: list[ParserModel] = [] + self.tag_entities = {} + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + extra = {} + + if tag == "a": + url = attrs.get("href", "") + + mention = Parser.MENTION_RE.match(url) + + if mention: + entity = models.AppBskyRichtextFacet.Mention + extra["did"] = mention.group(1) + else: + entity = models.AppBskyRichtextFacet.Link + extra["uri"] = url + else: + return + + if tag not in self.tag_entities: + self.tag_entities[tag] = [] + + e = ParserModel( + features=[entity(**extra)], + offset=len(self.text), + length=0, + ) + self.tag_entities[tag].append(e) + + def handle_data(self, data): + data = html.unescape(data) + + for entities in self.tag_entities.values(): + for entity in entities: + entity.length += len(data) + + self.text += data + + def handle_endtag(self, tag): + try: + self.facts.append(self.tag_entities[tag].pop()) + except (KeyError, IndexError): + line, offset = self.getpos() + offset += 1 + + logs.debug("Unmatched closing tag at line %s:%s", tag, line, offset) + else: + if not self.tag_entities[tag]: + self.tag_entities.pop(tag) + + def error(self, message): + pass + + +class HTML: + def __init__(self): + self.client = AsyncClient() + + async def resolve_peer(self, handle: str) -> Optional[str]: + try: + req = await self.client.get( + "https://bsky.social/xrpc/com.atproto.identity.resolveHandle", + params={"handle": handle}, + timeout=10, + ) + req.raise_for_status() + return req.json()["did"] + except Exception: + return None + + async def parse(self, text: str) -> dict: + # Strip whitespaces from the beginning and the end, but preserve closing tags + text = re.sub(r"^\s*(<[\w<>=\s\"]*>)\s*", r"\1", text) + text = re.sub(r"\s*(]*>)\s*$", r"\1", text) + + parser = Parser() + parser.feed(utils.add_surrogates(text)) + parser.close() + + if parser.tag_entities: + unclosed_tags = [] + + for tag, entities in parser.tag_entities.items(): + unclosed_tags.append(f"<{tag}> (x{len(entities)})") + + logs.info("Unclosed tags: %s", ", ".join(unclosed_tags)) + + entities = [] + + for fact in parser.facts: + entity = fact.features[0] + if isinstance(entity, models.AppBskyRichtextFacet.Mention): + if not entity.did.startswith("did:plc:"): + did = await self.resolve_peer(entity.did) + if did: + entity = models.AppBskyRichtextFacet.Mention(did=did) + else: + continue + + fact.features[0] = entity + entities.append(fact) + + # Remove zero-length entities + entities = list(filter(lambda x: x.length > 0, entities)) + entities = sorted(entities, key=lambda e: e.offset) or None + # get origin facts + facets = [fact.get_origin() for fact in entities] if entities else None + + return { + "message": utils.remove_surrogates(parser.text), + "facets": facets, + } + + @staticmethod + def unparse(text: str, facets: list["models.AppBskyRichtextFacet.Main"]) -> str: + entities = [ParserModel.from_origin(fact) for fact in facets] + + def parse_one(entity: ParserModel): + """ + Parses a single entity and returns (start_tag, start), (end_tag, end) + """ + fact = entity.features[0] + start = entity.offset + end = start + entity.length + + if isinstance(fact, models.AppBskyRichtextFacet.Link): + url = fact.uri + start_tag = f'' + end_tag = "" + elif isinstance(fact, models.AppBskyRichtextFacet.Mention): + did = fact.did + url = "https://bsky.app/profile/" + did + start_tag = f'' + end_tag = "" + else: + return + + return (start_tag, start), (end_tag, end) + + def recursive(entity_i: int) -> int: + """ + Takes the index of the entity to start parsing from, returns the number of parsed entities inside it. + Uses entities_offsets as a stack, pushing (start_tag, start) first, then parsing nested entities, + and finally pushing (end_tag, end) to the stack. + No need to sort at the end. + """ + this = parse_one(entities[entity_i]) + if this is None: + return 1 + (start_tag, start), (end_tag, end) = this + entities_offsets.append((start_tag, start)) + internal_i = entity_i + 1 + # while the next entity is inside the current one, keep parsing + while internal_i < len(entities) and entities[internal_i].offset < end: + internal_i += recursive(internal_i) + entities_offsets.append((end_tag, end)) + return internal_i - entity_i + + text = utils.add_surrogates(text) + + entities_offsets = [] + + # probably useless because entities are already sorted by telegram + entities.sort(key=lambda e: (e.offset, -e.length)) + + # main loop for first-level entities + i = 0 + while i < len(entities): + i += recursive(i) + + if entities_offsets: + last_offset = entities_offsets[-1][1] + # no need to sort, but still add entities starting from the end + for entity, offset in reversed(entities_offsets): + text = ( + text[:offset] + + entity + + html.escape(text[offset:last_offset]) + + text[last_offset:] + ) + last_offset = offset + + return utils.remove_surrogates(text) + + +bsky_html_parser = HTML()