feat: bsky support parse richtext

2024-11-22 07:07:52 +00:00 · 2024-10-21 17:07:24 +08:00 · 2024-10-21 17:07:24 +08:00 · 2883d13846
commit 2883d13846
parent b842cedae1
2 changed files with 230 additions and 1 deletions
--- a/models/models/bsky.py
+++ b/models/models/bsky.py
@ -13,6 +13,8 @@ from atproto_client.models.app.bsky.embed.record import (
    ViewRecord as BskyViewRecordRecord,
 )
 from .bsky_richtext import bsky_html_parser
 if TYPE_CHECKING:
    from atproto_client.models.app.bsky.feed.defs import (
        FeedViewPost,
@ -141,7 +143,7 @@ class HumanPost(BaseModel, frozen=False):
            if isinstance(post, BskyViewRecordRecord)
            else post.embed
        )
-        content = record.text
+        content = bsky_html_parser.unparse(record.text, record.facets) if record.facets else record.text
        created_at = record.created_at
        # images
        images = []
--- a/models/models/bsky_richtext.py
+++ b/models/models/bsky_richtext.py
@ -0,0 +1,227 @@
 import html
 import re
 from httpx import AsyncClient
 from html.parser import HTMLParser
 from typing import Optional
 from atproto import models
 from pyrogram.parser import utils
 from init import logs
 class ParserModel(models.AppBskyRichtextFacet.Main):
    index: Optional[str] = None
    offset: int
    length: int
    def get_origin(self) -> "models.AppBskyRichtextFacet.Main":
        index = models.AppBskyRichtextFacet.ByteSlice(
            byte_start=self.offset, byte_end=self.offset + self.length
        )
        return models.AppBskyRichtextFacet.Main(features=self.features, index=index)
    @staticmethod
    def from_origin(origin: "models.AppBskyRichtextFacet.Main") -> "ParserModel":
        return ParserModel(
            features=origin.features,
            offset=origin.index.byte_start,
            length=origin.index.byte_end - origin.index.byte_start,
        )
 class Parser(HTMLParser):
    MENTION_RE = re.compile(r"bsky\.app/profile/([^/]+)")
    def __init__(self):
        super().__init__()
        self.text = ""
        self.facts: list[ParserModel] = []
        self.tag_entities = {}
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        extra = {}
        if tag == "a":
            url = attrs.get("href", "")
            mention = Parser.MENTION_RE.match(url)
            if mention:
                entity = models.AppBskyRichtextFacet.Mention
                extra["did"] = mention.group(1)
            else:
                entity = models.AppBskyRichtextFacet.Link
                extra["uri"] = url
        else:
            return
        if tag not in self.tag_entities:
            self.tag_entities[tag] = []
        e = ParserModel(
            features=[entity(**extra)],
            offset=len(self.text),
            length=0,
        )
        self.tag_entities[tag].append(e)
    def handle_data(self, data):
        data = html.unescape(data)
        for entities in self.tag_entities.values():
            for entity in entities:
                entity.length += len(data)
        self.text += data
    def handle_endtag(self, tag):
        try:
            self.facts.append(self.tag_entities[tag].pop())
        except (KeyError, IndexError):
            line, offset = self.getpos()
            offset += 1
            logs.debug("Unmatched closing tag </%s> at line %s:%s", tag, line, offset)
        else:
            if not self.tag_entities[tag]:
                self.tag_entities.pop(tag)
    def error(self, message):
        pass
 class HTML:
    def __init__(self):
        self.client = AsyncClient()
    async def resolve_peer(self, handle: str) -> Optional[str]:
        try:
            req = await self.client.get(
                "https://bsky.social/xrpc/com.atproto.identity.resolveHandle",
                params={"handle": handle},
                timeout=10,
            )
            req.raise_for_status()
            return req.json()["did"]
        except Exception:
            return None
    async def parse(self, text: str) -> dict:
        # Strip whitespaces from the beginning and the end, but preserve closing tags
        text = re.sub(r"^\s*(<[\w<>=\s\"]*>)\s*", r"\1", text)
        text = re.sub(r"\s*(</[\w</>]*>)\s*$", r"\1", text)
        parser = Parser()
        parser.feed(utils.add_surrogates(text))
        parser.close()
        if parser.tag_entities:
            unclosed_tags = []
            for tag, entities in parser.tag_entities.items():
                unclosed_tags.append(f"<{tag}> (x{len(entities)})")
            logs.info("Unclosed tags: %s", ", ".join(unclosed_tags))
        entities = []
        for fact in parser.facts:
            entity = fact.features[0]
            if isinstance(entity, models.AppBskyRichtextFacet.Mention):
                if not entity.did.startswith("did:plc:"):
                    did = await self.resolve_peer(entity.did)
                    if did:
                        entity = models.AppBskyRichtextFacet.Mention(did=did)
                    else:
                        continue
            fact.features[0] = entity
            entities.append(fact)
        # Remove zero-length entities
        entities = list(filter(lambda x: x.length > 0, entities))
        entities = sorted(entities, key=lambda e: e.offset) or None
        # get origin facts
        facets = [fact.get_origin() for fact in entities] if entities else None
        return {
            "message": utils.remove_surrogates(parser.text),
            "facets": facets,
        }
    @staticmethod
    def unparse(text: str, facets: list["models.AppBskyRichtextFacet.Main"]) -> str:
        entities = [ParserModel.from_origin(fact) for fact in facets]
        def parse_one(entity: ParserModel):
            """
            Parses a single entity and returns (start_tag, start), (end_tag, end)
            """
            fact = entity.features[0]
            start = entity.offset
            end = start + entity.length
            if isinstance(fact, models.AppBskyRichtextFacet.Link):
                url = fact.uri
                start_tag = f'<a href="{url}">'
                end_tag = "</a>"
            elif isinstance(fact, models.AppBskyRichtextFacet.Mention):
                did = fact.did
                url = "https://bsky.app/profile/" + did
                start_tag = f'<a href="{url}">'
                end_tag = "</a>"
            else:
                return
            return (start_tag, start), (end_tag, end)
        def recursive(entity_i: int) -> int:
            """
            Takes the index of the entity to start parsing from, returns the number of parsed entities inside it.
            Uses entities_offsets as a stack, pushing (start_tag, start) first, then parsing nested entities,
            and finally pushing (end_tag, end) to the stack.
            No need to sort at the end.
            """
            this = parse_one(entities[entity_i])
            if this is None:
                return 1
            (start_tag, start), (end_tag, end) = this
            entities_offsets.append((start_tag, start))
            internal_i = entity_i + 1
            # while the next entity is inside the current one, keep parsing
            while internal_i < len(entities) and entities[internal_i].offset < end:
                internal_i += recursive(internal_i)
            entities_offsets.append((end_tag, end))
            return internal_i - entity_i
        text = utils.add_surrogates(text)
        entities_offsets = []
        # probably useless because entities are already sorted by telegram
        entities.sort(key=lambda e: (e.offset, -e.length))
        # main loop for first-level entities
        i = 0
        while i < len(entities):
            i += recursive(i)
        if entities_offsets:
            last_offset = entities_offsets[-1][1]
            # no need to sort, but still add entities starting from the end
            for entity, offset in reversed(entities_offsets):
                text = (
                    text[:offset]
                    + entity
                    + html.escape(text[offset:last_offset])
                    + text[last_offset:]
                )
                last_offset = offset
        return utils.remove_surrogates(text)
 bsky_html_parser = HTML()