mirror of
https://github.com/Xtao-Labs/iShotaBot.git
synced 2024-11-22 07:07:52 +00:00
feat: bsky support parse richtext
This commit is contained in:
parent
b842cedae1
commit
2883d13846
@ -13,6 +13,8 @@ from atproto_client.models.app.bsky.embed.record import (
|
|||||||
ViewRecord as BskyViewRecordRecord,
|
ViewRecord as BskyViewRecordRecord,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .bsky_richtext import bsky_html_parser
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from atproto_client.models.app.bsky.feed.defs import (
|
from atproto_client.models.app.bsky.feed.defs import (
|
||||||
FeedViewPost,
|
FeedViewPost,
|
||||||
@ -141,7 +143,7 @@ class HumanPost(BaseModel, frozen=False):
|
|||||||
if isinstance(post, BskyViewRecordRecord)
|
if isinstance(post, BskyViewRecordRecord)
|
||||||
else post.embed
|
else post.embed
|
||||||
)
|
)
|
||||||
content = record.text
|
content = bsky_html_parser.unparse(record.text, record.facets) if record.facets else record.text
|
||||||
created_at = record.created_at
|
created_at = record.created_at
|
||||||
# images
|
# images
|
||||||
images = []
|
images = []
|
||||||
|
227
models/models/bsky_richtext.py
Normal file
227
models/models/bsky_richtext.py
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
import html
|
||||||
|
import re
|
||||||
|
from httpx import AsyncClient
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from typing import Optional
|
||||||
|
from atproto import models
|
||||||
|
|
||||||
|
from pyrogram.parser import utils
|
||||||
|
|
||||||
|
from init import logs
|
||||||
|
|
||||||
|
|
||||||
|
class ParserModel(models.AppBskyRichtextFacet.Main):
|
||||||
|
index: Optional[str] = None
|
||||||
|
offset: int
|
||||||
|
length: int
|
||||||
|
|
||||||
|
def get_origin(self) -> "models.AppBskyRichtextFacet.Main":
|
||||||
|
index = models.AppBskyRichtextFacet.ByteSlice(
|
||||||
|
byte_start=self.offset, byte_end=self.offset + self.length
|
||||||
|
)
|
||||||
|
return models.AppBskyRichtextFacet.Main(features=self.features, index=index)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_origin(origin: "models.AppBskyRichtextFacet.Main") -> "ParserModel":
|
||||||
|
return ParserModel(
|
||||||
|
features=origin.features,
|
||||||
|
offset=origin.index.byte_start,
|
||||||
|
length=origin.index.byte_end - origin.index.byte_start,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Parser(HTMLParser):
|
||||||
|
MENTION_RE = re.compile(r"bsky\.app/profile/([^/]+)")
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.text = ""
|
||||||
|
self.facts: list[ParserModel] = []
|
||||||
|
self.tag_entities = {}
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = dict(attrs)
|
||||||
|
extra = {}
|
||||||
|
|
||||||
|
if tag == "a":
|
||||||
|
url = attrs.get("href", "")
|
||||||
|
|
||||||
|
mention = Parser.MENTION_RE.match(url)
|
||||||
|
|
||||||
|
if mention:
|
||||||
|
entity = models.AppBskyRichtextFacet.Mention
|
||||||
|
extra["did"] = mention.group(1)
|
||||||
|
else:
|
||||||
|
entity = models.AppBskyRichtextFacet.Link
|
||||||
|
extra["uri"] = url
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
if tag not in self.tag_entities:
|
||||||
|
self.tag_entities[tag] = []
|
||||||
|
|
||||||
|
e = ParserModel(
|
||||||
|
features=[entity(**extra)],
|
||||||
|
offset=len(self.text),
|
||||||
|
length=0,
|
||||||
|
)
|
||||||
|
self.tag_entities[tag].append(e)
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
data = html.unescape(data)
|
||||||
|
|
||||||
|
for entities in self.tag_entities.values():
|
||||||
|
for entity in entities:
|
||||||
|
entity.length += len(data)
|
||||||
|
|
||||||
|
self.text += data
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
try:
|
||||||
|
self.facts.append(self.tag_entities[tag].pop())
|
||||||
|
except (KeyError, IndexError):
|
||||||
|
line, offset = self.getpos()
|
||||||
|
offset += 1
|
||||||
|
|
||||||
|
logs.debug("Unmatched closing tag </%s> at line %s:%s", tag, line, offset)
|
||||||
|
else:
|
||||||
|
if not self.tag_entities[tag]:
|
||||||
|
self.tag_entities.pop(tag)
|
||||||
|
|
||||||
|
def error(self, message):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class HTML:
|
||||||
|
def __init__(self):
|
||||||
|
self.client = AsyncClient()
|
||||||
|
|
||||||
|
async def resolve_peer(self, handle: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
req = await self.client.get(
|
||||||
|
"https://bsky.social/xrpc/com.atproto.identity.resolveHandle",
|
||||||
|
params={"handle": handle},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
req.raise_for_status()
|
||||||
|
return req.json()["did"]
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def parse(self, text: str) -> dict:
|
||||||
|
# Strip whitespaces from the beginning and the end, but preserve closing tags
|
||||||
|
text = re.sub(r"^\s*(<[\w<>=\s\"]*>)\s*", r"\1", text)
|
||||||
|
text = re.sub(r"\s*(</[\w</>]*>)\s*$", r"\1", text)
|
||||||
|
|
||||||
|
parser = Parser()
|
||||||
|
parser.feed(utils.add_surrogates(text))
|
||||||
|
parser.close()
|
||||||
|
|
||||||
|
if parser.tag_entities:
|
||||||
|
unclosed_tags = []
|
||||||
|
|
||||||
|
for tag, entities in parser.tag_entities.items():
|
||||||
|
unclosed_tags.append(f"<{tag}> (x{len(entities)})")
|
||||||
|
|
||||||
|
logs.info("Unclosed tags: %s", ", ".join(unclosed_tags))
|
||||||
|
|
||||||
|
entities = []
|
||||||
|
|
||||||
|
for fact in parser.facts:
|
||||||
|
entity = fact.features[0]
|
||||||
|
if isinstance(entity, models.AppBskyRichtextFacet.Mention):
|
||||||
|
if not entity.did.startswith("did:plc:"):
|
||||||
|
did = await self.resolve_peer(entity.did)
|
||||||
|
if did:
|
||||||
|
entity = models.AppBskyRichtextFacet.Mention(did=did)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
fact.features[0] = entity
|
||||||
|
entities.append(fact)
|
||||||
|
|
||||||
|
# Remove zero-length entities
|
||||||
|
entities = list(filter(lambda x: x.length > 0, entities))
|
||||||
|
entities = sorted(entities, key=lambda e: e.offset) or None
|
||||||
|
# get origin facts
|
||||||
|
facets = [fact.get_origin() for fact in entities] if entities else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"message": utils.remove_surrogates(parser.text),
|
||||||
|
"facets": facets,
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def unparse(text: str, facets: list["models.AppBskyRichtextFacet.Main"]) -> str:
|
||||||
|
entities = [ParserModel.from_origin(fact) for fact in facets]
|
||||||
|
|
||||||
|
def parse_one(entity: ParserModel):
|
||||||
|
"""
|
||||||
|
Parses a single entity and returns (start_tag, start), (end_tag, end)
|
||||||
|
"""
|
||||||
|
fact = entity.features[0]
|
||||||
|
start = entity.offset
|
||||||
|
end = start + entity.length
|
||||||
|
|
||||||
|
if isinstance(fact, models.AppBskyRichtextFacet.Link):
|
||||||
|
url = fact.uri
|
||||||
|
start_tag = f'<a href="{url}">'
|
||||||
|
end_tag = "</a>"
|
||||||
|
elif isinstance(fact, models.AppBskyRichtextFacet.Mention):
|
||||||
|
did = fact.did
|
||||||
|
url = "https://bsky.app/profile/" + did
|
||||||
|
start_tag = f'<a href="{url}">'
|
||||||
|
end_tag = "</a>"
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
return (start_tag, start), (end_tag, end)
|
||||||
|
|
||||||
|
def recursive(entity_i: int) -> int:
|
||||||
|
"""
|
||||||
|
Takes the index of the entity to start parsing from, returns the number of parsed entities inside it.
|
||||||
|
Uses entities_offsets as a stack, pushing (start_tag, start) first, then parsing nested entities,
|
||||||
|
and finally pushing (end_tag, end) to the stack.
|
||||||
|
No need to sort at the end.
|
||||||
|
"""
|
||||||
|
this = parse_one(entities[entity_i])
|
||||||
|
if this is None:
|
||||||
|
return 1
|
||||||
|
(start_tag, start), (end_tag, end) = this
|
||||||
|
entities_offsets.append((start_tag, start))
|
||||||
|
internal_i = entity_i + 1
|
||||||
|
# while the next entity is inside the current one, keep parsing
|
||||||
|
while internal_i < len(entities) and entities[internal_i].offset < end:
|
||||||
|
internal_i += recursive(internal_i)
|
||||||
|
entities_offsets.append((end_tag, end))
|
||||||
|
return internal_i - entity_i
|
||||||
|
|
||||||
|
text = utils.add_surrogates(text)
|
||||||
|
|
||||||
|
entities_offsets = []
|
||||||
|
|
||||||
|
# probably useless because entities are already sorted by telegram
|
||||||
|
entities.sort(key=lambda e: (e.offset, -e.length))
|
||||||
|
|
||||||
|
# main loop for first-level entities
|
||||||
|
i = 0
|
||||||
|
while i < len(entities):
|
||||||
|
i += recursive(i)
|
||||||
|
|
||||||
|
if entities_offsets:
|
||||||
|
last_offset = entities_offsets[-1][1]
|
||||||
|
# no need to sort, but still add entities starting from the end
|
||||||
|
for entity, offset in reversed(entities_offsets):
|
||||||
|
text = (
|
||||||
|
text[:offset]
|
||||||
|
+ entity
|
||||||
|
+ html.escape(text[offset:last_offset])
|
||||||
|
+ text[last_offset:]
|
||||||
|
)
|
||||||
|
last_offset = offset
|
||||||
|
|
||||||
|
return utils.remove_surrogates(text)
|
||||||
|
|
||||||
|
|
||||||
|
bsky_html_parser = HTML()
|
Loading…
Reference in New Issue
Block a user