2018-01-23 14:17:48 +00:00
|
|
|
# Pyrogram - Telegram MTProto API Client Library for Python
|
2019-01-01 11:36:16 +00:00
|
|
|
# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>
|
2018-01-23 14:17:48 +00:00
|
|
|
#
|
|
|
|
# This file is part of Pyrogram.
|
|
|
|
#
|
|
|
|
# Pyrogram is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Lesser General Public License as published
|
|
|
|
# by the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# Pyrogram is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU Lesser General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Lesser General Public License
|
|
|
|
# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
import html
|
2018-01-21 23:26:43 +00:00
|
|
|
import re
|
2018-12-31 16:13:50 +00:00
|
|
|
from collections import OrderedDict
|
2019-06-24 08:07:28 +00:00
|
|
|
from html.parser import HTMLParser
|
2019-06-24 12:33:17 +00:00
|
|
|
from struct import unpack
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-03-26 12:32:30 +00:00
|
|
|
import pyrogram
|
2019-06-24 08:07:28 +00:00
|
|
|
from pyrogram.api import types
|
2019-03-26 12:32:30 +00:00
|
|
|
from pyrogram.errors import PeerIdInvalid
|
2018-01-21 23:26:43 +00:00
|
|
|
|
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
class Parser(HTMLParser):
|
2018-01-23 13:43:12 +00:00
|
|
|
MENTION_RE = re.compile(r"tg://user\?id=(\d+)")
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
def __init__(self, client: "pyrogram.BaseClient"):
|
|
|
|
super().__init__()
|
|
|
|
|
2019-03-26 12:32:30 +00:00
|
|
|
self.client = client
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
self.text = ""
|
|
|
|
self.entities = []
|
|
|
|
self.temp_entities = []
|
|
|
|
self.tags = []
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
attrs = dict(attrs)
|
|
|
|
extra = {}
|
|
|
|
|
|
|
|
if tag in ["b", "strong"]:
|
|
|
|
entity = types.MessageEntityBold
|
|
|
|
elif tag in ["i", "em"]:
|
|
|
|
entity = types.MessageEntityItalic
|
|
|
|
elif tag == "u":
|
|
|
|
entity = types.MessageEntityUnderline
|
|
|
|
elif tag in ["s", "del", "strike"]:
|
|
|
|
entity = types.MessageEntityStrike
|
|
|
|
elif tag == "blockquote":
|
|
|
|
entity = types.MessageEntityBlockquote
|
|
|
|
elif tag == "code":
|
|
|
|
entity = types.MessageEntityCode
|
|
|
|
elif tag == "pre":
|
|
|
|
entity = types.MessageEntityPre
|
|
|
|
extra["language"] = ""
|
|
|
|
elif tag == "a":
|
|
|
|
url = attrs.get("href", "")
|
|
|
|
|
|
|
|
mention = Parser.MENTION_RE.match(url)
|
|
|
|
|
|
|
|
if mention:
|
|
|
|
user_id = int(mention.group(1))
|
|
|
|
|
|
|
|
try:
|
|
|
|
user = self.client.resolve_peer(user_id)
|
|
|
|
except PeerIdInvalid:
|
|
|
|
entity = types.MessageEntityMentionName
|
|
|
|
extra["user_id"] = user_id
|
|
|
|
else:
|
|
|
|
entity = types.InputMessageEntityMentionName
|
|
|
|
extra["user_id"] = user
|
|
|
|
else:
|
|
|
|
entity = types.MessageEntityTextUrl
|
|
|
|
extra["url"] = url
|
|
|
|
else:
|
|
|
|
return
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
self.tags.append(tag)
|
|
|
|
self.temp_entities.append(entity(offset=len(self.text), length=0, **extra))
|
2018-01-23 13:43:12 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
def handle_data(self, data):
|
|
|
|
data = html.unescape(data)
|
2018-01-23 13:43:12 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
for entity in self.temp_entities:
|
|
|
|
entity.length += len(data)
|
2019-03-26 12:32:30 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
self.text += data
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
def handle_endtag(self, tag):
|
2019-06-24 11:36:27 +00:00
|
|
|
try:
|
|
|
|
start_tag = self.tags.pop()
|
|
|
|
except IndexError:
|
|
|
|
return
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
if start_tag != tag:
|
|
|
|
line, offset = self.getpos()
|
|
|
|
offset += 1
|
2018-01-21 23:26:43 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
raise ValueError("Expected end tag </{}>, but found </{}> at {}:{}".format(start_tag, tag, line, offset))
|
2018-05-11 11:37:49 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
self.entities.append(self.temp_entities.pop())
|
|
|
|
|
|
|
|
def error(self, message):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class HTML:
|
2019-06-24 12:33:17 +00:00
|
|
|
# SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview
|
|
|
|
SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")
|
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
def __init__(self, client: "pyrogram.BaseClient" = None):
|
|
|
|
self.client = client
|
|
|
|
|
2019-06-24 12:33:17 +00:00
|
|
|
@staticmethod
|
|
|
|
def add_surrogates(text):
|
|
|
|
# Replace each SMP code point with a surrogate pair
|
|
|
|
return HTML.SMP_RE.sub(
|
|
|
|
lambda match: # Split SMP in two surrogates
|
|
|
|
"".join(chr(i) for i in unpack("<HH", match.group().encode("utf-16le"))),
|
|
|
|
text
|
|
|
|
)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def remove_surrogates(text):
|
|
|
|
# Replace each surrogate pair with a SMP code point
|
|
|
|
return text.encode("utf-16", "surrogatepass").decode("utf-16")
|
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
def parse(self, text: str):
|
2019-06-24 12:33:17 +00:00
|
|
|
text = HTML.add_surrogates(str(text or "").strip())
|
2018-05-11 11:37:49 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
parser = Parser(self.client)
|
|
|
|
parser.feed(text)
|
2019-06-24 12:17:46 +00:00
|
|
|
parser.close()
|
2018-05-11 11:37:49 +00:00
|
|
|
|
2019-06-24 08:07:28 +00:00
|
|
|
# TODO: OrderedDict to be removed in Python 3.6
|
|
|
|
return OrderedDict([
|
2019-06-24 12:33:17 +00:00
|
|
|
("message", HTML.remove_surrogates(parser.text)),
|
2019-06-24 08:07:28 +00:00
|
|
|
("entities", parser.entities)
|
|
|
|
])
|