MTPyroger/pyrogram/client/style/html.py

# Pyrogram - Telegram MTProto API Client Library for Python
# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>
#
# This file is part of Pyrogram.
#
# Pyrogram is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pyrogram is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram.  If not, see <http://www.gnu.org/licenses/>.

import html
import re
from collections import OrderedDict
from html.parser import HTMLParser
from struct import unpack

import pyrogram
from pyrogram.api import types
from pyrogram.errors import PeerIdInvalid


class Parser(HTMLParser):
    MENTION_RE = re.compile(r"tg://user\?id=(\d+)")

    def __init__(self, client: "pyrogram.BaseClient"):
        super().__init__()

        self.client = client

        self.text = ""
        self.entities = []
        self.temp_entities = []
        self.tags = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        extra = {}

        if tag in ["b", "strong"]:
            entity = types.MessageEntityBold
        elif tag in ["i", "em"]:
            entity = types.MessageEntityItalic
        elif tag == "u":
            entity = types.MessageEntityUnderline
        elif tag in ["s", "del", "strike"]:
            entity = types.MessageEntityStrike
        elif tag == "blockquote":
            entity = types.MessageEntityBlockquote
        elif tag == "code":
            entity = types.MessageEntityCode
        elif tag == "pre":
            entity = types.MessageEntityPre
            extra["language"] = ""
        elif tag == "a":
            url = attrs.get("href", "")

            mention = Parser.MENTION_RE.match(url)

            if mention:
                user_id = int(mention.group(1))

                try:
                    user = self.client.resolve_peer(user_id)
                except PeerIdInvalid:
                    entity = types.MessageEntityMentionName
                    extra["user_id"] = user_id
                else:
                    entity = types.InputMessageEntityMentionName
                    extra["user_id"] = user
            else:
                entity = types.MessageEntityTextUrl
                extra["url"] = url
        else:
            return

        self.tags.append(tag)
        self.temp_entities.append(entity(offset=len(self.text), length=0, **extra))

    def handle_data(self, data):
        data = html.unescape(data)

        for entity in self.temp_entities:
            entity.length += len(data)

        self.text += data

    def handle_endtag(self, tag):
        try:
            start_tag = self.tags.pop()
        except IndexError:
            return

        if start_tag != tag:
            line, offset = self.getpos()
            offset += 1

            raise ValueError("Expected end tag </{}>, but found </{}> at {}:{}".format(start_tag, tag, line, offset))

        self.entities.append(self.temp_entities.pop())

    def error(self, message):
        pass


class HTML:
    # SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview
    SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")

    def __init__(self, client: "pyrogram.BaseClient" = None):
        self.client = client

    @staticmethod
    def add_surrogates(text):
        # Replace each SMP code point with a surrogate pair
        return HTML.SMP_RE.sub(
            lambda match:  # Split SMP in two surrogates
            "".join(chr(i) for i in unpack("<HH", match.group().encode("utf-16le"))),
            text
        )

    @staticmethod
    def remove_surrogates(text):
        # Replace each surrogate pair with a SMP code point
        return text.encode("utf-16", "surrogatepass").decode("utf-16")

    def parse(self, text: str):
        text = HTML.add_surrogates(str(text or "").strip())

        parser = Parser(self.client)
        parser.feed(text)
        parser.close()

        # TODO: OrderedDict to be removed in Python 3.6
        return OrderedDict([
            ("message", HTML.remove_surrogates(parser.text)),
            ("entities", parser.entities)
        ])
Move formatting classes inside the Client sub-package 2018-01-23 14:17:48 +00:00			`# Pyrogram - Telegram MTProto API Client Library for Python`
Update copyright year 2019-01-01 11:36:16 +00:00			`# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>`
Move formatting classes inside the Client sub-package 2018-01-23 14:17:48 +00:00			`#`
			`# This file is part of Pyrogram.`
			`#`
			`# Pyrogram is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Lesser General Public License as published`
			`# by the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Pyrogram is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.`

Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`import html`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00			`import re`
Fix style parsers randomly returning "unsorted" dicts. This is due to Python <3.6 having "unsorted" dicts. Dicts are inherently unsorted, but starting from Python 3.6 they keep the order in which the keys are inserted (useful for unpacking) 2018-12-31 16:13:50 +00:00			`from collections import OrderedDict`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`from html.parser import HTMLParser`
Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods 2019-06-24 12:33:17 +00:00			`from struct import unpack`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00			`import pyrogram`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`from pyrogram.api import types`
Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00			`from pyrogram.errors import PeerIdInvalid`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00

Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`class Parser(HTMLParser):`
Revamp HTML style parser 2018-01-23 13:43:12 +00:00			`MENTION_RE = re.compile(r"tg://user\?id=(\d+)")`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`def __init__(self, client: "pyrogram.BaseClient"):`
			`super().__init__()`

Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00			`self.client = client`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`self.text = ""`
			`self.entities = []`
			`self.temp_entities = []`
			`self.tags = []`

			`def handle_starttag(self, tag, attrs):`
			`attrs = dict(attrs)`
			`extra = {}`

			`if tag in ["b", "strong"]:`
			`entity = types.MessageEntityBold`
			`elif tag in ["i", "em"]:`
			`entity = types.MessageEntityItalic`
			`elif tag == "u":`
			`entity = types.MessageEntityUnderline`
			`elif tag in ["s", "del", "strike"]:`
			`entity = types.MessageEntityStrike`
			`elif tag == "blockquote":`
			`entity = types.MessageEntityBlockquote`
			`elif tag == "code":`
			`entity = types.MessageEntityCode`
			`elif tag == "pre":`
			`entity = types.MessageEntityPre`
			`extra["language"] = ""`
			`elif tag == "a":`
			`url = attrs.get("href", "")`

			`mention = Parser.MENTION_RE.match(url)`

			`if mention:`
			`user_id = int(mention.group(1))`

			`try:`
			`user = self.client.resolve_peer(user_id)`
			`except PeerIdInvalid:`
			`entity = types.MessageEntityMentionName`
			`extra["user_id"] = user_id`
			`else:`
			`entity = types.InputMessageEntityMentionName`
			`extra["user_id"] = user`
			`else:`
			`entity = types.MessageEntityTextUrl`
			`extra["url"] = url`
			`else:`
			`return`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`self.tags.append(tag)`
			`self.temp_entities.append(entity(offset=len(self.text), length=0, **extra))`
Revamp HTML style parser 2018-01-23 13:43:12 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`def handle_data(self, data):`
			`data = html.unescape(data)`
Revamp HTML style parser 2018-01-23 13:43:12 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`for entity in self.temp_entities:`
			`entity.length += len(data)`
Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`self.text += data`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`def handle_endtag(self, tag):`
Fix HTML parsing breaking with no tags 2019-06-24 11:36:27 +00:00			`try:`
			`start_tag = self.tags.pop()`
			`except IndexError:`
			`return`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`if start_tag != tag:`
			`line, offset = self.getpos()`
			`offset += 1`
Add HTML style parse mode 2018-01-21 23:26:43 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`raise ValueError("Expected end tag </{}>, but found </{}> at {}:{}".format(start_tag, tag, line, offset))`
Add html unparse 2018-05-11 11:37:49 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`self.entities.append(self.temp_entities.pop())`

			`def error(self, message):`
			`pass`


			`class HTML:`
Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods 2019-06-24 12:33:17 +00:00			`# SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview`
			`SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")`

Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`def __init__(self, client: "pyrogram.BaseClient" = None):`
			`self.client = client`

Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods 2019-06-24 12:33:17 +00:00			`@staticmethod`
			`def add_surrogates(text):`
			`# Replace each SMP code point with a surrogate pair`
			`return HTML.SMP_RE.sub(`
			`lambda match: # Split SMP in two surrogates`
			`"".join(chr(i) for i in unpack("<HH", match.group().encode("utf-16le"))),`
			`text`
			`)`

			`@staticmethod`
			`def remove_surrogates(text):`
			`# Replace each surrogate pair with a SMP code point`
			`return text.encode("utf-16", "surrogatepass").decode("utf-16")`

Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`def parse(self, text: str):`
Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods 2019-06-24 12:33:17 +00:00			`text = HTML.add_surrogates(str(text or "").strip())`
Add html unparse 2018-05-11 11:37:49 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`parser = Parser(self.client)`
			`parser.feed(text)`
Actually fix the HTML Parser feeding by calling .close() when done 2019-06-24 12:17:46 +00:00			`parser.close()`
Add html unparse 2018-05-11 11:37:49 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`# TODO: OrderedDict to be removed in Python 3.6`
			`return OrderedDict([`
Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods 2019-06-24 12:33:17 +00:00			`("message", HTML.remove_surrogates(parser.text)),`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`("entities", parser.entities)`
			`])`