MTPyroger/pyrogram/client/style/markdown.py

# Pyrogram - Telegram MTProto API Client Library for Python
# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>
#
# This file is part of Pyrogram.
#
# Pyrogram is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pyrogram is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram.  If not, see <http://www.gnu.org/licenses/>.

import re
from collections import OrderedDict

from pyrogram.api.types import (
    MessageEntityBold as Bold,
    MessageEntityItalic as Italic,
    MessageEntityCode as Code,
    MessageEntityTextUrl as Url,
    MessageEntityPre as Pre,
    MessageEntityMentionName as MentionInvalid,
    InputMessageEntityMentionName as Mention
)
from . import utils


class Markdown:
    BOLD_DELIMITER = "**"
    ITALIC_DELIMITER = "__"
    CODE_DELIMITER = "`"
    PRE_DELIMITER = "```"

    MARKDOWN_RE = re.compile(r"({d})([\w\W]*?)\1|\[([^[]+?)\]\(([^(]+?)\)".format(
        d="|".join(
            ["".join(i) for i in [
                ["\{}".format(j) for j in i]
                for i in [
                    PRE_DELIMITER,
                    CODE_DELIMITER,
                    ITALIC_DELIMITER,
                    BOLD_DELIMITER
                ]
            ]]
        )
    ))
    MENTION_RE = re.compile(r"tg://user\?id=(\d+)")

    def __init__(self, peers_by_id: dict):
        self.peers_by_id = peers_by_id

    def parse(self, message: str):
        message = utils.add_surrogates(str(message)).strip()
        entities = []
        offset = 0

        for match in self.MARKDOWN_RE.finditer(message):
            start = match.start() - offset
            style, body, text, url = match.groups()

            if url:
                mention = self.MENTION_RE.match(url)

                if mention:
                    user_id = int(mention.group(1))
                    input_user = self.peers_by_id.get(user_id, None)

                    entity = (
                        Mention(start, len(text), input_user)
                        if input_user
                        else MentionInvalid(start, len(text), user_id)
                    )
                else:
                    entity = Url(start, len(text), url)

                body = text
                offset += len(url) + 4
            else:
                if style == self.BOLD_DELIMITER:
                    entity = Bold(start, len(body))
                elif style == self.ITALIC_DELIMITER:
                    entity = Italic(start, len(body))
                elif style == self.CODE_DELIMITER:
                    entity = Code(start, len(body))
                elif style == self.PRE_DELIMITER:
                    entity = Pre(start, len(body), "")
                else:
                    continue

                offset += len(style) * 2

            entities.append(entity)
            message = message.replace(match.group(), body)

        # TODO: OrderedDict to be removed in Python3.6
        return OrderedDict([
            ("message", utils.remove_surrogates(message)),
            ("entities", entities)
        ])

    def unparse(self, message: str, entities: list):
        message = utils.add_surrogates(message).strip()
        offset = 0

        for entity in entities:
            start = entity.offset + offset
            type = entity.type
            url = entity.url
            user = entity.user
            sub = message[start: start + entity.length]

            if type == "bold":
                style = self.BOLD_DELIMITER
            elif type == "italic":
                style = self.ITALIC_DELIMITER
            elif type == "code":
                style = self.CODE_DELIMITER
            elif type == "pre":
                style = self.PRE_DELIMITER
            elif type == "text_link":
                offset += 4 + len(url)
                message = message[:start] + message[start:].replace(
                    sub, "[{}]({})".format(sub, url), 1)
                continue
            elif type == "text_mention":
                offset += 17 + len(str(user.id))
                message = message[:start] + message[start:].replace(
                    sub, "[{}](tg://user?id={})".format(sub, user.id), 1)
                continue
            else:
                continue

            offset += len(style) * 2
            message = message[:start] + message[start:].replace(
                sub, "{0}{1}{0}".format(style, sub), 1)

        return utils.remove_surrogates(message)
Add extensions package 2017-12-05 11:42:57 +00:00			`# Pyrogram - Telegram MTProto API Client Library for Python`
Update copyright year 2019-01-01 11:36:16 +00:00			`# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>`
Add extensions package 2017-12-05 11:42:57 +00:00			`#`
			`# This file is part of Pyrogram.`
			`#`
			`# Pyrogram is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Lesser General Public License as published`
			`# by the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Pyrogram is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.`

			`import re`
Fix style parsers randomly returning "unsorted" dicts. This is due to Python <3.6 having "unsorted" dicts. Dicts are inherently unsorted, but starting from Python 3.6 they keep the order in which the keys are inserted (useful for unpacking) 2018-12-31 16:13:50 +00:00			`from collections import OrderedDict`
Add extensions package 2017-12-05 11:42:57 +00:00
			`from pyrogram.api.types import (`
			`MessageEntityBold as Bold,`
			`MessageEntityItalic as Italic,`
			`MessageEntityCode as Code,`
			`MessageEntityTextUrl as Url,`
			`MessageEntityPre as Pre,`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`MessageEntityMentionName as MentionInvalid,`
Add support for user mentions 2017-12-13 09:44:24 +00:00			`InputMessageEntityMentionName as Mention`
Add extensions package 2017-12-05 11:42:57 +00:00			`)`
Move formatting classes inside the Client sub-package 2018-01-23 14:17:48 +00:00			`from . import utils`
Add extensions package 2017-12-05 11:42:57 +00:00

			`class Markdown:`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`BOLD_DELIMITER = "**"`
			`ITALIC_DELIMITER = "__"`
			CODE_DELIMITER = "`"
			PRE_DELIMITER = "```"
Add extensions package 2017-12-05 11:42:57 +00:00
Allow entities to span in multiple lines 2018-05-12 08:51:24 +00:00			`MARKDOWN_RE = re.compile(r"({d})([\w\W]*?)\1\|\[([^[]+?)\]\(([^(]+?)\)".format(`
Add extensions package 2017-12-05 11:42:57 +00:00			`d="\|".join(`
			`["".join(i) for i in [`
			`["\{}".format(j) for j in i]`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`for i in [`
			`PRE_DELIMITER,`
			`CODE_DELIMITER,`
			`ITALIC_DELIMITER,`
			`BOLD_DELIMITER`
			`]`
Add extensions package 2017-12-05 11:42:57 +00:00			`]]`
			`)`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`))`
			`MENTION_RE = re.compile(r"tg://user\?id=(\d+)")`
Add extensions package 2017-12-05 11:42:57 +00:00
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`def __init__(self, peers_by_id: dict):`
Add support for user mentions 2017-12-13 09:44:24 +00:00			`self.peers_by_id = peers_by_id`

Revamp markdown parser 2018-02-15 10:24:56 +00:00			`def parse(self, message: str):`
Automatically cast message and caption arguments to str 2019-01-03 19:53:48 +00:00			`message = utils.add_surrogates(str(message)).strip()`
Allow entities to span in multiple lines 2018-05-12 08:51:24 +00:00			`entities = []`
Add extensions package 2017-12-05 11:42:57 +00:00			`offset = 0`

Revamp markdown parser 2018-02-15 10:24:56 +00:00			`for match in self.MARKDOWN_RE.finditer(message):`
Add extensions package 2017-12-05 11:42:57 +00:00			`start = match.start() - offset`
Allow entities to span in multiple lines 2018-05-12 08:51:24 +00:00			`style, body, text, url = match.groups()`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Allow entities to span in multiple lines 2018-05-12 08:51:24 +00:00			`if url:`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`mention = self.MENTION_RE.match(url)`

			`if mention:`
			`user_id = int(mention.group(1))`
			`input_user = self.peers_by_id.get(user_id, None)`

			`entity = (`
			`Mention(start, len(text), input_user)`
			`if input_user`
			`else MentionInvalid(start, len(text), user_id)`
			`)`
			`else:`
			`entity = Url(start, len(text), url)`

			`body = text`
			`offset += len(url) + 4`
			`else:`
			`if style == self.BOLD_DELIMITER:`
			`entity = Bold(start, len(body))`
			`elif style == self.ITALIC_DELIMITER:`
			`entity = Italic(start, len(body))`
			`elif style == self.CODE_DELIMITER:`
			`entity = Code(start, len(body))`
Allow entities to span in multiple lines 2018-05-12 08:51:24 +00:00			`elif style == self.PRE_DELIMITER:`
			`entity = Pre(start, len(body), "")`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`else:`
Add extensions package 2017-12-05 11:42:57 +00:00			`continue`

Revamp markdown parser 2018-02-15 10:24:56 +00:00			`offset += len(style) * 2`
Add extensions package 2017-12-05 11:42:57 +00:00
			`entities.append(entity)`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`message = message.replace(match.group(), body)`
Add extensions package 2017-12-05 11:42:57 +00:00
Fix style parsers randomly returning "unsorted" dicts. This is due to Python <3.6 having "unsorted" dicts. Dicts are inherently unsorted, but starting from Python 3.6 they keep the order in which the keys are inserted (useful for unpacking) 2018-12-31 16:13:50 +00:00			`# TODO: OrderedDict to be removed in Python3.6`
			`return OrderedDict([`
			`("message", utils.remove_surrogates(message)),`
			`("entities", entities)`
			`])`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
			`def unparse(self, message: str, entities: list):`
Fix unparse not taking surrogates into account 2018-05-10 13:25:01 +00:00			`message = utils.add_surrogates(message).strip()`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`offset = 0`

			`for entity in entities:`
			`start = entity.offset + offset`
			`type = entity.type`
			`url = entity.url`
			`user = entity.user`
			`sub = message[start: start + entity.length]`

			`if type == "bold":`
			`style = self.BOLD_DELIMITER`
			`elif type == "italic":`
			`style = self.ITALIC_DELIMITER`
			`elif type == "code":`
			`style = self.CODE_DELIMITER`
			`elif type == "pre":`
			`style = self.PRE_DELIMITER`
			`elif type == "text_link":`
			`offset += 4 + len(url)`
Cleaner markdown 2018-05-10 13:07:03 +00:00			`message = message[:start] + message[start:].replace(`
			`sub, "[{}]({})".format(sub, url), 1)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`continue`
			`elif type == "text_mention":`
			`offset += 17 + len(str(user.id))`
Cleaner markdown 2018-05-10 13:07:03 +00:00			`message = message[:start] + message[start:].replace(`
			`sub, "[{}](tg://user?id={})".format(sub, user.id), 1)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`continue`
			`else:`
			`continue`

			`offset += len(style) * 2`
Cleaner markdown 2018-05-10 13:07:03 +00:00			`message = message[:start] + message[start:].replace(`
			`sub, "{0}{1}{0}".format(style, sub), 1)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
Fix unparse not taking surrogates into account 2018-05-10 13:25:01 +00:00			`return utils.remove_surrogates(message)`