MTPyroger/pyrogram/client/style/markdown.py

# Pyrogram - Telegram MTProto API Client Library for Python
# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>
#
# This file is part of Pyrogram.
#
# Pyrogram is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Pyrogram is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram.  If not, see <http://www.gnu.org/licenses/>.

import html
import re

import pyrogram
from . import utils
from .html import HTML

BOLD_DELIM = "**"
ITALIC_DELIM = "__"
UNDERLINE_DELIM = "--"
STRIKE_DELIM = "~~"
CODE_DELIM = "`"
PRE_DELIM = "```"


class Markdown:
    MARKDOWN_RE = re.compile(r"({d})".format(
        d="|".join(
            ["".join(i) for i in [
                [r"\{}".format(j) for j in i]
                for i in [
                    PRE_DELIM,
                    CODE_DELIM,
                    STRIKE_DELIM,
                    UNDERLINE_DELIM,
                    ITALIC_DELIM,
                    BOLD_DELIM
                ]
            ]]
        )))

    URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)")

    OPENING_TAG = "<{}>"
    CLOSING_TAG = "</{}>"
    URL_MARKUP = '<a href="{}">{}</a>'
    FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]

    def __init__(self, client: "pyrogram.BaseClient"):
        self.html = HTML(client)

    async def parse(self, text: str):
        text = html.escape(text)

        offset = 0
        delims = set()

        for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)):
            start, stop = match.span()
            delim = match.group(1)

            if delim == BOLD_DELIM:
                tag = "b"
            elif delim == ITALIC_DELIM:
                tag = "i"
            elif delim == UNDERLINE_DELIM:
                tag = "u"
            elif delim == STRIKE_DELIM:
                tag = "s"
            elif delim == CODE_DELIM:
                tag = "code"
            elif delim == PRE_DELIM:
                tag = "pre"
            else:
                continue

            if delim not in Markdown.FIXED_WIDTH_DELIMS and any(x in delims for x in Markdown.FIXED_WIDTH_DELIMS):
                continue

            if delim not in delims:
                delims.add(delim)
                tag = Markdown.OPENING_TAG.format(tag)
            else:
                delims.remove(delim)
                tag = Markdown.CLOSING_TAG.format(tag)

            text = text[:start + offset] + tag + text[stop + offset:]

            offset += len(tag) - len(delim)

        offset = 0

        for match in re.finditer(Markdown.URL_RE, text):
            start, stop = match.span()
            full = match.group(0)

            body, url = match.groups()
            replace = Markdown.URL_MARKUP.format(url, body)

            text = text[:start + offset] + replace + text[stop + offset:]

            offset += len(replace) - len(full)

        return await self.html.parse(text)

    @staticmethod
    def unparse(text: str, entities: list):
        text = utils.add_surrogates(text)
        copy = text

        for entity in entities:
            start = entity.offset
            end = start + entity.length

            type = entity.type

            url = entity.url
            user = entity.user

            sub = copy[start:end]

            if type == "bold":
                style = BOLD_DELIM
            elif type == "italic":
                style = ITALIC_DELIM
            elif type == "underline":
                style = UNDERLINE_DELIM
            elif type == "strike":
                style = STRIKE_DELIM
            elif type == "code":
                style = CODE_DELIM
            elif type == "pre":
                style = PRE_DELIM
            # TODO: Blockquote for MD
            # elif type == "blockquote":
            #     style = ...
            elif type == "text_link":
                text = text[:start] + text[start:].replace(sub, '[{1}]({0})'.format(url, sub), 1)
                continue
            elif type == "text_mention":
                text = text[:start] + text[start:].replace(
                    sub, '[{1}](tg://user?id={0})'.format(user.id, sub), 1)
                continue
            else:
                continue

            text = text[:start] + text[start:].replace(sub, "{0}{1}{0}".format(style, sub), 1)

        return utils.remove_surrogates(text)
Add extensions package 2017-12-05 11:42:57 +00:00			`# Pyrogram - Telegram MTProto API Client Library for Python`
Update copyright year 2019-01-01 11:36:16 +00:00			`# Copyright (C) 2017-2019 Dan Tès <https://github.com/delivrance>`
Add extensions package 2017-12-05 11:42:57 +00:00			`#`
			`# This file is part of Pyrogram.`
			`#`
			`# Pyrogram is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Lesser General Public License as published`
			`# by the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Pyrogram is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.`

Automatically escape URL bodies when using markdown 2019-06-24 08:54:58 +00:00			`import html`
Add extensions package 2017-12-05 11:42:57 +00:00			`import re`

Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00			`import pyrogram`
Move formatting classes inside the Client sub-package 2018-01-23 14:17:48 +00:00			`from . import utils`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`from .html import HTML`
Add extensions package 2017-12-05 11:42:57 +00:00
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`BOLD_DELIM = "**"`
			`ITALIC_DELIM = "__"`
			`UNDERLINE_DELIM = "--"`
			`STRIKE_DELIM = "~~"`
			CODE_DELIM = "`"
			PRE_DELIM = "```"
Add extensions package 2017-12-05 11:42:57 +00:00

			`class Markdown:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`MARKDOWN_RE = re.compile(r"({d})".format(`
Add extensions package 2017-12-05 11:42:57 +00:00			`d="\|".join(`
			`["".join(i) for i in [`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`[r"\{}".format(j) for j in i]`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`for i in [`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`PRE_DELIM,`
			`CODE_DELIM,`
			`STRIKE_DELIM,`
			`UNDERLINE_DELIM,`
			`ITALIC_DELIM,`
			`BOLD_DELIM`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`]`
Add extensions package 2017-12-05 11:42:57 +00:00			`]]`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`)))`
Add support for user mentions 2017-12-13 09:44:24 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)")`
Add extensions package 2017-12-05 11:42:57 +00:00
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`OPENING_TAG = "<{}>"`
			`CLOSING_TAG = "</{}>"`
			`URL_MARKUP = '<a href="{}">{}</a>'`
			`FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`def __init__(self, client: "pyrogram.BaseClient"):`
			`self.html = HTML(client)`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Merge branch 'develop' into asyncio # Conflicts: # pyrogram/client/style/html.py # pyrogram/client/style/markdown.py 2019-06-25 09:48:43 +00:00			`async def parse(self, text: str):`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`text = html.escape(text)`
Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`offset = 0`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`delims = set()`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00
			`for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)):`
			`start, stop = match.span()`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`delim = match.group(1)`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`if delim == BOLD_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "b"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == ITALIC_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "i"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == UNDERLINE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "u"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == STRIKE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "s"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == CODE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "code"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == PRE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "pre"`
			`else:`
			`continue`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`if delim not in Markdown.FIXED_WIDTH_DELIMS and any(x in delims for x in Markdown.FIXED_WIDTH_DELIMS):`
			`continue`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`if delim not in delims:`
			`delims.add(delim)`
			`tag = Markdown.OPENING_TAG.format(tag)`
Revamp markdown parser 2018-02-15 10:24:56 +00:00			`else:`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`delims.remove(delim)`
			`tag = Markdown.CLOSING_TAG.format(tag)`
Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`text = text[:start + offset] + tag + text[stop + offset:]`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`offset += len(tag) - len(delim)`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`offset = 0`

Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`for match in re.finditer(Markdown.URL_RE, text):`
			`start, stop = match.span()`
			`full = match.group(0)`
Automatically escape URL bodies when using markdown 2019-06-24 08:54:58 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`body, url = match.groups()`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`replace = Markdown.URL_MARKUP.format(url, body)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`text = text[:start + offset] + replace + text[stop + offset:]`
Automatically escape URL bodies when using markdown 2019-06-24 08:54:58 +00:00
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`offset += len(replace) - len(full)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
Merge branch 'develop' into asyncio # Conflicts: # pyrogram/client/style/html.py # pyrogram/client/style/markdown.py 2019-06-25 09:48:43 +00:00			`return await self.html.parse(text)`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00
			`@staticmethod`
			`def unparse(text: str, entities: list):`
			`text = utils.add_surrogates(text)`
			`copy = text`

Add markdown unparse method 2018-05-10 12:46:14 +00:00			`for entity in entities:`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`start = entity.offset`
			`end = start + entity.length`

Add markdown unparse method 2018-05-10 12:46:14 +00:00			`type = entity.type`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`url = entity.url`
			`user = entity.user`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00
			`sub = copy[start:end]`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
			`if type == "bold":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`style = BOLD_DELIM`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`elif type == "italic":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`style = ITALIC_DELIM`
Add support for underline and strikethrough text via Markdown New delimiters: - ~~strikethrough~~ - --underline-- 2019-06-23 20:43:11 +00:00			`elif type == "underline":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`style = UNDERLINE_DELIM`
Add support for underline and strikethrough text via Markdown New delimiters: - ~~strikethrough~~ - --underline-- 2019-06-23 20:43:11 +00:00			`elif type == "strike":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`style = STRIKE_DELIM`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`elif type == "code":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`style = CODE_DELIM`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`elif type == "pre":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`style = PRE_DELIM`
			`# TODO: Blockquote for MD`
			`# elif type == "blockquote":`
			`# style = ...`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`elif type == "text_link":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`text = text[:start] + text[start:].replace(sub, '[{1}]({0})'.format(url, sub), 1)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`continue`
			`elif type == "text_mention":`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`text = text[:start] + text[start:].replace(`
			`sub, '[{1}](tg://user?id={0})'.format(user.id, sub), 1)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00			`continue`
			`else:`
			`continue`

Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`text = text[:start] + text[start:].replace(sub, "{0}{1}{0}".format(style, sub), 1)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`return utils.remove_surrogates(text)`