MTPyroger/pyrogram/parser/markdown.py

#  Pyrogram - Telegram MTProto API Client Library for Python
#  Copyright (C) 2017-2021 Dan <https://github.com/delivrance>
#
#  This file is part of Pyrogram.
#
#  Pyrogram is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  Pyrogram is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with Pyrogram.  If not, see <http://www.gnu.org/licenses/>.

import html
import re
from typing import Optional

import pyrogram
from . import utils
from .html import HTML

BOLD_DELIM = "**"
ITALIC_DELIM = "__"
UNDERLINE_DELIM = "--"
STRIKE_DELIM = "~~"
CODE_DELIM = "`"
PRE_DELIM = "```"

MARKDOWN_RE = re.compile(r"({d})|\[(.+?)\]\((.+?)\)".format(
    d="|".join(
        ["".join(i) for i in [
            [rf"\{j}" for j in i]
            for i in [
                PRE_DELIM,
                CODE_DELIM,
                STRIKE_DELIM,
                UNDERLINE_DELIM,
                ITALIC_DELIM,
                BOLD_DELIM
            ]
        ]]
    )))

OPENING_TAG = "<{}>"
CLOSING_TAG = "</{}>"
URL_MARKUP = '<a href="{}">{}</a>'
FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]


class Markdown:
    def __init__(self, client: Optional["pyrogram.Client"]):
        self.html = HTML(client)

    async def parse(self, text: str, strict: bool = False):
        if strict:
            text = html.escape(text)

        delims = set()
        is_fixed_width = False

        for i, match in enumerate(re.finditer(MARKDOWN_RE, text)):
            start, _ = match.span()
            delim, text_url, url = match.groups()
            full = match.group(0)

            if delim in FIXED_WIDTH_DELIMS:
                is_fixed_width = not is_fixed_width

            if is_fixed_width and delim not in FIXED_WIDTH_DELIMS:
                continue

            if text_url:
                text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)
                continue

            if delim == BOLD_DELIM:
                tag = "b"
            elif delim == ITALIC_DELIM:
                tag = "i"
            elif delim == UNDERLINE_DELIM:
                tag = "u"
            elif delim == STRIKE_DELIM:
                tag = "s"
            elif delim == CODE_DELIM:
                tag = "code"
            elif delim == PRE_DELIM:
                tag = "pre"
            else:
                continue

            if delim not in delims:
                delims.add(delim)
                tag = OPENING_TAG.format(tag)
            else:
                delims.remove(delim)
                tag = CLOSING_TAG.format(tag)

            text = utils.replace_once(text, delim, tag, start)

        return await self.html.parse(text)

    @staticmethod
    def unparse(text: str, entities: list):
        text = utils.add_surrogates(text)

        entities_offsets = []

        for entity in entities:
            entity_type = entity.type
            start = entity.offset
            end = start + entity.length

            if entity_type == "bold":
                start_tag = end_tag = BOLD_DELIM
            elif entity_type == "italic":
                start_tag = end_tag = ITALIC_DELIM
            elif entity_type == "underline":
                start_tag = end_tag = UNDERLINE_DELIM
            elif entity_type == "strikethrough":
                start_tag = end_tag = STRIKE_DELIM
            elif entity_type == "code":
                start_tag = end_tag = CODE_DELIM
            elif entity_type in ("pre", "blockquote"):
                start_tag = end_tag = PRE_DELIM
            elif entity_type == "text_link":
                url = entity.url
                start_tag = "["
                end_tag = f"]({url})"
            elif entity_type == "text_mention":
                user = entity.user
                start_tag = "["
                end_tag = f"](tg://user?id={user.id})"
            else:
                continue

            entities_offsets.append((start_tag, start,))
            entities_offsets.append((end_tag, end,))

        # sorting by offset (desc)
        entities_offsets.sort(key=lambda x: -x[1])

        for entity, offset in entities_offsets:
            text = text[:offset] + entity + text[offset:]

        return utils.remove_surrogates(text)
Update Copyright 2020-03-21 14:43:32 +00:00			`# Pyrogram - Telegram MTProto API Client Library for Python`
Update copyright notice Year 2021 2021-01-01 21:58:48 +00:00			`# Copyright (C) 2017-2021 Dan <https://github.com/delivrance>`
Add extensions package 2017-12-05 11:42:57 +00:00			`#`
Update Copyright 2020-03-21 14:43:32 +00:00			`# This file is part of Pyrogram.`
Add extensions package 2017-12-05 11:42:57 +00:00			`#`
Update Copyright 2020-03-21 14:43:32 +00:00			`# Pyrogram is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Lesser General Public License as published`
			`# by the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
Add extensions package 2017-12-05 11:42:57 +00:00			`#`
Update Copyright 2020-03-21 14:43:32 +00:00			`# Pyrogram is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Lesser General Public License for more details.`
Add extensions package 2017-12-05 11:42:57 +00:00			`#`
Update Copyright 2020-03-21 14:43:32 +00:00			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.`
Add extensions package 2017-12-05 11:42:57 +00:00
Automatically escape URL bodies when using markdown 2019-06-24 08:54:58 +00:00			`import html`
Add extensions package 2017-12-05 11:42:57 +00:00			`import re`
Improve typing hints (#537) * Change type1 or type2 to Union[type1, type2] * Address @KunoiSayami suggestions * Change Union[type1, None] to Optional[type1] * Update PR with latest commit changes * Address Dan suggestions 2020-12-20 16:05:17 +00:00			`from typing import Optional`
Add extensions package 2017-12-05 11:42:57 +00:00
Fix broken mentions for both HTML and Markdown 2019-03-26 12:32:30 +00:00			`import pyrogram`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`from . import utils`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`from .html import HTML`
Add extensions package 2017-12-05 11:42:57 +00:00
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`BOLD_DELIM = "**"`
			`ITALIC_DELIM = "__"`
			`UNDERLINE_DELIM = "--"`
			`STRIKE_DELIM = "~~"`
			CODE_DELIM = "`"
			PRE_DELIM = "```"
Add extensions package 2017-12-05 11:42:57 +00:00
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`MARKDOWN_RE = re.compile(r"({d})\|\[(.+?)\]\((.+?)\)".format(`
			`d="\|".join(`
			`["".join(i) for i in [`
Deep rewrite: preparing for v1.0 - Pyrogram core is now fully asynchronous - Ditched Python 3.5, welcome 3.6 as minimum version. - Moved all types to pyrogram.types - Turned the Filters class into a module (filters) - Moved all filters to pyrogram.filters - Moved all handlers to pyrogram.handlers - Moved all emoji to pyrogram.emoji - Renamed pyrogram.api to pyrogram.raw - Clock is now synced with server's time - Telegram schema updated to Layer 117 - Greatly improved the TL compiler (proper type-constructor hierarchy) - Added "do not edit" warning in generated files - Crypto parts are executed in a thread pool to avoid blocking the event loop - idle() is now a separate function (it doesn't deal with Client instances) - Async storage, async filters and async progress callback (optional, can be sync too) - Added getpass back, for hidden password inputs 2020-08-22 06:05:05 +00:00			`[rf"\{j}" for j in i]`
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`for i in [`
			`PRE_DELIM,`
			`CODE_DELIM,`
			`STRIKE_DELIM,`
			`UNDERLINE_DELIM,`
			`ITALIC_DELIM,`
			`BOLD_DELIM`
			`]`
			`]]`
			`)))`

			`OPENING_TAG = "<{}>"`
			`CLOSING_TAG = "</{}>"`
			`URL_MARKUP = '<a href="{}">{}</a>'`
			`FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]`

Add extensions package 2017-12-05 11:42:57 +00:00
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`class Markdown:`
Improve typing hints (#537) * Change type1 or type2 to Union[type1, type2] * Address @KunoiSayami suggestions * Change Union[type1, None] to Optional[type1] * Update PR with latest commit changes * Address Dan suggestions 2020-12-20 16:05:17 +00:00			`def __init__(self, client: Optional["pyrogram.Client"]):`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`self.html = HTML(client)`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Change logging hierarchy for loading plugins (#451) Loading plugins shouldn't be considered a warning 2020-08-21 05:28:27 +00:00			`async def parse(self, text: str, strict: bool = False):`
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`if strict:`
			`text = html.escape(text)`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00
			`delims = set()`
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`is_fixed_width = False`

			`for i, match in enumerate(re.finditer(MARKDOWN_RE, text)):`
			`start, _ = match.span()`
			`delim, text_url, url = match.groups()`
			`full = match.group(0)`

			`if delim in FIXED_WIDTH_DELIMS:`
			`is_fixed_width = not is_fixed_width`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`if is_fixed_width and delim not in FIXED_WIDTH_DELIMS:`
			`continue`

			`if text_url:`
			`text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)`
			`continue`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`if delim == BOLD_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "b"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == ITALIC_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "i"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == UNDERLINE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "u"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == STRIKE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "s"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == CODE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "code"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`elif delim == PRE_DELIM:`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`tag = "pre"`
			`else:`
			`continue`
Revamp markdown parser 2018-02-15 10:24:56 +00:00
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`if delim not in delims:`
			`delims.add(delim)`
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`tag = OPENING_TAG.format(tag)`
Revamp HTML and Markdown parsers to allow multiple nested entities 2019-06-24 08:07:28 +00:00			`else:`
Ignore any other style when inside a fixed-width style 2019-06-24 11:35:58 +00:00			`delims.remove(delim)`
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`tag = CLOSING_TAG.format(tag)`
Automatically escape URL bodies when using markdown 2019-06-24 08:54:58 +00:00
Implement strict and loose markdown parsing This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed 2019-06-26 14:05:09 +00:00			`text = utils.replace_once(text, delim, tag, start)`
Add markdown unparse method 2018-05-10 12:46:14 +00:00
Change logging hierarchy for loading plugins (#451) Loading plugins shouldn't be considered a warning 2020-08-21 05:28:27 +00:00			`return await self.html.parse(text)`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00
			`@staticmethod`
			`def unparse(text: str, entities: list):`
			`text = utils.add_surrogates(text)`
Add better support for nested entities (both for HTML and Markdown) (#297) * Added better support for nested entities, both for HTML and Markdown * Tiny style fix * Make use of pre-defined constants 2019-08-07 11:48:21 +00:00
			`entities_offsets = []`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00
			`for entity in entities:`
Add better support for nested entities (both for HTML and Markdown) (#297) * Added better support for nested entities, both for HTML and Markdown * Tiny style fix * Make use of pre-defined constants 2019-08-07 11:48:21 +00:00			`entity_type = entity.type`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`start = entity.offset`
			`end = start + entity.length`

Add better support for nested entities (both for HTML and Markdown) (#297) * Added better support for nested entities, both for HTML and Markdown * Tiny style fix * Make use of pre-defined constants 2019-08-07 11:48:21 +00:00			`if entity_type == "bold":`
			`start_tag = end_tag = BOLD_DELIM`
			`elif entity_type == "italic":`
			`start_tag = end_tag = ITALIC_DELIM`
			`elif entity_type == "underline":`
			`start_tag = end_tag = UNDERLINE_DELIM`
Fix for strikethrough unparsing in markdown (#627) 2021-03-14 11:43:12 +00:00			`elif entity_type == "strikethrough":`
Add better support for nested entities (both for HTML and Markdown) (#297) * Added better support for nested entities, both for HTML and Markdown * Tiny style fix * Make use of pre-defined constants 2019-08-07 11:48:21 +00:00			`start_tag = end_tag = STRIKE_DELIM`
			`elif entity_type == "code":`
			`start_tag = end_tag = CODE_DELIM`
			`elif entity_type in ("pre", "blockquote"):`
			`start_tag = end_tag = PRE_DELIM`
			`elif entity_type == "text_link":`
			`url = entity.url`
			`start_tag = "["`
Deep rewrite: preparing for v1.0 - Pyrogram core is now fully asynchronous - Ditched Python 3.5, welcome 3.6 as minimum version. - Moved all types to pyrogram.types - Turned the Filters class into a module (filters) - Moved all filters to pyrogram.filters - Moved all handlers to pyrogram.handlers - Moved all emoji to pyrogram.emoji - Renamed pyrogram.api to pyrogram.raw - Clock is now synced with server's time - Telegram schema updated to Layer 117 - Greatly improved the TL compiler (proper type-constructor hierarchy) - Added "do not edit" warning in generated files - Crypto parts are executed in a thread pool to avoid blocking the event loop - idle() is now a separate function (it doesn't deal with Client instances) - Async storage, async filters and async progress callback (optional, can be sync too) - Added getpass back, for hidden password inputs 2020-08-22 06:05:05 +00:00			`end_tag = f"]({url})"`
Add better support for nested entities (both for HTML and Markdown) (#297) * Added better support for nested entities, both for HTML and Markdown * Tiny style fix * Make use of pre-defined constants 2019-08-07 11:48:21 +00:00			`elif entity_type == "text_mention":`
			`user = entity.user`
			`start_tag = "["`
Deep rewrite: preparing for v1.0 - Pyrogram core is now fully asynchronous - Ditched Python 3.5, welcome 3.6 as minimum version. - Moved all types to pyrogram.types - Turned the Filters class into a module (filters) - Moved all filters to pyrogram.filters - Moved all handlers to pyrogram.handlers - Moved all emoji to pyrogram.emoji - Renamed pyrogram.api to pyrogram.raw - Clock is now synced with server's time - Telegram schema updated to Layer 117 - Greatly improved the TL compiler (proper type-constructor hierarchy) - Added "do not edit" warning in generated files - Crypto parts are executed in a thread pool to avoid blocking the event loop - idle() is now a separate function (it doesn't deal with Client instances) - Async storage, async filters and async progress callback (optional, can be sync too) - Added getpass back, for hidden password inputs 2020-08-22 06:05:05 +00:00			`end_tag = f"](tg://user?id={user.id})"`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00			`else:`
			`continue`

Add better support for nested entities (both for HTML and Markdown) (#297) * Added better support for nested entities, both for HTML and Markdown * Tiny style fix * Make use of pre-defined constants 2019-08-07 11:48:21 +00:00			`entities_offsets.append((start_tag, start,))`
			`entities_offsets.append((end_tag, end,))`

			`# sorting by offset (desc)`
			`entities_offsets.sort(key=lambda x: -x[1])`

			`for entity, offset in entities_offsets:`
			`text = text[:offset] + entity + text[offset:]`
Implement HTML.unparse and Markdown.unparse 2019-06-25 08:24:19 +00:00
			`return utils.remove_surrogates(text)`