MTPyroger/pyrogram/parser/markdown.py

151 lines
4.6 KiB
Python
Raw Permalink Normal View History

2020-03-21 14:43:32 +00:00
# Pyrogram - Telegram MTProto API Client Library for Python
2021-01-01 21:58:48 +00:00
# Copyright (C) 2017-2021 Dan <https://github.com/delivrance>
2017-12-05 11:42:57 +00:00
#
2020-03-21 14:43:32 +00:00
# This file is part of Pyrogram.
2017-12-05 11:42:57 +00:00
#
2020-03-21 14:43:32 +00:00
# Pyrogram is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2017-12-05 11:42:57 +00:00
#
2020-03-21 14:43:32 +00:00
# Pyrogram is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
2017-12-05 11:42:57 +00:00
#
2020-03-21 14:43:32 +00:00
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.
2017-12-05 11:42:57 +00:00
import html
2017-12-05 11:42:57 +00:00
import re
from typing import Optional
2017-12-05 11:42:57 +00:00
import pyrogram
from . import utils
from .html import HTML
2017-12-05 11:42:57 +00:00
BOLD_DELIM = "**"
ITALIC_DELIM = "__"
UNDERLINE_DELIM = "--"
STRIKE_DELIM = "~~"
CODE_DELIM = "`"
PRE_DELIM = "```"
2017-12-05 11:42:57 +00:00
MARKDOWN_RE = re.compile(r"({d})|\[(.+?)\]\((.+?)\)".format(
d="|".join(
["".join(i) for i in [
[rf"\{j}" for j in i]
for i in [
PRE_DELIM,
CODE_DELIM,
STRIKE_DELIM,
UNDERLINE_DELIM,
ITALIC_DELIM,
BOLD_DELIM
]
]]
)))
OPENING_TAG = "<{}>"
CLOSING_TAG = "</{}>"
URL_MARKUP = '<a href="{}">{}</a>'
FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]
2017-12-05 11:42:57 +00:00
class Markdown:
def __init__(self, client: Optional["pyrogram.Client"]):
self.html = HTML(client)
2018-02-15 10:24:56 +00:00
async def parse(self, text: str, strict: bool = False):
if strict:
text = html.escape(text)
delims = set()
is_fixed_width = False
for i, match in enumerate(re.finditer(MARKDOWN_RE, text)):
start, _ = match.span()
delim, text_url, url = match.groups()
full = match.group(0)
if delim in FIXED_WIDTH_DELIMS:
is_fixed_width = not is_fixed_width
if is_fixed_width and delim not in FIXED_WIDTH_DELIMS:
continue
if text_url:
text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)
continue
if delim == BOLD_DELIM:
tag = "b"
elif delim == ITALIC_DELIM:
tag = "i"
elif delim == UNDERLINE_DELIM:
tag = "u"
elif delim == STRIKE_DELIM:
tag = "s"
elif delim == CODE_DELIM:
tag = "code"
elif delim == PRE_DELIM:
tag = "pre"
else:
continue
2018-02-15 10:24:56 +00:00
if delim not in delims:
delims.add(delim)
tag = OPENING_TAG.format(tag)
else:
delims.remove(delim)
tag = CLOSING_TAG.format(tag)
text = utils.replace_once(text, delim, tag, start)
2018-05-10 12:46:14 +00:00
return await self.html.parse(text)
@staticmethod
def unparse(text: str, entities: list):
text = utils.add_surrogates(text)
entities_offsets = []
for entity in entities:
entity_type = entity.type
start = entity.offset
end = start + entity.length
if entity_type == "bold":
start_tag = end_tag = BOLD_DELIM
elif entity_type == "italic":
start_tag = end_tag = ITALIC_DELIM
elif entity_type == "underline":
start_tag = end_tag = UNDERLINE_DELIM
elif entity_type == "strikethrough":
start_tag = end_tag = STRIKE_DELIM
elif entity_type == "code":
start_tag = end_tag = CODE_DELIM
elif entity_type in ("pre", "blockquote"):
start_tag = end_tag = PRE_DELIM
elif entity_type == "text_link":
url = entity.url
start_tag = "["
end_tag = f"]({url})"
elif entity_type == "text_mention":
user = entity.user
start_tag = "["
end_tag = f"](tg://user?id={user.id})"
else:
continue
entities_offsets.append((start_tag, start,))
entities_offsets.append((end_tag, end,))
# sorting by offset (desc)
entities_offsets.sort(key=lambda x: -x[1])
for entity, offset in entities_offsets:
text = text[:offset] + entity + text[offset:]
return utils.remove_surrogates(text)