From 72ac89f666d0093f523998eaa2a5ff9a41a26ff7 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Thu, 24 Aug 2017 18:57:32 +0200 Subject: [PATCH] add strutils.escape_special_areas --- mitmproxy/utils/strutils.py | 87 ++++++++++++++++++++++++++- test/mitmproxy/utils/test_strutils.py | 22 +++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/mitmproxy/utils/strutils.py b/mitmproxy/utils/strutils.py index db0cfd2e8..1a4b5bb20 100644 --- a/mitmproxy/utils/strutils.py +++ b/mitmproxy/utils/strutils.py @@ -1,6 +1,7 @@ +import io import re import codecs -from typing import AnyStr, Optional, cast +from typing import AnyStr, Optional, cast, Iterable, Tuple def always_bytes(str_or_bytes: Optional[AnyStr], *encode_args) -> Optional[bytes]: @@ -141,3 +142,87 @@ def hexdump(s): False )) yield (offset, x, part_repr) + + +def _move_to_private_code_plane(matchobj): + return chr(ord(matchobj.group(0)) + 0xE000) + + +def _restore_from_private_code_plane(matchobj): + return chr(ord(matchobj.group(0)) - 0xE000) + + +NO_ESCAPE = r"(?>> split_special_areas( + >>> "test /* don't modify me */ foo", + >>> [(r"/\*", r"\*/")]) # (left delimiter regex, right delimiter regex) + ["test ", "/* don't modify me */", " foo"] + + "".join(split_special_areas(x, ...)) == x always holds true. + """ + patterns = "|".join( + r"{lchar}.*?{rchar}".format( + lchar=a, + rchar=b, + ) for (a, b) in area_delimiter) + return re.split( + "({})".format(patterns), + data, + re.MULTILINE + ) + + +def escape_special_areas( + data: str, + area_delimiter: Iterable[Tuple[str, str]], + control_characters, +): + """ + Escape all control characters present in special areas with UTF8 symbols + in the private use plane (U+E000 t+ ord(char)). + This is useful so that one can then use regex replacements on the resulting string without + interfering with special areas. + + control_characters must be 0 < ord(x) < 256. + + Example: + + >>> print(x) + if (true) { console.log('{}'); } + >>> x = escape_special_areas(x, "{", [("'", "'")]) + >>> print(x) + if (true) { console.log('�}'); } + >>> x = re.sub(r"\s*{\s*", " {\n ", x) + >>> x = unescape_special_areas(x, "{", [("'", "'")]) + >>> print(x) + if (true) { + console.log('{}'); } + """ + buf = io.StringIO() + parts = split_special_areas(data, area_delimiter) + rex = re.compile(r"[{}]".format(control_characters)) + for i, x in enumerate(parts): + if i % 2: + x = rex.sub(_move_to_private_code_plane, x) + buf.write(x) + return buf.getvalue() + + +def unescape_special_areas(data: str): + """ + Invert escape_special_areas. + + x == unescape_special_areas(escape_special_areas(x)) always holds true. + """ + return re.sub(r"[\ue000-\ue0ff]", _restore_from_private_code_plane, data) diff --git a/test/mitmproxy/utils/test_strutils.py b/test/mitmproxy/utils/test_strutils.py index bacd7f62d..ea3569c03 100644 --- a/test/mitmproxy/utils/test_strutils.py +++ b/test/mitmproxy/utils/test_strutils.py @@ -96,3 +96,25 @@ def test_clean_hanging_newline(): def test_hexdump(): assert list(strutils.hexdump(b"one\0" * 10)) + + +ESCAPE_QUOTES = [ + ("'", strutils.NO_ESCAPE + "'"), + ('"', strutils.NO_ESCAPE + '"') +] + + +def test_split_special_areas(): + assert strutils.split_special_areas("foo", ESCAPE_QUOTES) == ["foo"] + assert strutils.split_special_areas("foo 'bar' baz", ESCAPE_QUOTES) == ["foo ", "'bar'", " baz"] + assert strutils.split_special_areas( + """foo 'b\\'a"r' baz""", + ESCAPE_QUOTES + ) == ["foo ", "'b\\'a\"r'", " baz"] + + +def test_escape_special_areas(): + assert strutils.escape_special_areas('foo "bar" baz', ESCAPE_QUOTES, "*") == 'foo "bar" baz' + esc = strutils.escape_special_areas('foo "b*r" b*z', ESCAPE_QUOTES, "*") + assert esc == 'foo "b\ue02ar" b*z' + assert strutils.unescape_special_areas(esc) == 'foo "b*r" b*z'