add strutils.replace_surrogates

This commit is contained in:
Maximilian Hils 2017-01-06 00:58:21 +01:00
parent 042261266f
commit c21ee90deb
2 changed files with 16 additions and 0 deletions

View File

@ -25,6 +25,17 @@ def always_str(str_or_bytes: Optional[AnyStr], *decode_args) -> Optional[str]:
raise TypeError("Expected str or bytes, but got {}.".format(type(str_or_bytes).__name__))
def replace_surrogates(text: str, errors='replace') -> str:
"""Convert surrogates to replacement characters (e.g., "\udc80" becomes "<EFBFBD>")
by applying a different error handler.
Uses the "replace" error handler by default, but any input
error handler may be specified.
For an introduction to surrogateescape, see https://www.python.org/dev/peps/pep-0383/.
"""
return text.encode('utf-8', 'surrogateescape').decode('utf-8', errors)
# Translate control characters to "safe" characters. This implementation initially
# replaced them with the matching control pictures (http://unicode.org/charts/PDF/U2400.pdf),
# but that turned out to render badly with monospace fonts. We are back to "." therefore.

View File

@ -19,6 +19,11 @@ def test_always_str():
assert strutils.always_str(None) is None
def test_replace_surrogates():
assert strutils.replace_surrogates("foo") == "foo"
assert strutils.replace_surrogates("bar \udc80 baz") == "bar <20> baz"
def test_escape_control_characters():
assert strutils.escape_control_characters(u"one") == u"one"
assert strutils.escape_control_characters(u"\00ne") == u".ne"