mitmproxy/netlib/http/message.py

301 lines
8.9 KiB
Python
Raw Normal View History

2016-07-01 21:10:48 +00:00
import re
2015-09-25 22:39:04 +00:00
import warnings
2016-10-17 03:56:46 +00:00
from typing import Optional
2015-09-25 22:39:04 +00:00
from mitmproxy.utils import strutils
from netlib import encoding
from netlib import basetypes
from netlib.http import headers
2015-09-25 22:39:04 +00:00
2016-05-28 20:17:02 +00:00
# While headers _should_ be ASCII, it's not uncommon for certain headers to be utf-8 encoded.
def _native(x):
return x.decode("utf-8", "surrogateescape")
2016-05-28 20:17:02 +00:00
def _always_bytes(x):
return strutils.always_bytes(x, "utf-8", "surrogateescape")
2015-09-25 22:39:04 +00:00
class MessageData(basetypes.Serializable):
def __eq__(self, other):
if isinstance(other, MessageData):
return self.__dict__ == other.__dict__
return False
def __ne__(self, other):
return not self.__eq__(other)
2016-02-08 03:16:58 +00:00
def set_state(self, state):
for k, v in state.items():
if k == "headers":
v = headers.Headers.from_state(v)
2016-02-08 03:16:58 +00:00
setattr(self, k, v)
def get_state(self):
state = vars(self).copy()
state["headers"] = state["headers"].get_state()
return state
@classmethod
def from_state(cls, state):
state["headers"] = headers.Headers.from_state(state["headers"])
2016-02-08 03:16:58 +00:00
return cls(**state)
class Message(basetypes.Serializable):
2015-09-25 22:39:04 +00:00
def __eq__(self, other):
if isinstance(other, Message):
return self.data == other.data
return False
def __ne__(self, other):
return not self.__eq__(other)
2016-02-08 03:16:58 +00:00
def get_state(self):
return self.data.get_state()
def set_state(self, state):
self.data.set_state(state)
@classmethod
def from_state(cls, state):
state["headers"] = headers.Headers.from_state(state["headers"])
2016-02-08 03:16:58 +00:00
return cls(**state)
2015-09-25 22:39:04 +00:00
@property
def headers(self):
"""
Message headers object
Returns:
netlib.http.Headers
"""
return self.data.headers
@headers.setter
def headers(self, h):
self.data.headers = h
2015-09-26 15:39:50 +00:00
@property
2016-10-17 03:56:46 +00:00
def raw_content(self) -> bytes:
2015-09-26 15:39:50 +00:00
"""
The raw (encoded) HTTP message body
See also: :py:attr:`content`, :py:class:`text`
2015-09-26 15:39:50 +00:00
"""
return self.data.content
@raw_content.setter
def raw_content(self, content):
2015-09-26 15:39:50 +00:00
self.data.content = content
2016-10-17 03:56:46 +00:00
def get_content(self, strict: bool=True) -> bytes:
"""
The HTTP message body decoded with the content-encoding header (e.g. gzip)
Raises:
2016-07-16 05:50:33 +00:00
ValueError, when the content-encoding is invalid and strict is True.
See also: :py:class:`raw_content`, :py:attr:`text`
"""
2016-07-16 05:50:33 +00:00
if self.raw_content is None:
return None
ce = self.headers.get("content-encoding")
if ce:
try:
return encoding.decode(self.raw_content, ce)
except ValueError:
if strict:
raise
return self.raw_content
else:
return self.raw_content
2016-07-16 05:50:33 +00:00
def set_content(self, value):
if value is None:
self.raw_content = None
return
if not isinstance(value, bytes):
raise TypeError(
"Message content must be bytes, not {}. "
"Please use .text if you want to assign a str."
.format(type(value).__name__)
)
ce = self.headers.get("content-encoding")
try:
self.raw_content = encoding.encode(value, ce or "identity")
except ValueError:
# So we have an invalid content-encoding?
# Let's remove it!
del self.headers["content-encoding"]
self.raw_content = value
2016-07-16 05:50:33 +00:00
self.headers["content-length"] = str(len(self.raw_content))
content = property(get_content, set_content)
2015-09-26 15:39:50 +00:00
@property
def http_version(self):
"""
Version string, e.g. "HTTP/1.1"
"""
return _native(self.data.http_version)
@http_version.setter
def http_version(self, http_version):
self.data.http_version = _always_bytes(http_version)
2015-09-25 22:39:04 +00:00
@property
def timestamp_start(self):
"""
First byte timestamp
"""
return self.data.timestamp_start
@timestamp_start.setter
def timestamp_start(self, timestamp_start):
self.data.timestamp_start = timestamp_start
@property
def timestamp_end(self):
"""
Last byte timestamp
"""
return self.data.timestamp_end
@timestamp_end.setter
def timestamp_end(self, timestamp_end):
self.data.timestamp_end = timestamp_end
2016-10-17 03:56:46 +00:00
def _get_content_type_charset(self) -> Optional[str]:
ct = headers.parse_content_type(self.headers.get("content-type", ""))
if ct:
return ct[2].get("charset")
2016-10-17 03:56:46 +00:00
def _guess_encoding(self) -> str:
2016-07-16 05:50:33 +00:00
enc = self._get_content_type_charset()
if enc:
return enc
if "json" in self.headers.get("content-type", ""):
return "utf8"
else:
# We may also want to check for HTML meta tags here at some point.
return "latin-1"
2016-10-17 03:56:46 +00:00
def get_text(self, strict: bool=True) -> str:
2015-09-25 22:39:04 +00:00
"""
The HTTP message body decoded with both content-encoding header (e.g. gzip)
and content-type header charset.
2015-09-25 22:39:04 +00:00
2016-07-16 05:50:33 +00:00
Raises:
ValueError, when either content-encoding or charset is invalid and strict is True.
See also: :py:attr:`content`, :py:class:`raw_content`
2015-09-25 22:39:04 +00:00
"""
2016-07-16 05:50:33 +00:00
if self.raw_content is None:
return None
enc = self._guess_encoding()
2016-07-16 05:50:33 +00:00
content = self.get_content(strict)
try:
return encoding.decode(content, enc)
except ValueError:
if strict:
raise
return content.decode("utf8", "surrogateescape")
2015-09-25 22:39:04 +00:00
2016-07-16 05:50:33 +00:00
def set_text(self, text):
if text is None:
self.content = None
return
enc = self._guess_encoding()
try:
self.content = encoding.encode(text, enc)
except ValueError:
# Fall back to UTF-8 and update the content-type header.
ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
ct[2]["charset"] = "utf-8"
self.headers["content-type"] = headers.assemble_content_type(*ct)
enc = "utf8"
self.content = text.encode(enc, "surrogateescape")
2015-09-25 22:39:04 +00:00
2016-07-16 05:50:33 +00:00
text = property(get_text, set_text)
def decode(self, strict=True):
2015-09-26 18:07:11 +00:00
"""
Decodes body based on the current Content-Encoding header, then
removes the header. If there is no Content-Encoding header, no
action is taken.
Raises:
2016-07-16 05:50:33 +00:00
ValueError, when the content-encoding is invalid and strict is True.
2015-09-26 18:07:11 +00:00
"""
2016-07-16 05:50:33 +00:00
self.raw_content = self.get_content(strict)
2015-09-26 18:07:11 +00:00
self.headers.pop("content-encoding", None)
def encode(self, e):
"""
2016-07-30 12:43:53 +00:00
Encodes body with the encoding e, where e is "gzip", "deflate", "identity", or "br".
Any existing content-encodings are overwritten,
the content is not decoded beforehand.
Raises:
ValueError, when the specified content-encoding is invalid.
2015-09-26 18:07:11 +00:00
"""
self.headers["content-encoding"] = e
self.content = self.raw_content
if "content-encoding" not in self.headers:
raise ValueError("Invalid content encoding {}".format(repr(e)))
2015-09-26 18:07:11 +00:00
def replace(self, pattern, repl, flags=0, count=0):
"""
Replaces a regular expression pattern with repl in both the headers
and the body of the message. Encoded body will be decoded
before replacement, and re-encoded afterwards.
Returns:
The number of replacements made.
"""
if isinstance(pattern, str):
2016-07-01 21:10:48 +00:00
pattern = strutils.escaped_str_to_bytes(pattern)
if isinstance(repl, str):
2016-07-01 21:10:48 +00:00
repl = strutils.escaped_str_to_bytes(repl)
replacements = 0
if self.content:
self.content, replacements = re.subn(
pattern, repl, self.content, flags=flags, count=count
)
2016-08-29 11:27:20 +00:00
replacements += self.headers.replace(pattern, repl, flags=flags, count=count)
return replacements
2015-09-26 15:39:50 +00:00
# Legacy
2015-09-25 22:39:04 +00:00
@property
def body(self): # pragma: no cover
2015-09-25 22:39:04 +00:00
warnings.warn(".body is deprecated, use .content instead.", DeprecationWarning)
return self.content
@body.setter
def body(self, body): # pragma: no cover
2015-09-25 22:39:04 +00:00
warnings.warn(".body is deprecated, use .content instead.", DeprecationWarning)
self.content = body
2016-10-17 04:29:45 +00:00
class decoded:
2015-09-25 22:39:04 +00:00
"""
Deprecated: You can now directly use :py:attr:`content`.
:py:attr:`raw_content` has the encoded content.
2015-09-25 22:39:04 +00:00
"""
2016-07-02 10:03:42 +00:00
def __init__(self, message): # pragma no cover
warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
".raw_content has the encoded content.", DeprecationWarning)
2015-09-25 22:39:04 +00:00
2016-07-02 10:03:42 +00:00
def __enter__(self): # pragma no cover
pass
2015-09-25 22:39:04 +00:00
2016-07-02 10:03:42 +00:00
def __exit__(self, type, value, tb): # pragma no cover
2016-07-02 09:11:00 +00:00
pass