Firm up cookie parsing and formatting API

Make a tough call: we won't support old-style comma-separated set-cookie
headers. Real world testing has shown that the latest rfc (6265) is
often violated in ways that make the parsing problem indeterminate.
Since this is much more common than the old style deprecated set-cookie
variant, we focus on the most useful case.
This commit is contained in:
Aldo Cortesi 2015-04-14 10:02:10 +12:00
parent 1a79ef8b6c
commit de9e741125
2 changed files with 196 additions and 33 deletions

View File

@ -1,13 +1,27 @@
""" """
A flexible module for cookie parsing and manipulation. A flexible module for cookie parsing and manipulation.
We try to be as permissive as possible. Parsing accepts formats from RFC6265 an This module differs from usual standards-compliant cookie modules in a number of
RFC2109. Serialization follows RFC6265 strictly. ways. We try to be as permissive as possible, and to retain even mal-formed
information. Duplicate cookies are preserved in parsing, and can be set in
formatting. We do attempt to escape and quote values where needed, but will not
reject data that violate the specs.
Parsing accepts the formats in RFC6265 and partially RFC2109 and RFC2965. We do
not parse the comma-separated variant of Set-Cookie that allows multiple cookies
to be set in a single header. Technically this should be feasible, but it turns
out that violations of RFC6265 that makes the parsing problem indeterminate are
much more common than genuine occurences of the multi-cookie variants.
Serialization follows RFC6265.
http://tools.ietf.org/html/rfc6265 http://tools.ietf.org/html/rfc6265
http://tools.ietf.org/html/rfc2109 http://tools.ietf.org/html/rfc2109
http://tools.ietf.org/html/rfc2965
""" """
# TODO
# - Disallow LHS-only Cookie values
import re import re
import odict import odict
@ -59,7 +73,7 @@ def _read_quoted_string(s, start):
return "".join(ret), i+1 return "".join(ret), i+1
def _read_value(s, start, special): def _read_value(s, start, delims):
""" """
Reads a value - the RHS of a token/value pair in a cookie. Reads a value - the RHS of a token/value pair in a cookie.
@ -70,37 +84,41 @@ def _read_value(s, start, special):
return "", start return "", start
elif s[start] == '"': elif s[start] == '"':
return _read_quoted_string(s, start) return _read_quoted_string(s, start)
elif special:
return _read_until(s, start, ";")
else: else:
return _read_until(s, start, ";,") return _read_until(s, start, delims)
def _read_pairs(s, specials=()): def _read_pairs(s, off=0, term=None, specials=()):
""" """
Read pairs of lhs=rhs values. Read pairs of lhs=rhs values.
specials: A lower-cased list of keys that may contain commas. off: start offset
term: if True, treat a comma as a terminator for the pairs lists
specials: a lower-cased list of keys that may contain commas if term is
True
""" """
off = 0
vals = [] vals = []
while 1: while 1:
lhs, off = _read_token(s, off) lhs, off = _read_token(s, off)
lhs = lhs.lstrip() lhs = lhs.lstrip()
rhs = None if lhs:
if off < len(s): rhs = None
if s[off] == "=": if off < len(s):
rhs, off = _read_value(s, off+1, lhs.lower() in specials) if s[off] == "=":
vals.append([lhs, rhs]) if term and lhs.lower() not in specials:
delims = ";,"
else:
delims = ";"
rhs, off = _read_value(s, off+1, delims)
vals.append([lhs, rhs])
off += 1 off += 1
if not off < len(s): if not off < len(s):
break break
if term and s[off-1] == ",":
break
return vals, off return vals, off
ESCAPE = re.compile(r"([\"\\])")
def _has_special(s): def _has_special(s):
for i in s: for i in s:
if i in '",;\\': if i in '",;\\':
@ -111,6 +129,9 @@ def _has_special(s):
return False return False
ESCAPE = re.compile(r"([\"\\])")
def _format_pairs(lst, specials=()): def _format_pairs(lst, specials=()):
""" """
specials: A lower-cased list of keys that will not be quoted. specials: A lower-cased list of keys that will not be quoted.
@ -127,25 +148,58 @@ def _format_pairs(lst, specials=()):
return "; ".join(vals) return "; ".join(vals)
def parse_cookies(s): def _format_set_cookie_pairs(lst):
return _format_pairs(
lst,
specials = ("expires", "path")
)
def _parse_set_cookie_pairs(s):
""" """
Parses a Cookie header value. For Set-Cookie, we support multiple cookies as described in RFC2109.
Returns an ODict object. This function therefore returns a list of lists.
""" """
pairs, off = _read_pairs(s) pairs, off = _read_pairs(
s,
specials = ("expires", "path")
)
return pairs
def parse_set_cookie_header(str):
"""
Parse a Set-Cookie header value
Returns a (name, value, attrs) tuple, or None, where attrs is an
ODictCaseless set of attributes. No attempt is made to parse attribute
values - they are treated purely as strings.
"""
pairs = _parse_set_cookie_pairs(str)
if pairs:
return pairs[0][0], pairs[0][1], odict.ODictCaseless(pairs[1:])
def format_set_cookie_header(name, value, attrs):
"""
Formats a Set-Cookie header value.
"""
pairs = [[name, value]]
pairs.extend(attrs.lst)
return _format_set_cookie_pairs(pairs)
def parse_cookie_header(str):
"""
Parse a Cookie header value.
Returns a (possibly empty) ODict object.
"""
pairs, off = _read_pairs(str)
return odict.ODict(pairs) return odict.ODict(pairs)
def unparse_cookies(od): def format_cookie_header(od):
""" """
Formats a Cookie header value. Formats a Cookie header value.
""" """
return _format_pairs(od.lst) return _format_pairs(od.lst)
def parse_set_cookies(s):
start = 0
def unparse_set_cookies(s):
pass

View File

@ -1,6 +1,8 @@
from netlib import http_cookies, odict import pprint
import nose.tools import nose.tools
from netlib import http_cookies, odict
def test_read_token(): def test_read_token():
tokens = [ tokens = [
@ -65,6 +67,10 @@ def test_read_pairs():
def test_pairs_roundtrips(): def test_pairs_roundtrips():
pairs = [ pairs = [
[
"",
[]
],
[ [
"one=uno", "one=uno",
[["one", "uno"]] [["one", "uno"]]
@ -110,5 +116,108 @@ def test_pairs_roundtrips():
nose.tools.eq_(ret, lst) nose.tools.eq_(ret, lst)
def test_parse_set_cookie(): def test_cookie_roundtrips():
pass pairs = [
[
"one=uno",
[["one", "uno"]]
],
[
"one=uno; two=due",
[["one", "uno"], ["two", "due"]]
],
]
for s, lst in pairs:
ret = http_cookies.parse_cookie_header(s)
nose.tools.eq_(ret.lst, lst)
s2 = http_cookies.format_cookie_header(ret)
ret = http_cookies.parse_cookie_header(s2)
nose.tools.eq_(ret.lst, lst)
# TODO
# I've seen the following pathological cookie in the wild:
#
# cid=09,0,0,0,0; expires=Wed, 10-Jun-2015 21:54:53 GMT; path=/
#
# It's not compliant under any RFC - the latest RFC prohibits commas in cookie
# values completely, earlier RFCs require them to be within a quoted string.
#
# If we ditch support for earlier RFCs, we can handle this correctly. This
# leaves us with the question: what's more common, multiple-value Set-Cookie
# headers, or Set-Cookie headers that violate the standards?
def test_parse_set_cookie_pairs():
pairs = [
[
"one=uno",
[
["one", "uno"]
]
],
[
"one=uno; foo",
[
["one", "uno"],
["foo", None]
]
],
[
"mun=1.390.f60; "
"expires=sun, 11-oct-2015 12:38:31 gmt; path=/; "
"domain=b.aol.com",
[
["mun", "1.390.f60"],
["expires", "sun, 11-oct-2015 12:38:31 gmt"],
["path", "/"],
["domain", "b.aol.com"]
]
],
[
r'rpb=190%3d1%2616726%3d1%2634832%3d1%2634874%3d1; '
'domain=.rubiconproject.com; '
'expires=mon, 11-may-2015 21:54:57 gmt; '
'path=/',
[
['rpb', r'190%3d1%2616726%3d1%2634832%3d1%2634874%3d1'],
['domain', '.rubiconproject.com'],
['expires', 'mon, 11-may-2015 21:54:57 gmt'],
['path', '/']
]
],
]
for s, lst in pairs:
ret = http_cookies._parse_set_cookie_pairs(s)
nose.tools.eq_(ret, lst)
s2 = http_cookies._format_set_cookie_pairs(ret)
ret2 = http_cookies._parse_set_cookie_pairs(s2)
nose.tools.eq_(ret2, lst)
def test_parse_set_cookie_header():
vals = [
[
"", None
],
[
"one=uno",
("one", "uno", [])
],
[
"one=uno; foo=bar",
("one", "uno", [["foo", "bar"]])
]
]
for s, expected in vals:
ret = http_cookies.parse_set_cookie_header(s)
if expected:
assert ret[0] == expected[0]
assert ret[1] == expected[1]
nose.tools.eq_(ret[2].lst, expected[2])
s2 = http_cookies.format_set_cookie_header(*ret)
ret2 = http_cookies.parse_set_cookie_header(s2)
assert ret2[0] == expected[0]
assert ret2[1] == expected[1]
nose.tools.eq_(ret2[2].lst, expected[2])
else:
assert ret is None