You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

450 lines
14 KiB

#!/usr/bin/env python
#
# Copyright 2009 Facebook
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""HTTP utility code shared by clients and servers."""
12 years ago
from __future__ import absolute_import, division, print_function, with_statement
13 years ago
12 years ago
import calendar
import collections
12 years ago
import datetime
12 years ago
import email.utils
12 years ago
import numbers
import time
13 years ago
from tornado.escape import native_str, parse_qs_bytes, utf8
from tornado.log import gen_log
12 years ago
from tornado.util import ObjectDict
try:
from httplib import responses # py2
except ImportError:
from http.client import responses # py3
# responses is unused in this file, but we re-export it to other files.
# Reference it so pyflakes doesn't complain.
responses
12 years ago
try:
from urllib import urlencode # py2
except ImportError:
from urllib.parse import urlencode # py3
13 years ago
12 years ago
class _NormalizedHeaderCache(dict):
"""Dynamic cached mapping of header names to Http-Header-Case.
Implemented as a dict subclass so that cache hits are as fast as a
normal dict lookup, without the overhead of a python function
call.
>>> normalized_headers = _NormalizedHeaderCache(10)
>>> normalized_headers["coNtent-TYPE"]
'Content-Type'
"""
def __init__(self, size):
super(_NormalizedHeaderCache, self).__init__()
self.size = size
self.queue = collections.deque()
def __missing__(self, key):
normalized = "-".join([w.capitalize() for w in key.split("-")])
self[key] = normalized
self.queue.append(key)
if len(self.queue) > self.size:
# Limit the size of the cache. LRU would be better, but this
# simpler approach should be fine. In Python 2.7+ we could
# use OrderedDict (or in 3.2+, @functools.lru_cache).
old_key = self.queue.popleft()
del self[old_key]
return normalized
_normalized_headers = _NormalizedHeaderCache(1000)
class HTTPHeaders(dict):
"""A dictionary that maintains ``Http-Header-Case`` for all keys.
Supports multiple values per key via a pair of new methods,
`add()` and `get_list()`. The regular dictionary interface
returns a single value per key, with multiple values joined by a
comma.
>>> h = HTTPHeaders({"content-type": "text/html"})
12 years ago
>>> list(h.keys())
['Content-Type']
>>> h["Content-Type"]
'text/html'
>>> h.add("Set-Cookie", "A=B")
>>> h.add("Set-Cookie", "C=D")
>>> h["set-cookie"]
'A=B,C=D'
>>> h.get_list("set-cookie")
['A=B', 'C=D']
>>> for (k,v) in sorted(h.get_all()):
12 years ago
... print('%s: %s' % (k,v))
...
Content-Type: text/html
Set-Cookie: A=B
Set-Cookie: C=D
"""
def __init__(self, *args, **kwargs):
# Don't pass args or kwargs to dict.__init__, as it will bypass
# our __setitem__
dict.__init__(self)
self._as_list = {}
self._last_key = None
13 years ago
if (len(args) == 1 and len(kwargs) == 0 and
12 years ago
isinstance(args[0], HTTPHeaders)):
13 years ago
# Copy constructor
for k, v in args[0].get_all():
self.add(k, v)
else:
# Dict-style initialization
self.update(*args, **kwargs)
# new public methods
def add(self, name, value):
"""Adds a new value for the given key."""
12 years ago
norm_name = _normalized_headers[name]
self._last_key = norm_name
if norm_name in self:
# bypass our override of __setitem__ since it modifies _as_list
12 years ago
dict.__setitem__(self, norm_name,
native_str(self[norm_name]) + ',' +
native_str(value))
self._as_list[norm_name].append(value)
else:
self[norm_name] = value
def get_list(self, name):
"""Returns all values for the given header as a list."""
12 years ago
norm_name = _normalized_headers[name]
return self._as_list.get(norm_name, [])
def get_all(self):
"""Returns an iterable of all (name, value) pairs.
If a header has multiple values, multiple pairs will be
returned with the same name.
"""
12 years ago
for name, values in self._as_list.items():
for value in values:
yield (name, value)
def parse_line(self, line):
"""Updates the dictionary with a single header line.
>>> h = HTTPHeaders()
>>> h.parse_line("Content-Type: text/html")
>>> h.get('content-type')
'text/html'
"""
if line[0].isspace():
# continuation of a multi-line header
new_part = ' ' + line.lstrip()
self._as_list[self._last_key][-1] += new_part
dict.__setitem__(self, self._last_key,
self[self._last_key] + new_part)
else:
name, value = line.split(":", 1)
self.add(name, value.strip())
@classmethod
def parse(cls, headers):
"""Returns a dictionary from HTTP header text.
>>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
12 years ago
>>> sorted(h.items())
[('Content-Length', '42'), ('Content-Type', 'text/html')]
"""
h = cls()
for line in headers.splitlines():
if line:
h.parse_line(line)
return h
# dict implementation overrides
def __setitem__(self, name, value):
12 years ago
norm_name = _normalized_headers[name]
dict.__setitem__(self, norm_name, value)
self._as_list[norm_name] = [value]
def __getitem__(self, name):
12 years ago
return dict.__getitem__(self, _normalized_headers[name])
def __delitem__(self, name):
12 years ago
norm_name = _normalized_headers[name]
dict.__delitem__(self, norm_name)
del self._as_list[norm_name]
def __contains__(self, name):
12 years ago
norm_name = _normalized_headers[name]
return dict.__contains__(self, norm_name)
def get(self, name, default=None):
12 years ago
return dict.get(self, _normalized_headers[name], default)
def update(self, *args, **kwargs):
# dict.update bypasses our __setitem__
12 years ago
for k, v in dict(*args, **kwargs).items():
self[k] = v
13 years ago
def copy(self):
# default implementation returns dict(self), not the subclass
return HTTPHeaders(self)
def url_concat(url, args):
"""Concatenate url and argument dictionary regardless of whether
url has existing query parameters.
>>> url_concat("http://example.com/foo?a=b", dict(c="d"))
'http://example.com/foo?a=b&c=d'
"""
13 years ago
if not args:
return url
if url[-1] not in ('?', '&'):
url += '&' if ('?' in url) else '?'
12 years ago
return url + urlencode(args)
class HTTPFile(ObjectDict):
"""Represents a file uploaded via a form.
For backwards compatibility, its instance attributes are also
accessible as dictionary keys.
* ``filename``
* ``body``
* ``content_type``
"""
pass
12 years ago
def _parse_request_range(range_header):
"""Parses a Range header.
Returns either ``None`` or tuple ``(start, end)``.
Note that while the HTTP headers use inclusive byte positions,
this method returns indexes suitable for use in slices.
>>> start, end = _parse_request_range("bytes=1-2")
>>> start, end
(1, 3)
>>> [0, 1, 2, 3, 4][start:end]
[1, 2]
>>> _parse_request_range("bytes=6-")
(6, None)
>>> _parse_request_range("bytes=-6")
(-6, None)
>>> _parse_request_range("bytes=-0")
(None, 0)
>>> _parse_request_range("bytes=")
(None, None)
>>> _parse_request_range("foo=42")
>>> _parse_request_range("bytes=1-2,6-10")
Note: only supports one range (ex, ``bytes=1-2,6-10`` is not allowed).
See [0] for the details of the range header.
[0]: http://greenbytes.de/tech/webdav/draft-ietf-httpbis-p5-range-latest.html#byte.ranges
"""
unit, _, value = range_header.partition("=")
unit, value = unit.strip(), value.strip()
if unit != "bytes":
return None
start_b, _, end_b = value.partition("-")
try:
start = _int_or_none(start_b)
end = _int_or_none(end_b)
except ValueError:
return None
if end is not None:
if start is None:
if end != 0:
start = -end
end = None
else:
end += 1
return (start, end)
def _get_content_range(start, end, total):
"""Returns a suitable Content-Range header:
>>> print(_get_content_range(None, 1, 4))
bytes 0-0/4
>>> print(_get_content_range(1, 3, 4))
bytes 1-2/4
>>> print(_get_content_range(None, None, 4))
bytes 0-3/4
"""
start = start or 0
end = (end or total) - 1
return "bytes %s-%s/%s" % (start, end, total)
def _int_or_none(val):
val = val.strip()
if val == "":
return None
return int(val)
13 years ago
def parse_body_arguments(content_type, body, arguments, files):
"""Parses a form request body.
Supports ``application/x-www-form-urlencoded`` and
``multipart/form-data``. The ``content_type`` parameter should be
a string and ``body`` should be a byte string. The ``arguments``
and ``files`` parameters are dictionaries that will be updated
with the parsed contents.
"""
13 years ago
if content_type.startswith("application/x-www-form-urlencoded"):
12 years ago
try:
uri_arguments = parse_qs_bytes(native_str(body), keep_blank_values=True)
except Exception as e:
gen_log.warning('Invalid x-www-form-urlencoded body: %s', e)
uri_arguments = {}
12 years ago
for name, values in uri_arguments.items():
13 years ago
if values:
arguments.setdefault(name, []).extend(values)
elif content_type.startswith("multipart/form-data"):
fields = content_type.split(";")
for field in fields:
k, sep, v = field.strip().partition("=")
if k == "boundary" and v:
parse_multipart_form_data(utf8(v), body, arguments, files)
break
else:
gen_log.warning("Invalid multipart/form-data")
13 years ago
def parse_multipart_form_data(boundary, data, arguments, files):
"""Parses a ``multipart/form-data`` body.
The ``boundary`` and ``data`` parameters are both byte strings.
The dictionaries given in the arguments and files parameters
will be updated with the contents of the body.
"""
# The standard allows for the boundary to be quoted in the header,
# although it's rare (it happens at least for google app engine
# xmpp). I think we're also supposed to handle backslash-escapes
# here but I'll save that until we see a client that uses them
# in the wild.
12 years ago
if boundary.startswith(b'"') and boundary.endswith(b'"'):
boundary = boundary[1:-1]
12 years ago
final_boundary_index = data.rfind(b"--" + boundary + b"--")
13 years ago
if final_boundary_index == -1:
gen_log.warning("Invalid multipart/form-data: no final boundary")
13 years ago
return
12 years ago
parts = data[:final_boundary_index].split(b"--" + boundary + b"\r\n")
for part in parts:
13 years ago
if not part:
continue
12 years ago
eoh = part.find(b"\r\n\r\n")
if eoh == -1:
gen_log.warning("multipart/form-data missing headers")
continue
headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"))
disp_header = headers.get("Content-Disposition", "")
disposition, disp_params = _parse_header(disp_header)
12 years ago
if disposition != "form-data" or not part.endswith(b"\r\n"):
gen_log.warning("Invalid multipart/form-data")
continue
value = part[eoh + 4:-2]
if not disp_params.get("name"):
gen_log.warning("multipart/form-data value missing name")
continue
name = disp_params["name"]
if disp_params.get("filename"):
ctype = headers.get("Content-Type", "application/unknown")
files.setdefault(name, []).append(HTTPFile(
filename=disp_params["filename"], body=value,
content_type=ctype))
else:
arguments.setdefault(name, []).append(value)
12 years ago
def format_timestamp(ts):
"""Formats a timestamp in the format used by HTTP.
The argument may be a numeric timestamp as returned by `time.time`,
a time tuple as returned by `time.gmtime`, or a `datetime.datetime`
12 years ago
object.
>>> format_timestamp(1359312200)
'Sun, 27 Jan 2013 18:43:20 GMT'
"""
12 years ago
if isinstance(ts, numbers.Real):
12 years ago
pass
12 years ago
elif isinstance(ts, (tuple, time.struct_time)):
ts = calendar.timegm(ts)
12 years ago
elif isinstance(ts, datetime.datetime):
12 years ago
ts = calendar.timegm(ts.utctimetuple())
12 years ago
else:
raise TypeError("unknown timestamp type: %r" % ts)
12 years ago
return email.utils.formatdate(ts, usegmt=True)
12 years ago
# _parseparam and _parse_header are copied and modified from python2.7's cgi.py
# The original 2.7 version of this code did not correctly support some
# combinations of semicolons and double quotes.
def _parseparam(s):
while s[:1] == ';':
s = s[1:]
end = s.find(';')
while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
end = s.find(';', end + 1)
if end < 0:
end = len(s)
f = s[:end]
yield f.strip()
s = s[end:]
13 years ago
def _parse_header(line):
"""Parse a Content-type like header.
Return the main content-type and a dictionary of options.
"""
parts = _parseparam(';' + line)
12 years ago
key = next(parts)
pdict = {}
for p in parts:
i = p.find('=')
if i >= 0:
name = p[:i].strip().lower()
13 years ago
value = p[i + 1:].strip()
if len(value) >= 2 and value[0] == value[-1] == '"':
value = value[1:-1]
value = value.replace('\\\\', '\\').replace('\\"', '"')
pdict[name] = value
return key, pdict
def doctests():
import doctest
return doctest.DocTestSuite()