|
|
@ -4,14 +4,14 @@ The httplib2 algorithms ported for use with requests. |
|
|
|
import re |
|
|
|
import calendar |
|
|
|
import time |
|
|
|
import datetime |
|
|
|
from email.utils import parsedate_tz |
|
|
|
|
|
|
|
from lib.requests.structures import CaseInsensitiveDict |
|
|
|
from requests.structures import CaseInsensitiveDict |
|
|
|
|
|
|
|
from .cache import DictCache |
|
|
|
from .compat import parsedate_tz |
|
|
|
from .serialize import Serializer |
|
|
|
|
|
|
|
|
|
|
|
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") |
|
|
|
|
|
|
|
|
|
|
@ -21,7 +21,7 @@ def parse_uri(uri): |
|
|
|
(scheme, authority, path, query, fragment) = parse_uri(uri) |
|
|
|
""" |
|
|
|
groups = URI.match(uri).groups() |
|
|
|
return groups[1], groups[3], groups[4], groups[6], groups[8] |
|
|
|
return (groups[1], groups[3], groups[4], groups[6], groups[8]) |
|
|
|
|
|
|
|
|
|
|
|
class CacheController(object): |
|
|
@ -32,26 +32,29 @@ class CacheController(object): |
|
|
|
self.cache_etags = cache_etags |
|
|
|
self.serializer = serializer or Serializer() |
|
|
|
|
|
|
|
def _urlnorm(self, uri): |
|
|
|
@classmethod |
|
|
|
def _urlnorm(cls, uri): |
|
|
|
"""Normalize the URL to create a safe key for the cache""" |
|
|
|
(scheme, authority, path, query, fragment) = parse_uri(uri) |
|
|
|
if not scheme or not authority: |
|
|
|
raise Exception("Only absolute URIs are allowed. uri = %s" % uri) |
|
|
|
authority = authority.lower() |
|
|
|
|
|
|
|
scheme = scheme.lower() |
|
|
|
authority = authority.lower() |
|
|
|
|
|
|
|
if not path: |
|
|
|
path = "/" |
|
|
|
|
|
|
|
# Could do syntax based normalization of the URI before |
|
|
|
# computing the digest. See Section 6.2.2 of Std 66. |
|
|
|
request_uri = query and "?".join([path, query]) or path |
|
|
|
scheme = scheme.lower() |
|
|
|
defrag_uri = scheme + "://" + authority + request_uri |
|
|
|
|
|
|
|
return defrag_uri |
|
|
|
|
|
|
|
def cache_url(self, uri): |
|
|
|
return self._urlnorm(uri) |
|
|
|
@classmethod |
|
|
|
def cache_url(cls, uri): |
|
|
|
return cls._urlnorm(uri) |
|
|
|
|
|
|
|
def parse_cache_control(self, headers): |
|
|
|
""" |
|
|
@ -68,13 +71,20 @@ class CacheController(object): |
|
|
|
parts = headers[cc_header].split(',') |
|
|
|
parts_with_args = [ |
|
|
|
tuple([x.strip().lower() for x in part.split("=", 1)]) |
|
|
|
for part in parts if -1 != part.find("=")] |
|
|
|
parts_wo_args = [(name.strip().lower(), 1) |
|
|
|
for name in parts if -1 == name.find("=")] |
|
|
|
for part in parts if -1 != part.find("=") |
|
|
|
] |
|
|
|
parts_wo_args = [ |
|
|
|
(name.strip().lower(), 1) |
|
|
|
for name in parts if -1 == name.find("=") |
|
|
|
] |
|
|
|
retval = dict(parts_with_args + parts_wo_args) |
|
|
|
return retval |
|
|
|
|
|
|
|
def cached_request(self, request): |
|
|
|
""" |
|
|
|
Return a cached response if it exists in the cache, otherwise |
|
|
|
return False. |
|
|
|
""" |
|
|
|
cache_url = self.cache_url(request.url) |
|
|
|
cc = self.parse_cache_control(request.headers) |
|
|
|
|
|
|
@ -95,7 +105,24 @@ class CacheController(object): |
|
|
|
if not resp: |
|
|
|
return False |
|
|
|
|
|
|
|
# If we have a cached 301, return it immediately. We don't |
|
|
|
# need to test our response for other headers b/c it is |
|
|
|
# intrinsically "cacheable" as it is Permanent. |
|
|
|
# See: |
|
|
|
# https://tools.ietf.org/html/rfc7231#section-6.4.2 |
|
|
|
# |
|
|
|
# Client can try to refresh the value by repeating the request |
|
|
|
# with cache busting headers as usual (ie no-cache). |
|
|
|
if resp.status == 301: |
|
|
|
return resp |
|
|
|
|
|
|
|
headers = CaseInsensitiveDict(resp.headers) |
|
|
|
if not headers or 'date' not in headers: |
|
|
|
# With date or etag, the cached response can never be used |
|
|
|
# and should be deleted. |
|
|
|
if 'etag' not in headers: |
|
|
|
self.cache.delete(cache_url) |
|
|
|
return False |
|
|
|
|
|
|
|
now = time.time() |
|
|
|
date = calendar.timegm( |
|
|
@ -104,15 +131,19 @@ class CacheController(object): |
|
|
|
current_age = max(0, now - date) |
|
|
|
|
|
|
|
# TODO: There is an assumption that the result will be a |
|
|
|
# urllib3 response object. This may not be best since we |
|
|
|
# could probably avoid instantiating or constructing the |
|
|
|
# response until we know we need it. |
|
|
|
# urllib3 response object. This may not be best since we |
|
|
|
# could probably avoid instantiating or constructing the |
|
|
|
# response until we know we need it. |
|
|
|
resp_cc = self.parse_cache_control(headers) |
|
|
|
|
|
|
|
# determine freshness |
|
|
|
freshness_lifetime = 0 |
|
|
|
|
|
|
|
# Check the max-age pragma in the cache control header |
|
|
|
if 'max-age' in resp_cc and resp_cc['max-age'].isdigit(): |
|
|
|
freshness_lifetime = int(resp_cc['max-age']) |
|
|
|
|
|
|
|
# If there isn't a max-age, check for an expires header |
|
|
|
elif 'expires' in headers: |
|
|
|
expires = parsedate_tz(headers['expires']) |
|
|
|
if expires is not None: |
|
|
@ -163,32 +194,24 @@ class CacheController(object): |
|
|
|
|
|
|
|
return new_headers |
|
|
|
|
|
|
|
def cache_response(self, request, response): |
|
|
|
def cache_response(self, request, response, body=None): |
|
|
|
""" |
|
|
|
Algorithm for caching requests. |
|
|
|
|
|
|
|
This assumes a requests Response object. |
|
|
|
""" |
|
|
|
# From httplib2: Don't cache 206's since we aren't going to |
|
|
|
# handle byte range requests |
|
|
|
if response.status not in [200, 203]: |
|
|
|
# handle byte range requests |
|
|
|
if response.status not in [200, 203, 300, 301]: |
|
|
|
return |
|
|
|
|
|
|
|
# Cache Session Params |
|
|
|
cache_auto = getattr(request, 'cache_auto', False) |
|
|
|
cache_urls = getattr(request, 'cache_urls', []) |
|
|
|
cache_max_age = getattr(request, 'cache_max_age', None) |
|
|
|
|
|
|
|
response_headers = CaseInsensitiveDict(response.headers) |
|
|
|
|
|
|
|
# Check if we are wanting to cache responses from specific urls only |
|
|
|
cache_url = self.cache_url(request.url) |
|
|
|
if len(cache_urls) > 0 and not any(s in cache_url for s in cache_urls): |
|
|
|
return |
|
|
|
|
|
|
|
cc_req = self.parse_cache_control(request.headers) |
|
|
|
cc = self.parse_cache_control(response_headers) |
|
|
|
|
|
|
|
cache_url = self.cache_url(request.url) |
|
|
|
|
|
|
|
# Delete it from the cache if we happen to have it stored there |
|
|
|
no_store = cc.get('no-store') or cc_req.get('no-store') |
|
|
|
if no_store and self.cache.get(cache_url): |
|
|
@ -196,21 +219,18 @@ class CacheController(object): |
|
|
|
|
|
|
|
# If we've been given an etag, then keep the response |
|
|
|
if self.cache_etags and 'etag' in response_headers: |
|
|
|
self.cache.set(cache_url, self.serializer.dumps(request, response)) |
|
|
|
|
|
|
|
# If we want to cache sites not setup with cache headers then add the proper headers and keep the response |
|
|
|
elif cache_auto and not cc and response_headers: |
|
|
|
headers = {'Cache-Control': 'public,max-age=%d' % int(cache_max_age or 900)} |
|
|
|
response.headers.update(headers) |
|
|
|
|
|
|
|
if 'expires' not in response_headers: |
|
|
|
if getattr(response_headers, 'expires', None) is None: |
|
|
|
expires = datetime.datetime.utcnow() + datetime.timedelta(days=1) |
|
|
|
expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT") |
|
|
|
headers = {'Expires': expires} |
|
|
|
response.headers.update(headers) |
|
|
|
|
|
|
|
self.cache.set(cache_url, self.serializer.dumps(request, response)) |
|
|
|
self.cache.set( |
|
|
|
cache_url, |
|
|
|
self.serializer.dumps(request, response, body=body), |
|
|
|
) |
|
|
|
|
|
|
|
# Add to the cache any 301s. We do this before looking that |
|
|
|
# the Date headers. |
|
|
|
elif response.status == 301: |
|
|
|
self.cache.set( |
|
|
|
cache_url, |
|
|
|
self.serializer.dumps(request, response) |
|
|
|
) |
|
|
|
|
|
|
|
# Add to the cache if the response headers demand it. If there |
|
|
|
# is no date header then we can't do anything about expiring |
|
|
@ -219,10 +239,10 @@ class CacheController(object): |
|
|
|
# cache when there is a max-age > 0 |
|
|
|
if cc and cc.get('max-age'): |
|
|
|
if int(cc['max-age']) > 0: |
|
|
|
if isinstance(cache_max_age, int): |
|
|
|
cc['max-age'] = int(cache_max_age) |
|
|
|
response.headers['cache-control'] = ''.join(['%s=%s' % (key, value) for (key, value) in cc.items()]) |
|
|
|
self.cache.set(cache_url, self.serializer.dumps(request, response)) |
|
|
|
self.cache.set( |
|
|
|
cache_url, |
|
|
|
self.serializer.dumps(request, response, body=body), |
|
|
|
) |
|
|
|
|
|
|
|
# If the request can expire, it means we should cache it |
|
|
|
# in the meantime. |
|
|
@ -230,7 +250,7 @@ class CacheController(object): |
|
|
|
if response_headers['expires']: |
|
|
|
self.cache.set( |
|
|
|
cache_url, |
|
|
|
self.serializer.dumps(request, response), |
|
|
|
self.serializer.dumps(request, response, body=body), |
|
|
|
) |
|
|
|
|
|
|
|
def update_cached_response(self, request, response): |
|
|
@ -242,14 +262,30 @@ class CacheController(object): |
|
|
|
""" |
|
|
|
cache_url = self.cache_url(request.url) |
|
|
|
|
|
|
|
cached_response = self.serializer.loads(request, self.cache.get(cache_url)) |
|
|
|
cached_response = self.serializer.loads( |
|
|
|
request, |
|
|
|
self.cache.get(cache_url) |
|
|
|
) |
|
|
|
|
|
|
|
if not cached_response: |
|
|
|
# we didn't have a cached response |
|
|
|
return response |
|
|
|
|
|
|
|
# did so lets update our headers |
|
|
|
cached_response.headers.update(response.headers) |
|
|
|
# Lets update our headers with the headers from the new request: |
|
|
|
# http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1 |
|
|
|
# |
|
|
|
# The server isn't supposed to send headers that would make |
|
|
|
# the cached body invalid. But... just in case, we'll be sure |
|
|
|
# to strip out ones we know that might be problmatic due to |
|
|
|
# typical assumptions. |
|
|
|
excluded_headers = [ |
|
|
|
"content-length", |
|
|
|
] |
|
|
|
|
|
|
|
cached_response.headers.update( |
|
|
|
dict((k, v) for k, v in response.headers.items() |
|
|
|
if k.lower() not in excluded_headers) |
|
|
|
) |
|
|
|
|
|
|
|
# we want a 200 b/c we have content via the cache |
|
|
|
cached_response.status = 200 |
|
|
|