From 12f8a4b9567a6dc64443e1c7eba53a641a0a1616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Mon, 13 Apr 2020 22:55:26 +0100 Subject: [PATCH 01/19] Fix up typo. --- cachecontrol/controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index c5c4a508..a57a585f 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -312,7 +312,7 @@ def cache_response(self, request, response, body=None, status_codes=None): # Add to the cache any 301s. We do this before looking that # the Date headers. elif response.status == 301: - logger.debug("Caching permanant redirect") + logger.debug("Caching permanent redirect") self.cache.set(cache_url, self.serializer.dumps(request, response)) # Add to the cache if the response headers demand it. If there From 56cd94b143ee1b9081205ad55acf9f7b56250173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Mon, 13 Apr 2020 22:57:03 +0100 Subject: [PATCH 02/19] Include HTTP 308 as a permanent redirect. The new status code was introduced with RFC 7538 in April 2015. This makes it so that "308 Permanent Redirect" status codes are treated the same as "301 Moved Permanently" statuses. --- cachecontrol/adapter.py | 4 ++-- cachecontrol/controller.py | 19 +++++++++++-------- tests/test_cache_control.py | 12 ++++++------ 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index de50006a..b2e4f012 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -4,7 +4,7 @@ from requests.adapters import HTTPAdapter -from .controller import CacheController +from .controller import CacheController, PERMANENT_REDIRECT_STATUSES from .cache import DictCache from .filewrapper import CallbackFileWrapper @@ -93,7 +93,7 @@ def build_response( response = cached_response # We always cache the 301 responses - elif response.status == 301: + elif int(response.status) in PERMANENT_REDIRECT_STATUSES: self.controller.cache_response(request, response) else: # Wrap the response file with a wrapper that will cache the diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index a57a585f..155f3cf1 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -17,6 +17,8 @@ URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") +PERMANENT_REDIRECT_STATUSES = (301, 308) + def parse_uri(uri): """Parses a URI using the regex given in Appendix B of RFC 3986. @@ -37,7 +39,7 @@ def __init__( self.cache = DictCache() if cache is None else cache self.cache_etags = cache_etags self.serializer = serializer or Serializer() - self.cacheable_status_codes = status_codes or (200, 203, 300, 301) + self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308) @classmethod def _urlnorm(cls, uri): @@ -147,17 +149,18 @@ def cached_request(self, request): logger.warning("Cache entry deserialization failed, entry ignored") return False - # If we have a cached 301, return it immediately. We don't - # need to test our response for other headers b/c it is + # If we have a cached permanent redirect, return it immediately. We + # don't need to test our response for other headers b/c it is # intrinsically "cacheable" as it is Permanent. + # # See: # https://tools.ietf.org/html/rfc7231#section-6.4.2 # # Client can try to refresh the value by repeating the request # with cache busting headers as usual (ie no-cache). - if resp.status == 301: + if int(resp.status) in PERMANENT_REDIRECT_STATUSES: msg = ( - 'Returning cached "301 Moved Permanently" response ' + 'Returning cached permanent redirect response ' "(ignoring date and etag information)" ) logger.debug(msg) @@ -309,9 +312,9 @@ def cache_response(self, request, response, body=None, status_codes=None): cache_url, self.serializer.dumps(request, response, body=body) ) - # Add to the cache any 301s. We do this before looking that - # the Date headers. - elif response.status == 301: + # Add to the cache any permanent redirects. We do this before looking + # that the Date headers. + elif int(response.status) in PERMANENT_REDIRECT_STATUSES: logger.debug("Caching permanent redirect") self.cache.set(cache_url, self.serializer.dumps(request, response)) diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 7ede3713..adbc64d2 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -152,7 +152,7 @@ def req(self, headers): return self.c.cached_request(mock_request) def test_cache_request_no_headers(self): - cached_resp = Mock(headers={"ETag": "jfd9094r808", "Content-Length": 100}) + cached_resp = Mock(headers={"ETag": "jfd9094r808", "Content-Length": 100}, status=200) self.c.cache = DictCache({self.url: cached_resp}) resp = self.req({}) assert not resp @@ -179,7 +179,7 @@ def test_cache_request_not_in_cache(self): def test_cache_request_fresh_max_age(self): now = time.strftime(TIME_FMT, time.gmtime()) - resp = Mock(headers={"cache-control": "max-age=3600", "date": now}) + resp = Mock(headers={"cache-control": "max-age=3600", "date": now}, status=200) cache = DictCache({self.url: resp}) self.c.cache = cache @@ -189,7 +189,7 @@ def test_cache_request_fresh_max_age(self): def test_cache_request_unfresh_max_age(self): earlier = time.time() - 3700 # epoch - 1h01m40s now = time.strftime(TIME_FMT, time.gmtime(earlier)) - resp = Mock(headers={"cache-control": "max-age=3600", "date": now}) + resp = Mock(headers={"cache-control": "max-age=3600", "date": now}, status=200) self.c.cache = DictCache({self.url: resp}) r = self.req({}) assert not r @@ -198,7 +198,7 @@ def test_cache_request_fresh_expires(self): later = time.time() + 86400 # GMT + 1 day expires = time.strftime(TIME_FMT, time.gmtime(later)) now = time.strftime(TIME_FMT, time.gmtime()) - resp = Mock(headers={"expires": expires, "date": now}) + resp = Mock(headers={"expires": expires, "date": now}, status=200) cache = DictCache({self.url: resp}) self.c.cache = cache r = self.req({}) @@ -208,7 +208,7 @@ def test_cache_request_unfresh_expires(self): sooner = time.time() - 86400 # GMT - 1 day expires = time.strftime(TIME_FMT, time.gmtime(sooner)) now = time.strftime(TIME_FMT, time.gmtime()) - resp = Mock(headers={"expires": expires, "date": now}) + resp = Mock(headers={"expires": expires, "date": now}, status=200) cache = DictCache({self.url: resp}) self.c.cache = cache r = self.req({}) @@ -217,7 +217,7 @@ def test_cache_request_unfresh_expires(self): def test_cached_request_with_bad_max_age_headers_not_returned(self): now = time.strftime(TIME_FMT, time.gmtime()) # Not a valid header; this would be from a misconfigured server - resp = Mock(headers={"cache-control": "max-age=xxx", "date": now}) + resp = Mock(headers={"cache-control": "max-age=xxx", "date": now}, status=200) self.c.cache = DictCache({self.url: resp}) From db0dcc212ba5fef8bec6f11b926138cac6cc51a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Mon, 13 Apr 2020 23:15:38 +0100 Subject: [PATCH 03/19] Add some more debug logging for FileCache and for the pass-through path. This makes it easier to figure out _why_ something fails to look up altogether. --- cachecontrol/caches/file_cache.py | 11 +++++++++-- cachecontrol/controller.py | 9 +++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index 607b9452..9773581f 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -1,4 +1,5 @@ import hashlib +import logging import os from textwrap import dedent @@ -12,6 +13,9 @@ FileNotFoundError = (IOError, OSError) +logger = logging.getLogger(__name__) + + def _secure_open_write(filename, fmode): # We only want to write to this file, so open it in write only mode flags = os.O_WRONLY @@ -107,6 +111,7 @@ def _fn(self, name): def get(self, key): name = self._fn(key) + logger.debug("Looking up '%s' in '%s'", key, name) try: with open(name, "rb") as fh: return fh.read() @@ -116,12 +121,14 @@ def get(self, key): def set(self, key, value): name = self._fn(key) + logger.debug("Caching '%s' in '%s'", key, name) # Make sure the directory exists + parentdir = os.path.dirname(name) try: - os.makedirs(os.path.dirname(name), self.dirmode) + os.makedirs(parentdir, self.dirmode) except (IOError, OSError): - pass + logging.debug("Error trying to create directory '%s'", parentdir, exc_info=True) with self.lock_class(name) as lock: # Write our actual file diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index 155f3cf1..c8970bcd 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -280,7 +280,7 @@ def cache_response(self, request, response, body=None, status_codes=None): cc = self.parse_cache_control(response_headers) cache_url = self.cache_url(request.url) - logger.debug('Updating cache with response from "%s"', cache_url) + logger.debug('Updating cache %r with response from "%s"', self.cache, cache_url) # Delete it from the cache if we happen to have it stored there no_store = False @@ -321,7 +321,10 @@ def cache_response(self, request, response, body=None, status_codes=None): # Add to the cache if the response headers demand it. If there # is no date header then we can't do anything about expiring # the cache. - elif "date" in response_headers: + elif "date" not in response_headers: + logger.debug("No date header, expiration cannot be set.") + return + else: # cache when there is a max-age > 0 if "max-age" in cc and cc["max-age"] > 0: logger.debug("Caching b/c date exists and max-age > 0") @@ -337,6 +340,8 @@ def cache_response(self, request, response, body=None, status_codes=None): self.cache.set( cache_url, self.serializer.dumps(request, response, body=body) ) + else: + logger.debug("No combination of headers to cache.") def update_cached_response(self, request, response): """On a 304 we will get a new set of headers that we want to From a8241b946cc9bec4711a50825fe24206b352c027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Mon, 13 Apr 2020 23:37:58 +0100 Subject: [PATCH 04/19] Make `Serializer.dumps()` require a `body` parameter. When caching permanent redirects, if `body` is left to `None`, there's an infinite recursion that will lead to the caching to silently fail and not cache anything at all. So instead, make `body` a required parameter, which can be empty (`''`) for cached redirects. --- cachecontrol/controller.py | 13 +++++++------ cachecontrol/serialize.py | 20 ++++---------------- tests/test_cache_control.py | 4 ++-- tests/test_serialization.py | 6 +++--- 4 files changed, 16 insertions(+), 27 deletions(-) diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index c8970bcd..d28f2a2a 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -280,7 +280,7 @@ def cache_response(self, request, response, body=None, status_codes=None): cc = self.parse_cache_control(response_headers) cache_url = self.cache_url(request.url) - logger.debug('Updating cache %r with response from "%s"', self.cache, cache_url) + logger.debug('Updating cache with response from "%s"', cache_url) # Delete it from the cache if we happen to have it stored there no_store = False @@ -309,14 +309,14 @@ def cache_response(self, request, response, body=None, status_codes=None): if self.cache_etags and "etag" in response_headers: logger.debug("Caching due to etag") self.cache.set( - cache_url, self.serializer.dumps(request, response, body=body) + cache_url, self.serializer.dumps(request, response, body) ) # Add to the cache any permanent redirects. We do this before looking # that the Date headers. elif int(response.status) in PERMANENT_REDIRECT_STATUSES: logger.debug("Caching permanent redirect") - self.cache.set(cache_url, self.serializer.dumps(request, response)) + self.cache.set(cache_url, self.serializer.dumps(request, response, b'')) # Add to the cache if the response headers demand it. If there # is no date header then we can't do anything about expiring @@ -329,7 +329,7 @@ def cache_response(self, request, response, body=None, status_codes=None): if "max-age" in cc and cc["max-age"] > 0: logger.debug("Caching b/c date exists and max-age > 0") self.cache.set( - cache_url, self.serializer.dumps(request, response, body=body) + cache_url, self.serializer.dumps(request, response, body) ) # If the request can expire, it means we should cache it @@ -338,7 +338,7 @@ def cache_response(self, request, response, body=None, status_codes=None): if response_headers["expires"]: logger.debug("Caching b/c of expires header") self.cache.set( - cache_url, self.serializer.dumps(request, response, body=body) + cache_url, self.serializer.dumps(request, response, body) ) else: logger.debug("No combination of headers to cache.") @@ -379,6 +379,7 @@ def update_cached_response(self, request, response): cached_response.status = 200 # update our cache - self.cache.set(cache_url, self.serializer.dumps(request, cached_response)) + body = cached_response.read(decode_content=False) + self.cache.set(cache_url, self.serializer.dumps(request, cached_response, body)) return cached_response diff --git a/cachecontrol/serialize.py b/cachecontrol/serialize.py index 572cf0e6..6ab23314 100644 --- a/cachecontrol/serialize.py +++ b/cachecontrol/serialize.py @@ -17,26 +17,14 @@ def _b64_decode_str(s): return _b64_decode_bytes(s).decode("utf8") +_default_body_read = object() + + class Serializer(object): - def dumps(self, request, response, body=None): + def dumps(self, request, response, body): response_headers = CaseInsensitiveDict(response.headers) - if body is None: - body = response.read(decode_content=False) - - # NOTE: 99% sure this is dead code. I'm only leaving it - # here b/c I don't have a test yet to prove - # it. Basically, before using - # `cachecontrol.filewrapper.CallbackFileWrapper`, - # this made an effort to reset the file handle. The - # `CallbackFileWrapper` short circuits this code by - # setting the body as the content is consumed, the - # result being a `body` argument is *always* passed - # into cache_response, and in turn, - # `Serializer.dump`. - response._fp = io.BytesIO(body) - # NOTE: This is all a bit weird, but it's really important that on # Python 2.x these objects are unicode and not str, even when # they contain only ascii. The problem here is that msgpack diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index adbc64d2..2e307c69 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -67,7 +67,7 @@ def test_no_cache_with_wrong_sized_body(self, cc): # When the body is the wrong size, then we don't want to cache it # because it is obviously broken. resp = self.resp({"cache-control": "max-age=3600", "Content-Length": "5"}) - cc.cache_response(self.req(), resp, body=b"0" * 10) + cc.cache_response(self.req(), resp, b"0" * 10) assert not cc.cache.set.called @@ -82,7 +82,7 @@ def test_cache_response_cache_max_age(self, cc): resp = self.resp({"cache-control": "max-age=3600", "date": now}) req = self.req() cc.cache_response(req, resp) - cc.serializer.dumps.assert_called_with(req, resp, body=None) + cc.serializer.dumps.assert_called_with(req, resp, None) cc.cache.set.assert_called_with(self.url, ANY) def test_cache_response_cache_max_age_with_invalid_value_not_cached(self, cc): diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 8d8ade4c..7c342de9 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -89,7 +89,7 @@ def test_read_latest_version_streamable(self, url): original_resp = requests.get(url, stream=True) req = original_resp.request - resp = self.serializer.loads(req, self.serializer.dumps(req, original_resp.raw)) + resp = self.serializer.loads(req, self.serializer.dumps(req, original_resp.raw, original_resp.content)) assert resp.read() @@ -99,7 +99,7 @@ def test_read_latest_version(self, url): req = original_resp.request resp = self.serializer.loads( - req, self.serializer.dumps(req, original_resp.raw, body=data) + req, self.serializer.dumps(req, original_resp.raw, data) ) assert resp.read() == data @@ -114,5 +114,5 @@ def test_no_vary_header(self, url): original_resp.raw.headers["vary"] = "Foo" assert self.serializer.loads( - req, self.serializer.dumps(req, original_resp.raw, body=data) + req, self.serializer.dumps(req, original_resp.raw, data) ) From 39d7bbd34e0f43ac2b6bf82a776f7ca8a8fc4d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 14:22:42 +0100 Subject: [PATCH 05/19] Remove test_stream that appears to be a duplicate of test_chunked_response. --- tests/test_stream.py | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 tests/test_stream.py diff --git a/tests/test_stream.py b/tests/test_stream.py deleted file mode 100644 index 964c5874..00000000 --- a/tests/test_stream.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -Test for supporting streamed responses (Transfer-Encoding: chunked) -""" -import requests - -from cachecontrol import CacheControl - - -class TestStream(object): - - def setup(self): - self.sess = CacheControl(requests.Session()) - - def test_stream_is_cached(self, url): - resp_1 = self.sess.get(url + "stream") - content_1 = resp_1.content - - resp_2 = self.sess.get(url + "stream") - content_2 = resp_1.content - - assert not resp_1.from_cache - assert resp_2.from_cache - assert content_1 == content_2 From ccd14ec9a43a7160a8c254e0b41de7259bee702c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 14:23:48 +0100 Subject: [PATCH 06/19] Add an explicit encoding to test_chunked_response. This is to workaround an isort bug that appears fixed in master, where the Transfer-Encoding: chunked line is interpreted as an encoding for the file. --- tests/test_chunked_response.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_chunked_response.py b/tests/test_chunked_response.py index 763a0e60..715ae65a 100644 --- a/tests/test_chunked_response.py +++ b/tests/test_chunked_response.py @@ -1,3 +1,4 @@ +# encoding: utf-8 """ Test for supporting streamed responses (Transfer-Encoding: chunked) """ From 0621c6ab01653b8b35c61cde466dcbe0c029150c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 14:27:32 +0100 Subject: [PATCH 07/19] Use [isort](https://github.com/timothycrosley/isort) throughout the source. --- cachecontrol/__init__.py | 2 +- cachecontrol/_cmd.py | 3 +-- cachecontrol/adapter.py | 4 ++-- cachecontrol/caches/redis_cache.py | 1 + cachecontrol/controller.py | 3 +-- cachecontrol/heuristics.py | 4 +--- dev_requirements.txt | 17 +++++++++-------- docs/conf.py | 3 ++- examples/benchmark.py | 9 +++++---- pyproject.toml | 3 +++ tests/conftest.py | 6 ++---- tests/test_adapter.py | 2 +- tests/test_cache_control.py | 4 ++-- tests/test_etag.py | 4 +--- tests/test_expires_heuristics.py | 11 ++++------- tests/test_max_age.py | 3 ++- tests/test_regressions.py | 4 ++-- tests/test_serialization.py | 1 - tests/test_storage_filecache.py | 6 +++--- tests/test_storage_redis.py | 1 + tests/test_vary.py | 4 ++-- 21 files changed, 46 insertions(+), 49 deletions(-) create mode 100644 pyproject.toml diff --git a/cachecontrol/__init__.py b/cachecontrol/__init__.py index a1bbbbe3..de960ed2 100644 --- a/cachecontrol/__init__.py +++ b/cachecontrol/__init__.py @@ -6,6 +6,6 @@ __email__ = "eric@ionrock.org" __version__ = "0.12.6" -from .wrapper import CacheControl from .adapter import CacheControlAdapter from .controller import CacheController +from .wrapper import CacheControl diff --git a/cachecontrol/_cmd.py b/cachecontrol/_cmd.py index ee8d60d1..42d40fa5 100644 --- a/cachecontrol/_cmd.py +++ b/cachecontrol/_cmd.py @@ -1,4 +1,5 @@ import logging +from argparse import ArgumentParser import requests @@ -6,8 +7,6 @@ from cachecontrol.cache import DictCache from cachecontrol.controller import logger -from argparse import ArgumentParser - def setup_logging(): logger.setLevel(logging.DEBUG) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index b2e4f012..df81d1f2 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -1,11 +1,11 @@ -import types import functools +import types import zlib from requests.adapters import HTTPAdapter -from .controller import CacheController, PERMANENT_REDIRECT_STATUSES from .cache import DictCache +from .controller import PERMANENT_REDIRECT_STATUSES, CacheController from .filewrapper import CallbackFileWrapper diff --git a/cachecontrol/caches/redis_cache.py b/cachecontrol/caches/redis_cache.py index 16da0aed..49155be7 100644 --- a/cachecontrol/caches/redis_cache.py +++ b/cachecontrol/caches/redis_cache.py @@ -1,6 +1,7 @@ from __future__ import division from datetime import datetime + from cachecontrol.cache import BaseCache diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index d28f2a2a..29aae459 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -1,9 +1,9 @@ """ The httplib2 algorithms ported for use with requests. """ +import calendar import logging import re -import calendar import time from email.utils import parsedate_tz @@ -12,7 +12,6 @@ from .cache import DictCache from .serialize import Serializer - logger = logging.getLogger(__name__) URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") diff --git a/cachecontrol/heuristics.py b/cachecontrol/heuristics.py index 6c0e9790..47dfabd7 100644 --- a/cachecontrol/heuristics.py +++ b/cachecontrol/heuristics.py @@ -1,9 +1,7 @@ import calendar import time - -from email.utils import formatdate, parsedate, parsedate_tz - from datetime import datetime, timedelta +from email.utils import formatdate, parsedate, parsedate_tz TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" diff --git a/dev_requirements.txt b/dev_requirements.txt index d026999c..93e99a16 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,14 +1,15 @@ -e . -tox -pytest-cov -pytest -mock +black +bumpversion cherrypy -sphinx -redis +isort lockfile -bumpversion +mock +pytest +pytest-cov +redis +sphinx +tox twine -black wheel diff --git a/docs/conf.py b/docs/conf.py index 29f37ea1..8c3c39a8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,7 +11,8 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os +import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the diff --git a/examples/benchmark.py b/examples/benchmark.py index 3a51a5fe..730f76c2 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -1,10 +1,11 @@ -import sys -import requests import argparse - -from multiprocessing import Process +import sys from datetime import datetime +from multiprocessing import Process from wsgiref.simple_server import make_server + +import requests + from cachecontrol import CacheControl HOST = "localhost" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..56a038e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.isort] +line_length = 88 +known_first_party = ['cachecontrol'] diff --git a/tests/conftest.py b/tests/conftest.py index 28b80cc5..3e298f28 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,9 @@ -from pprint import pformat - import os import socket - -import pytest +from pprint import pformat import cherrypy +import pytest class SimpleApp(object): diff --git a/tests/test_adapter.py b/tests/test_adapter.py index 7394f26c..acf51d1d 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -1,7 +1,7 @@ import mock import pytest - from requests import Session + from cachecontrol.adapter import CacheControlAdapter from cachecontrol.cache import DictCache from cachecontrol.wrapper import CacheControl diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 2e307c69..065480d0 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -1,14 +1,14 @@ """ Unit tests that verify our caching methods work correctly. """ +import time + import pytest from mock import ANY, Mock -import time from cachecontrol import CacheController from cachecontrol.cache import DictCache - TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" diff --git a/tests/test_etag.py b/tests/test_etag.py index 2b8e0274..52a8115b 100644 --- a/tests/test_etag.py +++ b/tests/test_etag.py @@ -1,8 +1,6 @@ import pytest - -from mock import Mock, patch - import requests +from mock import Mock, patch from cachecontrol import CacheControl from cachecontrol.cache import DictCache diff --git a/tests/test_expires_heuristics.py b/tests/test_expires_heuristics.py index 2adc95ba..eee9a99c 100644 --- a/tests/test_expires_heuristics.py +++ b/tests/test_expires_heuristics.py @@ -1,19 +1,16 @@ import calendar import time - -from email.utils import formatdate, parsedate from datetime import datetime +from email.utils import formatdate, parsedate +from pprint import pprint from mock import Mock from requests import Session, get from requests.structures import CaseInsensitiveDict from cachecontrol import CacheControl -from cachecontrol.heuristics import LastModified, ExpiresAfter, OneDayCache -from cachecontrol.heuristics import TIME_FMT -from cachecontrol.heuristics import BaseHeuristic - -from pprint import pprint +from cachecontrol.heuristics import (TIME_FMT, BaseHeuristic, ExpiresAfter, + LastModified, OneDayCache) class TestHeuristicWithoutWarning(object): diff --git a/tests/test_max_age.py b/tests/test_max_age.py index 09e361e6..c3d0d4e4 100644 --- a/tests/test_max_age.py +++ b/tests/test_max_age.py @@ -1,7 +1,8 @@ from __future__ import print_function -import pytest +import pytest from requests import Session + from cachecontrol.adapter import CacheControlAdapter from cachecontrol.cache import DictCache diff --git a/tests/test_regressions.py b/tests/test_regressions.py index daa4bde6..17345e89 100644 --- a/tests/test_regressions.py +++ b/tests/test_regressions.py @@ -1,11 +1,11 @@ import sys -import pytest +import pytest +from requests import Session from cachecontrol import CacheControl from cachecontrol.caches import FileCache from cachecontrol.filewrapper import CallbackFileWrapper -from requests import Session class Test39(object): diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 7c342de9..616f3b5a 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -1,6 +1,5 @@ import msgpack import requests - from mock import Mock from cachecontrol.compat import pickle diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py index e9c2a940..e3f9114c 100644 --- a/tests/test_storage_filecache.py +++ b/tests/test_storage_filecache.py @@ -3,16 +3,16 @@ """ import os import string - from random import randint, sample import pytest import requests -from cachecontrol import CacheControl -from cachecontrol.caches import FileCache from lockfile import LockFile from lockfile.mkdirlockfile import MkdirLockFile +from cachecontrol import CacheControl +from cachecontrol.caches import FileCache + def randomdata(): """Plain random http data generator:""" diff --git a/tests/test_storage_redis.py b/tests/test_storage_redis.py index d7b3afc1..f189a597 100644 --- a/tests/test_storage_redis.py +++ b/tests/test_storage_redis.py @@ -1,6 +1,7 @@ from datetime import datetime from mock import Mock + from cachecontrol.caches import RedisCache diff --git a/tests/test_vary.py b/tests/test_vary.py index bcffea25..90a9ae9b 100644 --- a/tests/test_vary.py +++ b/tests/test_vary.py @@ -1,3 +1,5 @@ +from pprint import pprint + import pytest import requests @@ -5,8 +7,6 @@ from cachecontrol.cache import DictCache from cachecontrol.compat import urljoin -from pprint import pprint - class TestVary(object): From f2e8526f6e826d4b44765245401c969edf66275f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 14:29:14 +0100 Subject: [PATCH 08/19] Refresh [black](https://github.com/psf/black) formatting. --- cachecontrol/cache.py | 2 -- cachecontrol/caches/file_cache.py | 5 +++-- cachecontrol/caches/redis_cache.py | 1 - cachecontrol/controller.py | 8 +++----- cachecontrol/heuristics.py | 14 ++++++++++++-- cachecontrol/serialize.py | 1 - examples/benchmark.py | 4 ++-- pyproject.toml | 3 +++ tests/conftest.py | 1 - tests/test_adapter.py | 1 - tests/test_cache_control.py | 5 +++-- tests/test_chunked_response.py | 1 - tests/test_etag.py | 1 - tests/test_expires_heuristics.py | 20 +++++++------------- tests/test_max_age.py | 2 -- tests/test_redirects.py | 2 -- tests/test_regressions.py | 1 - tests/test_serialization.py | 5 +++-- tests/test_storage_filecache.py | 1 - tests/test_storage_redis.py | 1 - tests/test_vary.py | 1 - 21 files changed, 36 insertions(+), 44 deletions(-) diff --git a/cachecontrol/cache.py b/cachecontrol/cache.py index 94e07732..8d4cf192 100644 --- a/cachecontrol/cache.py +++ b/cachecontrol/cache.py @@ -6,7 +6,6 @@ class BaseCache(object): - def get(self, key): raise NotImplementedError() @@ -21,7 +20,6 @@ def close(self): class DictCache(BaseCache): - def __init__(self, init_dict=None): self.lock = Lock() self.data = init_dict or {} diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index 9773581f..593a0c5e 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -58,7 +58,6 @@ def _secure_open_write(filename, fmode): class FileCache(BaseCache): - def __init__( self, directory, @@ -128,7 +127,9 @@ def set(self, key, value): try: os.makedirs(parentdir, self.dirmode) except (IOError, OSError): - logging.debug("Error trying to create directory '%s'", parentdir, exc_info=True) + logging.debug( + "Error trying to create directory '%s'", parentdir, exc_info=True + ) with self.lock_class(name) as lock: # Write our actual file diff --git a/cachecontrol/caches/redis_cache.py b/cachecontrol/caches/redis_cache.py index 49155be7..2a18fbd9 100644 --- a/cachecontrol/caches/redis_cache.py +++ b/cachecontrol/caches/redis_cache.py @@ -6,7 +6,6 @@ class RedisCache(BaseCache): - def __init__(self, conn): self.conn = conn diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index 29aae459..a3388430 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -159,7 +159,7 @@ def cached_request(self, request): # with cache busting headers as usual (ie no-cache). if int(resp.status) in PERMANENT_REDIRECT_STATUSES: msg = ( - 'Returning cached permanent redirect response ' + "Returning cached permanent redirect response " "(ignoring date and etag information)" ) logger.debug(msg) @@ -307,15 +307,13 @@ def cache_response(self, request, response, body=None, status_codes=None): # If we've been given an etag, then keep the response if self.cache_etags and "etag" in response_headers: logger.debug("Caching due to etag") - self.cache.set( - cache_url, self.serializer.dumps(request, response, body) - ) + self.cache.set(cache_url, self.serializer.dumps(request, response, body)) # Add to the cache any permanent redirects. We do this before looking # that the Date headers. elif int(response.status) in PERMANENT_REDIRECT_STATUSES: logger.debug("Caching permanent redirect") - self.cache.set(cache_url, self.serializer.dumps(request, response, b'')) + self.cache.set(cache_url, self.serializer.dumps(request, response, b"")) # Add to the cache if the response headers demand it. If there # is no date header then we can't do anything about expiring diff --git a/cachecontrol/heuristics.py b/cachecontrol/heuristics.py index 47dfabd7..58267ba1 100644 --- a/cachecontrol/heuristics.py +++ b/cachecontrol/heuristics.py @@ -16,7 +16,6 @@ def datetime_to_header(dt): class BaseHeuristic(object): - def warning(self, response): """ Return a valid 1xx warning header value describing the cache @@ -95,8 +94,19 @@ class LastModified(BaseHeuristic): http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397 Unlike mozilla we limit this to 24-hr. """ + cacheable_by_default_statuses = { - 200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501 + 200, + 203, + 204, + 206, + 300, + 301, + 404, + 405, + 410, + 414, + 501, } def update_headers(self, resp): diff --git a/cachecontrol/serialize.py b/cachecontrol/serialize.py index 6ab23314..513c5b6d 100644 --- a/cachecontrol/serialize.py +++ b/cachecontrol/serialize.py @@ -21,7 +21,6 @@ def _b64_decode_str(s): class Serializer(object): - def dumps(self, request, response, body): response_headers = CaseInsensitiveDict(response.headers) diff --git a/examples/benchmark.py b/examples/benchmark.py index 730f76c2..720c3ee4 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -14,12 +14,12 @@ class Server(object): - def __call__(self, env, sr): body = "Hello World!" status = "200 OK" headers = [ - ("Cache-Control", "max-age=%i" % (60 * 10)), ("Content-Type", "text/plain") + ("Cache-Control", "max-age=%i" % (60 * 10)), + ("Content-Type", "text/plain"), ] sr(status, headers) return body diff --git a/pyproject.toml b/pyproject.toml index 56a038e5..f35b9a04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,6 @@ [tool.isort] line_length = 88 known_first_party = ['cachecontrol'] +# Set multi-line output to "Vertical Hanging indent" to avoid fighting with black. +multi_line_output = 3 +include_trailing_comma = true diff --git a/tests/conftest.py b/tests/conftest.py index 3e298f28..67842238 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,6 @@ class SimpleApp(object): - def __init__(self): self.etag_count = 0 self.update_etag_string() diff --git a/tests/test_adapter.py b/tests/test_adapter.py index acf51d1d..2d305040 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -31,7 +31,6 @@ def sess(url, request): class TestSessionActions(object): - def test_get_caches(self, url, sess): r2 = sess.get(url) assert r2.from_cache is True diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 065480d0..7e9bc7f1 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -13,7 +13,6 @@ class NullSerializer(object): - def dumps(self, request, response): return response @@ -152,7 +151,9 @@ def req(self, headers): return self.c.cached_request(mock_request) def test_cache_request_no_headers(self): - cached_resp = Mock(headers={"ETag": "jfd9094r808", "Content-Length": 100}, status=200) + cached_resp = Mock( + headers={"ETag": "jfd9094r808", "Content-Length": 100}, status=200 + ) self.c.cache = DictCache({self.url: cached_resp}) resp = self.req({}) assert not resp diff --git a/tests/test_chunked_response.py b/tests/test_chunked_response.py index 715ae65a..a2d06413 100644 --- a/tests/test_chunked_response.py +++ b/tests/test_chunked_response.py @@ -20,7 +20,6 @@ def sess(): class TestChunkedResponses(object): - def test_cache_chunked_response(self, url, sess): """ Verify that an otherwise cacheable response is cached when the diff --git a/tests/test_etag.py b/tests/test_etag.py index 52a8115b..cfc71d1a 100644 --- a/tests/test_etag.py +++ b/tests/test_etag.py @@ -8,7 +8,6 @@ class NullSerializer(object): - def dumps(self, request, response, body=None): return response diff --git a/tests/test_expires_heuristics.py b/tests/test_expires_heuristics.py index eee9a99c..1cc21153 100644 --- a/tests/test_expires_heuristics.py +++ b/tests/test_expires_heuristics.py @@ -9,14 +9,17 @@ from requests.structures import CaseInsensitiveDict from cachecontrol import CacheControl -from cachecontrol.heuristics import (TIME_FMT, BaseHeuristic, ExpiresAfter, - LastModified, OneDayCache) +from cachecontrol.heuristics import ( + TIME_FMT, + BaseHeuristic, + ExpiresAfter, + LastModified, + OneDayCache, +) class TestHeuristicWithoutWarning(object): - def setup(self): - class NoopHeuristic(BaseHeuristic): warning = Mock() @@ -34,11 +37,8 @@ def test_no_header_change_means_no_warning_header(self, url): class TestHeuristicWith3xxResponse(object): - def setup(self): - class DummyHeuristic(BaseHeuristic): - def update_headers(self, resp): return {"x-dummy-header": "foobar"} @@ -56,7 +56,6 @@ def test_heuristic_applies_to_304(self, url): class TestUseExpiresHeuristic(object): - def test_expires_heuristic_arg(self): sess = Session() cached_sess = CacheControl(sess, heuristic=Mock()) @@ -64,7 +63,6 @@ def test_expires_heuristic_arg(self): class TestOneDayCache(object): - def setup(self): self.sess = Session() self.cached_sess = CacheControl(self.sess, heuristic=OneDayCache()) @@ -84,7 +82,6 @@ def test_cache_for_one_day(self, url): class TestExpiresAfter(object): - def setup(self): self.sess = Session() self.cache_sess = CacheControl(self.sess, heuristic=ExpiresAfter(days=1)) @@ -105,7 +102,6 @@ def test_expires_after_one_day(self, url): class TestLastModified(object): - def setup(self): self.sess = Session() self.cached_sess = CacheControl(self.sess, heuristic=LastModified()) @@ -125,7 +121,6 @@ def test_last_modified(self, url): class DummyResponse: - def __init__(self, status, headers): self.status = status self.headers = CaseInsensitiveDict(headers) @@ -136,7 +131,6 @@ def datetime_to_header(dt): class TestModifiedUnitTests(object): - def last_modified(self, period): return time.strftime(TIME_FMT, time.gmtime(self.time_now - period)) diff --git a/tests/test_max_age.py b/tests/test_max_age.py index c3d0d4e4..3b5da827 100644 --- a/tests/test_max_age.py +++ b/tests/test_max_age.py @@ -8,7 +8,6 @@ class NullSerializer(object): - def dumps(self, request, response, body=None): return response @@ -19,7 +18,6 @@ def loads(self, request, data): class TestMaxAge(object): - @pytest.fixture() def sess(self, url): self.url = url diff --git a/tests/test_redirects.py b/tests/test_redirects.py index a182390c..f5334f19 100644 --- a/tests/test_redirects.py +++ b/tests/test_redirects.py @@ -7,7 +7,6 @@ class TestPermanentRedirects(object): - def setup(self): self.sess = CacheControl(requests.Session()) @@ -29,7 +28,6 @@ def test_bust_cache_on_redirect(self, url): class TestMultipleChoicesRedirects(object): - def setup(self): self.sess = CacheControl(requests.Session()) diff --git a/tests/test_regressions.py b/tests/test_regressions.py index 17345e89..32e10ed5 100644 --- a/tests/test_regressions.py +++ b/tests/test_regressions.py @@ -9,7 +9,6 @@ class Test39(object): - @pytest.mark.skipif( sys.version.startswith("2"), reason="Only run this for python 3.x" ) diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 616f3b5a..1af83441 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -7,7 +7,6 @@ class TestSerializer(object): - def setup(self): self.serializer = Serializer() self.response_data = { @@ -88,7 +87,9 @@ def test_read_latest_version_streamable(self, url): original_resp = requests.get(url, stream=True) req = original_resp.request - resp = self.serializer.loads(req, self.serializer.dumps(req, original_resp.raw, original_resp.content)) + resp = self.serializer.loads( + req, self.serializer.dumps(req, original_resp.raw, original_resp.content) + ) assert resp.read() diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py index e3f9114c..5cc58802 100644 --- a/tests/test_storage_filecache.py +++ b/tests/test_storage_filecache.py @@ -22,7 +22,6 @@ def randomdata(): class TestStorageFileCache(object): - @pytest.fixture() def sess(self, url, tmpdir): self.url = url diff --git a/tests/test_storage_redis.py b/tests/test_storage_redis.py index f189a597..0843d84e 100644 --- a/tests/test_storage_redis.py +++ b/tests/test_storage_redis.py @@ -6,7 +6,6 @@ class TestRedisCache(object): - def setup(self): self.conn = Mock() self.cache = RedisCache(self.conn) diff --git a/tests/test_vary.py b/tests/test_vary.py index 90a9ae9b..2831cad9 100644 --- a/tests/test_vary.py +++ b/tests/test_vary.py @@ -9,7 +9,6 @@ class TestVary(object): - @pytest.fixture() def sess(self, url): self.url = urljoin(url, "/vary_accept") From bbaa67fe16591af827a8a7b1b24b55565c4bba73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 14:38:14 +0100 Subject: [PATCH 09/19] Set up [pre-commit](https://pre-commit.com/). This includes isort, black and some basic hygiene on text files. --- .bumpversion.cfg | 1 - .gitignore | 2 +- .pre-commit-config.yaml | 17 +++++++++++++++++ MANIFEST.in | 2 +- dev_requirements.txt | 1 + pyproject.toml | 1 + 6 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b9373bc9..9624f3a1 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -3,4 +3,3 @@ current_version = 0.12.6 files = setup.py cachecontrol/__init__.py docs/conf.py commit = True tag = True - diff --git a/.gitignore b/.gitignore index 21b2f65e..42e144ef 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,4 @@ include .Python docs/_build build/ -.tox \ No newline at end of file +.tox diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..acc64e9d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/timothycrosley/isort + rev: 4.3.21 + hooks: + - id: isort + additional_dependencies: + - toml +- repo: https://github.com/python/black + rev: 19.10b0 + hooks: + - id: black diff --git a/MANIFEST.in b/MANIFEST.in index 1e0fc794..42eb4101 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include LICENSE.txt \ No newline at end of file +include LICENSE.txt diff --git a/dev_requirements.txt b/dev_requirements.txt index 93e99a16..2859d607 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -6,6 +6,7 @@ cherrypy isort lockfile mock +pre-commit pytest pytest-cov redis diff --git a/pyproject.toml b/pyproject.toml index f35b9a04..12ffa088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ [tool.isort] line_length = 88 known_first_party = ['cachecontrol'] +known_third_party = ['mock', 'lockfile', 'requests', 'pytest', 'msgpack', 'cherrypy'] # Set multi-line output to "Vertical Hanging indent" to avoid fighting with black. multi_line_output = 3 include_trailing_comma = true From a1003d157fe62461fb4275b43be7083d11b0fe18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 14:51:12 +0100 Subject: [PATCH 10/19] Add *~ to gitignore. This is a fairly common unix extension for backup files used at least by Emacsen and vim. --- .gitignore | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 42e144ef..84d595ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,14 @@ -.DS_Store +*.egg-info/* *.pyc *.pyo -*.egg-info/* -dist +*~ +.DS_Store +.Python +.tox bin +build/ +dist +docs/_build +include lib lib64 -include -.Python -docs/_build -build/ -.tox From 7150e5bc8e6e05b6490f884d7d9bb98ac7fc98ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 18:36:14 +0100 Subject: [PATCH 11/19] Use range() rather than xrange(). This would be a bit less efficient in Python 2.7, but it would work on Python 3.x. --- examples/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark.py b/examples/benchmark.py index 720c3ee4..b36e67a7 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -35,7 +35,7 @@ def run_benchmark(sess): proc.start() start = datetime.now() - for i in xrange(0, 1000): + for i in range(0, 1000): sess.get(URL) sys.stdout.write(".") end = datetime.now() From 30eb2f4667b1d0593e318e48de0bfefd00f2837b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 19:16:29 +0100 Subject: [PATCH 12/19] Use six instead of manually maintaining compatibility with PY2. This makes it easier to use constants for status codes as well. --- cachecontrol/compat.py | 18 ------------------ cachecontrol/serialize.py | 5 ++++- setup.py | 2 +- tests/test_etag.py | 2 +- tests/test_serialization.py | 2 +- tests/test_vary.py | 2 +- 6 files changed, 8 insertions(+), 23 deletions(-) diff --git a/cachecontrol/compat.py b/cachecontrol/compat.py index 143c8ab0..6355a173 100644 --- a/cachecontrol/compat.py +++ b/cachecontrol/compat.py @@ -1,15 +1,3 @@ -try: - from urllib.parse import urljoin -except ImportError: - from urlparse import urljoin - - -try: - import cPickle as pickle -except ImportError: - import pickle - - # Handle the case where the requests module has been patched to not have # urllib3 bundled as part of its source. try: @@ -21,9 +9,3 @@ from requests.packages.urllib3.util import is_fp_closed except ImportError: from urllib3.util import is_fp_closed - -# Replicate some six behaviour -try: - text_type = unicode -except NameError: - text_type = str diff --git a/cachecontrol/serialize.py b/cachecontrol/serialize.py index 513c5b6d..35526f4f 100644 --- a/cachecontrol/serialize.py +++ b/cachecontrol/serialize.py @@ -6,7 +6,10 @@ import msgpack from requests.structures import CaseInsensitiveDict -from .compat import HTTPResponse, pickle, text_type +from six import text_type +from six.moves import cPickle as pickle + +from .compat import HTTPResponse def _b64_decode_bytes(b): diff --git a/setup.py b/setup.py index 2f3b51c2..680f68dd 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ include_package_data=True, description="httplib2 caching for requests", long_description=long_description, - install_requires=["requests", "msgpack>=0.5.2"], + install_requires=["requests", "msgpack>=0.5.2", "six"], extras_require={"filecache": ["lockfile>=0.9"], "redis": ["redis>=2.10.5"]}, entry_points={"console_scripts": ["doesitcache = cachecontrol._cmd:main"]}, python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", diff --git a/tests/test_etag.py b/tests/test_etag.py index cfc71d1a..193a99c3 100644 --- a/tests/test_etag.py +++ b/tests/test_etag.py @@ -4,7 +4,7 @@ from cachecontrol import CacheControl from cachecontrol.cache import DictCache -from cachecontrol.compat import urljoin +from six.moves.urllib.parse import urljoin class NullSerializer(object): diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 1af83441..f1160f69 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -2,8 +2,8 @@ import requests from mock import Mock -from cachecontrol.compat import pickle from cachecontrol.serialize import Serializer +from six.moves import cPickle as pickle class TestSerializer(object): diff --git a/tests/test_vary.py b/tests/test_vary.py index 2831cad9..169222b9 100644 --- a/tests/test_vary.py +++ b/tests/test_vary.py @@ -5,7 +5,7 @@ from cachecontrol import CacheControl from cachecontrol.cache import DictCache -from cachecontrol.compat import urljoin +from six.moves.urllib.parse import urljoin class TestVary(object): From 09feac3f3e9daa364dc88dca7593aaba283699f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 20:12:08 +0100 Subject: [PATCH 13/19] Add Python 3.7~3.9-dev to Travis. --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ee00d21e..83eb75dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,9 @@ python: - 3.4 - 3.5 - 3.6 - # - 3.7 + - 3.7 + - 3.8 + - 3.9-dev install: pip install tox-travis From 852ef7fef844e4f430eddf02fe9ccf53da617565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Sun, 19 Apr 2020 15:18:09 +0100 Subject: [PATCH 14/19] Make the project compliant with the REUSE guidelines. See https://reuse.software/ for details. --- .bumpversion.cfg | 4 + .gitignore | 4 + .pre-commit-config.yaml | 4 + .travis.yml | 4 + CONTRIBUTORS.rst | 5 + LICENSES/Apache-2.0.txt | 208 +++++++++++++++++++++++++++++ MANIFEST.in | 1 - Makefile | 4 + README.rst | 5 + cachecontrol/__init__.py | 4 + cachecontrol/_cmd.py | 4 + cachecontrol/adapter.py | 4 + cachecontrol/cache.py | 4 + cachecontrol/caches/__init__.py | 4 + cachecontrol/caches/file_cache.py | 4 + cachecontrol/caches/redis_cache.py | 4 + cachecontrol/compat.py | 4 + cachecontrol/controller.py | 4 + cachecontrol/filewrapper.py | 4 + cachecontrol/heuristics.py | 4 + cachecontrol/serialize.py | 4 + cachecontrol/wrapper.py | 4 + dev_requirements.txt | 4 + docs/Makefile | 4 + docs/conf.py | 4 + docs/custom_heuristics.rst | 5 + docs/etags.rst | 5 + docs/index.rst | 5 + docs/release_notes.rst | 5 + docs/storage.rst | 5 + docs/tips.rst | 5 + docs/usage.rst | 5 + examples/benchmark.py | 4 + pyproject.toml | 4 + setup.cfg | 8 ++ setup.py | 4 + tests/conftest.py | 4 + tests/test_adapter.py | 4 + tests/test_cache_control.py | 4 + tests/test_chunked_response.py | 4 + tests/test_etag.py | 4 + tests/test_expires_heuristics.py | 4 + tests/test_max_age.py | 4 + tests/test_redirects.py | 4 + tests/test_regressions.py | 4 + tests/test_serialization.py | 4 + tests/test_server_http_version.py | 4 + tests/test_storage_filecache.py | 4 + tests/test_storage_redis.py | 4 + tests/test_vary.py | 4 + tox.ini | 4 + 51 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 LICENSES/Apache-2.0.txt delete mode 100644 MANIFEST.in diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 9624f3a1..aa028df5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + [bumpversion] current_version = 0.12.6 files = setup.py cachecontrol/__init__.py docs/conf.py diff --git a/.gitignore b/.gitignore index 84d595ac..92826fa8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + *.egg-info/* *.pyc *.pyo diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index acc64e9d..c69b49d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 diff --git a/.travis.yml b/.travis.yml index 83eb75dc..2baade11 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + language: python sudo: false diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 4fb8c2d5..8bfb7e0e 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + ============== Contributors ============== diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt new file mode 100644 index 00000000..527a83a2 --- /dev/null +++ b/LICENSES/Apache-2.0.txt @@ -0,0 +1,208 @@ +Apache License + +Version 2.0, January 2004 + +http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, +AND DISTRIBUTION + + 1. Definitions. + + + +"License" shall mean the terms and conditions for use, reproduction, and distribution +as defined by Sections 1 through 9 of this document. + + + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + + + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct +or indirect, to cause the direction or management of such entity, whether +by contract or otherwise, or (ii) ownership of fifty percent (50%) or more +of the outstanding shares, or (iii) beneficial ownership of such entity. + + + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions +granted by this License. + + + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + + + +"Object" form shall mean any form resulting from mechanical transformation +or translation of a Source form, including but not limited to compiled object +code, generated documentation, and conversions to other media types. + + + +"Work" shall mean the work of authorship, whether in Source or Object form, +made available under the License, as indicated by a copyright notice that +is included in or attached to the work (an example is provided in the Appendix +below). + + + +"Derivative Works" shall mean any work, whether in Source or Object form, +that is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative +Works shall not include works that remain separable from, or merely link (or +bind by name) to the interfaces of, the Work and Derivative Works thereof. + + + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative +Works thereof, that is intentionally submitted to Licensor for inclusion in +the Work by the copyright owner or by an individual or Legal Entity authorized +to submit on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication +sent to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor +for the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + + + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently incorporated +within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this +License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable copyright license to reproduce, prepare +Derivative Works of, publicly display, publicly perform, sublicense, and distribute +the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, +each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) patent +license to make, have made, use, offer to sell, sell, import, and otherwise +transfer the Work, where such license applies only to those patent claims +licensable by such Contributor that are necessarily infringed by their Contribution(s) +alone or by combination of their Contribution(s) with the Work to which such +Contribution(s) was submitted. If You institute patent litigation against +any entity (including a cross-claim or counterclaim in a lawsuit) alleging +that the Work or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses granted to You +under this License for that Work shall terminate as of the date such litigation +is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or +Derivative Works thereof in any medium, with or without modifications, and +in Source or Object form, provided that You meet the following conditions: + +(a) You must give any other recipients of the Work or Derivative Works a copy +of this License; and + +(b) You must cause any modified files to carry prominent notices stating that +You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source +form of the Work, excluding those notices that do not pertain to any part +of the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its distribution, +then any Derivative Works that You distribute must include a readable copy +of the attribution notices contained within such NOTICE file, excluding those +notices that do not pertain to any part of the Derivative Works, in at least +one of the following places: within a NOTICE text file distributed as part +of the Derivative Works; within the Source form or documentation, if provided +along with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works +that You distribute, alongside or as an addendum to the NOTICE text from the +Work, provided that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, +or distribution of Your modifications, or for any such Derivative Works as +a whole, provided Your use, reproduction, and distribution of the Work otherwise +complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any +Contribution intentionally submitted for inclusion in the Work by You to the +Licensor shall be under the terms and conditions of this License, without +any additional terms or conditions. Notwithstanding the above, nothing herein +shall supersede or modify the terms of any separate license agreement you +may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, +trademarks, service marks, or product names of the Licensor, except as required +for reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to +in writing, Licensor provides the Work (and each Contributor provides its +Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied, including, without limitation, any warranties +or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR +A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness +of using or redistributing the Work and assume any risks associated with Your +exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether +in tort (including negligence), contract, or otherwise, unless required by +applicable law (such as deliberate and grossly negligent acts) or agreed to +in writing, shall any Contributor be liable to You for damages, including +any direct, indirect, special, incidental, or consequential damages of any +character arising as a result of this License or out of the use or inability +to use the Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all other commercial +damages or losses), even if such Contributor has been advised of the possibility +of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work +or Derivative Works thereof, You may choose to offer, and charge a fee for, +acceptance of support, warranty, indemnity, or other liability obligations +and/or rights consistent with this License. However, in accepting such obligations, +You may act only on Your own behalf and on Your sole responsibility, not on +behalf of any other Contributor, and only if You agree to indemnify, defend, +and hold each Contributor harmless for any liability incurred by, or claims +asserted against, such Contributor by reason of your accepting any such warranty +or additional liability. END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own identifying +information. (Don't include the brackets!) The text should be enclosed in +the appropriate comment syntax for the file format. We also recommend that +a file or class name and description of purpose be included on the same "printed +page" as the copyright notice for easier identification within third-party +archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); + +you may not use this file except in compliance with the License. + +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software + +distributed under the License is distributed on an "AS IS" BASIS, + +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and + +limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 42eb4101..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include LICENSE.txt diff --git a/Makefile b/Makefile index 5ec850a1..f20fb1e7 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + VENV=.venv VENV_CMD=python3 -m venv ACTIVATE = $(VENV)/bin/activate diff --git a/README.rst b/README.rst index 8457c5d1..04aee1dd 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + ============== CacheControl ============== diff --git a/cachecontrol/__init__.py b/cachecontrol/__init__.py index de960ed2..23ab25ed 100644 --- a/cachecontrol/__init__.py +++ b/cachecontrol/__init__.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + """CacheControl import Interface. Make it easy to import from cachecontrol without long namespaces. diff --git a/cachecontrol/_cmd.py b/cachecontrol/_cmd.py index 42d40fa5..bf04b5db 100644 --- a/cachecontrol/_cmd.py +++ b/cachecontrol/_cmd.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import logging from argparse import ArgumentParser diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index df81d1f2..ea1c302e 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import functools import types import zlib diff --git a/cachecontrol/cache.py b/cachecontrol/cache.py index 8d4cf192..55786457 100644 --- a/cachecontrol/cache.py +++ b/cachecontrol/cache.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + """ The cache object API for implementing caches. The default is a thread safe in-memory dictionary. diff --git a/cachecontrol/caches/__init__.py b/cachecontrol/caches/__init__.py index 0e1658fa..44becd68 100644 --- a/cachecontrol/caches/__init__.py +++ b/cachecontrol/caches/__init__.py @@ -1,2 +1,6 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from .file_cache import FileCache # noqa from .redis_cache import RedisCache # noqa diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index 593a0c5e..de4e79bd 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import hashlib import logging import os diff --git a/cachecontrol/caches/redis_cache.py b/cachecontrol/caches/redis_cache.py index 2a18fbd9..f0b146e0 100644 --- a/cachecontrol/caches/redis_cache.py +++ b/cachecontrol/caches/redis_cache.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from __future__ import division from datetime import datetime diff --git a/cachecontrol/compat.py b/cachecontrol/compat.py index 6355a173..d602c4aa 100644 --- a/cachecontrol/compat.py +++ b/cachecontrol/compat.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + # Handle the case where the requests module has been patched to not have # urllib3 bundled as part of its source. try: diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index a3388430..8a2fee50 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + """ The httplib2 algorithms ported for use with requests. """ diff --git a/cachecontrol/filewrapper.py b/cachecontrol/filewrapper.py index 30ed4c5a..dd91334c 100644 --- a/cachecontrol/filewrapper.py +++ b/cachecontrol/filewrapper.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from io import BytesIO diff --git a/cachecontrol/heuristics.py b/cachecontrol/heuristics.py index 58267ba1..27ef7dae 100644 --- a/cachecontrol/heuristics.py +++ b/cachecontrol/heuristics.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import calendar import time from datetime import datetime, timedelta diff --git a/cachecontrol/serialize.py b/cachecontrol/serialize.py index 35526f4f..0d40ca5a 100644 --- a/cachecontrol/serialize.py +++ b/cachecontrol/serialize.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import base64 import io import json diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py index d8e6fc6a..b6ee7f20 100644 --- a/cachecontrol/wrapper.py +++ b/cachecontrol/wrapper.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from .adapter import CacheControlAdapter from .cache import DictCache diff --git a/dev_requirements.txt b/dev_requirements.txt index 2859d607..e1896819 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + -e . black diff --git a/docs/Makefile b/docs/Makefile index d40f0063..c02956c8 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + # Makefile for Sphinx documentation # diff --git a/docs/conf.py b/docs/conf.py index 8c3c39a8..b4447a89 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,9 @@ # -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2015 Eric Larson # +# SPDX-License-Identifier: Apache-2.0 + # CacheControl documentation build configuration file, created by # sphinx-quickstart on Mon Nov 4 15:01:23 2013. # diff --git a/docs/custom_heuristics.rst b/docs/custom_heuristics.rst index 7a3e970e..2f1871fe 100644 --- a/docs/custom_heuristics.rst +++ b/docs/custom_heuristics.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + =========================== Custom Caching Strategies =========================== diff --git a/docs/etags.rst b/docs/etags.rst index 39efed4b..77561e5e 100644 --- a/docs/etags.rst +++ b/docs/etags.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + ============== ETag Support ============== diff --git a/docs/index.rst b/docs/index.rst index b8e01f7e..b76ba53e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + .. CacheControl documentation master file, created by sphinx-quickstart on Mon Nov 4 15:01:23 2013. You can adapt this file completely to your liking, but it should at least diff --git a/docs/release_notes.rst b/docs/release_notes.rst index b0fd2f25..78b02b04 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + =============== Release Notes =============== diff --git a/docs/storage.rst b/docs/storage.rst index fc4317a7..36672086 100644 --- a/docs/storage.rst +++ b/docs/storage.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + ==================== Storing Cache Data ==================== diff --git a/docs/tips.rst b/docs/tips.rst index 6ebaf82b..af00bd9c 100644 --- a/docs/tips.rst +++ b/docs/tips.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + ========================= Tips and Best Practices ========================= diff --git a/docs/usage.rst b/docs/usage.rst index f87dd08d..f908d38e 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -1,3 +1,8 @@ +.. + SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson + + SPDX-License-Identifier: Apache-2.0 + ==================== Using CacheControl ==================== diff --git a/examples/benchmark.py b/examples/benchmark.py index b36e67a7..2eac44b7 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import argparse import sys from datetime import datetime diff --git a/pyproject.toml b/pyproject.toml index 12ffa088..1d48a866 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + [tool.isort] line_length = 88 known_first_party = ['cachecontrol'] diff --git a/setup.cfg b/setup.cfg index 516f5f20..7842d0b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,11 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + +[metadata] +license_files = + LICENSES/* + [tool:pytest] norecursedirs = bin lib include build diff --git a/setup.py b/setup.py index 680f68dd..b571cc3f 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import setuptools long_description = open("README.rst").read() diff --git a/tests/conftest.py b/tests/conftest.py index 67842238..2681ad43 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import os import socket from pprint import pformat diff --git a/tests/test_adapter.py b/tests/test_adapter.py index 2d305040..0aa3bfdc 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import mock import pytest from requests import Session diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 7e9bc7f1..0b7c0f8f 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + """ Unit tests that verify our caching methods work correctly. """ diff --git a/tests/test_chunked_response.py b/tests/test_chunked_response.py index a2d06413..8e031262 100644 --- a/tests/test_chunked_response.py +++ b/tests/test_chunked_response.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + # encoding: utf-8 """ Test for supporting streamed responses (Transfer-Encoding: chunked) diff --git a/tests/test_etag.py b/tests/test_etag.py index 193a99c3..2b627763 100644 --- a/tests/test_etag.py +++ b/tests/test_etag.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import pytest import requests from mock import Mock, patch diff --git a/tests/test_expires_heuristics.py b/tests/test_expires_heuristics.py index 1cc21153..913704e7 100644 --- a/tests/test_expires_heuristics.py +++ b/tests/test_expires_heuristics.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import calendar import time from datetime import datetime diff --git a/tests/test_max_age.py b/tests/test_max_age.py index 3b5da827..a04776cd 100644 --- a/tests/test_max_age.py +++ b/tests/test_max_age.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from __future__ import print_function import pytest diff --git a/tests/test_redirects.py b/tests/test_redirects.py index f5334f19..40db5f6e 100644 --- a/tests/test_redirects.py +++ b/tests/test_redirects.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + """ Test for supporting redirect caches as needed. """ diff --git a/tests/test_regressions.py b/tests/test_regressions.py index 32e10ed5..eccd2797 100644 --- a/tests/test_regressions.py +++ b/tests/test_regressions.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import sys import pytest diff --git a/tests/test_serialization.py b/tests/test_serialization.py index f1160f69..598ae289 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import msgpack import requests from mock import Mock diff --git a/tests/test_server_http_version.py b/tests/test_server_http_version.py index 52df5470..74f58e87 100644 --- a/tests/test_server_http_version.py +++ b/tests/test_server_http_version.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + import requests diff --git a/tests/test_storage_filecache.py b/tests/test_storage_filecache.py index 5cc58802..38f178b0 100644 --- a/tests/test_storage_filecache.py +++ b/tests/test_storage_filecache.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + """ Unit tests that verify FileCache storage works correctly. """ diff --git a/tests/test_storage_redis.py b/tests/test_storage_redis.py index 0843d84e..3edfb8ac 100644 --- a/tests/test_storage_redis.py +++ b/tests/test_storage_redis.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from datetime import datetime from mock import Mock diff --git a/tests/test_vary.py b/tests/test_vary.py index 169222b9..a9d6fc96 100644 --- a/tests/test_vary.py +++ b/tests/test_vary.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2015 Eric Larson +# +# SPDX-License-Identifier: Apache-2.0 + from pprint import pprint import pytest diff --git a/tox.ini b/tox.ini index f7c52e38..f359544d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,3 +1,7 @@ +; SPDX-FileCopyrightText: 2015 Eric Larson +; +; SPDX-License-Identifier: Apache-2.0 + [tox] envlist = py27, py34, py35, py36, py37 From 22eb0a818568e73aefbf21ea61aea262ae6ac095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Mon, 20 Apr 2020 15:59:43 +0100 Subject: [PATCH 15/19] Make the BaseCache class an abstract class. This is just a matter of cleanup, I can't think of any good reason for this _not_ to be marked abstract. --- cachecontrol/cache.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cachecontrol/cache.py b/cachecontrol/cache.py index 55786457..9500f5fb 100644 --- a/cachecontrol/cache.py +++ b/cachecontrol/cache.py @@ -6,18 +6,26 @@ The cache object API for implementing caches. The default is a thread safe in-memory dictionary. """ + +from abc import ABCMeta, abstractmethod from threading import Lock +from six import add_metaclass + +@add_metaclass(ABCMeta) class BaseCache(object): + @abstractmethod def get(self, key): - raise NotImplementedError() + pass + @abstractmethod def set(self, key, value): - raise NotImplementedError() + pass + @abstractmethod def delete(self, key): - raise NotImplementedError() + pass def close(self): pass From 802a8f8872995e32f52b6d09de1ac7cd8ecd8441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Mon, 20 Apr 2020 16:13:23 +0100 Subject: [PATCH 16/19] Suppress errors for `os.makedirs()`, again. This is a bit more nuanced in Python 3, where only EEXIST errors are suppressed, to match the `delete` codepath. --- cachecontrol/caches/file_cache.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cachecontrol/caches/file_cache.py b/cachecontrol/caches/file_cache.py index de4e79bd..523b07a5 100644 --- a/cachecontrol/caches/file_cache.py +++ b/cachecontrol/caches/file_cache.py @@ -15,7 +15,7 @@ except NameError: # py2.X FileNotFoundError = (IOError, OSError) - + FileExistsError = (IOError, OSError) logger = logging.getLogger(__name__) @@ -130,10 +130,8 @@ def set(self, key, value): parentdir = os.path.dirname(name) try: os.makedirs(parentdir, self.dirmode) - except (IOError, OSError): - logging.debug( - "Error trying to create directory '%s'", parentdir, exc_info=True - ) + except FileExistsError: + pass with self.lock_class(name) as lock: # Write our actual file From 3e730f2fb2e78200d58785b6ccb3d4531f527f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Tue, 14 Apr 2020 19:17:18 +0100 Subject: [PATCH 17/19] Remove the custom heuristics implementation. These heuristics disagree with RFC7234, and while there's no reason to stick to their implementation, they seem to be more ill-suited than the default 10%-since-last-modification. --- cachecontrol/_cmd.py | 4 +- cachecontrol/adapter.py | 8 -- cachecontrol/heuristics.py | 147 ---------------------- cachecontrol/wrapper.py | 2 - docs/custom_heuristics.rst | 170 ------------------------- docs/index.rst | 2 - tests/test_expires_heuristics.py | 207 ------------------------------- 7 files changed, 1 insertion(+), 539 deletions(-) delete mode 100644 cachecontrol/heuristics.py delete mode 100644 docs/custom_heuristics.rst delete mode 100644 tests/test_expires_heuristics.py diff --git a/cachecontrol/_cmd.py b/cachecontrol/_cmd.py index bf04b5db..4ba24ab7 100644 --- a/cachecontrol/_cmd.py +++ b/cachecontrol/_cmd.py @@ -19,9 +19,7 @@ def setup_logging(): def get_session(): - adapter = CacheControlAdapter( - DictCache(), cache_etags=True, serializer=None, heuristic=None - ) + adapter = CacheControlAdapter(DictCache(), cache_etags=True, serializer=None) sess = requests.Session() sess.mount("http://", adapter) sess.mount("https://", adapter) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index ea1c302e..6e688f6e 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -22,14 +22,12 @@ def __init__( cache_etags=True, controller_class=None, serializer=None, - heuristic=None, cacheable_methods=None, *args, **kw ): super(CacheControlAdapter, self).__init__(*args, **kw) self.cache = DictCache() if cache is None else cache - self.heuristic = heuristic self.cacheable_methods = cacheable_methods or ("GET",) controller_factory = controller_class or CacheController @@ -69,12 +67,6 @@ def build_response( """ cacheable = cacheable_methods or self.cacheable_methods if not from_cache and request.method in cacheable: - # Check for any heuristics that might update headers - # before trying to cache. - if self.heuristic: - response = self.heuristic.apply(response) - - # apply any expiration heuristics if response.status == 304: # We must have sent an ETag request. This could mean # that we've been expired already or that we simply diff --git a/cachecontrol/heuristics.py b/cachecontrol/heuristics.py deleted file mode 100644 index 27ef7dae..00000000 --- a/cachecontrol/heuristics.py +++ /dev/null @@ -1,147 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Eric Larson -# -# SPDX-License-Identifier: Apache-2.0 - -import calendar -import time -from datetime import datetime, timedelta -from email.utils import formatdate, parsedate, parsedate_tz - -TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" - - -def expire_after(delta, date=None): - date = date or datetime.utcnow() - return date + delta - - -def datetime_to_header(dt): - return formatdate(calendar.timegm(dt.timetuple())) - - -class BaseHeuristic(object): - def warning(self, response): - """ - Return a valid 1xx warning header value describing the cache - adjustments. - - The response is provided too allow warnings like 113 - http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need - to explicitly say response is over 24 hours old. - """ - return '110 - "Response is Stale"' - - def update_headers(self, response): - """Update the response headers with any new headers. - - NOTE: This SHOULD always include some Warning header to - signify that the response was cached by the client, not - by way of the provided headers. - """ - return {} - - def apply(self, response): - updated_headers = self.update_headers(response) - - if updated_headers: - response.headers.update(updated_headers) - warning_header_value = self.warning(response) - if warning_header_value is not None: - response.headers.update({"Warning": warning_header_value}) - - return response - - -class OneDayCache(BaseHeuristic): - """ - Cache the response by providing an expires 1 day in the - future. - """ - - def update_headers(self, response): - headers = {} - - if "expires" not in response.headers: - date = parsedate(response.headers["date"]) - expires = expire_after(timedelta(days=1), date=datetime(*date[:6])) - headers["expires"] = datetime_to_header(expires) - headers["cache-control"] = "public" - return headers - - -class ExpiresAfter(BaseHeuristic): - """ - Cache **all** requests for a defined time period. - """ - - def __init__(self, **kw): - self.delta = timedelta(**kw) - - def update_headers(self, response): - expires = expire_after(self.delta) - return {"expires": datetime_to_header(expires), "cache-control": "public"} - - def warning(self, response): - tmpl = "110 - Automatically cached for %s. Response might be stale" - return tmpl % self.delta - - -class LastModified(BaseHeuristic): - """ - If there is no Expires header already, fall back on Last-Modified - using the heuristic from - http://tools.ietf.org/html/rfc7234#section-4.2.2 - to calculate a reasonable value. - - Firefox also does something like this per - https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ - http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397 - Unlike mozilla we limit this to 24-hr. - """ - - cacheable_by_default_statuses = { - 200, - 203, - 204, - 206, - 300, - 301, - 404, - 405, - 410, - 414, - 501, - } - - def update_headers(self, resp): - headers = resp.headers - - if "expires" in headers: - return {} - - if "cache-control" in headers and headers["cache-control"] != "public": - return {} - - if resp.status not in self.cacheable_by_default_statuses: - return {} - - if "date" not in headers or "last-modified" not in headers: - return {} - - date = calendar.timegm(parsedate_tz(headers["date"])) - last_modified = parsedate(headers["last-modified"]) - if date is None or last_modified is None: - return {} - - now = time.time() - current_age = max(0, now - date) - delta = date - calendar.timegm(last_modified) - freshness_lifetime = max(0, min(delta / 10, 24 * 3600)) - if freshness_lifetime <= current_age: - return {} - - expires = date + freshness_lifetime - return {"expires": time.strftime(TIME_FMT, time.gmtime(expires))} - - def warning(self, resp): - return None diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py index b6ee7f20..0fd604e0 100644 --- a/cachecontrol/wrapper.py +++ b/cachecontrol/wrapper.py @@ -11,7 +11,6 @@ def CacheControl( cache=None, cache_etags=True, serializer=None, - heuristic=None, controller_class=None, adapter_class=None, cacheable_methods=None, @@ -23,7 +22,6 @@ def CacheControl( cache, cache_etags=cache_etags, serializer=serializer, - heuristic=heuristic, controller_class=controller_class, cacheable_methods=cacheable_methods, ) diff --git a/docs/custom_heuristics.rst b/docs/custom_heuristics.rst deleted file mode 100644 index 2f1871fe..00000000 --- a/docs/custom_heuristics.rst +++ /dev/null @@ -1,170 +0,0 @@ -.. - SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson - - SPDX-License-Identifier: Apache-2.0 - -=========================== - Custom Caching Strategies -=========================== - -There are times when a server provides responses that are logically -cacheable, but they lack the headers necessary to cause CacheControl -to cache the response. `The HTTP Caching Spec -`_ does allow for caching systems -to cache requests that lack caching headers. In these situations, the -caching system can use heuristics to determine an appropriate amount -of time to cache a response. - -By default, in CacheControl the decision to cache must be explicit by -default via the caching headers. When there is a need to cache -responses that wouldn't normally be cached, a user can provide a -heuristic to adjust the response in order to make it cacheable. - -For example when running a test suite against a service, caching all -responses might be helpful speeding things up while still making real -calls to the API. - - -Caching Heuristics -================== - -A cache heuristic allows specifying a caching strategy by adjusting -response headers before the response is considered for caching. - -For example, if we wanted to implement a caching strategy where every -request should be cached for a week, we can implement the strategy in -a `cachecontrol.heuristics.Heuristic`. :: - - import calendar - from cachecontrol.heuristics import BaseHeuristic - from datetime import datetime, timedelta - from email.utils import parsedate, formatdate - - - class OneWeekHeuristic(BaseHeuristic): - - def update_headers(self, response): - date = parsedate(response.headers['date']) - expires = datetime(*date[:6]) + timedelta(weeks=1) - return { - 'expires' : formatdate(calendar.timegm(expires.timetuple())), - 'cache-control' : 'public', - } - - def warning(self, response): - msg = 'Automatically cached! Response is Stale.' - return '110 - "%s"' % msg - - -When a response is received and we are testing for whether it is -cacheable, the heuristic is applied before checking its headers. We -also set a `warning header -`_ to communicate why -the response might be stale. The original response is passed into the -warning header in order to use its values. For example, if the -response has been expired for more than 24 hours a `Warning 113 -`_ should be used. - -In order to use this heuristic, we pass it to our `CacheControl` -constructor. :: - - - from requests import Session - from cachecontrol import CacheControl - - - sess = CacheControl(Session(), heuristic=OneWeekHeuristic()) - sess.get('http://google.com') - r = sess.get('http://google.com') - assert r.from_cache - -The google homepage specifically uses a negative expires header and -private cache control header to avoid caches. We've managed to work -around that aspect and cache the response using our heuristic. - - -Best Practices -============== - -Cache heuristics are still a new feature, which means that the support -is somewhat rudimentary. There likely to be best practices and common -heuristics that can meet the needs of many use cases. For example, in -the above heuristic it is important to change both the `expires` and -`cache-control` headers in order to make the response cacheable. - -If you do find a helpful best practice or create a helpful heuristic, -please consider sending a pull request or opening a issue. - - -Expires After -------------- - -CacheControl bundles an `ExpiresAfter` heuristic that is aimed at -making it relatively easy to automatically cache responses for a -period of time. Here is an example - -.. code-block:: python - - import requests - from cachecontrol import CacheControlAdapter - from cachecontrol.heuristics import ExpiresAfter - - adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1)) - - sess = requests.Session() - sess.mount('http://', adapter) - -The arguments are the same as the `datetime.timedelta` -object. `ExpiresAfter` will override or add the `Expires` header and -override or set the `Cache-Control` header to `public`. - - -Last Modified -------------- - -CacheControl bundles an `LastModified` heuristic that emulates -the behavior of Firefox, following RFC7234. Roughly stated, -this sets the expiration on a page to 10% of the difference -between the request timestamp and the last modified timestamp. -This is capped at 24-hr. - -.. code-block:: python - - import requests - from cachecontrol import CacheControlAdapter - from cachecontrol.heuristics import LastModified - - adapter = CacheControlAdapter(heuristic=LastModified()) - - sess = requests.Session() - sess.mount('http://', adapter) - - -Site Specific Heuristics ------------------------- - -If you have a specific domain that you want to apply a specific -heuristic to, use a separate adapter. :: - - import requests - from cachecontrol import CacheControlAdapter - from mypkg import MyHeuristic - - - sess = requests.Session() - sess.mount( - 'http://my.specific-domain.com', - CacheControlAdapter(heuristic=MyHeuristic()) - ) - -In this way you can limit your heuristic to a specific site. - - -Warning! -======== - -Caching is hard and while HTTP does a reasonable job defining rules -for freshness, overriding those rules should be done with -caution. Many have been frustrated by over aggressive caches, so -please carefully consider your use case before utilizing a more -aggressive heuristic. diff --git a/docs/index.rst b/docs/index.rst index b76ba53e..4db91599 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -86,8 +86,6 @@ Contents: usage storage - etags - custom_heuristics tips release_notes diff --git a/tests/test_expires_heuristics.py b/tests/test_expires_heuristics.py deleted file mode 100644 index 913704e7..00000000 --- a/tests/test_expires_heuristics.py +++ /dev/null @@ -1,207 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Eric Larson -# -# SPDX-License-Identifier: Apache-2.0 - -import calendar -import time -from datetime import datetime -from email.utils import formatdate, parsedate -from pprint import pprint - -from mock import Mock -from requests import Session, get -from requests.structures import CaseInsensitiveDict - -from cachecontrol import CacheControl -from cachecontrol.heuristics import ( - TIME_FMT, - BaseHeuristic, - ExpiresAfter, - LastModified, - OneDayCache, -) - - -class TestHeuristicWithoutWarning(object): - def setup(self): - class NoopHeuristic(BaseHeuristic): - warning = Mock() - - def update_headers(self, resp): - return {} - - self.heuristic = NoopHeuristic() - self.sess = CacheControl(Session(), heuristic=self.heuristic) - - def test_no_header_change_means_no_warning_header(self, url): - the_url = url + "optional_cacheable_request" - resp = self.sess.get(the_url) - - assert not self.heuristic.warning.called - - -class TestHeuristicWith3xxResponse(object): - def setup(self): - class DummyHeuristic(BaseHeuristic): - def update_headers(self, resp): - return {"x-dummy-header": "foobar"} - - self.sess = CacheControl(Session(), heuristic=DummyHeuristic()) - - def test_heuristic_applies_to_301(self, url): - the_url = url + "permanent_redirect" - resp = self.sess.get(the_url) - assert "x-dummy-header" in resp.headers - - def test_heuristic_applies_to_304(self, url): - the_url = url + "conditional_get" - resp = self.sess.get(the_url) - assert "x-dummy-header" in resp.headers - - -class TestUseExpiresHeuristic(object): - def test_expires_heuristic_arg(self): - sess = Session() - cached_sess = CacheControl(sess, heuristic=Mock()) - assert cached_sess - - -class TestOneDayCache(object): - def setup(self): - self.sess = Session() - self.cached_sess = CacheControl(self.sess, heuristic=OneDayCache()) - - def test_cache_for_one_day(self, url): - the_url = url + "optional_cacheable_request" - r = self.sess.get(the_url) - - assert "expires" in r.headers - assert "warning" in r.headers - - pprint(dict(r.headers)) - - r = self.sess.get(the_url) - pprint(dict(r.headers)) - assert r.from_cache - - -class TestExpiresAfter(object): - def setup(self): - self.sess = Session() - self.cache_sess = CacheControl(self.sess, heuristic=ExpiresAfter(days=1)) - - def test_expires_after_one_day(self, url): - the_url = url + "no_cache" - resp = get(the_url) - assert resp.headers["cache-control"] == "no-cache" - - r = self.sess.get(the_url) - - assert "expires" in r.headers - assert "warning" in r.headers - assert r.headers["cache-control"] == "public" - - r = self.sess.get(the_url) - assert r.from_cache - - -class TestLastModified(object): - def setup(self): - self.sess = Session() - self.cached_sess = CacheControl(self.sess, heuristic=LastModified()) - - def test_last_modified(self, url): - the_url = url + "optional_cacheable_request" - r = self.sess.get(the_url) - - assert "expires" in r.headers - assert "warning" not in r.headers - - pprint(dict(r.headers)) - - r = self.sess.get(the_url) - pprint(dict(r.headers)) - assert r.from_cache - - -class DummyResponse: - def __init__(self, status, headers): - self.status = status - self.headers = CaseInsensitiveDict(headers) - - -def datetime_to_header(dt): - return formatdate(calendar.timegm(dt.timetuple())) - - -class TestModifiedUnitTests(object): - def last_modified(self, period): - return time.strftime(TIME_FMT, time.gmtime(self.time_now - period)) - - def setup(self): - self.heuristic = LastModified() - self.time_now = time.time() - day_in_seconds = 86400 - self.year_ago = self.last_modified(day_in_seconds * 365) - self.week_ago = self.last_modified(day_in_seconds * 7) - self.day_ago = self.last_modified(day_in_seconds) - self.now = self.last_modified(0) - - # NOTE: We pass in a negative to get a positive... Probably - # should refactor. - self.day_ahead = self.last_modified(-day_in_seconds) - - def test_no_expiry_is_inferred_when_no_last_modified_is_present(self): - assert self.heuristic.update_headers(DummyResponse(200, {})) == {} - - def test_expires_is_not_replaced_when_present(self): - resp = DummyResponse(200, {"Expires": self.day_ahead}) - assert self.heuristic.update_headers(resp) == {} - - def test_last_modified_is_used(self): - resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.week_ago}) - modified = self.heuristic.update_headers(resp) - assert ["expires"] == list(modified.keys()) - assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now() - - def test_last_modified_is_not_used_when_cache_control_present(self): - resp = DummyResponse( - 200, - { - "Date": self.now, - "Last-Modified": self.week_ago, - "Cache-Control": "private", - }, - ) - assert self.heuristic.update_headers(resp) == {} - - def test_last_modified_is_not_used_when_status_is_unknown(self): - resp = DummyResponse(299, {"Date": self.now, "Last-Modified": self.week_ago}) - assert self.heuristic.update_headers(resp) == {} - - def test_last_modified_is_used_when_cache_control_public(self): - resp = DummyResponse( - 200, - { - "Date": self.now, - "Last-Modified": self.week_ago, - "Cache-Control": "public", - }, - ) - modified = self.heuristic.update_headers(resp) - assert ["expires"] == list(modified.keys()) - assert datetime(*parsedate(modified["expires"])[:6]) > datetime.now() - - def test_warning_not_added_when_response_more_recent_than_24_hours(self): - resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.week_ago}) - assert self.heuristic.warning(resp) is None - - def test_warning_is_not_added_when_heuristic_was_not_used(self): - resp = DummyResponse(200, {"Date": self.now, "Expires": self.day_ahead}) - assert self.heuristic.warning(resp) is None - - def test_expiry_is_no_more_that_twenty_four_hours(self): - resp = DummyResponse(200, {"Date": self.now, "Last-Modified": self.year_ago}) - modified = self.heuristic.update_headers(resp) - assert ["expires"] == list(modified.keys()) - assert self.day_ahead == modified["expires"] From 045c1fe1c70d10f849cebcd4b455014c0c6e25ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 19:59:45 +0100 Subject: [PATCH 18/19] Remove the `cache_etags` parameter throughout. The way this is set up in the module appears to go contrary to RFC7234 by expecting it to be used for validation purposes: https://httpwg.org/specs/rfc7234.html#validation.sent --- cachecontrol/_cmd.py | 2 +- cachecontrol/adapter.py | 3 +- cachecontrol/wrapper.py | 2 - docs/etags.rst | 142 ------------------------------------- tests/test_etag.py | 151 ---------------------------------------- 5 files changed, 2 insertions(+), 298 deletions(-) delete mode 100644 docs/etags.rst delete mode 100644 tests/test_etag.py diff --git a/cachecontrol/_cmd.py b/cachecontrol/_cmd.py index 4ba24ab7..007c4a74 100644 --- a/cachecontrol/_cmd.py +++ b/cachecontrol/_cmd.py @@ -19,7 +19,7 @@ def setup_logging(): def get_session(): - adapter = CacheControlAdapter(DictCache(), cache_etags=True, serializer=None) + adapter = CacheControlAdapter(DictCache(), serializer=None) sess = requests.Session() sess.mount("http://", adapter) sess.mount("https://", adapter) diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index 6e688f6e..a461428c 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -19,7 +19,6 @@ class CacheControlAdapter(HTTPAdapter): def __init__( self, cache=None, - cache_etags=True, controller_class=None, serializer=None, cacheable_methods=None, @@ -32,7 +31,7 @@ def __init__( controller_factory = controller_class or CacheController self.controller = controller_factory( - self.cache, cache_etags=cache_etags, serializer=serializer + self.cache, serializer=serializer ) def send(self, request, cacheable_methods=None, **kw): diff --git a/cachecontrol/wrapper.py b/cachecontrol/wrapper.py index 0fd604e0..5fee43a7 100644 --- a/cachecontrol/wrapper.py +++ b/cachecontrol/wrapper.py @@ -9,7 +9,6 @@ def CacheControl( sess, cache=None, - cache_etags=True, serializer=None, controller_class=None, adapter_class=None, @@ -20,7 +19,6 @@ def CacheControl( adapter_class = adapter_class or CacheControlAdapter adapter = adapter_class( cache, - cache_etags=cache_etags, serializer=serializer, controller_class=controller_class, cacheable_methods=cacheable_methods, diff --git a/docs/etags.rst b/docs/etags.rst deleted file mode 100644 index 77561e5e..00000000 --- a/docs/etags.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. - SPDX-FileCopyrightText: SPDX-FileCopyrightText: 2015 Eric Larson - - SPDX-License-Identifier: Apache-2.0 - -============== - ETag Support -============== - -CacheControl's support of ETags is slightly different than -httplib2. In httplib2, an ETag is considered when using a cached -response when the cache is considered stale. When a cached response is -expired and it has an ETag header, it returns a response with the -appropriate `If-None-Match` header. We'll call this behavior a **Time -Priority** cache as the ETag support only takes effect when the time has -expired. - -In CacheControl the default behavior when an ETag is sent by the -server is to cache the response. We'll refer to this pattern as a -**Equal Priority** cache as the decision to cache is either time base or -due to the presense of an ETag. - -The spec is not explicit what takes priority when caching with both -ETags and time based headers. Therefore, CacheControl supports the -different mechanisms via configuration where possible. - - -Turning Off Equal Priority Caching -================================== - -The danger in Equal Priority Caching is that a server that returns -ETag headers for every request may fill up your cache. You can disable -Equal Priority Caching and utilize a Time Priority algorithm like -httplib2. :: - - import requests - from cachecontrol import CacheControl - - sess = CacheControl(requests.Session(), cache_etags=False) - -This will only utilize ETags when they exist within the context of -time based caching headers. If a response has time base caching -headers that are valid along with an ETag, we will still attempt to -handle a 304 Not Modified even though the cached value as -expired. Here is a simple example. :: - - # Server response - GET /foo.html - Date: Tue, 26 Nov 2013 00:50:49 GMT - Cache-Control: max-age=3000 - ETag: JAsUYM8K - -On a subsequent request, if the cache has expired, the next request -will still include the `If-None-Match` header. The cached response -will remain in the cache awaiting the response. :: - - # Client request - GET /foo.html - If-None-Match: JAsUYM8K - -If the server returns a `304 Not Modified`, it will use the stale -cached value, updating the headers from the most recent request. :: - - # Server response - GET /foo.html - Date: Tue, 26 Nov 2013 01:30:19 GMT - Cache-Control: max-age=3000 - ETag: JAsUYM8K - -If the server returns a `200 OK`, the cache will be updated -accordingly. - - -Equal Priority Caching Benefits -=============================== - -The benefits of equal priority caching is that you have two orthogonal -means of introducing a cache. The time based cache provides an -effective way to reduce the load on requests that can be eventually -consistent. Static resource are a great example of when time based -caching is effective. - -The ETag based cache is effective for working with documents that are -larger and/or need to be correct immediately after changes. For -example, if you exported some data from a large database, the file -could be 10 GBs. Being able to send an ETag with this sort of request -an know the version you have locally is valid saves a ton of bandwidth -and time. - -Likewise, if you have a resource that you want to update, you can be -confident there will not be a `lost update`_ because you have local -version that is stale. - - -Endpoint Specific Caching -========================= - -It should be pointed out that there are times when an endpoint is -specifically tailored for different caching techniques. If you have a -RESTful service, there might be endpoints that are specifically meant -to be cached via time based caching techniques where as other -endpoints should focus on using ETags. In this situation it is -recommended that you use the `CacheControlAdapter` directly. :: - - import requests - from cachecontrol import CacheControlAdapter - from cachecontrol.caches import RedisCache - - # using django for an idea on where you might get a - # username/password. - from django.conf import settings - - # a function to return a redis connection all the instances of the - # app may use. this allows updates to the API (ie PUT) to invalidate - # the cache for other users. - from myapp.db import redis_connection - - - # create our session - client = sess.Session(auth=(settings.user, settings.password)) - - # we have a gettext like endpoint. this doesn't get updated very - # often so a time based cache is a helpful way to reduce many small - # requests. - client.mount('http://myapi.foo.com/gettext/', - CacheControlAdapter(cache_etags=False)) - - - # here we have user profile endpoint that lets us update information - # about users. we need this to be consistent immediately after a user - # updates some information because another node might handle the - # request. It uses the global redis cache to coordinate the cache and - # uses the equal priority caching to be sure etags are used by default. - redis_cache = RedisCache(redis_connection()) - client.mount('http://myapi.foo.com/user_profiles/', - CacheControlAdapter(cache=redis_cache)) - -Hopefully this more indepth example reveals how to configure a -`requests.Session` to better utilize ETag based caching vs. Time -Priority Caching. - -.. _lost update: http://www.w3.org/1999/04/Editing/ diff --git a/tests/test_etag.py b/tests/test_etag.py deleted file mode 100644 index 2b627763..00000000 --- a/tests/test_etag.py +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Eric Larson -# -# SPDX-License-Identifier: Apache-2.0 - -import pytest -import requests -from mock import Mock, patch - -from cachecontrol import CacheControl -from cachecontrol.cache import DictCache -from six.moves.urllib.parse import urljoin - - -class NullSerializer(object): - def dumps(self, request, response, body=None): - return response - - def loads(self, request, data): - if data and getattr(data, "chunked", False): - data.chunked = False - return data - - -class TestETag(object): - """Test our equal priority caching with ETags - - Equal Priority Caching is a term I've defined to describe when - ETags are cached orthgonally from Time Based Caching. - """ - - @pytest.fixture() - def sess(self, url): - self.etag_url = urljoin(url, "/etag") - self.update_etag_url = urljoin(url, "/update_etag") - self.cache = DictCache() - sess = CacheControl( - requests.Session(), cache=self.cache, serializer=NullSerializer() - ) - yield sess - - # closing session object - sess.close() - - def test_etags_get_example(self, sess, server): - """RFC 2616 14.26 - - The If-None-Match request-header field is used with a method to make - it conditional. A client that has one or more entities previously - obtained from the resource can verify that none of those entities - is current by including a list of their associated entity tags in - the If-None-Match header field. The purpose of this feature is to - allow efficient updates of cached information with a minimum amount - of transaction overhead - - If any of the entity tags match the entity tag of the entity that - would have been returned in the response to a similar GET request - (without the If-None-Match header) on that resource, [...] then - the server MUST NOT perform the requested method, [...]. Instead, if - the request method was GET or HEAD, the server SHOULD respond with - a 304 (Not Modified) response, including the cache-related header - fields (particularly ETag) of one of the entities that matched. - - (Paraphrased) A server may provide an ETag header on a response. On - subsequent queries, the client may reference the value of this Etag - header in an If-None-Match header; on receiving such a header, the - server can check whether the entity at that URL has changed from the - clients last version, and if not, it can return a 304 to indicate - the client can use it's current representation. - """ - r = sess.get(self.etag_url) - - # make sure we cached it - assert self.cache.get(self.etag_url) == r.raw - - # make the same request - resp = sess.get(self.etag_url) - assert resp.raw == r.raw - assert resp.from_cache - - # tell the server to change the etags of the response - sess.get(self.update_etag_url) - - resp = sess.get(self.etag_url) - assert resp != r - assert not resp.from_cache - - # Make sure we updated our cache with the new etag'd response. - assert self.cache.get(self.etag_url) == resp.raw - - -class TestDisabledETags(object): - """Test our use of ETags when the response is stale and the - response has an ETag. - """ - - @pytest.fixture() - def sess(self, server, url): - self.etag_url = urljoin(url, "/etag") - self.update_etag_url = urljoin(url, "/update_etag") - self.cache = DictCache() - sess = CacheControl( - requests.Session(), - cache=self.cache, - cache_etags=False, - serializer=NullSerializer(), - ) - return sess - - def test_expired_etags_if_none_match_response(self, sess): - """Make sure an expired response that contains an ETag uses - the If-None-Match header. - """ - # get our response - r = sess.get(self.etag_url) - - # expire our request by changing the date. Our test endpoint - # doesn't provide time base caching headers, so we add them - # here in order to expire the request. - r.headers["Date"] = "Tue, 26 Nov 2012 00:50:49 GMT" - self.cache.set(self.etag_url, r.raw) - - r = sess.get(self.etag_url) - assert r.from_cache - assert "if-none-match" in r.request.headers - assert r.status_code == 200 - - -class TestReleaseConnection(object): - """ - On 304s we still make a request using our connection pool, yet - we do not call the parent adapter, which releases the connection - back to the pool. This test ensures that when the parent `get` - method is not called we consume the response (which should be - empty according to the HTTP spec) and release the connection. - """ - - def test_not_modified_releases_connection(self, server, url): - sess = CacheControl(requests.Session()) - etag_url = urljoin(url, "/etag") - sess.get(etag_url) - - resp = Mock(status=304, headers={}) - - # This is how the urllib3 response is created in - # requests.adapters - response_mod = "requests.adapters.HTTPResponse.from_httplib" - - with patch(response_mod, Mock(return_value=resp)): - sess.get(etag_url) - assert resp.read.called - assert resp.release_conn.called From 9ca7d21104a01ac07a02bdfaa94a13c73644077f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 17 Apr 2020 21:23:24 +0100 Subject: [PATCH 19/19] Expand and split the decision methods into their own module. This includes a "policy" module that includes functions to answer policy decisions on whether to use the cache, whether to cache a certain response, and whether to consider a response fresh. Note that this relies on a full-fledged Cache-Control parser that uses the abnf module, which is Python 3 only. --- cachecontrol/adapter.py | 47 ++--- cachecontrol/controller.py | 319 ++++++++-------------------- cachecontrol/headers_parser.py | 81 +++++++ cachecontrol/policy.py | 372 +++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- setup.py | 2 +- tests/test_cache_control.py | 81 ++++--- tests/test_headers_parser.py | 35 ++++ 8 files changed, 647 insertions(+), 292 deletions(-) create mode 100644 cachecontrol/headers_parser.py create mode 100644 cachecontrol/policy.py create mode 100644 tests/test_headers_parser.py diff --git a/cachecontrol/adapter.py b/cachecontrol/adapter.py index a461428c..4a481336 100644 --- a/cachecontrol/adapter.py +++ b/cachecontrol/adapter.py @@ -3,19 +3,21 @@ # SPDX-License-Identifier: Apache-2.0 import functools +import logging import types import zlib from requests.adapters import HTTPAdapter from .cache import DictCache -from .controller import PERMANENT_REDIRECT_STATUSES, CacheController +from .controller import CacheController from .filewrapper import CallbackFileWrapper +from .policy import use_cache_for_request +logger = logging.getLogger(__name__) -class CacheControlAdapter(HTTPAdapter): - invalidating_methods = {"PUT", "DELETE"} +class CacheControlAdapter(HTTPAdapter): def __init__( self, cache=None, @@ -27,11 +29,11 @@ def __init__( ): super(CacheControlAdapter, self).__init__(*args, **kw) self.cache = DictCache() if cache is None else cache - self.cacheable_methods = cacheable_methods or ("GET",) + self.cacheable_methods = cacheable_methods controller_factory = controller_class or CacheController self.controller = controller_factory( - self.cache, serializer=serializer + self.cache, serializer=serializer, cacheable_methods=cacheable_methods, ) def send(self, request, cacheable_methods=None, **kw): @@ -39,17 +41,16 @@ def send(self, request, cacheable_methods=None, **kw): Send a request. Use the request information to see if it exists in the cache and cache the response if we need to and can. """ - cacheable = cacheable_methods or self.cacheable_methods - if request.method in cacheable: - try: - cached_response = self.controller.cached_request(request) - except zlib.error: - cached_response = None - if cached_response: - return self.build_response(request, cached_response, from_cache=True) - - # check for etags and add headers if appropriate - request.headers.update(self.controller.conditional_headers(request)) + try: + cached_response = self.controller.cached_request( + request, cacheable_methods=cacheable_methods + ) + except zlib.error: + cached_response = None + if cached_response: + return self.build_response(request, cached_response, from_cache=True) + + self.controller.add_conditional_headers(request) resp = super(CacheControlAdapter, self).send(request, **kw) @@ -64,9 +65,12 @@ def build_response( This will end up calling send and returning a potentially cached response """ - cacheable = cacheable_methods or self.cacheable_methods - if not from_cache and request.method in cacheable: + if not from_cache and use_cache_for_request( + request, cacheable_methods=cacheable_methods + ): if response.status == 304: + logger.debug("Received a 'Not Modified' response.") + # We must have sent an ETag request. This could mean # that we've been expired already or that we simply # have an etag. In either case, we want to try and @@ -87,9 +91,6 @@ def build_response( response = cached_response - # We always cache the 301 responses - elif int(response.status) in PERMANENT_REDIRECT_STATUSES: - self.controller.cache_response(request, response) else: # Wrap the response file with a wrapper that will cache the # response when the stream has been consumed. @@ -114,9 +115,7 @@ def _update_chunk_length(self): resp = super(CacheControlAdapter, self).build_response(request, response) # See if we should invalidate the cache. - if request.method in self.invalidating_methods and resp.ok: - cache_url = self.controller.cache_url(request.url) - self.cache.delete(cache_url) + self.controller.maybe_invalidate_cache(request, response) # Give the request a from_cache attr to let people use it resp.from_cache = from_cache diff --git a/cachecontrol/controller.py b/cachecontrol/controller.py index 8a2fee50..6034c2ae 100644 --- a/cachecontrol/controller.py +++ b/cachecontrol/controller.py @@ -9,19 +9,22 @@ import logging import re import time -from email.utils import parsedate_tz from requests.structures import CaseInsensitiveDict from .cache import DictCache +from .policy import ( + can_cache_response, + is_invalidating_cache, + is_response_fresh, + use_cache_for_request, +) from .serialize import Serializer logger = logging.getLogger(__name__) URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") -PERMANENT_REDIRECT_STATUSES = (301, 308) - def parse_uri(uri): """Parses a URI using the regex given in Appendix B of RFC 3986. @@ -37,12 +40,12 @@ class CacheController(object): """ def __init__( - self, cache=None, cache_etags=True, serializer=None, status_codes=None + self, cache=None, serializer=None, status_codes=None, cacheable_methods=None, ): self.cache = DictCache() if cache is None else cache - self.cache_etags = cache_etags self.serializer = serializer or Serializer() - self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308) + self.cacheable_status_codes = status_codes + self.cacheable_methods = cacheable_methods @classmethod def _urlnorm(cls, uri): @@ -68,77 +71,16 @@ def _urlnorm(cls, uri): def cache_url(cls, uri): return cls._urlnorm(uri) - def parse_cache_control(self, headers): - known_directives = { - # https://tools.ietf.org/html/rfc7234#section-5.2 - "max-age": (int, True), - "max-stale": (int, False), - "min-fresh": (int, True), - "no-cache": (None, False), - "no-store": (None, False), - "no-transform": (None, False), - "only-if-cached": (None, False), - "must-revalidate": (None, False), - "public": (None, False), - "private": (None, False), - "proxy-revalidate": (None, False), - "s-maxage": (int, True), - } - - cc_headers = headers.get("cache-control", headers.get("Cache-Control", "")) - - retval = {} - - for cc_directive in cc_headers.split(","): - if not cc_directive.strip(): - continue - - parts = cc_directive.split("=", 1) - directive = parts[0].strip() - - try: - typ, required = known_directives[directive] - except KeyError: - logger.debug("Ignoring unknown cache-control directive: %s", directive) - continue - - if not typ or not required: - retval[directive] = None - if typ: - try: - retval[directive] = typ(parts[1].strip()) - except IndexError: - if required: - logger.debug( - "Missing value for cache-control " "directive: %s", - directive, - ) - except ValueError: - logger.debug( - "Invalid value for cache-control directive " "%s, must be %s", - directive, - typ.__name__, - ) - - return retval - - def cached_request(self, request): + def cached_request(self, request, cacheable_methods=None): """ Return a cached response if it exists in the cache, otherwise return False. """ - cache_url = self.cache_url(request.url) - logger.debug('Looking up "%s" in the cache', cache_url) - cc = self.parse_cache_control(request.headers) - - # Bail out if the request insists on fresh data - if "no-cache" in cc: - logger.debug('Request header has "no-cache", cache bypassed') + if not use_cache_for_request(request, cacheable_methods=cacheable_methods): return False - if "max-age" in cc and cc["max-age"] == 0: - logger.debug('Request header has "max_age" as 0, cache bypassed') - return False + cache_url = self.cache_url(request.url) + logger.debug('Looking up "%s" in the cache', cache_url) # Request allows serving from the cache, let's see if we find something cache_data = self.cache.get(cache_url) @@ -152,103 +94,32 @@ def cached_request(self, request): logger.warning("Cache entry deserialization failed, entry ignored") return False - # If we have a cached permanent redirect, return it immediately. We - # don't need to test our response for other headers b/c it is - # intrinsically "cacheable" as it is Permanent. - # - # See: - # https://tools.ietf.org/html/rfc7231#section-6.4.2 - # - # Client can try to refresh the value by repeating the request - # with cache busting headers as usual (ie no-cache). - if int(resp.status) in PERMANENT_REDIRECT_STATUSES: - msg = ( - "Returning cached permanent redirect response " - "(ignoring date and etag information)" - ) - logger.debug(msg) - return resp - - headers = CaseInsensitiveDict(resp.headers) - if not headers or "date" not in headers: - if "etag" not in headers: - # Without date or etag, the cached response can never be used - # and should be deleted. - logger.debug("Purging cached response: no date or etag") - self.cache.delete(cache_url) - logger.debug("Ignoring cached response: no date") + try: + if is_response_fresh(request, resp): + return resp + except Exception: return False - now = time.time() - date = calendar.timegm(parsedate_tz(headers["date"])) - current_age = max(0, now - date) - logger.debug("Current age based on date: %i", current_age) - - # TODO: There is an assumption that the result will be a - # urllib3 response object. This may not be best since we - # could probably avoid instantiating or constructing the - # response until we know we need it. - resp_cc = self.parse_cache_control(headers) - - # determine freshness - freshness_lifetime = 0 - - # Check the max-age pragma in the cache control header - if "max-age" in resp_cc: - freshness_lifetime = resp_cc["max-age"] - logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime) - - # If there isn't a max-age, check for an expires header - elif "expires" in headers: - expires = parsedate_tz(headers["expires"]) - if expires is not None: - expire_time = calendar.timegm(expires) - date - freshness_lifetime = max(0, expire_time) - logger.debug("Freshness lifetime from expires: %i", freshness_lifetime) - - # Determine if we are setting freshness limit in the - # request. Note, this overrides what was in the response. - if "max-age" in cc: - freshness_lifetime = cc["max-age"] - logger.debug( - "Freshness lifetime from request max-age: %i", freshness_lifetime - ) - - if "min-fresh" in cc: - min_fresh = cc["min-fresh"] - # adjust our current age by our min fresh - current_age += min_fresh - logger.debug("Adjusted current age from min-fresh: %i", current_age) - - # Return entry if it is fresh enough - if freshness_lifetime > current_age: - logger.debug('The response is "fresh", returning cached response') - logger.debug("%i > %i", freshness_lifetime, current_age) - return resp - - # we're not fresh. If we don't have an Etag, clear it out - if "etag" not in headers: - logger.debug('The cached response is "stale" with no etag, purging') - self.cache.delete(cache_url) - # return the original handler return False - def conditional_headers(self, request): + def add_conditional_headers(self, request): cache_url = self.cache_url(request.url) + logger.debug("Applying conditional headers to request for %s", cache_url) resp = self.serializer.loads(request, self.cache.get(cache_url)) - new_headers = {} if resp: - headers = CaseInsensitiveDict(resp.headers) - - if "etag" in headers: - new_headers["If-None-Match"] = headers["ETag"] + cached_headers = CaseInsensitiveDict(resp.headers) + etag = cached_headers.get("etag", None) + last_modified = cached_headers.get("last-modified", None) - if "last-modified" in headers: - new_headers["If-Modified-Since"] = headers["Last-Modified"] + if etag is not None: + logger.debug("Adding If-None-Match: %s", etag) + request.headers["if-none-match"] = etag - return new_headers + if last_modified is not None: + logger.debug("Adding If-Modified-Since: %s", last_modified) + request.headers["if-modified-since"] = last_modified def cache_response(self, request, response, body=None, status_codes=None): """ @@ -259,91 +130,39 @@ def cache_response(self, request, response, body=None, status_codes=None): # From httplib2: Don't cache 206's since we aren't going to # handle byte range requests cacheable_status_codes = status_codes or self.cacheable_status_codes - if response.status not in cacheable_status_codes: - logger.debug( - "Status code %s not in %s", response.status, cacheable_status_codes - ) - return - - response_headers = CaseInsensitiveDict(response.headers) # If we've been given a body, our response has a Content-Length, that - # Content-Length is valid then we can check to see if the body we've - # been given matches the expected size, and if it doesn't we'll just - # skip trying to cache it. - if ( - body is not None - and "content-length" in response_headers - and response_headers["content-length"].isdigit() - and int(response_headers["content-length"]) != len(body) - ): - return - - cc_req = self.parse_cache_control(request.headers) - cc = self.parse_cache_control(response_headers) - - cache_url = self.cache_url(request.url) - logger.debug('Updating cache with response from "%s"', cache_url) - - # Delete it from the cache if we happen to have it stored there - no_store = False - if "no-store" in cc: - no_store = True - logger.debug('Response header has "no-store"') - if "no-store" in cc_req: - no_store = True - logger.debug('Request header has "no-store"') - if no_store and self.cache.get(cache_url): - logger.debug('Purging existing cache entry to honor "no-store"') - self.cache.delete(cache_url) - if no_store: - return + # Content-Length is valid then we can check to see if the body we've been given + # matches the expected size, and if it doesn't we'll just skip trying to cache + # it. + if body is not None: + response_headers = CaseInsensitiveDict(response.headers) + content_length = response_headers.get("content-length", None) + try: + if int(content_length) != len(body): + logger.debug("Not caching response with invalid Content-Length") + return + except (ValueError, TypeError): + pass + + try: + if not use_cache_for_request(request): + logger.warning( + "Trying to cache the response to a request skipping cache." + ) + return - # https://tools.ietf.org/html/rfc7234#section-4.1: - # A Vary header field-value of "*" always fails to match. - # Storing such a response leads to a deserialization warning - # during cache lookup and is not allowed to ever be served, - # so storing it can be avoided. - if "*" in response_headers.get("vary", ""): - logger.debug('Response header has "Vary: *"') + can_cache = can_cache_response(response) + except Exception: + logger.debug( + "Exception occurred while verifying whether response can be cached, not caching." + ) return - # If we've been given an etag, then keep the response - if self.cache_etags and "etag" in response_headers: - logger.debug("Caching due to etag") + if can_cache: + cache_url = self.cache_url(request.url) self.cache.set(cache_url, self.serializer.dumps(request, response, body)) - # Add to the cache any permanent redirects. We do this before looking - # that the Date headers. - elif int(response.status) in PERMANENT_REDIRECT_STATUSES: - logger.debug("Caching permanent redirect") - self.cache.set(cache_url, self.serializer.dumps(request, response, b"")) - - # Add to the cache if the response headers demand it. If there - # is no date header then we can't do anything about expiring - # the cache. - elif "date" not in response_headers: - logger.debug("No date header, expiration cannot be set.") - return - else: - # cache when there is a max-age > 0 - if "max-age" in cc and cc["max-age"] > 0: - logger.debug("Caching b/c date exists and max-age > 0") - self.cache.set( - cache_url, self.serializer.dumps(request, response, body) - ) - - # If the request can expire, it means we should cache it - # in the meantime. - elif "expires" in response_headers: - if response_headers["expires"]: - logger.debug("Caching b/c of expires header") - self.cache.set( - cache_url, self.serializer.dumps(request, response, body) - ) - else: - logger.debug("No combination of headers to cache.") - def update_cached_response(self, request, response): """On a 304 we will get a new set of headers that we want to update our cached value with, assuming we have one. @@ -351,6 +170,19 @@ def update_cached_response(self, request, response): This should only ever be called when we've sent an ETag and gotten a 304 as the response. """ + + # Special case: we can cache a 304 code, but only because we're trying + # to cache the new response. + try: + if not can_cache_response(response, cacheable_status_codes={200, 304}): + logger.debug("Not updating cached response.") + return response + except Exception: + logger.debug( + "Exception occurred while verifying whether cached response can be updated, not updating." + ) + return response + cache_url = self.cache_url(request.url) cached_response = self.serializer.loads(request, self.cache.get(cache_url)) @@ -384,3 +216,20 @@ def update_cached_response(self, request, response): self.cache.set(cache_url, self.serializer.dumps(request, cached_response, body)) return cached_response + + def maybe_invalidate_cache(self, request, response): + try: + invalidate_cache = is_invalidating_cache(request, response) + except Exception: + logger.debug( + "Exception occurred while verifying whether cache should be invalidated. Invalidating for safety." + ) + invalidate_cache = True + + if invalidate_cache: + # TODO: https://httpwg.org/specs/rfc7234.html#invalidation says that the + # cache MUST invalidate Location and Content-Location URLs if present, _if_ + # they are in the same host. + cache_url = self.cache_url(request.url) + logger.debug("Invalidating cache for %s", cache_url) + self.cache.delete(cache_url) diff --git a/cachecontrol/headers_parser.py b/cachecontrol/headers_parser.py new file mode 100644 index 00000000..79d08976 --- /dev/null +++ b/cachecontrol/headers_parser.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: © 2019 The cachecontrol Authors +# SPDX-License-Identifier: Apache-2.0 + +import functools + +from abnf.grammars import rfc7234 +from abnf.parser import NodeVisitor, Rule +from requests.structures import CaseInsensitiveDict + + +@functools.lru_cache(maxsize=1) +def _get_cache_control_rule(): + return rfc7234.Rule("Cache-Control") + + +@functools.lru_cache(maxsize=1) +def _get_pragma_rule() -> Rule: + return rfc7234.Rule("Pragma") + + +class DirectivesVisitor(NodeVisitor): + def __init__(self): + super().__init__() + self.directives = [] + + def visit(self, node): + super().visit(node) + + def visit_cache_control(self, node): + for child_node in node.children: + self.visit(child_node) + + def visit_cache_directive(self, node): + self.directives.append(node.value) + + def visit_pragma(self, node): + for child_node in node.children: + self.visit(child_node) + + def visit_pragma_directive(self, node): + self.directives.append(node.value) + + +def _tokenize_directives(header_value, rule): + # We allow a bit of leeway from the RFC, allowing spaces before and after the header + # value, by stripping the input string. + header_value = header_value.strip() + + if not header_value: + return {} + + header_node = rule.parse_all(header_value) + header_visitor = DirectivesVisitor() + header_visitor.visit(header_node) + + directives_dict = CaseInsensitiveDict() + for directive in header_visitor.directives: + if "=" in directive: + directive, argument = directive.split("=", 1) + # RFC7234 requires recognizing quoted-string forms even where they are not + # recommended. Thankfully the parser will reject invalid half-quoted + # strings. + if argument[0] == '"': + argument = argument[1:-1] + else: + argument = None + + if directive in directives_dict: + logger.debug("Duplicate directive '%s' in header", directive) + else: + directives_dict[directive] = argument + + return directives_dict + + +def tokenize_cache_control(header_value): + return _tokenize_directives(header_value, _get_cache_control_rule()) + + +def tokenize_pragma(header_value): + return _tokenize_directives(header_value, _get_pragma_rule()) diff --git a/cachecontrol/policy.py b/cachecontrol/policy.py new file mode 100644 index 00000000..e21e2f16 --- /dev/null +++ b/cachecontrol/policy.py @@ -0,0 +1,372 @@ +# SPDX-FileCopyrightText: © 2019 The cachecontrol Authors +# SPDX-License-Identifier: Apache-2.0 +"""Implement policy decision on caching requests and responses. + +Providing the right decision on whether to cache or not cache a response, given a +request, is a complex matter that is governed by RFC 7234: +https://httpwg.org/specs/rfc7234.html + +This module focuses on providing the answer to the following questions: + - Can this request be answered from the cache? + - Is this cached response still valid? + - Can this new response be cached? +""" + +import datetime +import http +import logging +from email.utils import parsedate_to_datetime + +import pytz +from requests.structures import CaseInsensitiveDict +from six.moves import http_client + +from .headers_parser import tokenize_cache_control, tokenize_pragma + +logger = logging.getLogger(__name__) + + +# While RFC7234 allows caching methods other than GET, for now focus on caching the safe +# requests. There's a few other differences for methods such as HEAD, that can +# invalidate a cache, and can in some cases generate a new cache entry (e.g. permanent +# redirects), but not cache the whole content. +_CACHEABLE_METHODS = { + "GET", +} + +# We want to explicitly allow the safe methods, rather than disallow the invalidating +# one, as the RFC is clear that "A cache MUST invalidate ... when it receives a +# non-error response to a request with a method whose safety is unknown." +_SAFE_METHODS = { + "GET", + "HEAD", +} + +_CACHEABLE_STATUS_CODES = { + http_client.OK, + http_client.NON_AUTHORITATIVE_INFORMATION, + http_client.MULTIPLE_CHOICES, + http_client.MOVED_PERMANENTLY, + 308, # PERMANENT_REDIRECT +} + +_PERMANENT_REDIRECT_CODES = { + http_client.MOVED_PERMANENTLY, + 308, # PERMANENT_REDIRECT +} + + +def use_cache_for_request(request, cacheable_methods=None): + """Decide whether the provided request can be answered from cache. + + Args: + request: The HTTPRequest object that is yet to be sent to the server. + cacheable_methods: The list of methods to consider cacheable. + + Returns: + False if the request is explicitly asking not to answer from a cached response, + True otherwise. + """ + + if cacheable_methods is None: + cacheable_methods = _CACHEABLE_METHODS + + if request.method not in cacheable_methods: + logger.debug("Ignoring cache: method %r is not cacheable", request.method) + return False + + request_headers = CaseInsensitiveDict(request.headers) + req_cache_control_header = request_headers.get("cache-control", "") + req_cache_control = tokenize_cache_control(req_cache_control_header) + + if "no-store" in req_cache_control: + logger.debug( + "Ignoring cache: request Cache-Control includes 'no-store' directive: %r", + req_cache_control_header, + ) + return False + + if "authorization" in request_headers: + logger.debug("Ignoring cache: request includes 'Authorization'header") + return False + + return True + + +def _response_expiration_datetime( + response, request_datetime=None, shared_cache=False, max_age_override=None +): + """Calculate the expiration datetime for a given response. + + Args: + response: The HTTPResponse object to calculate the expiration of. + request_datetime: Optional datetime object to assume the request was sent at. + Only used if the response does not have a Date header. + shared_cache: Whether to consider the cache a shared cache per RFC7234. + max_age_override: If provided, this value in seconds will be used as the max age + for the response. + + Returns: + A datetime.datetime object representing the moment the request is considered + expired. + """ + response_headers = CaseInsensitiveDict(response.headers) + resp_cache_control_header = response_headers.get("cache-control", "") + resp_cache_control = tokenize_cache_control(resp_cache_control_header) + + if "date" in response_headers: + response_datetime = parsedate_to_datetime(response_headers["date"]) + elif request_datetime: + logger.debug("Missing Date header from request, assuming %s", request_datetime) + response_datetime = request_datetime + else: + logger.debug("Missing response timestamp, no expiration assumed") + return None + + # https://httpwg.org/specs/rfc7234.html#header.age + if "age" in response_headers: + response_datetime += datetime.timedelta(seconds=int(response_header["age"])) + + if max_age_override: + max_age = max_age_override + else: + max_age = resp_cache_control.get("max-age", None) + # https://httpwg.org/specs/rfc7234.html#cache-response-directive.s-maxage + if shared_cache: + max_age = resp_cache_control.get("s-maxage", max_age) + + # If any max_age directive or override is present, those control the expiration. + if max_age is not None: + expiration = response_datetime + datetime.timedelta(seconds=int(max_age)) + logger.debug("Expiration time: %s (max-age / s-maxage directives)", expiration) + elif "expires" in response_headers: + expiration = parsedate_to_datetime(response_headers["expires"]) + logger.debug("Expiration time: %s (Expires header)", expiration) + elif "last-modified" in response_headers: + # https://httpwg.org/specs/rfc7234.html#heuristic.freshness + # + # There's no expiration defined as part of the response, we need to + # heuristically define an expiration for the request. + last_modified_datetime = parsedate_to_datetime( + response_headers["last-modified"] + ) + modification_delta = response_datetime - last_modified_datetime + expiration = response_datetime + modification_delta * 0.1 # 10% + logger.debug("Expiration time: %s (heuristic)", expiration) + else: + logger.debug("Unable to identify a valid expiration time") + return None + + return expiration + + +def is_response_fresh(request, cached_response, shared_cache=False): + """Decide whether the cached response is still fresh enough for the request. + + Note that this depends on the request: a cached response might still not be expired, + but not fresh enough for the provided request. + + Args: + request: The HTTPRequest object that is yet to be sent to the server. + cached_response: The HTTPResponse object stored in the cache to evaluate. + shared_cache: Whether to consider the cache a shared cache per RFC7234. + + Returns: + True if the cached response is still fresh enough for the request, False + otherwise. + """ + response_headers = CaseInsensitiveDict(cached_response.headers) + resp_cache_control_header = response_headers.get("cache-control", "") + resp_cache_control = tokenize_cache_control(resp_cache_control_header) + + request_headers = CaseInsensitiveDict(request.headers) + req_cache_control_header = request_headers.get("cache-control", "") + req_cache_control = tokenize_cache_control(req_cache_control_header) + + if "no-cache" in req_cache_control: + logger.debug( + "Cached response is not fresh: request Cache-Control includes 'no-cache' directive: %r", + req_cache_control_header, + ) + return False + + # https://httpwg.org/specs/rfc7234.html#header.pragma + # + # The Pragma header is only specified for requests, not responses, and is ignored + # if Cache-Control is provided. + if "cache-control" not in request_headers: + pragma = tokenize_pragma(request_headers.get("pragma", "")) + if "no-cache" in pragma: + logger.debug( + "Cached response is not fresh: request includes a 'Pragma: no-cache' header" + ) + return False + + if "no-cache" in resp_cache_control: + logger.debug( + "Cached response is not fresh: response Cache-Control includes 'no-cache' directive: %r", + resp_cache_control_header, + ) + return False + + if "must-revalidate" in resp_cache_control: + logger.debug( + "Cached response is not fresh: response Cache-Control includes 'must-revalidate' directive: %r", + resp_cache_control_header, + ) + return False + + if req_cache_control.get("max-age", None) == "0": + logger.debug( + "Cache response is not fresh: request Cache-Control includes 'max-age=0' directive: %r", + req_cache_control_header, + ) + return False + + # If the cached response is a permanent redirect, consider it always fresh (minus + # the Cache-Control directives above), since it does not require an explicit + # expiration. + if int(cached_response.status) in _PERMANENT_REDIRECT_CODES: + logger.debug("Cached response is fresh: permanent redirect") + return True + + if "max-age" in req_cache_control: + max_age_override = int(req_cache_control["max-age"]) + else: + max_age_override = None + + expiration = _response_expiration_datetime( + cached_response, max_age_override=max_age_override, shared_cache=shared_cache + ) + if not expiration: + logger.debug( + "Cached response is not fresh: unable to identify a valid expiration time." + ) + return False + + # https://httpwg.org/specs/rfc7234.html#cache-request-directive.max-stale + # + # If the request is allowing stale response, extend the expiration by how much it + # was required. + expiration += datetime.timedelta(int(req_cache_control.get("max-stale", 0))) + + # https://httpwg.org/specs/rfc7234.html#cache-request-directive.min-fresh + # + # If the request is asking for a response that is valid for longer, include that + # in the frehsness horizon. + freshness_horizon = pytz.UTC.localize(datetime.datetime.utcnow()) + freshness_horizon += datetime.timedelta(int(req_cache_control.get("min-fresh", 0))) + + if freshness_horizon > expiration: + logger.debug("Cached response is not fresh: expiration time already passed.") + return False + + logger.debug("Cached response is fresh.") + return True + + +def can_cache_response(response, cacheable_status_codes=None, shared_cache=False): + """Decide whether the provided response can be stored in cache. + + Args: + response: The *new* HTTPResponse object that was returned by the server. + cacheable_status_codes: A container of integer status codes that are considered + cacheable. + shared_cache: Whether to consider the cache a shared cache per RFC7234. + + Returns: + True if the received response is cacheable. False otherwise. + """ + + if cacheable_status_codes is None: + cacheable_status_codes = _CACHEABLE_STATUS_CODES + + # Don't cache errors, temporary statuses, or non-OK return codes. + if int(response.status) not in cacheable_status_codes: + logger.debug("Not caching: status code %r is not cacheable", response.status) + return False + + response_headers = CaseInsensitiveDict(response.headers) + resp_cache_control_header = response_headers.get("cache-control", "") + resp_cache_control = tokenize_cache_control(resp_cache_control_header) + + if "no-store" in resp_cache_control or "private" in resp_cache_control: + logger.debug( + "Not caching: response Cache-Control includes 'no-store' or 'private' directives: %r", + resp_cache_control_header, + ) + return False + + # https://httpwg.org/specs/rfc7234.html#caching.negotiated.responses + # + # A Vary header field-value of "*" always fails to match. Storing such a response + # leads to a deserialization warning during cache lookup and is not allowed to + # ever be served, so storing it can be avoided. + if "*" in response_headers.get("vary", ""): + logger.debug("Not caching: response contains 'Vary: *'") + return False + + if int(response.status) in _PERMANENT_REDIRECT_CODES: + logger.debug("Caching: permanent redirect") + return True + + now = pytz.UTC.localize(datetime.datetime.utcnow()) + if "date" in response_headers: + response_datetime = parsedate_to_datetime(response_headers["date"]) + # https://httpwg.org/specs/rfc7231.html#header.date + # + # Date is supposed to always be in GMT, but it's not always returned + # correctly. If no timezone data was provided, assume GMT. + if not response_datetime.tzinfo: + pytz.UTC.localize(response_datetime) + else: + logger.debug("Missing Date header from request, assuming current.") + response_datetime = now + + expiration = _response_expiration_datetime( + response, request_datetime=now, shared_cache=shared_cache + ) + + if not expiration: + logger.debug("Not caching: unable to identify a valid expiration time") + return False + + if now > expiration: + logger.debug("Not caching: expiration time already passed.") + return False + + logger.debug("Caching: no reason not to") + return True + + +def is_invalidating_cache(request, new_response): + """Decide whether a given request/response pair invalidates the cached values. + + Args: + request: The HTTPRequest sent to the server (does not need to have been sent + through the cache. + response: The HTTPResponse received from the server (not the one in cache.) + + Returns: + True if the received response should invalidate the cached response, otherwise + False. + """ + + response_status = int(new_response.status) + # https://httpwg.org/specs/rfc7234.html#invalidation + # + # Non-error response is defined as a 2xx or 3xx status, everything else can be + # considered an error and ignore it. + if not 200 <= response_status <= 399: + logger.debug( + "Not invalidating: response contains an error status: %r", + new_response.status, + ) + return False + + if request.method not in _SAFE_METHODS: + logger.debug("Invalidating: request method not known safe: %r", request.method) + return True + + logger.debug("Not invalidating request.") + return False diff --git a/pyproject.toml b/pyproject.toml index 1d48a866..e00bc7bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ [tool.isort] line_length = 88 known_first_party = ['cachecontrol'] -known_third_party = ['mock', 'lockfile', 'requests', 'pytest', 'msgpack', 'cherrypy'] +known_third_party = ['mock', 'lockfile', 'requests', 'pytest', 'msgpack', 'cherrypy', 'pytz', 'abnf'] # Set multi-line output to "Vertical Hanging indent" to avoid fighting with black. multi_line_output = 3 include_trailing_comma = true diff --git a/setup.py b/setup.py index b571cc3f..ab0aeb77 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ include_package_data=True, description="httplib2 caching for requests", long_description=long_description, - install_requires=["requests", "msgpack>=0.5.2", "six"], + install_requires=["requests", "msgpack>=0.5.2", "six", "abnf"], extras_require={"filecache": ["lockfile>=0.9"], "redis": ["redis>=2.10.5"]}, entry_points={"console_scripts": ["doesitcache = cachecontrol._cmd:main"]}, python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", diff --git a/tests/test_cache_control.py b/tests/test_cache_control.py index 0b7c0f8f..f7f25d2b 100644 --- a/tests/test_cache_control.py +++ b/tests/test_cache_control.py @@ -9,10 +9,17 @@ import pytest from mock import ANY, Mock +from requests.models import PreparedRequest from cachecontrol import CacheController from cachecontrol.cache import DictCache +try: + from requests.packages.urllib3.response import HTTPResponse +except ImportError: + from urllib3.response import HTTPResponse + + TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" @@ -28,14 +35,12 @@ class TestCacheControllerResponse(object): url = "http://url.com/" def req(self, headers=None): - headers = headers or {} - return Mock(full_url=self.url, url=self.url, headers=headers) # < 1.x support + new_req = PreparedRequest() + new_req.prepare(method="GET", url=self.url, headers=headers) + return new_req - def resp(self, headers=None): - headers = headers or {} - return Mock( - status=200, headers=headers, request=self.req(), read=lambda **k: b"testing" - ) + def resp(self, headers=None, status=200): + return HTTPResponse(body="testing", headers=headers, status=status) @pytest.fixture() def cc(self): @@ -50,7 +55,7 @@ def test_no_cache_non_20x_response(self, cc): no_cache_codes = [201, 400, 500] for code in no_cache_codes: resp.status = code - cc.cache_response(Mock(), resp) + cc.cache_response(self.req(), resp) assert not cc.cache.set.called # this should work b/c the resp is 20x @@ -59,13 +64,6 @@ def test_no_cache_non_20x_response(self, cc): assert cc.serializer.dumps.called assert cc.cache.set.called - def test_no_cache_with_no_date(self, cc): - # No date header which makes our max-age pointless - resp = self.resp({"cache-control": "max-age=3600"}) - cc.cache_response(self.req(), resp) - - assert not cc.cache.set.called - def test_no_cache_with_wrong_sized_body(self, cc): # When the body is the wrong size, then we don't want to cache it # because it is obviously broken. @@ -97,17 +95,19 @@ def test_cache_response_cache_max_age_with_invalid_value_not_cached(self, cc): assert not cc.cache.set.called def test_cache_response_no_store(self): - resp = Mock() - cache = DictCache({self.url: resp}) + cached_resp = self.resp({"ETag": "cached-resp"}) + cache = DictCache({self.url: cached_resp}) cc = CacheController(cache) cache_url = cc.cache_url(self.url) - resp = self.resp({"cache-control": "no-store"}) + resp = self.resp({"cache-control": "no-store", "ETag": "no-store-resp"}) assert cc.cache.get(cache_url) cc.cache_response(self.req(), resp) - assert not cc.cache.get(cache_url) + + lookup_response = cc.cache.get(cache_url) + assert lookup_response.headers["ETag"] == "cached-resp" def test_cache_response_no_store_with_etag(self, cc): resp = self.resp({"cache-control": "no-store", "ETag": "jfd9094r808"}) @@ -124,11 +124,20 @@ def test_no_cache_with_vary_star(self, cc): assert not cc.cache.set.called def test_update_cached_response_with_valid_headers(self): - cached_resp = Mock(headers={"ETag": "jfd9094r808", "Content-Length": 100}) + cached_resp = self.resp( + headers={"ETag": "jfd9094r808", "Content-Length": "100"} + ) # Set our content length to 200. That would be a mistake in # the server, but we'll handle it gracefully... for now. - resp = Mock(headers={"ETag": "28371947465", "Content-Length": 200}) + resp = self.resp( + headers={ + "ETag": "28371947465", + "Content-Length": "200", + "Cache-Control": "max-age=86400", + }, + status=304, + ) cache = DictCache({self.url: cached_resp}) cc = CacheController(cache) @@ -141,7 +150,7 @@ def test_update_cached_response_with_valid_headers(self): result = cc.update_cached_response(Mock(), resp) assert result.headers["ETag"] == resp.headers["ETag"] - assert result.headers["Content-Length"] == 100 + assert result.headers["Content-Length"] == "100" class TestCacheControlRequest(object): @@ -150,13 +159,17 @@ class TestCacheControlRequest(object): def setup(self): self.c = CacheController(DictCache(), serializer=NullSerializer()) + def resp(self, headers, status=200): + return HTTPResponse(headers=headers, status=status) + def req(self, headers): - mock_request = Mock(url=self.url, headers=headers) - return self.c.cached_request(mock_request) + new_req = PreparedRequest() + new_req.prepare(method="GET", url=self.url, headers=headers) + return self.c.cached_request(new_req) def test_cache_request_no_headers(self): - cached_resp = Mock( - headers={"ETag": "jfd9094r808", "Content-Length": 100}, status=200 + cached_resp = self.resp( + headers={"ETag": "jfd9094r808", "Content-Length": "100"}, status=200 ) self.c.cache = DictCache({self.url: cached_resp}) resp = self.req({}) @@ -184,7 +197,9 @@ def test_cache_request_not_in_cache(self): def test_cache_request_fresh_max_age(self): now = time.strftime(TIME_FMT, time.gmtime()) - resp = Mock(headers={"cache-control": "max-age=3600", "date": now}, status=200) + resp = self.resp( + headers={"cache-control": "max-age=3600", "date": now}, status=200 + ) cache = DictCache({self.url: resp}) self.c.cache = cache @@ -194,7 +209,9 @@ def test_cache_request_fresh_max_age(self): def test_cache_request_unfresh_max_age(self): earlier = time.time() - 3700 # epoch - 1h01m40s now = time.strftime(TIME_FMT, time.gmtime(earlier)) - resp = Mock(headers={"cache-control": "max-age=3600", "date": now}, status=200) + resp = self.resp( + headers={"cache-control": "max-age=3600", "date": now}, status=200 + ) self.c.cache = DictCache({self.url: resp}) r = self.req({}) assert not r @@ -203,7 +220,7 @@ def test_cache_request_fresh_expires(self): later = time.time() + 86400 # GMT + 1 day expires = time.strftime(TIME_FMT, time.gmtime(later)) now = time.strftime(TIME_FMT, time.gmtime()) - resp = Mock(headers={"expires": expires, "date": now}, status=200) + resp = self.resp(headers={"expires": expires, "date": now}, status=200) cache = DictCache({self.url: resp}) self.c.cache = cache r = self.req({}) @@ -213,7 +230,7 @@ def test_cache_request_unfresh_expires(self): sooner = time.time() - 86400 # GMT - 1 day expires = time.strftime(TIME_FMT, time.gmtime(sooner)) now = time.strftime(TIME_FMT, time.gmtime()) - resp = Mock(headers={"expires": expires, "date": now}, status=200) + resp = self.resp(headers={"expires": expires, "date": now}, status=200) cache = DictCache({self.url: resp}) self.c.cache = cache r = self.req({}) @@ -222,7 +239,9 @@ def test_cache_request_unfresh_expires(self): def test_cached_request_with_bad_max_age_headers_not_returned(self): now = time.strftime(TIME_FMT, time.gmtime()) # Not a valid header; this would be from a misconfigured server - resp = Mock(headers={"cache-control": "max-age=xxx", "date": now}, status=200) + resp = self.resp( + headers={"cache-control": "max-age=xxx", "date": now}, status=200 + ) self.c.cache = DictCache({self.url: resp}) diff --git a/tests/test_headers_parser.py b/tests/test_headers_parser.py new file mode 100644 index 00000000..d857ba10 --- /dev/null +++ b/tests/test_headers_parser.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: © 2019 The cachecontrol Authors +# SPDX-License-Identifier: Apache-2.0 +import unittest + +from cachecontrol.headers_parser import tokenize_cache_control, tokenize_pragma + +class TestTokenizer(unittest.TestCase): + + def test_single_pragma(self): + self.assertEqual({"token": None}, + tokenize_pragma("token")) + + def test_single_cachecontrol(self): + self.assertEqual({"token": None}, + tokenize_cache_control("token")) + + def test_multiple_tokens(self): + self.assertEqual({"token1": None, "token2": None}, + tokenize_cache_control("token1,token2")) + + def test_single_token_with_value(self): + self.assertEqual({"token1": "value1"}, + tokenize_cache_control("token1=value1")) + + def test_single_token_with_value_quoted(self): + self.assertEqual({"token1": "value1"}, + tokenize_cache_control('token1="value1"')) + + def test_single_token_with_value_quoted_with_comma(self): + self.assertEqual({"token1": "value1,value2"}, + tokenize_cache_control('token1="value1,value2"')) + + def test_two_tokens_with_value_quoted_with_comma(self): + self.assertEqual({"token1": "value1,value2", "token2": None}, + tokenize_cache_control('token1="value1,value2",token2'))