Provide an MVP implementation of a session middleware#193
Provide an MVP implementation of a session middleware#193Gallaecio merged 98 commits intoscrapy-plugins:mainfrom
Conversation
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## main #193 +/- ##
==========================================
- Coverage 98.45% 97.56% -0.90%
==========================================
Files 13 14 +1
Lines 1102 1476 +374
Branches 0 309 +309
==========================================
+ Hits 1085 1440 +355
+ Misses 17 15 -2
- Partials 0 21 +21
|
|
I have created a project based on https://github.com/zytedata/zyte-spider-templates-project, added the following spider to it: from logging import getLogger
from scrapy import Request
from scrapy.exceptions import NotSupported
from scrapy.http.response import Response
from tenacity import stop_after_attempt
from tenacity.stop import stop_base
from zyte_api import RequestError, RetryFactory
logger = getLogger(__name__)
class custom_throttling_stop(stop_base):
def __call__(self, retry_state: "RetryCallState") -> bool:
assert retry_state.outcome, "Unexpected empty outcome"
exc = retry_state.outcome.exception()
assert exc, "Unexpected empty exception"
return (
isinstance(exc, RequestError)
and exc.status == 429
and exc.parsed.data["title"] == "Session has expired"
)
class CustomRetryFactory(RetryFactory):
# Do not retry 520, let Scrapy deal with them (i.e. retry them with a
# different session).
temporary_download_error_stop = stop_after_attempt(1)
# Handle temporary bug
throttling_stop = custom_throttling_stop()
SESSION_RETRY_POLICY = CustomRetryFactory().build()
class _SessionChecker:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
params = crawler.settings["ZYTE_API_SESSION_PARAMS"]
self.zip_code = params["actions"][0]["address"]["postalCode"]
def check_session(self, request: Request, response: Response) -> bool:
try:
zip_code = response.css(".delivery-text + a > span > span::text").get()
except NotSupported: # Empty response.
logger.debug(f"Empty response {response}.")
return False
if not zip_code:
logger.debug(f"No ZIP code found in {response}.")
return False
if zip_code == self.zip_code:
logger.debug(f"Found expected ZIP code {zip_code!r} in {response}.")
return True
logger.debug(
f"Found unexpected ZIP code {zip_code!r} in {response} (expected "
f"{self.zip_code!r})."
)
return False
from zyte_spider_templates import EcommerceSpider
class SessionEcommerceSpider(EcommerceSpider):
name = "session_ecommerce"
@classmethod
def update_settings(cls, settings):
super().update_settings(settings)
settings["ZYTE_API_AUTOMAP_PARAMS"] = {"browserHtml": True}
# DEBUG
settings["ZYTE_API_LOG_REQUESTS"] = True
settings["ZYTE_API_LOG_REQUESTS_TRUNCATE"] = 0
# Settings needed for the session stuff.
settings["ZYTE_API_SESSION_CHECKER"] = _SessionChecker
settings["ZYTE_API_SESSION_PARAMS"] = {
"browserHtml": True,
"actions": [{"action": "setLocation", "address": {"postalCode": "94124"}}],
}
settings["COOKIES_ENABLED"] = False # Sessions handle cookies.
settings["ZYTE_API_RETRY_POLICY"] = SESSION_RETRY_POLICY # Don’t retry bans.
settings["ZYTE_API_PROVIDER_PARAMS"] = {"browserHtml": True} # Cannot validate extraction-only responsesAnd executed it as follows: scrapy crawl session_ecommerce -a "url=https://ecommerce.example/product-list" -a crawl_strategy=pagination_onlyIt seems to work well enough. Although retries can be exceeded, i.e. a higher |
|
@Gallaecio Do you have a real job or stats? How many times did it get invalid response? What reasons responses were invalidated upon? |
For a short crawl I performed just now: i.e. sessions were created 11 times because the default is creating 8, and 3 got 521. Once a valid session was created, all (7) usages succeeded. Of course, it is a rather small sample. In any case, I will now try to build some of the ideas by @VMRuiz, of a better API for location, into this PR (including per-domain web-poet-like configurations). |
|
0638df7 is based on the ideas shared by @VMRuiz elsewhere. It enables both a location-specific approach with poet-based overrides: from logging import getLogger
from typing import Any
from pydantic import BaseModel, Field
from pydantic.types import Json
from scrapy import Request, Spider
from scrapy.crawler import Crawler
from scrapy.exceptions import NotSupported
from scrapy.http.response import Response
from scrapy_spider_metadata import Args
from scrapy_zyte_api import SessionConfig, session_config
from tenacity import stop_after_attempt
from tenacity.stop import stop_base
from zyte_api import RequestError, RetryFactory
from zyte_spider_templates import EcommerceSpider
from zyte_spider_templates.spiders.base import ARG_SETTING_PRIORITY
from zyte_spider_templates.spiders.ecommerce import EcommerceSpiderParams
logger = getLogger(__name__)
class custom_throttling_stop(stop_base):
def __call__(self, retry_state: "RetryCallState") -> bool:
assert retry_state.outcome, "Unexpected empty outcome"
exc = retry_state.outcome.exception()
assert exc, "Unexpected empty exception"
return (
isinstance(exc, RequestError)
and exc.status == 429
and exc.parsed.data["title"] == "Session has expired"
)
class CustomRetryFactory(RetryFactory):
# Do not retry 520, let Scrapy deal with them (i.e. retry them with a
# different session).
temporary_download_error_stop = stop_after_attempt(1)
# Handle temporary bug
throttling_stop = custom_throttling_stop()
SESSION_RETRY_POLICY = CustomRetryFactory().build()
@session_config("ecommerce.example")
class EcommerceExampleLocationSessionConfig(SessionConfig):
def check(self, response: Response, request: Request) -> bool:
try:
zip_code = response.css(".delivery-text + a > span > span::text").get()
except NotSupported: # Empty response.
logger.debug(f"Empty response {response}.")
return False
if not zip_code:
logger.debug(f"No ZIP code found in {response}.")
return False
expected_zip_code = self.location(request)["postalCode"]
if zip_code == expected_zip_code:
logger.debug(f"Found expected ZIP code {zip_code!r} in {response}.")
return True
logger.debug(
f"Found unexpected ZIP code {zip_code!r} in {response} (expected "
f"{expected_zip_code!r})."
)
return False
class LocationParam(BaseModel):
location: Json[Any] = Field(default_factory=dict)
class LocationSpiderParams(LocationParam, EcommerceSpiderParams):
pass
class LocationEcommerceSpider(EcommerceSpider, Args[LocationSpiderParams]):
name = "location_ecommerce"
@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Spider:
spider = super(LocationEcommerceSpider, cls).from_crawler(crawler, *args, **kwargs)
if spider.args.location:
spider.settings.set(
"ZYTE_API_SESSION_ENABLED",
True,
priority=ARG_SETTING_PRIORITY,
)
spider.settings.set(
"ZYTE_API_SESSION_LOCATION",
spider.args.location,
priority=ARG_SETTING_PRIORITY,
)
return spider
@classmethod
def update_settings(cls, settings):
super().update_settings(settings)
settings["ZYTE_API_AUTOMAP_PARAMS"] = {"browserHtml": True}
# DEBUG
settings["ZYTE_API_LOG_REQUESTS"] = True
settings["ZYTE_API_LOG_REQUESTS_TRUNCATE"] = 0
# Settings needed for the session stuff.
settings["ZYTE_API_SESSION_ENABLED"] = True
settings["COOKIES_ENABLED"] = False # Sessions handle cookies.
settings["ZYTE_API_RETRY_POLICY"] = SESSION_RETRY_POLICY # Don’t retry bans.
settings["ZYTE_API_PROVIDER_PARAMS"] = {"browserHtml": True} # Cannot validate extraction-only responsesBut it also supports a non-location-specific approach, as well as a non-poet-like definition of session initialization parameters and a check function: from logging import getLogger
from scrapy import Request
from scrapy.exceptions import NotSupported
from scrapy.http.response import Response
from tenacity import stop_after_attempt
from tenacity.stop import stop_base
from zyte_api import RequestError, RetryFactory
logger = getLogger(__name__)
class custom_throttling_stop(stop_base):
def __call__(self, retry_state: "RetryCallState") -> bool:
assert retry_state.outcome, "Unexpected empty outcome"
exc = retry_state.outcome.exception()
assert exc, "Unexpected empty exception"
return (
isinstance(exc, RequestError)
and exc.status == 429
and exc.parsed.data["title"] == "Session has expired"
)
class CustomRetryFactory(RetryFactory):
# Do not retry 520, let Scrapy deal with them (i.e. retry them with a
# different session).
temporary_download_error_stop = stop_after_attempt(1)
# Handle temporary bug
throttling_stop = custom_throttling_stop()
SESSION_RETRY_POLICY = CustomRetryFactory().build()
class _SessionChecker:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
params = crawler.settings["ZYTE_API_SESSION_PARAMS"]
self.zip_code = params["actions"][0]["address"]["postalCode"]
def check(self, response: Response, request: Request) -> bool:
try:
zip_code = response.css(".delivery-text + a > span > span::text").get()
except NotSupported: # Empty response.
logger.debug(f"Empty response {response}.")
return False
if not zip_code:
logger.debug(f"No ZIP code found in {response}.")
return False
if zip_code == self.zip_code:
logger.debug(f"Found expected ZIP code {zip_code!r} in {response}.")
return True
logger.debug(
f"Found unexpected ZIP code {zip_code!r} in {response} (expected "
f"{self.zip_code!r})."
)
return False
from zyte_spider_templates import EcommerceSpider
class SessionEcommerceSpider(EcommerceSpider):
name = "session_ecommerce"
@classmethod
def update_settings(cls, settings):
super().update_settings(settings)
settings["ZYTE_API_AUTOMAP_PARAMS"] = {"browserHtml": True}
# DEBUG
settings["ZYTE_API_LOG_REQUESTS"] = True
settings["ZYTE_API_LOG_REQUESTS_TRUNCATE"] = 0
# Settings needed for the session stuff.
settings["ZYTE_API_SESSION_ENABLED"] = True
settings["ZYTE_API_SESSION_CHECKER"] = _SessionChecker
settings["ZYTE_API_SESSION_PARAMS"] = {
"browserHtml": True,
"actions": [{"action": "setLocation", "address": {"postalCode": "94124"}}],
}
settings["COOKIES_ENABLED"] = False # Sessions handle cookies.
settings["ZYTE_API_RETRY_POLICY"] = SESSION_RETRY_POLICY # Don’t retry bans.
settings["ZYTE_API_PROVIDER_PARAMS"] = {"browserHtml": True} # Cannot validate extraction-only responsesAlso, multiple session pools are now supported, and by default each domain has its own pool. The stats now show check passes and failures separately depending on whether they happened during session initialization or regular response checks, and are split per session pool: Now I need to figure out why the number of passes for session initializations is higher than the number of total passes. Feels like more sessions are being initialized than necessary. I also want to implement a basic default check for |
…by the session downloader middleware
… cookie handling docs
To do:
setLocationactionsetLocationis not available for a given session pool.@session_config.