Skip to content

Commit 71567e6

Browse files
committed
Switch auto field stats to an item pipeline
1 parent 106b099 commit 71567e6

File tree

13 files changed

+943
-634
lines changed

13 files changed

+943
-634
lines changed

docs/reference/settings.rst

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,11 @@ ZYTE_API_AUTO_FIELD_STATS
1313

1414
Default: ``False``
1515

16-
Enables stats that indicate which requested fields :ref:`obtained through
17-
scrapy-poet integration <scrapy-poet>` come directly from
18-
:ref:`zapi-extract`.
16+
Enables stats that indicate which fields from yielded items come from
17+
:ref:`zapi-extract` when using :ref:`scrapy-poet integration <scrapy-poet>`.
1918

20-
If for any request no page object class is used to override
21-
:ref:`zapi-extract` fields for a given item type, the following stat is
22-
set:
19+
If for any combination of item type and URL there is no registered page object
20+
class, the following stat is set:
2321

2422
.. code-block:: python
2523
@@ -28,8 +26,9 @@ set:
2826
.. note:: A literal ``(all fields)`` string is used as value, not a list with
2927
all fields.
3028

31-
If for any request a custom page object class is used to override some
32-
:ref:`zapi-extract` fields, the following stat is set:
29+
When a page object class is registered for a given combination of item type and
30+
URL, and that page object class overrides some fields, the following stat is
31+
set:
3332

3433
.. code-block:: python
3534
@@ -40,6 +39,32 @@ If for any request a custom page object class is used to override some
4039
.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
4140
whether a field has been overridden or not.
4241

42+
Item URLs are read from the ``url`` field by default. Use
43+
:setting:`ZYTE_API_AUTO_FIELD_URL_FIELDS` to configure a different field for
44+
any given item type.
45+
46+
.. setting:: ZYTE_API_AUTO_FIELD_URL_FIELDS
47+
48+
ZYTE_API_AUTO_FIELD_URL_FIELDS
49+
==============================
50+
51+
Default: ``{}``
52+
53+
Dictionary where keys are item types or their import paths, and values are
54+
strings with the names of the fields in those item types that indicate the
55+
source URL of the item.
56+
57+
For example:
58+
59+
.. code-block:: python
60+
:caption: settings.py
61+
62+
ZYTE_API_AUTO_FIELD_URL_FIELDS = {
63+
"my_project.items.CustomItem": "custom_url_field",
64+
}
65+
66+
If a URL field is not specified for an item, ``url`` is used by default.
67+
4368
.. setting:: ZYTE_API_AUTOMAP_PARAMS
4469

4570
ZYTE_API_AUTOMAP_PARAMS

docs/setup.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ spider to use Zyte API for all requests, set the following setting as well:
120120
121121
ZYTE_API_TRANSPARENT_MODE = True
122122
123+
.. _scrapy-poet-manual-setup:
124+
123125
For :ref:`scrapy-poet integration <scrapy-poet>`, add the following provider to
124126
the ``SCRAPY_POET_PROVIDERS`` setting:
125127

@@ -150,6 +152,17 @@ middleware to the :setting:`DOWNLOADER_MIDDLEWARES
150152
"scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667,
151153
}
152154
155+
For :setting:`ZYTE_API_AUTO_FIELD_STATS` support, first :ref:`enable
156+
scrapy-poet integration <scrapy-poet-manual-setup>`, and then add the following
157+
item pipeline to the :setting:`ITEM_PIPELINES <scrapy:ITEM_PIPELINES>` setting:
158+
159+
.. code-block:: python
160+
:caption: settings.py
161+
162+
ITEM_PIPELINES = {
163+
"scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0,
164+
}
165+
153166
154167
.. _reactor-change:
155168

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from logging import getLogger
2+
from typing import Any
3+
4+
from itemadapter import ItemAdapter
5+
from scrapy import Spider
6+
from scrapy.crawler import Crawler
7+
from scrapy.exceptions import NotConfigured
8+
from scrapy.utils.misc import load_object
9+
from scrapy_poet.downloadermiddlewares import InjectionMiddleware
10+
from web_poet.fields import get_fields_dict
11+
from web_poet.utils import get_fq_class_name
12+
from zyte_common_items.fields import is_auto_field
13+
14+
logger = getLogger(__name__)
15+
16+
17+
class ScrapyZyteAPIPoetItemPipeline:
18+
19+
@classmethod
20+
def from_crawler(cls, crawler):
21+
return cls(crawler)
22+
23+
def __init__(self, crawler: Crawler):
24+
if not crawler.settings.getbool("ZYTE_API_AUTO_FIELD_STATS", False):
25+
raise NotConfigured
26+
27+
raw_url_fields = crawler.settings.getdict("ZYTE_API_AUTO_FIELD_URL_FIELDS", {})
28+
self._url_fields = {load_object(k): v for k, v in raw_url_fields.items()}
29+
self._seen = set()
30+
self._crawler = crawler
31+
self._stats = crawler.stats
32+
self._cls_without_url = set()
33+
34+
def open_spider(self, spider):
35+
for component in self._crawler.engine.downloader.middleware.middlewares:
36+
if isinstance(component, InjectionMiddleware):
37+
self._registry = component.injector.registry
38+
return
39+
raise RuntimeError(
40+
"Could not find "
41+
"scrapy_poet.downloadermiddlewares.InjectionMiddleware among "
42+
"downloader middlewares. scrapy-poet may be misconfigured."
43+
)
44+
45+
def process_item(self, item: Any, spider: Spider):
46+
cls = item.__class__
47+
48+
url_field = self._url_fields.get(cls, "url")
49+
adapter = ItemAdapter(item)
50+
url = adapter.get(url_field, None)
51+
if not url:
52+
if cls not in self._cls_without_url:
53+
self._cls_without_url.add(cls)
54+
logger.warning(
55+
f"An item of type {cls} was missing a non-empty URL in "
56+
f"its {url_field!r} field. An item URL is necessary to "
57+
f"determine the page object that was used to generate "
58+
f"that item, and hence print the auto field stats that "
59+
f"you requested by enabling the ZYTE_API_AUTO_FIELD_STATS "
60+
f"setting. If {url_field!r} is the wrong URL field for "
61+
f"that item type, use the ZYTE_API_AUTO_FIELD_URL_FIELDS "
62+
f"setting to set a different field."
63+
)
64+
return
65+
66+
page_cls = self._registry.page_cls_for_item(url, cls)
67+
cls = page_cls or cls
68+
69+
if cls in self._seen:
70+
return
71+
self._seen.add(cls)
72+
73+
if not page_cls:
74+
field_list = "(all fields)"
75+
else:
76+
cls = page_cls
77+
auto_fields = set()
78+
missing_fields = False
79+
for field_name in get_fields_dict(cls):
80+
if is_auto_field(cls, field_name): # type: ignore[arg-type]
81+
auto_fields.add(field_name)
82+
else:
83+
missing_fields = True
84+
if missing_fields:
85+
field_list = " ".join(sorted(auto_fields))
86+
else:
87+
field_list = "(all fields)"
88+
89+
cls_fqn = get_fq_class_name(cls)
90+
self._stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)

scrapy_zyte_api/addon.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,11 @@ def update_settings(self, settings: BaseSettings) -> None:
111111
except ImportError:
112112
pass
113113
else:
114+
from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline
114115
from scrapy_zyte_api.providers import ZyteApiProvider
115116

116117
_setdefault(settings, "DOWNLOADER_MIDDLEWARES", InjectionMiddleware, 543)
118+
_setdefault(settings, "ITEM_PIPELINES", ScrapyZyteAPIPoetItemPipeline, 0)
117119
_setdefault(settings, "SCRAPY_POET_PROVIDERS", ZyteApiProvider, 1100)
118120

119121
if settings.getbool("ZYTE_API_SESSION_ENABLED", False):

scrapy_zyte_api/poet.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from ._poet_item_pipelines import ScrapyZyteAPIPoetItemPipeline

scrapy_zyte_api/providers.py

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast
1+
from typing import Any, Callable, Dict, List, Optional, Sequence, Set
22

33
from andi.typeutils import is_typing_annotated, strip_annotated
44
from scrapy import Request
@@ -13,8 +13,6 @@
1313
HttpResponseHeaders,
1414
)
1515
from web_poet.annotated import AnnotatedInstance
16-
from web_poet.fields import get_fields_dict
17-
from web_poet.utils import get_fq_class_name
1816
from zyte_common_items import (
1917
Article,
2018
ArticleList,
@@ -32,7 +30,6 @@
3230
ProductList,
3331
ProductNavigation,
3432
)
35-
from zyte_common_items.fields import is_auto_field
3633

3734
from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
3835
from scrapy_zyte_api._annotations import _ActionResult
@@ -84,38 +81,9 @@ class ZyteApiProvider(PageObjectInputProvider):
8481
Screenshot,
8582
}
8683

87-
def __init__(self, *args, **kwargs):
88-
super().__init__(*args, **kwargs)
89-
self._should_track_auto_fields = None
90-
self._tracked_auto_fields = set()
91-
9284
def is_provided(self, type_: Callable) -> bool:
9385
return super().is_provided(strip_annotated(type_))
9486

95-
def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
96-
if cls not in _ITEM_KEYWORDS:
97-
return
98-
if self._should_track_auto_fields is None:
99-
self._should_track_auto_fields = crawler.settings.getbool(
100-
"ZYTE_API_AUTO_FIELD_STATS", False
101-
)
102-
if self._should_track_auto_fields is False:
103-
return
104-
cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls
105-
if cls in self._tracked_auto_fields:
106-
return
107-
self._tracked_auto_fields.add(cls)
108-
if cls in _ITEM_KEYWORDS:
109-
field_list = "(all fields)"
110-
else:
111-
auto_fields = set()
112-
for field_name in get_fields_dict(cls):
113-
if is_auto_field(cls, field_name): # type: ignore[arg-type]
114-
auto_fields.add(field_name)
115-
field_list = " ".join(sorted(auto_fields))
116-
cls_fqn = get_fq_class_name(cls)
117-
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
118-
11987
async def __call__( # noqa: C901
12088
self, to_provide: Set[Callable], request: Request, crawler: Crawler
12189
) -> Sequence[Any]:
@@ -125,7 +93,6 @@ async def __call__( # noqa: C901
12593
http_response = None
12694
screenshot_requested = Screenshot in to_provide
12795
for cls in list(to_provide):
128-
self._track_auto_fields(crawler, request, cast(type, cls))
12996
item = self.injector.weak_cache.get(request, {}).get(cls)
13097
if item:
13198
results.append(item)

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,7 @@ max-line-length = 88
44
max-complexity = 18
55
select = B,C,E,F,W,T4
66
per-file-ignores =
7+
tests/test_auto_field_stats.py: E402
78
tests/test_providers.py: E402
89
scrapy_zyte_api/__init__.py: F401
10+
scrapy_zyte_api/poet.py: F401

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def get_version():
3333
"andi>=0.6.0",
3434
"scrapy-poet>=0.22.3",
3535
"web-poet>=0.17.0",
36-
"zyte-common-items>=0.20.0",
36+
# "zyte-common-items>=0.20.0",
37+
"zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@auto-fields",
3738
]
3839
},
3940
classifiers=[

tests/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
SETTINGS["SCRAPY_POET_PROVIDERS"] = {
4343
"scrapy_zyte_api.providers.ZyteApiProvider": 1100
4444
}
45+
SETTINGS["ITEM_PIPELINES"] = {
46+
"scrapy_zyte_api.poet.ScrapyZyteAPIPoetItemPipeline": 0
47+
}
4548
SETTINGS_ADDON: SETTINGS_T = {
4649
"ADDONS": {
4750
Addon: 500,
@@ -108,6 +111,7 @@ def serialize_settings(settings):
108111
del result[setting]
109112
for setting in (
110113
"DOWNLOADER_MIDDLEWARES",
114+
"ITEM_PIPELINES",
111115
"SCRAPY_POET_PROVIDERS",
112116
"SPIDER_MIDDLEWARES",
113117
):

tests/test_addon.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@
2424
POET = False
2525
InjectionMiddleware = None
2626
ZyteApiProvider: Optional[Type] = None
27+
ScrapyZyteAPIPoetItemPipeline: Optional[Type] = None
2728
else:
2829
POET = True
30+
from scrapy_zyte_api.poet import ScrapyZyteAPIPoetItemPipeline
2931
from scrapy_zyte_api.providers import ZyteApiProvider
3032

3133
_crawler = get_crawler()
@@ -120,6 +122,7 @@ def _test_setting_changes(initial_settings, expected_settings):
120122
# Test separately settings that copy_to_dict messes up.
121123
for setting in (
122124
"DOWNLOADER_MIDDLEWARES",
125+
"ITEM_PIPELINES",
123126
"SCRAPY_POET_PROVIDERS",
124127
"SPIDER_MIDDLEWARES",
125128
):
@@ -145,6 +148,7 @@ def _test_setting_changes(initial_settings, expected_settings):
145148
"http": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPDownloadHandler",
146149
"https": "scrapy_zyte_api.handler.ScrapyZyteAPIHTTPSDownloadHandler",
147150
},
151+
"ITEM_PIPELINES": {},
148152
"REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter",
149153
"SPIDER_MIDDLEWARES": {
150154
ScrapyZyteAPISpiderMiddleware: 100,
@@ -230,6 +234,9 @@ def test_no_poet_setting_changes(initial_settings, expected_settings):
230234
ScrapyZyteAPISessionDownloaderMiddleware: 667,
231235
InjectionMiddleware: 543,
232236
},
237+
"ITEM_PIPELINES": {
238+
ScrapyZyteAPIPoetItemPipeline: 0,
239+
},
233240
"SCRAPY_POET_PROVIDERS": {
234241
ZyteApiProvider: 1100,
235242
},

tests/test_api_requests.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,7 @@ async def test_default_params_immutability(setting_key, meta_key, setting, meta)
650650
async def _test_automap(
651651
settings, request_kwargs, meta, expected, warnings, caplog, cookie_jar=None
652652
):
653+
caplog.clear()
653654
request = Request(url="https://example.com", **request_kwargs)
654655
request.meta["zyte_api_automap"] = meta
655656
settings = {**settings, "ZYTE_API_TRANSPARENT_MODE": True}
@@ -694,7 +695,7 @@ async def _test_automap(
694695
for warning in warnings:
695696
assert warning in caplog.text
696697
else:
697-
assert not caplog.records
698+
assert not caplog.records, caplog.records[0].args
698699

699700

700701
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)