forked from scrapy-plugins/scrapy-playwright
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontexts.py
107 lines (101 loc) · 3.51 KB
/
contexts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from pathlib import Path
from scrapy import Spider, Request
class MultipleContextsSpider(Spider):
"""Handle multiple browser contexts"""
name = "contexts"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"PLAYWRIGHT_MAX_CONTEXTS": 6,
"PLAYWRIGHT_CONTEXTS": {
"first": {
"storage_state": {
"cookies": [
{
"url": "https://example.org",
"name": "context",
"value": "first",
},
],
},
},
"second": {
"storage_state": {
"cookies": [
{
"url": "https://example.org",
"name": "context",
"value": "second",
},
],
},
},
"persistent": {
"user_data_dir": str(Path.home() / "playwright-persistent-context"),
"java_script_enabled": False,
},
},
}
def start_requests(self):
# using existing contexts
for ctx_name in self.custom_settings["PLAYWRIGHT_CONTEXTS"].keys():
yield Request(
url="https://example.org",
meta={
"playwright": True,
"playwright_context": ctx_name,
"playwright_include_page": True,
},
dont_filter=True,
)
# create a new context
yield Request(
url="https://example.org",
meta={
"playwright": True,
"playwright_context": "third",
"playwright_context_kwargs": {
"storage_state": {
"cookies": [
{
"url": "https://example.org",
"name": "context",
"value": "third",
},
],
},
},
"playwright_include_page": True,
},
dont_filter=True,
)
# default context
yield Request(
url="https://example.org",
meta={"playwright": True, "playwright_include_page": True},
dont_filter=True,
)
# each request on a different context
for i in range(20):
yield Request(
url=f"https://example.org?foo={i}",
meta={
"playwright": True,
"playwright_context": f"context-{i}",
"playwright_include_page": True,
},
dont_filter=True,
)
async def parse(self, response):
page = response.meta["playwright_page"]
context_name = response.meta["playwright_context"]
storage_state = await page.context.storage_state()
await page.context.close()
return {
"url": response.url,
"context": context_name,
"cookies": storage_state["cookies"],
}