-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathasync_parse.py
99 lines (71 loc) · 2.97 KB
/
async_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import sys
import re
import json
import asyncio
import random
from playwright.async_api import async_playwright as pl
assert sys.argv[1], "there is no args"
article = int(sys.argv[1])
scroll_down = '''
const scrollStep = 200; // Размер шага прокрутки (в пикселях)
const scrollInterval = 100; // Интервал между шагами (в миллисекундах)
const scrollHeight = document.documentElement.scrollHeight;
let currentPosition = 0;
const interval = setInterval(() => {
window.scrollBy(0, scrollStep);
currentPosition += scrollStep;
if (currentPosition >= scrollHeight) {
clearInterval(interval);
}
}, scrollInterval);
'''
links = dict()
async def parse_html(page):
"""Find all links in raw HTML by regular expression (re)."""
a_handle = await page.evaluate_handle("document.body")
result_handle = await page.evaluate_handle("body => body.innerHTML", a_handle)
html = str(await result_handle.json_value())
await result_handle.dispose()
find = "https://ir.ozone.ru/s3/rp-photo-[0-9]/wc[0-9]00/.*?jpeg"
vid = "https://ir.ozone.ru/s3/video-[0-9]/.*?.jpg"
pics = re.findall(find, html)
thumbnails = re.findall(vid, html)
for el in pics:
hashed_pic = el[40:-5]
if links.get(hashed_pic):
continue
else:
links[hashed_pic] = el
for el in thumbnails:
hashed_thumb = el[32:-22]
if links.get(hashed_thumb):
continue
else:
links[hashed_thumb] = el
async def main(article: int):
pl_ctx = await pl().start()
browser = await pl_ctx.firefox.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()
url = "https://www.ozon.ru/product/" + str(article)
print(f"Going to {article} article")
await page.goto(url, wait_until="load")
await asyncio.sleep(1.5)
print("Load page and wait some time")
await page.evaluate(scroll_down)
await page.locator('div[style*="grid-template-columns: repeat(9, minmax(56px, 90px));"]').click(delay=1)
print("Click on locator")
await asyncio.sleep(1)
for _ in range(20):
await parse_html(page=page)
await page.keyboard.press("PageDown", delay=0.8)
print(f"Earn {len(links)+1} links")
sleep = random.uniform(1.5, 3)
await asyncio.sleep(sleep)
await page.close()
await pl_ctx.stop()
print("Close browser")
with open('result.json', 'w') as fp:
json.dump(links, fp)
if __name__ == "__main__":
asyncio.run(main=main(article=article))