-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
71 lines (56 loc) · 1.88 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import random
import time
import requests
from bs4 import BeautifulSoup as bS
def get_soup(url: str, timeout_retry_count: int = 0):
"""
:param url:
:param timeout_retry_count:
:return:
"""
soup = None
for i in range(0, timeout_retry_count):
response = _get_page_response(url)
status_code = response.status_code
if status_code == 200:
# Returns the page content to string using UTF-8
content = response.text
soup = _extract_dom(content)
break
delays = [7, 4, 6, 2, 10, 19]
delay = random.choice(delays)
time.sleep(delay)
return soup
def get_random_ua():
random_ua = ''
ua_file = 'ua_file.txt'
try:
lines = open(ua_file).read().splitlines()
random_ua = random.choice(lines)
except Exception as e:
print(e)
finally:
return random_ua
def _get_page_response(url: str):
headers = {
'User-Agent': get_random_ua(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,'
'*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'DNT': '1',
'Referer': 'https://www.google.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
# Sends Get request to URL and returns a response
page_response = requests.get(url, headers=headers)
return page_response
def _extract_dom(response):
parsed_data = bS(response, 'lxml')
return parsed_data