-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreview_scraper.py
60 lines (52 loc) · 3.23 KB
/
review_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Il file implementa una classe che consente di fare scraping di recensioni amazon
import re
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
UserReview = namedtuple('UserReview', ['product_name', 'review_title', 'comment', 'rating', 'date', 'username', 'profile_url', 'verified_purchase'])
class AmazonScraper:
review_date_pattern = re.compile('(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) \d+, \d{4}')
product_name_pattern = re.compile('^https:\/{2}www.amazon.com\/(.+)\/product-reviews')
def __init__(self):
# create a browser session
self.session = requests.Session()
self.session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv: 87.0) Gecko/20100101 Firefox/87.0'
self.session.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
self.session.headers['Accept-Language'] = 'en-US,en;q=0.5'
self.session.headers['Connection'] = 'keep-alive'
self.session.headers['Upgrade-Insecure-Requests'] = '1'
def scrapeReviews(self, url, page_num, filter_by='recent', star = "critical"):
"""
args
filter_by: recent or helpful
return
namedtuple
"""
try:
review_url = re.search('^.+(?=\/)', url).group()
review_url = review_url + '?reviewerType=all_reviews&sortBy={0}&reviewerType=avp_only_reviews&filterByStar={1}&pageNumber={2}'.format(filter_by, star, page_num)
print('Processing {0}...'.format(review_url))
response = self.session.get(review_url)
product_name = self.product_name_pattern.search(url).group(1) if self.product_name_pattern.search(url) else ''
if not product_name:
print('url is invalid. Please check the url.')
return
else:
product_name = product_name.replace('-', ' ')
soup = BeautifulSoup(response.content, 'html.parser')
review_list = soup.find('div', {'id': 'cm_cr-review_list' })
reviews = []
product_reviews = review_list.find_all('div', {'data-hook': 'review'}) # return reviews
for product_review in product_reviews:
review_title = product_review.find('a', {'data-hook': 'review-title'}).text.strip()
verified_purchase = True if product_review.find('span', {'data-hook': 'avp-badge'}) else False
review_body = product_review.find('span', {'data-hook': 'review-body'}).text.strip()
rating = product_review.find('i', {'data-hook': 'review-star-rating'}).text
review_date = self.review_date_pattern.search(product_review.find('span', {'data-hook': 'review-date'}).text).group(0)
username = product_review.a.span.text
user_profile = 'https://amazon.com/{0}'.format(product_review.a['href'])
reviews.append(UserReview(product_name, review_title, review_body, rating, review_date, username, user_profile, verified_purchase))
return reviews
except Exception as e:
print(e)
return None