-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAmazon_Scrapping.py
131 lines (66 loc) · 2.66 KB
/
Amazon_Scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
def get_title(soup):
try:
title = soup.find("span", attrs={"id":'productTitle'})
title_value = title.text
title_string = title_value.strip()
except AttributeError:
title_string = ""
return title_string
def get_price(soup):
try:
price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()
except AttributeError:
try:
price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()
except:
price = ""
return price
def get_rating(soup):
try:
rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
except AttributeError:
try:
rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
except:
rating = ""
return rating
def get_review_count(soup):
try:
review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
except AttributeError:
review_count = ""
return review_count
def get_availability(soup):
try:
available = soup.find("div", attrs={'id':'availability'})
available = available.find("span").string.strip()
except AttributeError:
available = "Not Available"
return available
if __name__ == '__main__':
HEADERS = ({'User-Agent':'', 'Accept-Language': 'en-US, en;q=0.5'})
URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
links_list = []
for link in links:
links_list.append(link.get('href'))
d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}
for link in links_list:
new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
new_soup = BeautifulSoup(new_webpage.content, "html.parser")
d['title'].append(get_title(new_soup))
d['price'].append(get_price(new_soup))
d['rating'].append(get_rating(new_soup))
d['reviews'].append(get_review_count(new_soup))
d['availability'].append(get_availability(new_soup))
amazon_df = pd.DataFrame.from_dict(d)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)
print(amazon_df)