-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
70 lines (49 loc) · 2.02 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from urllib.request import urlopen
from bs4 import BeautifulSoup
from newegg import newegg, eggUrls
from amazon import amazon, zonUrls
from flipkart import flipkart, kartUrls
def parse(url):
'''
Parses the url page and return a BeautifulSoup object
'''
# open connection to url, and download the page
client = urlopen(url)
page_html = client.read()
client.close()
# parse the page
page_soup = BeautifulSoup(page_html, 'html.parser')
return page_soup
urls = ['https://www.newegg.com/global/in-en/Laptops-Notebooks/SubCategory/ID-32?name=Laptops-Notebooks&Tid=1297918&Order=PRICE',
# amazon 5k to 20k
'https://www.amazon.in/s?bbn=1375424031&rh=n%3A976392031%2Cn%3A976393031%2Cn%3A1375424031%2Cp_36%3A500000-1999999&s=price-asc-rank',
# amazon 20k to 30k
'https://www.amazon.in/s?bbn=1375424031&rh=n%3A976392031%2Cn%3A976393031%2Cn%3A1375424031%2Cp_36%3A2000000-3000000&s=price-asc-rank',
'https://www.flipkart.com/laptops/pr?sid=6bo%2Fb5g&p%5B%5D=facets.serviceability%5B%5D%3Dtrue&fm=neo%2Fmerchandising&iid=M_3580c4f9-a714-45e8-a54c-64fa60d4b35d_10.f37da198-ab3e-48f0-bc9a-12dfbbbc32c9_DesktopSite&ppt=clp&ppn=laptops-store&sort=price_asc']
# craete urls for other pages and them to the list of pages to be parsed
for url in urls:
page_soup = parse(url)
if 'newegg' in url:
print('Creating URLs for newegg')
# add only upto a page to prevent from going over budget
urls = urls + eggUrls(page_soup, url)[:25]
if 'amazon' in url:
print('Creating URLs for amazon')
urls = urls + zonUrls(page_soup, url)
if 'flipkart' in url:
print('Creating URLs for flipkart')
# add only upto a page to prevent from going over budget
urls = urls + kartUrls(page_soup, url)[:10]
# extract product's information and write them in csv files
for i,url in enumerate(urls):
page_soup = parse(url)
print(str(i+1) + ' Page')
# adding to test the program quickly
# if not (i%10 == 0):
# continue
if 'newegg' in url:
newegg(page_soup)
if 'amazon' in url:
amazon(page_soup)
if 'flipkart' in url:
flipkart(page_soup)