Skip to content

Commit

Permalink
Merge pull request #95 from Crinibus/dev
Browse files Browse the repository at this point in the history
Add ability to scrape and add products from Expert.dk, MM-Vision.dk, Coolshop.dk and Sharkgaming
  • Loading branch information
Crinibus authored Oct 2, 2020
2 parents 9e6b104 + da780a3 commit b672ef0
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 23 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
**The tech scraper can scrape prices on products from Komplett.dk, Proshop.dk, Computersalg.dk, Elgiganten.dk, AvXperten.dk, Av-Cables.dk, Amazon.com, eBay.com and Power.dk**<br/><br/>
**The tech scraper can scrape prices on products from Komplett.dk, Proshop.dk, Computersalg.dk, Elgiganten.dk, AvXperten.dk, Av-Cables.dk, Amazon.com, eBay.com, Power.dk, Expert.dk, MM-Vision.dk, Coolshop.dk and Sharkgaming.dk**<br/><br/>
**The Fakta scraper can scrape discounts from this week discounts.**

<br/>
Expand Down Expand Up @@ -81,6 +81,14 @@ There is some optional arguments you can use when running add_product.py, these

- --power

- --expert

- --mmvision

- --coolshop

- --sharkgaming

When using one or more of "domain" arguments, only the chosen domains gets added to records.json under the product name.

<br/>
Expand Down
10 changes: 9 additions & 1 deletion tech_scraping/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
**This program can scrape prices on products from Komplett.dk, Proshop.dk, Computersalg.dk, Elgiganten.dk, AvXperten.dk, Av-Cables.dk, Amazon.com, eBay.com and Power.dk**
**This program can scrape prices on products from Komplett.dk, Proshop.dk, Computersalg.dk, Elgiganten.dk, AvXperten.dk, Av-Cables.dk, Amazon.com, eBay.com, Power.dk, Expert.dk, MM-Vision.dk, Coolshop.dk and Sharkgaming.dk**

# Table of contents
- [First setup](#first-setup)
Expand Down Expand Up @@ -69,4 +69,12 @@ There is some optional arguments you can use when running add_product.py, these

- --power

- --expert

- --mmvision

- --coolshop

- --sharkgaming

When using one or more of "domain" arguments, only the chosen domains gets added to records.json under the product name.
133 changes: 113 additions & 20 deletions tech_scraping/add_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import requests
from bs4 import BeautifulSoup
import json
from scraping import change_name
from scraping import change_name, change_æøå
import argparse


Expand Down Expand Up @@ -64,6 +64,26 @@ def argparse_setup():
'if this is the only optional flag',
action="store_true")

parser.add_argument('--expert',
help='add only expert-domain under the product-name,'
'if this is the only optional flag',
action="store_true")

parser.add_argument('--mmvision',
help='add only mm-vision-domain under the product-name,'
'if this is the only optional flag',
action="store_true")

parser.add_argument('--coolshop',
help='add only coolshop-domain under the product-name,'
'if this is the only optional flag',
action="store_true")

parser.add_argument('--sharkgaming',
help='add only sharkgaming-domain under the product-name,'
'if this is the only optional flag',
action="store_true")

return parser.parse_args()


Expand Down Expand Up @@ -97,30 +117,23 @@ def get_product_name(link):
return change_name(html_soup.find('h1', class_='product-title').text.lower())
elif URL_domain == 'www.power.dk':
return change_name(html_soup.find('title').text.replace(' - Power.dk', '').lower())
elif URL_domain == 'www.expert.dk':
return change_name(html_soup.find('meta', property='og:title')['content'].lower())
elif URL_domain == 'www.mm-vision.dk':
return change_name(html_soup.find('h1', itemprop='name').text.strip().lower())
elif URL_domain == 'www.coolshop.dk':
return change_name(html_soup.find('div', class_='thing-header').text.strip().lower())
elif URL_domain == 'www.sharkgaming.dk':
return change_name(html_soup.find('div', class_='product-name').text.strip().lower())
else:
return None


def ændre_æøå(navn):
"""Change the letters æ, ø and å to international letters to avoid unicode and return the new name."""
nyt_navn = ''
for bogstav in navn:
if bogstav in 'æøå':
if bogstav == 'æ':
bogstav = 'ae'
elif bogstav == 'ø':
bogstav = 'oe'
elif bogstav == 'å':
bogstav = 'aa'
nyt_navn += bogstav
return nyt_navn


def check_arguments():
"""Check if any of the optional domain arguments is giving to the script
and returns those that are as one json-object."""
json_object = json.loads('{}')
if args.komplett or args.proshop or args.computersalg or args.elgiganten or args.avxperten or args.avcables or args.amazon or args.ebay or args.power:
if args.komplett or args.proshop or args.computersalg or args.elgiganten or args.avxperten or args.avcables or args.amazon or args.ebay or args.power or args.expert or args.mmvision or args.coolshop or args.sharkgaming:
if args.komplett:
json_object.update({
f"{komplett_domain}": {
Expand Down Expand Up @@ -211,6 +224,46 @@ def check_arguments():
"dates": {}
}
})
if args.expert:
json_object.update({
f"{expert_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
}
})
if args.mmvision:
json_object.update({
f"{mmvision_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
}
})
if args.coolshop:
json_object.update({
f"{coolshop_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
}
})
if args.sharkgaming:
json_object.update({
f"{sharkgaming_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
}
})
else:
json_object = {
f"{komplett_domain}": {
Expand Down Expand Up @@ -275,6 +328,34 @@ def check_arguments():
"url": ""
},
"dates": {}
},
f"{expert_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
},
f"{mmvision_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
},
f"{coolshop_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
},
f"{sharkgaming_domain}": {
"info": {
"part_num": "",
"url": ""
},
"dates": {}
}
}
return json_object
Expand Down Expand Up @@ -314,6 +395,14 @@ def find_domain(domain):
return 'eBay'
elif domain == 'www.power.dk':
return 'Power'
elif domain == 'www.expert.dk':
return 'Expert'
elif domain == 'www.mm-vision.dk':
return 'MMVision'
elif domain == 'www.coolshop.dk':
return 'Coolshop'
elif domain == 'www.sharkgaming.dk':
return 'Sharkgaming'


def add_to_scraper(kategori, link, url_domain):
Expand All @@ -334,9 +423,9 @@ def main(kategori, link):
print(f'Sorry, but I can\'t scrape from this domain: {URL_domain}')
return

# Ændre æ, ø og/eller å
kategori = ændre_æøå(kategori)
produkt_navn = ændre_æøå(produkt_navn)
# Change æ, ø and/or å
kategori = change_æøå(kategori)
produkt_navn = change_æøå(produkt_navn)

save_json(kategori, produkt_navn)
add_to_scraper(kategori, link, URL_domain)
Expand All @@ -352,5 +441,9 @@ def main(kategori, link):
amazon_domain = 'www.amazon.com'
ebay_domain = 'www.ebay.com'
power_domain = 'www.power.dk'
expert_domain = 'www.expert.dk'
mmvision_domain = 'www.mm-vision.dk'
coolshop_domain = 'www.coolshop.dk'
sharkgaming_domain = 'www.sharkgaming.dk'
args = argparse_setup()
main(args.category, args.url)
57 changes: 56 additions & 1 deletion tech_scraping/scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, category, URL):
except Exception as err:
logger.error(f'Failed in method "{self.__class__.__name__}.get_info()": {err}', exc_info=True)

self.name = change_name(self.name)
self.name = change_æøå(change_name(self.name))
self.date = str(datetime.today().strftime('%Y-%m-%d'))
self.get_part_num()
self.shorten_url()
Expand Down Expand Up @@ -95,6 +95,14 @@ def get_part_num(self):
self.part_num = self.URL.split('=')[1]
elif self.URL_domain == 'www.power.dk':
self.part_num = self.URL.split('/')[-2].replace('p-', '')
elif self.URL_domain == 'www.expert.dk':
self.part_num = self.URL.split('/')[-2].replace('p-', '')
elif self.URL_domain == 'www.mm-vision.dk':
self.part_num = self.html_soup.find('input', type='radio')['value']
elif self.URL_domain == 'www.coolshop.dk':
self.part_num = self.html_soup.find_all('div', id='attributeSku')[1].text.strip()
elif self.URL_domain == 'www.sharkgaming.dk' or self.URL_domain == 'sharkgaming.dk':
self.part_num = 'Non existing on Sharkgaming'

def check_part_num(self):
"""
Expand Down Expand Up @@ -167,6 +175,14 @@ def shorten_url(self):
self.short_url = self.URL.split('?')[0]
elif self.URL_domain == 'www.power.dk':
self.short_url = f'https://www.power.dk/{self.URL.split("/")[3]}/p-{self.part_num}'
elif self.URL_domain == 'www.expert.dk':
self.short_url = f'https://www.expert.dk/{self.URL.split("/")[3]}/p-{self.part_num}'
elif self.URL_domain == 'www.mm-vision.dk':
self.short_url = self.URL
elif self.URL_domain == 'www.coolshop.dk':
self.short_url = f'https://www.coolshop.dk/produkt/{self.URL.split("/")[-2]}/'
elif self.URL_domain == 'www.sharkgaming.dk' or self.URL_domain == 'sharkgaming.dk':
self.short_url = self.URL

def print_info(self):
"""Print info about the product in the terminal."""
Expand Down Expand Up @@ -203,6 +219,21 @@ def change_name(name):
return name


def change_æøå(name):
"""Change the letters æ, ø and å to international letters to avoid unicode and return the new name."""
new_name = ''
for letter in name:
if letter in 'æøå':
if letter == 'æ':
letter = 'ae'
elif letter == 'ø':
letter = 'oe'
elif letter == 'å':
letter = 'aa'
new_name += letter
return new_name


class Komplett(Scraper):
def get_info(self):
self.name = self.html_soup.find('div', class_='product-main-info__info').h1.span.text.lower()
Expand Down Expand Up @@ -270,6 +301,30 @@ def get_info(self):
self.price = self.html_soup.find('meta', property='product:price:amount')['content'].split(',')[0]


class Expert(Scraper):
def get_info(self):
self.name = self.html_soup.find('meta', property='og:title')['content'].lower()
self.price = self.html_soup.find('meta', property='product:price:amount')['content'].split(',')[0]


class MMVision(Scraper):
def get_info(self):
self.name = self.html_soup.find('h1', itemprop='name').text.strip().lower()
self.price = self.html_soup.find('h3', class_='product-price text-right').text.replace(',-', '').replace('.', '')


class Coolshop(Scraper):
def get_info(self):
self.name = self.html_soup.find('div', class_='thing-header').text.strip().lower()
self.price = self.html_soup.find('meta', property='product:price:amount')['content'].split('.')[0]


class Sharkgaming(Scraper):
def get_info(self):
self.name = self.html_soup.find('div', class_='product-name').text.strip().lower()
self.price = self.html_soup.find('span', class_='price').text.replace(' kr.', '').replace('.', '')


if __name__ == '__main__':
logger = log_setup()
Komplett('ssd', 'https://www.komplett.dk/product/1133452/hardware/lagring/harddiskssd/ssd-m2/corsair-force-series-mp600-1tb-m2-ssd#')
Expand Down

0 comments on commit b672ef0

Please sign in to comment.