Skip to content

Commit a8eb874

Browse files
committed
initial commit
0 parents  commit a8eb874

12 files changed

+346
-0
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
__pycache__/
2+
/shopify_spy/resources/
3+
test.py
4+
.scrapy/

parsing.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
import hashlib
3+
4+
5+
def sha1(string):
6+
return hashlib.sha1(string.encode("utf-8")).hexdigest()
7+
8+
9+
def write_test_page(response, dst):
10+
os.makedirs(dst, exist_ok=True)
11+
path = os.path.join(dst, sha1(response.request.url) + ".html")
12+
with open(path, "wb") as f:
13+
f.write(response.body)
14+
15+
16+
def drop_dups(seq):
17+
seen = set()
18+
add = seen.add
19+
return [x for x in seq if not (x in seen or add(x))]

requirements.txt

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Requirements automatically generated by pigar.
2+
# https://github.com/damnever/pigar
3+
4+
# shopify_spy\items.py: 8
5+
# shopify_spy\middlewares.py: 8
6+
# shopify_spy\spiders\google.py: 3
7+
# shopify_spy\spiders\shopify.py: 4
8+
Scrapy == 1.6.0
9+
10+
# shopify_spy\spiders\google.py: 4
11+
beautifulsoup4 == 4.9.3

scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
5+
6+
[settings]
7+
default = shopify_spy.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = shopify_spy

shopify_spy/__init__.py

Whitespace-only changes.

shopify_spy/items.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
11+
# class SearchResult(scrapy.Item):
12+
# # define the fields for your item here like:
13+
# # name = scrapy.Field()
14+
# url = scrapy.Field()

shopify_spy/middlewares.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
10+
11+
# class ShopifySearcherSpiderMiddleware(object):
12+
# # Not all methods need to be defined. If a method is not defined,
13+
# # scrapy acts as if the spider middleware does not modify the
14+
# # passed objects.
15+
#
16+
# @classmethod
17+
# def from_crawler(cls, crawler):
18+
# # This method is used by Scrapy to create your spiders.
19+
# s = cls()
20+
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21+
# return s
22+
#
23+
# def process_spider_input(self, response, spider):
24+
# # Called for each response that goes through the spider
25+
# # middleware and into the spider.
26+
#
27+
# # Should return None or raise an exception.
28+
# return None
29+
#
30+
# def process_spider_output(self, response, result, spider):
31+
# # Called with the results returned from the Spider, after
32+
# # it has processed the response.
33+
#
34+
# # Must return an iterable of Request, dict or Item objects.
35+
# for i in result:
36+
# yield i
37+
#
38+
# def process_spider_exception(self, response, exception, spider):
39+
# # Called when a spider or process_spider_input() method
40+
# # (from other spider middleware) raises an exception.
41+
#
42+
# # Should return either None or an iterable of Response, dict
43+
# # or Item objects.
44+
# pass
45+
#
46+
# def process_start_requests(self, start_requests, spider):
47+
# # Called with the start requests of the spider, and works
48+
# # similarly to the process_spider_output() method, except
49+
# # that it doesn’t have a response associated.
50+
#
51+
# # Must return only requests (not items).
52+
# for r in start_requests:
53+
# yield r
54+
#
55+
# def spider_opened(self, spider):
56+
# spider.logger.info('Spider opened: %s' % spider.name)
57+
#
58+
#
59+
# class ShopifySearcherDownloaderMiddleware(object):
60+
# # Not all methods need to be defined. If a method is not defined,
61+
# # scrapy acts as if the downloader middleware does not modify the
62+
# # passed objects.
63+
#
64+
# @classmethod
65+
# def from_crawler(cls, crawler):
66+
# # This method is used by Scrapy to create your spiders.
67+
# s = cls()
68+
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69+
# return s
70+
#
71+
# def process_request(self, request, spider):
72+
# # Called for each request that goes through the downloader
73+
# # middleware.
74+
#
75+
# # Must either:
76+
# # - return None: continue processing this request
77+
# # - or return a Response object
78+
# # - or return a Request object
79+
# # - or raise IgnoreRequest: process_exception() methods of
80+
# # installed downloader middleware will be called
81+
# return None
82+
#
83+
# def process_response(self, request, response, spider):
84+
# # Called with the response returned from the downloader.
85+
#
86+
# # Must either;
87+
# # - return a Response object
88+
# # - return a Request object
89+
# # - or raise IgnoreRequest
90+
# return response
91+
#
92+
# def process_exception(self, request, exception, spider):
93+
# # Called when a download handler or a process_request()
94+
# # (from other downloader middleware) raises an exception.
95+
#
96+
# # Must either:
97+
# # - return None: continue processing this exception
98+
# # - return a Response object: stops process_exception() chain
99+
# # - return a Request object: stops process_exception() chain
100+
# pass
101+
#
102+
# def spider_opened(self, spider):
103+
# spider.logger.info('Spider opened: %s' % spider.name)

shopify_spy/pipelines.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
9+
# class ShopifySearcherPipeline(object):
10+
# def process_item(self, item, spider):
11+
# return item

shopify_spy/settings.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# -*- coding: utf-8 -*-
2+
from os import path
3+
from pathlib import PurePath
4+
5+
# Scrapy settings for shopify_spy project
6+
#
7+
# For simplicity, this file contains only settings considered important or
8+
# commonly used. You can find more settings consulting the documentation:
9+
#
10+
# https://doc.scrapy.org/en/latest/topics/settings.html
11+
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
12+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
13+
14+
BOT_NAME = 'shopify_spy'
15+
16+
SPIDER_MODULES = ['shopify_spy.spiders']
17+
NEWSPIDER_MODULE = 'shopify_spy.spiders'
18+
RESOURCES_DIR = path.join(path.dirname(path.abspath(__file__)), "resources")
19+
TEST_PAGES_DIR = path.join(RESOURCES_DIR, "test_pages")
20+
21+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
22+
#USER_AGENT = 'shopify_spy (+http://www.yourdomain.com)'
23+
24+
# Obey robots.txt rules
25+
ROBOTSTXT_OBEY = True
26+
27+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
28+
#CONCURRENT_REQUESTS = 32
29+
30+
# Configure a delay for requests for the same website (default: 0)
31+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
32+
# See also autothrottle settings and docs
33+
DOWNLOAD_DELAY = 3
34+
# The download delay setting will honor only one of:
35+
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
36+
CONCURRENT_REQUESTS_PER_IP = 16
37+
38+
# Disable cookies (enabled by default)
39+
COOKIES_ENABLED = False
40+
41+
# Disable Telnet Console (enabled by default)
42+
TELNETCONSOLE_ENABLED = False
43+
44+
# Override the default request headers:
45+
# DEFAULT_REQUEST_HEADERS = {
46+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
47+
# 'Accept-Language': 'en',
48+
# }
49+
50+
# Enable or disable spider middlewares
51+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
52+
# SPIDER_MIDDLEWARES = {
53+
# 'shopify_spy.middlewares.ShopifyGuesserSpiderMiddleware': 543,
54+
# }
55+
56+
# Enable or disable downloader middlewares
57+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
58+
# DOWNLOADER_MIDDLEWARES = {
59+
# 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 543,
60+
# }
61+
62+
# Enable or disable extensions
63+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
64+
# EXTENSIONS = {
65+
# 'scrapy.extensions.telnet.TelnetConsole': None,
66+
# }
67+
68+
# Configure item pipelines
69+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
70+
# ITEM_PIPELINES = {
71+
# 'shopify_spy.pipelines.ShopifyGuesserPipeline': 300,
72+
# }
73+
74+
FEED_URI = PurePath(RESOURCES_DIR).as_uri() + "/%(name)s/%(time)s.json"
75+
FEED_FORMAT = "jsonlines"
76+
77+
# Enable and configure the AutoThrottle extension (disabled by default)
78+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
79+
# AUTOTHROTTLE_ENABLED = True
80+
# The initial download delay
81+
# AUTOTHROTTLE_START_DELAY = 5
82+
# The maximum download delay to be set in case of high latencies
83+
# AUTOTHROTTLE_MAX_DELAY = 60
84+
# The average number of requests Scrapy should be sending in parallel to
85+
# each remote server
86+
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87+
# Enable showing throttling stats for every response received:
88+
# AUTOTHROTTLE_DEBUG = True
89+
90+
# Enable and configure HTTP caching (disabled by default)
91+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92+
HTTPCACHE_ENABLED = True
93+
HTTPCACHE_EXPIRATION_SECS = 0
94+
HTTPCACHE_DIR = 'httpcache'
95+
HTTPCACHE_IGNORE_HTTP_CODES = []
96+
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

shopify_spy/spiders/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

shopify_spy/spiders/google.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# -*- coding: utf-8 -*-
2+
import re
3+
import scrapy
4+
import bs4
5+
6+
RE_MYSHOPIFY = re.compile(r"https?://[\w\d\-]+\.myshopify\.com/?")
7+
8+
9+
class GoogleSpider(scrapy.Spider):
10+
name = 'GoogleSpider'
11+
allowed_domains = ['google.com']
12+
custom_settings = {
13+
"ROBOTSTXT_OBEY": False,
14+
}
15+
16+
def __init__(self, query=None, *args, **kwargs):
17+
super().__init__(*args, **kwargs)
18+
self.start_urls = [get_search_url(query)]
19+
20+
def parse(self, response):
21+
soup = bs4.BeautifulSoup(response.text, "lxml")
22+
urls = [x["href"] for x in soup.find_all("a", href=RE_MYSHOPIFY)]
23+
urls = [RE_MYSHOPIFY.search(x)[0] for x in urls]
24+
yield from [{"url": x} for x in urls]
25+
next_ = soup.find("a", attrs={"aria-label": "Next page", "href": True})
26+
if next_:
27+
yield scrapy.Request("https://www.google.com" + next_["href"])
28+
29+
30+
def get_search_url(query, site="myshopify.com"):
31+
terms = "+".join(query.split())
32+
return "https://www.google.com/search?q={terms}+site:{site}".format(**locals())

shopify_spy/spiders/shopify.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# -*- coding: utf-8 -*-
2+
import re
3+
import json
4+
import scrapy
5+
import urllib
6+
7+
PRODUCT_URL = r"/products/"
8+
9+
10+
class ShopifySpider(scrapy.spiders.SitemapSpider):
11+
name = 'ShopifySpider'
12+
# allowed_domains = ["shopify.com"]
13+
sitemap_rules = [(PRODUCT_URL, "parse_product")]
14+
15+
def __init__(self, *args, url=None, from_file=None, **kwargs):
16+
super().__init__(*args, **kwargs)
17+
if url:
18+
self.sitemap_urls = [get_sitemap_url(url)]
19+
elif from_file:
20+
with open(from_file) as f:
21+
self.sitemap_urls = [get_sitemap_url(x) for x in f.readlines()]
22+
else:
23+
self.sitemap_urls = []
24+
25+
def sitemap_filter(self, entries):
26+
for entry in entries:
27+
if re.search(PRODUCT_URL, entry["loc"]):
28+
entry["loc"] = entry["loc"] + ".json"
29+
yield entry
30+
31+
def parse_product(self, response):
32+
product = json.loads(response.text)
33+
product["url"] = response.request.url
34+
product["store"] = urllib.parse.urlparse(response.request.url).netloc
35+
yield product
36+
37+
38+
def get_sitemap_url(url):
39+
url = urllib.parse.urlparse(url)
40+
url = ["https", url.netloc, "/sitemap.xml"] + [None]*3
41+
return urllib.parse.urlunparse(url)

0 commit comments

Comments
 (0)