initial commit

ndgigliotti · ndgigliotti · commit a8eb87458aa0 · 2020-12-30T16:53:05.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+/shopify_spy/resources/
+test.py
+.scrapy/
diff --git a/parsing.py b/parsing.py
@@ -0,0 +1,19 @@
+import os
+import hashlib
+
+
+def sha1(string):
+    return hashlib.sha1(string.encode("utf-8")).hexdigest()
+
+
+def write_test_page(response, dst):
+    os.makedirs(dst, exist_ok=True)
+    path = os.path.join(dst, sha1(response.request.url) + ".html")
+    with open(path, "wb") as f:
+        f.write(response.body)
+
+
+def drop_dups(seq):
+    seen = set()
+    add = seen.add
+    return [x for x in seq if not (x in seen or add(x))]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,11 @@
+# Requirements automatically generated by pigar.
+# https://github.com/damnever/pigar
+
+# shopify_spy\items.py: 8
+# shopify_spy\middlewares.py: 8
+# shopify_spy\spiders\google.py: 3
+# shopify_spy\spiders\shopify.py: 4
+Scrapy == 1.6.0
+
+# shopify_spy\spiders\google.py: 4
+beautifulsoup4 == 4.9.3
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = shopify_spy.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = shopify_spy
diff --git a/shopify_spy/__init__.py b/shopify_spy/__init__.py
diff --git a/shopify_spy/items.py b/shopify_spy/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+# class SearchResult(scrapy.Item):
+#     # define the fields for your item here like:
+#     # name = scrapy.Field()
+#     url = scrapy.Field()
diff --git a/shopify_spy/middlewares.py b/shopify_spy/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+# class ShopifySearcherSpiderMiddleware(object):
+#     # Not all methods need to be defined. If a method is not defined,
+#     # scrapy acts as if the spider middleware does not modify the
+#     # passed objects.
+#
+#     @classmethod
+#     def from_crawler(cls, crawler):
+#         # This method is used by Scrapy to create your spiders.
+#         s = cls()
+#         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+#         return s
+#
+#     def process_spider_input(self, response, spider):
+#         # Called for each response that goes through the spider
+#         # middleware and into the spider.
+#
+#         # Should return None or raise an exception.
+#         return None
+#
+#     def process_spider_output(self, response, result, spider):
+#         # Called with the results returned from the Spider, after
+#         # it has processed the response.
+#
+#         # Must return an iterable of Request, dict or Item objects.
+#         for i in result:
+#             yield i
+#
+#     def process_spider_exception(self, response, exception, spider):
+#         # Called when a spider or process_spider_input() method
+#         # (from other spider middleware) raises an exception.
+#
+#         # Should return either None or an iterable of Response, dict
+#         # or Item objects.
+#         pass
+#
+#     def process_start_requests(self, start_requests, spider):
+#         # Called with the start requests of the spider, and works
+#         # similarly to the process_spider_output() method, except
+#         # that it doesn’t have a response associated.
+#
+#         # Must return only requests (not items).
+#         for r in start_requests:
+#             yield r
+#
+#     def spider_opened(self, spider):
+#         spider.logger.info('Spider opened: %s' % spider.name)
+#
+#
+# class ShopifySearcherDownloaderMiddleware(object):
+#     # Not all methods need to be defined. If a method is not defined,
+#     # scrapy acts as if the downloader middleware does not modify the
+#     # passed objects.
+#
+#     @classmethod
+#     def from_crawler(cls, crawler):
+#         # This method is used by Scrapy to create your spiders.
+#         s = cls()
+#         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+#         return s
+#
+#     def process_request(self, request, spider):
+#         # Called for each request that goes through the downloader
+#         # middleware.
+#
+#         # Must either:
+#         # - return None: continue processing this request
+#         # - or return a Response object
+#         # - or return a Request object
+#         # - or raise IgnoreRequest: process_exception() methods of
+#         #   installed downloader middleware will be called
+#         return None
+#
+#     def process_response(self, request, response, spider):
+#         # Called with the response returned from the downloader.
+#
+#         # Must either;
+#         # - return a Response object
+#         # - return a Request object
+#         # - or raise IgnoreRequest
+#         return response
+#
+#     def process_exception(self, request, exception, spider):
+#         # Called when a download handler or a process_request()
+#         # (from other downloader middleware) raises an exception.
+#
+#         # Must either:
+#         # - return None: continue processing this exception
+#         # - return a Response object: stops process_exception() chain
+#         # - return a Request object: stops process_exception() chain
+#         pass
+#
+#     def spider_opened(self, spider):
+#         spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/shopify_spy/pipelines.py b/shopify_spy/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# class ShopifySearcherPipeline(object):
+#     def process_item(self, item, spider):
+#         return item
diff --git a/shopify_spy/settings.py b/shopify_spy/settings.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+from os import path
+from pathlib import PurePath
+
+# Scrapy settings for shopify_spy project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'shopify_spy'
+
+SPIDER_MODULES = ['shopify_spy.spiders']
+NEWSPIDER_MODULE = 'shopify_spy.spiders'
+RESOURCES_DIR = path.join(path.dirname(path.abspath(__file__)), "resources")
+TEST_PAGES_DIR = path.join(RESOURCES_DIR, "test_pages")
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'shopify_spy (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+# }
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    'shopify_spy.middlewares.ShopifyGuesserSpiderMiddleware': 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 543,
+# }
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+# }
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {
+#    'shopify_spy.pipelines.ShopifyGuesserPipeline': 300,
+# }
+
+FEED_URI = PurePath(RESOURCES_DIR).as_uri() + "/%(name)s/%(time)s.json"
+FEED_FORMAT = "jsonlines"
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = True
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 0
+HTTPCACHE_DIR = 'httpcache'
+HTTPCACHE_IGNORE_HTTP_CODES = []
+HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/shopify_spy/spiders/__init__.py b/shopify_spy/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/shopify_spy/spiders/google.py b/shopify_spy/spiders/google.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+import re
+import scrapy
+import bs4
+
+RE_MYSHOPIFY = re.compile(r"https?://[\w\d\-]+\.myshopify\.com/?")
+
+
+class GoogleSpider(scrapy.Spider):
+    name = 'GoogleSpider'
+    allowed_domains = ['google.com']
+    custom_settings = {
+        "ROBOTSTXT_OBEY": False,
+    }
+
+    def __init__(self, query=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.start_urls = [get_search_url(query)]
+
+    def parse(self, response):
+        soup = bs4.BeautifulSoup(response.text, "lxml")
+        urls = [x["href"] for x in soup.find_all("a", href=RE_MYSHOPIFY)]
+        urls = [RE_MYSHOPIFY.search(x)[0] for x in urls]
+        yield from [{"url": x} for x in urls]
+        next_ = soup.find("a", attrs={"aria-label": "Next page", "href": True})
+        if next_:
+            yield scrapy.Request("https://www.google.com" + next_["href"])
+
+
+def get_search_url(query, site="myshopify.com"):
+    terms = "+".join(query.split())
+    return "https://www.google.com/search?q={terms}+site:{site}".format(**locals())
diff --git a/shopify_spy/spiders/shopify.py b/shopify_spy/spiders/shopify.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+import re
+import json
+import scrapy
+import urllib
+
+PRODUCT_URL = r"/products/"
+
+
+class ShopifySpider(scrapy.spiders.SitemapSpider):
+    name = 'ShopifySpider'
+    # allowed_domains = ["shopify.com"]
+    sitemap_rules = [(PRODUCT_URL, "parse_product")]
+
+    def __init__(self, *args, url=None, from_file=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        if url:
+            self.sitemap_urls = [get_sitemap_url(url)]
+        elif from_file:
+            with open(from_file) as f:
+                self.sitemap_urls = [get_sitemap_url(x) for x in f.readlines()]
+        else:
+            self.sitemap_urls = []
+
+    def sitemap_filter(self, entries):
+        for entry in entries:
+            if re.search(PRODUCT_URL, entry["loc"]):
+                entry["loc"] = entry["loc"] + ".json"
+            yield entry
+
+    def parse_product(self, response):
+        product = json.loads(response.text)
+        product["url"] = response.request.url
+        product["store"] = urllib.parse.urlparse(response.request.url).netloc
+        yield product
+
+
+def get_sitemap_url(url):
+    url = urllib.parse.urlparse(url)
+    url = ["https", url.netloc, "/sitemap.xml"] + [None]*3
+    return urllib.parse.urlunparse(url)