tarpetos · tarpetos · Nov 4, 2022
diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py
@@ -4,4 +4,17 @@
 class CrawlerItem(scrapy.Item):
     score = scrapy.Field()
     occupation = scrapy.Field()
-    code = scrapy.Field()
+    code = scrapy.Field()
+
+
+class ModificatedCrawlerItem(scrapy.Item):
+    score = scrapy.Field()
+    occupation = scrapy.Field()
+    code = scrapy.Field()
+    title = scrapy.Field()
+    lay_titles = scrapy.Field()
+    description = scrapy.Field()
+    tasks = scrapy.Field()
+    tasks_number = scrapy.Field()
+    activities = scrapy.Field()
+    activities_number = scrapy.Field()
diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py
@@ -1,13 +1,92 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+import mysql
+import mysql.connector
 
 
-# useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
+class CrawlerPipeline:
+    def process_item(self, item, spider):
+        return item
 
 
-class CrawlerPipeline:
+class ModificatedCrawlerPipeline:
+    def __init__(self):
+        self.con = mysql.connector.connect(
+            host='localhost',
+            user='root',
+            passwd='1111',
+            database='onet_code_db'
+        )
+        self.cur = self.con.cursor()
+        self.create_table()
+
+    def create_table(self):
+        self.cur.execute("""DROP TABLE IF EXISTS stored_data_mod""")
+
+        self.cur.execute(
+            """
+            CREATE TABLE stored_data_mod(
+                id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
+                score VARCHAR(10),
+                occupation VARCHAR(100),
+                code VARCHAR(25),
+                title VARCHAR(200),
+                lay_titles VARCHAR(200),
+                description VARCHAR(200),
+                tasks VARCHAR(200),
+                tasks_number VARCHAR(3),
+                activities VARCHAR(200),
+                activities_number VARCHAR(3)
+            )
+            """
+        )
+
+        self.cur.execute("""DROP PROCEDURE IF EXISTS remove_empty_table""")
+
+        self.cur.execute(
+            """
+            CREATE PROCEDURE remove_empty_table()
+            BEGIN
+                SET @temp = (SELECT COUNT(*) FROM stored_data_mod);
+                IF @temp = 0 THEN
+                    DROP TABLE stored_data_mod;
+                END IF;
+            END
+            """
+        )
+
+    def drop_table_if_empty(self):
+        self.cur.execute("""CALL remove_empty_table()""")
+
     def process_item(self, item, spider):
+        self.cur.execute(
+            """
+            INSERT INTO stored_data_mod(
+                score, 
+                occupation, 
+                code, 
+                title, 
+                lay_titles, 
+                description, 
+                tasks, 
+                tasks_number, 
+                activities, 
+                activities_number
+            )
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """,
+            (
+                item['score'],
+                item['occupation'],
+                item['code'],
+                item['title'],
+                item['lay_titles'],
+                item['description'],
+                item['tasks'],
+                item['tasks_number'],
+                item['activities'],
+                item['activities_number']
+            )
+        )
+
+        self.con.commit()
+
         return item
diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py
@@ -12,77 +12,77 @@
 SPIDER_MODULES = ['crawler.spiders']
 NEWSPIDER_MODULE = 'crawler.spiders'
 
-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'crawler (+http://www.yourdomain.com)'
+# USER_AGENT = 'crawler (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'crawler.middlewares.CrawlerSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# DOWNLOADER_MIDDLEWARES = {
 #    'crawler.middlewares.CrawlerDownloaderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'crawler.pipelines.CrawlerPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    # 'crawler.pipelines.CrawlerPipeline': 300,
+    'crawler.pipelines.ModificatedCrawlerPipeline': 300,
+}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/crawler/crawler/spiders/modificated_scrap.py b/crawler/crawler/spiders/modificated_scrap.py
@@ -0,0 +1,86 @@
+import scrapy
+import scrapy.exceptions
+
+from ..pipelines import ModificatedCrawlerPipeline
+from ..items import ModificatedCrawlerItem
+
+
+def str_to_void(s):
+    if s is None:
+        return ''
+    return str(s)
+
+
+def add_url_part(s):
+    url_part = 'https://www.onetcodeconnector.org'
+
+    if str_to_void(s) != '':
+        return url_part + str_to_void(s)
+    return ''
+
+
+def number_from_string(s):
+    number = ''
+
+    for symbol in s:
+        if symbol.isdigit():
+            number += symbol
+
+    return number
+
+
+def search_term_from_user():
+    return input("Enter what you wants to find: ")
+
+
+def check_for_table(response):
+    if response.xpath('//table').extract_first() is None:
+        ModificatedCrawlerPipeline().drop_table_if_empty()
+        raise scrapy.exceptions.CloseSpider('No table exists for this query!!!')
+    else:
+        pass
+
+
+class OnetCodeScraperModificated(scrapy.Spider):
+    name = 'modificated_scrap'
+
+    start_urls = [
+        'https://www.onetcodeconnector.org' + f"/find/result?s={search_term_from_user()}" + "&a=1",
+    ]
+
+    def parse(self, response, **kwargs):
+        check_for_table(response)
+
+        items = ModificatedCrawlerItem()
+
+        rows = response.xpath('//tr[position()>1]')
+
+        for row in rows:
+            score = str_to_void(row.xpath('.//td[1]/text()').extract_first())
+            occupation = row.xpath('.//td[2]/a/text()').extract_first()
+            soc_code = row.xpath('.//td[3]/text()').extract_first()
+
+            title = add_url_part(str_to_void(row.xpath('.//td[4]//a/@href').extract_first()))
+            lay_titles = add_url_part(str_to_void(row.xpath('.//td[5]//a/@href').extract_first()))
+            description = add_url_part(str_to_void(row.xpath('.//td[6]//a/@href').extract_first()))
+
+            tasks = add_url_part(str_to_void(row.xpath('.//td[7]//a/@href').extract_first()))
+            tasks_number = number_from_string(row.xpath('.//td[7]/text()').extract_first())
+
+            activities = add_url_part(str_to_void(row.xpath('.//td[8]//a/@href').extract_first()))
+            activities_number = number_from_string(row.xpath('.//td[8]/text()').extract_first())
+
+            items['score'] = score
+            items['occupation'] = occupation
+            items['code'] = soc_code
+            items['title'] = title
+            items['lay_titles'] = lay_titles
+            items['description'] = description
+
+            items['tasks'] = tasks
+            items['tasks_number'] = tasks_number
+
+            items['activities'] = activities
+            items['activities_number'] = activities_number
+
+            yield items