From 4153a57d8ea0bc63378057faa5c6f18a92cd0e78 Mon Sep 17 00:00:00 2001 From: tarpetos Date: Fri, 4 Nov 2022 15:27:34 +0200 Subject: [PATCH] Adding the ability to enter search queries --- crawler/crawler/items.py | 15 +++- crawler/crawler/pipelines.py | 93 ++++++++++++++++++-- crawler/crawler/settings.py | 60 ++++++------- crawler/crawler/spiders/modificated_scrap.py | 86 ++++++++++++++++++ 4 files changed, 216 insertions(+), 38 deletions(-) create mode 100644 crawler/crawler/spiders/modificated_scrap.py diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py index f833183..8b3082b 100644 --- a/crawler/crawler/items.py +++ b/crawler/crawler/items.py @@ -4,4 +4,17 @@ class CrawlerItem(scrapy.Item): score = scrapy.Field() occupation = scrapy.Field() - code = scrapy.Field() \ No newline at end of file + code = scrapy.Field() + + +class ModificatedCrawlerItem(scrapy.Item): + score = scrapy.Field() + occupation = scrapy.Field() + code = scrapy.Field() + title = scrapy.Field() + lay_titles = scrapy.Field() + description = scrapy.Field() + tasks = scrapy.Field() + tasks_number = scrapy.Field() + activities = scrapy.Field() + activities_number = scrapy.Field() diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py index e3aa1dc..6190306 100644 --- a/crawler/crawler/pipelines.py +++ b/crawler/crawler/pipelines.py @@ -1,13 +1,92 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +import mysql +import mysql.connector -# useful for handling different item types with a single interface -from itemadapter import ItemAdapter +class CrawlerPipeline: + def process_item(self, item, spider): + return item -class CrawlerPipeline: +class ModificatedCrawlerPipeline: + def __init__(self): + self.con = mysql.connector.connect( + host='localhost', + user='root', + passwd='1111', + database='onet_code_db' + ) + self.cur = self.con.cursor() + self.create_table() + + def create_table(self): + self.cur.execute("""DROP TABLE IF EXISTS stored_data_mod""") + + self.cur.execute( + """ + CREATE TABLE stored_data_mod( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + score VARCHAR(10), + occupation VARCHAR(100), + code VARCHAR(25), + title VARCHAR(200), + lay_titles VARCHAR(200), + description VARCHAR(200), + tasks VARCHAR(200), + tasks_number VARCHAR(3), + activities VARCHAR(200), + activities_number VARCHAR(3) + ) + """ + ) + + self.cur.execute("""DROP PROCEDURE IF EXISTS remove_empty_table""") + + self.cur.execute( + """ + CREATE PROCEDURE remove_empty_table() + BEGIN + SET @temp = (SELECT COUNT(*) FROM stored_data_mod); + IF @temp = 0 THEN + DROP TABLE stored_data_mod; + END IF; + END + """ + ) + + def drop_table_if_empty(self): + self.cur.execute("""CALL remove_empty_table()""") + def process_item(self, item, spider): + self.cur.execute( + """ + INSERT INTO stored_data_mod( + score, + occupation, + code, + title, + lay_titles, + description, + tasks, + tasks_number, + activities, + activities_number + ) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """, + ( + item['score'], + item['occupation'], + item['code'], + item['title'], + item['lay_titles'], + item['description'], + item['tasks'], + item['tasks_number'], + item['activities'], + item['activities_number'] + ) + ) + + self.con.commit() + return item diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py index 4d0a402..8ef865d 100644 --- a/crawler/crawler/settings.py +++ b/crawler/crawler/settings.py @@ -12,77 +12,77 @@ SPIDER_MODULES = ['crawler.spiders'] NEWSPIDER_MODULE = 'crawler.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'crawler (+http://www.yourdomain.com)' +# USER_AGENT = 'crawler (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'crawler.middlewares.CrawlerSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'crawler.middlewares.CrawlerDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'crawler.pipelines.CrawlerPipeline': 300, -#} +ITEM_PIPELINES = { + # 'crawler.pipelines.CrawlerPipeline': 300, + 'crawler.pipelines.ModificatedCrawlerPipeline': 300, +} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/crawler/crawler/spiders/modificated_scrap.py b/crawler/crawler/spiders/modificated_scrap.py new file mode 100644 index 0000000..e819d3c --- /dev/null +++ b/crawler/crawler/spiders/modificated_scrap.py @@ -0,0 +1,86 @@ +import scrapy +import scrapy.exceptions + +from ..pipelines import ModificatedCrawlerPipeline +from ..items import ModificatedCrawlerItem + + +def str_to_void(s): + if s is None: + return '' + return str(s) + + +def add_url_part(s): + url_part = 'https://www.onetcodeconnector.org' + + if str_to_void(s) != '': + return url_part + str_to_void(s) + return '' + + +def number_from_string(s): + number = '' + + for symbol in s: + if symbol.isdigit(): + number += symbol + + return number + + +def search_term_from_user(): + return input("Enter what you wants to find: ") + + +def check_for_table(response): + if response.xpath('//table').extract_first() is None: + ModificatedCrawlerPipeline().drop_table_if_empty() + raise scrapy.exceptions.CloseSpider('No table exists for this query!!!') + else: + pass + + +class OnetCodeScraperModificated(scrapy.Spider): + name = 'modificated_scrap' + + start_urls = [ + 'https://www.onetcodeconnector.org' + f"/find/result?s={search_term_from_user()}" + "&a=1", + ] + + def parse(self, response, **kwargs): + check_for_table(response) + + items = ModificatedCrawlerItem() + + rows = response.xpath('//tr[position()>1]') + + for row in rows: + score = str_to_void(row.xpath('.//td[1]/text()').extract_first()) + occupation = row.xpath('.//td[2]/a/text()').extract_first() + soc_code = row.xpath('.//td[3]/text()').extract_first() + + title = add_url_part(str_to_void(row.xpath('.//td[4]//a/@href').extract_first())) + lay_titles = add_url_part(str_to_void(row.xpath('.//td[5]//a/@href').extract_first())) + description = add_url_part(str_to_void(row.xpath('.//td[6]//a/@href').extract_first())) + + tasks = add_url_part(str_to_void(row.xpath('.//td[7]//a/@href').extract_first())) + tasks_number = number_from_string(row.xpath('.//td[7]/text()').extract_first()) + + activities = add_url_part(str_to_void(row.xpath('.//td[8]//a/@href').extract_first())) + activities_number = number_from_string(row.xpath('.//td[8]/text()').extract_first()) + + items['score'] = score + items['occupation'] = occupation + items['code'] = soc_code + items['title'] = title + items['lay_titles'] = lay_titles + items['description'] = description + + items['tasks'] = tasks + items['tasks_number'] = tasks_number + + items['activities'] = activities + items['activities_number'] = activities_number + + yield items