Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion crawler/crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,17 @@
class CrawlerItem(scrapy.Item):
score = scrapy.Field()
occupation = scrapy.Field()
code = scrapy.Field()
code = scrapy.Field()


class ModificatedCrawlerItem(scrapy.Item):
score = scrapy.Field()
occupation = scrapy.Field()
code = scrapy.Field()
title = scrapy.Field()
lay_titles = scrapy.Field()
description = scrapy.Field()
tasks = scrapy.Field()
tasks_number = scrapy.Field()
activities = scrapy.Field()
activities_number = scrapy.Field()
93 changes: 86 additions & 7 deletions crawler/crawler/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,92 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import mysql
import mysql.connector


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class CrawlerPipeline:
def process_item(self, item, spider):
return item


class CrawlerPipeline:
class ModificatedCrawlerPipeline:
def __init__(self):
self.con = mysql.connector.connect(
host='localhost',
user='root',
passwd='1111',
database='onet_code_db'
)
self.cur = self.con.cursor()
self.create_table()

def create_table(self):
self.cur.execute("""DROP TABLE IF EXISTS stored_data_mod""")

self.cur.execute(
"""
CREATE TABLE stored_data_mod(
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
score VARCHAR(10),
occupation VARCHAR(100),
code VARCHAR(25),
title VARCHAR(200),
lay_titles VARCHAR(200),
description VARCHAR(200),
tasks VARCHAR(200),
tasks_number VARCHAR(3),
activities VARCHAR(200),
activities_number VARCHAR(3)
)
"""
)

self.cur.execute("""DROP PROCEDURE IF EXISTS remove_empty_table""")

self.cur.execute(
"""
CREATE PROCEDURE remove_empty_table()
BEGIN
SET @temp = (SELECT COUNT(*) FROM stored_data_mod);
IF @temp = 0 THEN
DROP TABLE stored_data_mod;
END IF;
END
"""
)

def drop_table_if_empty(self):
self.cur.execute("""CALL remove_empty_table()""")

def process_item(self, item, spider):
self.cur.execute(
"""
INSERT INTO stored_data_mod(
score,
occupation,
code,
title,
lay_titles,
description,
tasks,
tasks_number,
activities,
activities_number
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""",
(
item['score'],
item['occupation'],
item['code'],
item['title'],
item['lay_titles'],
item['description'],
item['tasks'],
item['tasks_number'],
item['activities'],
item['activities_number']
)
)

self.con.commit()

return item
60 changes: 30 additions & 30 deletions crawler/crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,77 +12,77 @@
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'crawler (+http://www.yourdomain.com)'
# USER_AGENT = 'crawler (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'crawler.middlewares.CrawlerSpiderMiddleware': 543,
#}
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'crawler.middlewares.CrawlerDownloaderMiddleware': 543,
#}
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'crawler.pipelines.CrawlerPipeline': 300,
#}
ITEM_PIPELINES = {
# 'crawler.pipelines.CrawlerPipeline': 300,
'crawler.pipelines.ModificatedCrawlerPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
86 changes: 86 additions & 0 deletions crawler/crawler/spiders/modificated_scrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import scrapy
import scrapy.exceptions

from ..pipelines import ModificatedCrawlerPipeline
from ..items import ModificatedCrawlerItem


def str_to_void(s):
if s is None:
return ''
return str(s)


def add_url_part(s):
url_part = 'https://www.onetcodeconnector.org'

if str_to_void(s) != '':
return url_part + str_to_void(s)
return ''


def number_from_string(s):
number = ''

for symbol in s:
if symbol.isdigit():
number += symbol

return number


def search_term_from_user():
return input("Enter what you wants to find: ")


def check_for_table(response):
if response.xpath('//table').extract_first() is None:
ModificatedCrawlerPipeline().drop_table_if_empty()
raise scrapy.exceptions.CloseSpider('No table exists for this query!!!')
else:
pass


class OnetCodeScraperModificated(scrapy.Spider):
name = 'modificated_scrap'

start_urls = [
'https://www.onetcodeconnector.org' + f"/find/result?s={search_term_from_user()}" + "&a=1",
]

def parse(self, response, **kwargs):
check_for_table(response)

items = ModificatedCrawlerItem()

rows = response.xpath('//tr[position()>1]')

for row in rows:
score = str_to_void(row.xpath('.//td[1]/text()').extract_first())
occupation = row.xpath('.//td[2]/a/text()').extract_first()
soc_code = row.xpath('.//td[3]/text()').extract_first()

title = add_url_part(str_to_void(row.xpath('.//td[4]//a/@href').extract_first()))
lay_titles = add_url_part(str_to_void(row.xpath('.//td[5]//a/@href').extract_first()))
description = add_url_part(str_to_void(row.xpath('.//td[6]//a/@href').extract_first()))

tasks = add_url_part(str_to_void(row.xpath('.//td[7]//a/@href').extract_first()))
tasks_number = number_from_string(row.xpath('.//td[7]/text()').extract_first())

activities = add_url_part(str_to_void(row.xpath('.//td[8]//a/@href').extract_first()))
activities_number = number_from_string(row.xpath('.//td[8]/text()').extract_first())

items['score'] = score
items['occupation'] = occupation
items['code'] = soc_code
items['title'] = title
items['lay_titles'] = lay_titles
items['description'] = description

items['tasks'] = tasks
items['tasks_number'] = tasks_number

items['activities'] = activities
items['activities_number'] = activities_number

yield items