Skip to content

Commit

Permalink
added some changes on spiders amach, cristim, rcsrds
Browse files Browse the repository at this point in the history
  • Loading branch information
andreireporter13 committed Sep 28, 2024
1 parent 3f672de commit db6b154
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 14 deletions.
8 changes: 2 additions & 6 deletions JobsCrawlerProject/JobsCrawlerProject/spiders/amach_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,11 @@ class AmachSpiderSpider(scrapy.Spider):
allowed_domains = ["amach.software", "boards.eu.greenhouse.io"]
start_urls = ["https://boards.eu.greenhouse.io/embed/job_board/?for=amach"]

def start_requests(self):
request = scrapy.Request(self.start_urls[0])
yield request

def parse(self, response):
# data here
for job in response.xpath('//div[@class="opening"]'):
city_tag = job.xpath('.//span[@class="location"]/text()').get()

# check city
if 'romania' in city_tag.lower():
if (location := city_tag.split(',')[0].strip()) and location.lower() == 'bucharest':
Expand All @@ -45,4 +41,4 @@ def parse(self, response):
item['remote'] = 'on-site'
item['logo_company'] = 'https://s101-recruiting.cdn.greenhouse.io/external_greenhouse_job_boards/logos/400/114/210/resized/AMACH-5_Color_Logo_1.png?1675863445'
#
yield item
yield item
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CristimSpiderSpider(CrawlSpider):

rules = (
Rule(LinkExtractor(allow=('/cariere/',), deny=('/apply',)),
callback='parse_job'),
callback='parse_job'),
)

def parse_job(self, response):
Expand All @@ -30,7 +30,7 @@ def parse_job(self, response):
location = location[0]
else:
location = 'all'

# get location
location_finish = get_county(location=location)

Expand Down
11 changes: 5 additions & 6 deletions JobsCrawlerProject/JobsCrawlerProject/spiders/rscrds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#
from urllib.parse import urlencode
import re

from time import sleep


Expand All @@ -24,7 +23,7 @@ def make_headers(town: str, industry: str) -> tuple[dict, dict]:
params: Town: str, for search jobs in each town.
Industry: Exact title job.
return: headers - dict
payload - dict
'''
Expand Down Expand Up @@ -56,9 +55,9 @@ class RscrdsSpider(scrapy.Spider):
def start_requests(self):
yield scrapy.Request(url='https://www.digi.ro/cariere',
callback=self.parse_towns)

def parse_towns(self, response):

# get all counties
for city_ro in response.xpath('//select[@id="form-assistance-support-careers-county"]//option'):
if (location := city_ro.xpath('.//text()').extract_first()) and location.lower() == 'judet':
Expand Down Expand Up @@ -94,8 +93,8 @@ def parse_job(self, response):
#
location = response.meta.get('location')
location_finish = get_county(location=location)
# send data to Pipelines

# send data to Pipelines
item = JobItem()
item['job_link'] = f"https://www.digi.ro/{links[idx]}"
item['job_title'] = titles[idx]
Expand Down

0 comments on commit db6b154

Please sign in to comment.