Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
  • Loading branch information
ceroberoz committed Sep 23, 2024
2 parents 3da4762 + c8249e5 commit 50bedd6
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 8 deletions.
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@

## 🆕 Latest Updates

- Added Glints spider for comprehensive job data from Glints.com
- Implemented pagination in Glints spider to capture all available job listings
- Enhanced error handling and logging for improved debugging
- Optimized data processing pipeline for Glints job information
- Updated documentation to reflect new data source and features
- Added TechInAsia spider to collect job data from Tech in Asia Jobs portal
- Implemented Algolia API integration for efficient data retrieval from TechInAsia
- Enhanced data sanitization to ensure CSV-friendly output
- Improved error handling and logging for the new spider
- Updated documentation to reflect the addition of TechInAsia as a data source

## 📊 Overview

id-jobs collects job listings from Indonesian job portals and company websites, respecting each site's terms of service.

**View the Data on Google Sheets:** [https://s.id/id-jobs-v2](https://s.id/id-jobs-v2)

**View the Dasboard on LookerStudio by Google:** [https://s.id/id-jobs-dashboard](https://s.id/id-jobs-dashboard)
**View the Dashboard on LookerStudio by Google:** [https://s.id/id-jobs-dashboard](https://s.id/id-jobs-dashboard)

## 🎨 Job Age Colors

Expand All @@ -35,7 +35,7 @@ id-jobs collects job listings from Indonesian job portals and company websites,

## 🔧 How It Works

id-jobs automatically collects job data from various websites, cleans the information, and compiles it into a single spreadsheet. We now use Playwright for sites with complex JavaScript rendering.
id-jobs automatically collects job data from various websites, cleans the information, and compiles it into a single spreadsheet. We use Scrapy for most sites and Playwright for sites with complex JavaScript rendering.

![Scraping Process](how-scraper-works.gif)

Expand All @@ -50,7 +50,7 @@ id-jobs simplifies job searching by gathering information from multiple sources
## 📚 Data Sources

We collect data from various job portals and company websites, including:
Blibli, Dealls, Evermos, Flip, GoTo, Glints (Lite), Jobstreet, Kalibrr, Karir.com, Kredivo, Mekari, SoftwareOne, Tiket, and more.
Blibli, Dealls, Evermos, Flip, GoTo, Glints (Lite), Jobstreet, Kalibrr, Karir.com, Kredivo, Mekari, SoftwareOne, Tiket, Tech in Asia Jobs, and more.

## 🚀 Features

Expand All @@ -63,6 +63,7 @@ Blibli, Dealls, Evermos, Flip, GoTo, Glints (Lite), Jobstreet, Kalibrr, Karir.co
- Job age tracking
- JavaScript-rendered content handling with Playwright
- Efficient pagination across multiple pages
- Integration with Algolia API for improved data retrieval

## 🏁 Getting Started

Expand Down
137 changes: 137 additions & 0 deletions freya/spiders/techinasia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import scrapy
import json
from datetime import datetime
import logging
from typing import Dict, Any, Optional
from urllib.parse import urlencode # Add this import
from freya.pipelines import calculate_job_age
from freya.utils import calculate_job_apply_end_date

logger = logging.getLogger(__name__)

class TechInAsiaSpider(scrapy.Spider):
name = 'techinasia'
BASE_URL = 'https://219wx3mpv4-dsn.algolia.net/1/indexes/*/queries'

custom_settings = {
'DOWNLOAD_DELAY': 0.2,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.timestamp = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

def start_requests(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:130.0) Gecko/20100101 Firefox/130.0',
'Accept': 'application/json',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'content-type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.techinasia.com',
'Connection': 'keep-alive',
'Referer': 'https://www.techinasia.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'DNT': '1',
'Sec-GPC': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}

params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript 3.30.0;JS Helper 2.26.1',
'x-algolia-application-id': '219WX3MPV4',
'x-algolia-api-key': 'b528008a75dc1c4402bfe0d8db8b3f8e',
}

payload = {
"requests": [
{
"indexName": "job_postings",
"params": "query=&hitsPerPage=20&maxValuesPerFacet=1000&page=0&facets=%5B%22*%22%2C%22city.work_country_name%22%2C%22position.name%22%2C%22industries.vertical_name%22%2C%22experience%22%2C%22job_type.name%22%2C%22is_salary_visible%22%2C%22has_equity%22%2C%22currency.currency_code%22%2C%22salary_min%22%2C%22taxonomies.slug%22%5D&tagFilters=&facetFilters=%5B%5B%22city.work_country_name%3AIndonesia%22%5D%5D"
},
{
"indexName": "job_postings",
"params": "query=&hitsPerPage=1&maxValuesPerFacet=1000&page=0&attributesToRetrieve=%5B%5D&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&analytics=false&clickAnalytics=false&facets=city.work_country_name"
}
]
}

yield scrapy.Request(
f"{self.BASE_URL}?{urlencode(params)}",
method='POST',
headers=headers,
body=json.dumps(payload),
callback=self.parse,
dont_filter=True
)

def parse(self, response):
try:
data = json.loads(response.text)
hits = data['results'][0]['hits']
total_pages = data['results'][0]['nbPages']

for job in hits:
yield self.parse_job(job)

# Handle pagination
current_page = data['results'][0]['page']
if current_page < total_pages - 1:
payload = json.loads(response.request.body)
params_str = payload['requests'][0]['params']
new_params_str = params_str.replace(f"page={current_page}", f"page={current_page + 1}")
payload['requests'][0]['params'] = new_params_str

yield scrapy.Request(
response.url,
method='POST',
headers=response.request.headers,
body=json.dumps(payload),
callback=self.parse,
dont_filter=True
)

except json.JSONDecodeError as e:
logger.error(f"Error decoding JSON: {e}")
logger.debug(f"Response content: {response.text}")
except Exception as e:
logger.error(f"Unexpected error: {e}")

def parse_job(self, job: Dict[str, Any]) -> Dict[str, Any]:
first_seen = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
last_seen = self.sanitize_string(job.get('published_at', 'N/A'))

def format_salary(min_salary, max_salary, currency):
min_sal = self.sanitize_string(str(min_salary)) if min_salary is not None else 'N/A'
max_sal = self.sanitize_string(str(max_salary)) if max_salary is not None else 'N/A'
curr = self.sanitize_string(currency) if currency else 'N/A'
return f"{min_sal} - {max_sal} {curr}".replace(',', '')

return {
'job_title': self.sanitize_string(job.get('title')),
'job_location': f"{self.sanitize_string(job.get('city', {}).get('name'))} - {self.sanitize_string(job.get('city', {}).get('work_country_name'))}",
'job_department': self.sanitize_string(job.get('position', {}).get('name')),
'job_url': self.sanitize_string(f"https://www.techinasia.com/jobs/{job.get('id')}"),
'first_seen': self.sanitize_string(first_seen),
'base_salary': format_salary(job.get('salary_min'), job.get('salary_max'), job.get('currency', {}).get('currency_code')),
'job_type': self.sanitize_string(job.get('job_type', {}).get('name')),
'job_level': f"{self.sanitize_string(str(job.get('experience_min', 'N/A')))} - {self.sanitize_string(str(job.get('experience_max', 'N/A')))} years",
'job_apply_end_date': self.sanitize_string(calculate_job_apply_end_date(last_seen)),
'last_seen': last_seen,
'is_active': 'True',
'company': self.sanitize_string(job.get('company', {}).get('name')),
'company_url': self.sanitize_string(f"https://www.techinasia.com/companies/{job.get('company', {}).get('entity_slug')}"),
'job_board': 'Tech in Asia Jobs',
'job_board_url': 'https://www.techinasia.com/jobs',
'job_age': self.sanitize_string(str(calculate_job_age(first_seen, last_seen))),
'work_arrangement': 'Remote' if job.get('is_remote') else 'On-site',
}

@staticmethod
def sanitize_string(s: Optional[str]) -> str:
if s is None:
return 'N/A'
return s.strip().replace(',', ' -').replace('\n', ' ').replace('\r', '')

0 comments on commit 50bedd6

Please sign in to comment.