From c5357487bd1c4dda9af59e5697dc6301c70e1135 Mon Sep 17 00:00:00 2001
From: Perdana Hadi <ceroberoz@gmail.com>
Date: Mon, 23 Sep 2024 11:33:04 +0700
Subject: [PATCH 1/2] add TIA

---
 freya/spiders/techinasia.py | 107 ++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 freya/spiders/techinasia.py

diff --git a/freya/spiders/techinasia.py b/freya/spiders/techinasia.py
new file mode 100644
index 0000000..e7b905d
--- /dev/null
+++ b/freya/spiders/techinasia.py
@@ -0,0 +1,107 @@
+import scrapy
+import json
+from datetime import datetime
+import logging
+from typing import Dict, Any, Optional
+import random
+from freya.pipelines import calculate_job_age  # Import the function
+from freya.utils import calculate_job_apply_end_date
+
+logger = logging.getLogger(__name__)
+
+class BlibliSpiderJson(scrapy.Spider):
+    name = 'blibli'
+    BASE_URL = 'https://careers.blibli.com'
+    API_URL = f'{BASE_URL}/ext/api/job/list?format=COMPLETE&groupBy=true'
+    JOB_URL_TEMPLATE = f'{BASE_URL}/job-detail/{{}}?job={{}}'
+
+    USER_AGENTS = [
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
+        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0',
+    ]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.timestamp = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+
+    def start_requests(self):
+        headers = {
+            'accept': '*/*',
+            'accept-language': 'en-US,en;q=0.9',
+            'cache-control': 'no-cache',
+            'dnt': '1',
+            'pragma': 'no-cache',
+            'priority': 'u=1, i',
+            'referer': 'https://careers.blibli.com/department/all-departments?experience=&employmentType=',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'sec-gpc': '1',
+            'user-agent': random.choice(self.USER_AGENTS)
+        }
+        yield scrapy.Request(self.API_URL, headers=headers, callback=self.parse)
+
+    def parse(self, response):
+        try:
+            data = json.loads(response.text)
+            departments = data['responseObject']
+
+            for department in departments:
+                for job in department['jobs']:
+                    yield self.parse_job(job)
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Error decoding JSON: {e}")
+            logger.debug(f"Response content: {response.text}")
+        except Exception as e:
+            logger.error(f"Unexpected error: {e}")
+
+    def parse_job(self, job: Dict[str, Any]) -> Dict[str, Any]:
+        first_seen = datetime.strptime(self.timestamp, "%d/%m/%Y %H:%M:%S").strftime("%Y-%m-%d %H:%M:%S")
+        last_seen = self.format_unix_time(job.get('createdDate'))
+        
+        return {
+            'job_title': self.sanitize_string(job.get('title')),
+            'job_location': self.sanitize_string(job.get('location')),
+            'job_department': self.sanitize_string(job.get('departmentName')),
+            'job_url': self.get_job_url(job),
+            'first_seen': first_seen,
+            'base_salary': 'N/A',
+            'job_type': self.get_employment_type(job),
+            'job_level': self.sanitize_string(job.get('experience')),
+            'job_apply_end_date': calculate_job_apply_end_date(last_seen),
+            'last_seen': last_seen,
+            'is_active': 'True',
+            'company': 'Blibli',
+            'company_url': self.BASE_URL,
+            'job_board': 'Blibli Job Portal',
+            'job_board_url': 'https://careers.blibli.com',
+            'job_age': calculate_job_age(first_seen, last_seen),  # Ensure this line is present
+            'work_arrangement': '', # TODO: Check if this is the correct work arrangement
+        }
+
+    def get_job_url(self, job: Dict[str, Any]) -> str:
+        job_title = job.get('title', '').lower().replace(' ', '-')
+        job_id = job.get('id', '')
+        return self.JOB_URL_TEMPLATE.format(job_title, job_id)
+
+    @staticmethod
+    def get_employment_type(job: Dict[str, Any]) -> str:
+        employment_type = job.get('employmentType', '')
+        return employment_type.replace("Ph-", "").replace("-", " ").capitalize() if employment_type else 'N/A'
+
+    @staticmethod
+    def sanitize_string(s: Optional[str]) -> str:
+        return s.strip() if s else 'N/A'
+
+    @staticmethod
+    def format_unix_time(unix_time: Optional[int]) -> str:
+        if unix_time is None:
+            return 'N/A'
+        try:
+            return datetime.fromtimestamp(unix_time / 1000).strftime('%Y-%m-%d %H:%M:%S')
+        except (ValueError, TypeError):
+            return 'N/A'
\ No newline at end of file

From c8249e5a519083840282d17ca9ee3bb139983153 Mon Sep 17 00:00:00 2001
From: Perdana Hadi <ceroberoz@gmail.com>
Date: Mon, 23 Sep 2024 11:55:13 +0700
Subject: [PATCH 2/2] add TIA Jobs

---
 README.md                   |  17 ++--
 freya/spiders/techinasia.py | 164 +++++++++++++++++++++---------------
 2 files changed, 106 insertions(+), 75 deletions(-)

diff --git a/README.md b/README.md
index 5717a4c..2fc491b 100644
--- a/README.md
+++ b/README.md
@@ -8,11 +8,11 @@
 
 ## 🆕 Latest Updates
 
-- Added Glints spider for comprehensive job data from Glints.com
-- Implemented pagination in Glints spider to capture all available job listings
-- Enhanced error handling and logging for improved debugging
-- Optimized data processing pipeline for Glints job information
-- Updated documentation to reflect new data source and features
+- Added TechInAsia spider to collect job data from Tech in Asia Jobs portal
+- Implemented Algolia API integration for efficient data retrieval from TechInAsia
+- Enhanced data sanitization to ensure CSV-friendly output
+- Improved error handling and logging for the new spider
+- Updated documentation to reflect the addition of TechInAsia as a data source
 
 ## 📊 Overview
 
@@ -20,7 +20,7 @@ id-jobs collects job listings from Indonesian job portals and company websites,
 
 **View the Data on Google Sheets:** [https://s.id/id-jobs-v2](https://s.id/id-jobs-v2)
 
-**View the Dasboard on LookerStudio by Google:** [https://s.id/id-jobs-dashboard](https://s.id/id-jobs-dashboard)
+**View the Dashboard on LookerStudio by Google:** [https://s.id/id-jobs-dashboard](https://s.id/id-jobs-dashboard)
 
 ## 🎨 Job Age Colors
 
@@ -35,7 +35,7 @@ id-jobs collects job listings from Indonesian job portals and company websites,
 
 ## 🔧 How It Works
 
-id-jobs automatically collects job data from various websites, cleans the information, and compiles it into a single spreadsheet. We now use Playwright for sites with complex JavaScript rendering.
+id-jobs automatically collects job data from various websites, cleans the information, and compiles it into a single spreadsheet. We use Scrapy for most sites and Playwright for sites with complex JavaScript rendering.
 
 ![Scraping Process](how-scraper-works.gif)
 
@@ -50,7 +50,7 @@ id-jobs simplifies job searching by gathering information from multiple sources
 ## 📚 Data Sources
 
 We collect data from various job portals and company websites, including:
-Blibli, Dealls, Evermos, Flip, GoTo, Glints (Lite), Jobstreet, Kalibrr, Karir.com, Kredivo, Mekari, SoftwareOne, Tiket, and more.
+Blibli, Dealls, Evermos, Flip, GoTo, Glints (Lite), Jobstreet, Kalibrr, Karir.com, Kredivo, Mekari, SoftwareOne, Tiket, Tech in Asia Jobs, and more.
 
 ## 🚀 Features
 
@@ -63,6 +63,7 @@ Blibli, Dealls, Evermos, Flip, GoTo, Glints (Lite), Jobstreet, Kalibrr, Karir.co
 - Job age tracking
 - JavaScript-rendered content handling with Playwright
 - Efficient pagination across multiple pages
+- Integration with Algolia API for improved data retrieval
 
 ## 🏁 Getting Started
 
diff --git a/freya/spiders/techinasia.py b/freya/spiders/techinasia.py
index e7b905d..374dbd3 100644
--- a/freya/spiders/techinasia.py
+++ b/freya/spiders/techinasia.py
@@ -3,25 +3,19 @@
 from datetime import datetime
 import logging
 from typing import Dict, Any, Optional
-import random
-from freya.pipelines import calculate_job_age  # Import the function
+from urllib.parse import urlencode  # Add this import
+from freya.pipelines import calculate_job_age
 from freya.utils import calculate_job_apply_end_date
 
 logger = logging.getLogger(__name__)
 
-class BlibliSpiderJson(scrapy.Spider):
-    name = 'blibli'
-    BASE_URL = 'https://careers.blibli.com'
-    API_URL = f'{BASE_URL}/ext/api/job/list?format=COMPLETE&groupBy=true'
-    JOB_URL_TEMPLATE = f'{BASE_URL}/job-detail/{{}}?job={{}}'
+class TechInAsiaSpider(scrapy.Spider):
+    name = 'techinasia'
+    BASE_URL = 'https://219wx3mpv4-dsn.algolia.net/1/indexes/*/queries'
 
-    USER_AGENTS = [
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
-        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0',
-    ]
+    custom_settings = {
+        'DOWNLOAD_DELAY': 0.2,
+    }
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -29,29 +23,76 @@ def __init__(self, *args, **kwargs):
 
     def start_requests(self):
         headers = {
-            'accept': '*/*',
-            'accept-language': 'en-US,en;q=0.9',
-            'cache-control': 'no-cache',
-            'dnt': '1',
-            'pragma': 'no-cache',
-            'priority': 'u=1, i',
-            'referer': 'https://careers.blibli.com/department/all-departments?experience=&employmentType=',
-            'sec-fetch-dest': 'empty',
-            'sec-fetch-mode': 'cors',
-            'sec-fetch-site': 'same-origin',
-            'sec-gpc': '1',
-            'user-agent': random.choice(self.USER_AGENTS)
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:130.0) Gecko/20100101 Firefox/130.0',
+            'Accept': 'application/json',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br, zstd',
+            'content-type': 'application/x-www-form-urlencoded',
+            'Origin': 'https://www.techinasia.com',
+            'Connection': 'keep-alive',
+            'Referer': 'https://www.techinasia.com/',
+            'Sec-Fetch-Dest': 'empty',
+            'Sec-Fetch-Mode': 'cors',
+            'Sec-Fetch-Site': 'cross-site',
+            'DNT': '1',
+            'Sec-GPC': '1',
+            'Pragma': 'no-cache',
+            'Cache-Control': 'no-cache',
         }
-        yield scrapy.Request(self.API_URL, headers=headers, callback=self.parse)
+
+        params = {
+            'x-algolia-agent': 'Algolia for vanilla JavaScript 3.30.0;JS Helper 2.26.1',
+            'x-algolia-application-id': '219WX3MPV4',
+            'x-algolia-api-key': 'b528008a75dc1c4402bfe0d8db8b3f8e',
+        }
+
+        payload = {
+            "requests": [
+                {
+                    "indexName": "job_postings",
+                    "params": "query=&hitsPerPage=20&maxValuesPerFacet=1000&page=0&facets=%5B%22*%22%2C%22city.work_country_name%22%2C%22position.name%22%2C%22industries.vertical_name%22%2C%22experience%22%2C%22job_type.name%22%2C%22is_salary_visible%22%2C%22has_equity%22%2C%22currency.currency_code%22%2C%22salary_min%22%2C%22taxonomies.slug%22%5D&tagFilters=&facetFilters=%5B%5B%22city.work_country_name%3AIndonesia%22%5D%5D"
+                },
+                {
+                    "indexName": "job_postings",
+                    "params": "query=&hitsPerPage=1&maxValuesPerFacet=1000&page=0&attributesToRetrieve=%5B%5D&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&analytics=false&clickAnalytics=false&facets=city.work_country_name"
+                }
+            ]
+        }
+
+        yield scrapy.Request(
+            f"{self.BASE_URL}?{urlencode(params)}",
+            method='POST',
+            headers=headers,
+            body=json.dumps(payload),
+            callback=self.parse,
+            dont_filter=True
+        )
 
     def parse(self, response):
         try:
             data = json.loads(response.text)
-            departments = data['responseObject']
+            hits = data['results'][0]['hits']
+            total_pages = data['results'][0]['nbPages']
+
+            for job in hits:
+                yield self.parse_job(job)
 
-            for department in departments:
-                for job in department['jobs']:
-                    yield self.parse_job(job)
+            # Handle pagination
+            current_page = data['results'][0]['page']
+            if current_page < total_pages - 1:
+                payload = json.loads(response.request.body)
+                params_str = payload['requests'][0]['params']
+                new_params_str = params_str.replace(f"page={current_page}", f"page={current_page + 1}")
+                payload['requests'][0]['params'] = new_params_str
+
+                yield scrapy.Request(
+                    response.url,
+                    method='POST',
+                    headers=response.request.headers,
+                    body=json.dumps(payload),
+                    callback=self.parse,
+                    dont_filter=True
+                )
 
         except json.JSONDecodeError as e:
             logger.error(f"Error decoding JSON: {e}")
@@ -60,48 +101,37 @@ def parse(self, response):
             logger.error(f"Unexpected error: {e}")
 
     def parse_job(self, job: Dict[str, Any]) -> Dict[str, Any]:
-        first_seen = datetime.strptime(self.timestamp, "%d/%m/%Y %H:%M:%S").strftime("%Y-%m-%d %H:%M:%S")
-        last_seen = self.format_unix_time(job.get('createdDate'))
-        
+        first_seen = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        last_seen = self.sanitize_string(job.get('published_at', 'N/A'))
+
+        def format_salary(min_salary, max_salary, currency):
+            min_sal = self.sanitize_string(str(min_salary)) if min_salary is not None else 'N/A'
+            max_sal = self.sanitize_string(str(max_salary)) if max_salary is not None else 'N/A'
+            curr = self.sanitize_string(currency) if currency else 'N/A'
+            return f"{min_sal} - {max_sal} {curr}".replace(',', '')
+
         return {
             'job_title': self.sanitize_string(job.get('title')),
-            'job_location': self.sanitize_string(job.get('location')),
-            'job_department': self.sanitize_string(job.get('departmentName')),
-            'job_url': self.get_job_url(job),
-            'first_seen': first_seen,
-            'base_salary': 'N/A',
-            'job_type': self.get_employment_type(job),
-            'job_level': self.sanitize_string(job.get('experience')),
-            'job_apply_end_date': calculate_job_apply_end_date(last_seen),
+            'job_location': f"{self.sanitize_string(job.get('city', {}).get('name'))} - {self.sanitize_string(job.get('city', {}).get('work_country_name'))}",
+            'job_department': self.sanitize_string(job.get('position', {}).get('name')),
+            'job_url': self.sanitize_string(f"https://www.techinasia.com/jobs/{job.get('id')}"),
+            'first_seen': self.sanitize_string(first_seen),
+            'base_salary': format_salary(job.get('salary_min'), job.get('salary_max'), job.get('currency', {}).get('currency_code')),
+            'job_type': self.sanitize_string(job.get('job_type', {}).get('name')),
+            'job_level': f"{self.sanitize_string(str(job.get('experience_min', 'N/A')))} - {self.sanitize_string(str(job.get('experience_max', 'N/A')))} years",
+            'job_apply_end_date': self.sanitize_string(calculate_job_apply_end_date(last_seen)),
             'last_seen': last_seen,
             'is_active': 'True',
-            'company': 'Blibli',
-            'company_url': self.BASE_URL,
-            'job_board': 'Blibli Job Portal',
-            'job_board_url': 'https://careers.blibli.com',
-            'job_age': calculate_job_age(first_seen, last_seen),  # Ensure this line is present
-            'work_arrangement': '', # TODO: Check if this is the correct work arrangement
+            'company': self.sanitize_string(job.get('company', {}).get('name')),
+            'company_url': self.sanitize_string(f"https://www.techinasia.com/companies/{job.get('company', {}).get('entity_slug')}"),
+            'job_board': 'Tech in Asia Jobs',
+            'job_board_url': 'https://www.techinasia.com/jobs',
+            'job_age': self.sanitize_string(str(calculate_job_age(first_seen, last_seen))),
+            'work_arrangement': 'Remote' if job.get('is_remote') else 'On-site',
         }
 
-    def get_job_url(self, job: Dict[str, Any]) -> str:
-        job_title = job.get('title', '').lower().replace(' ', '-')
-        job_id = job.get('id', '')
-        return self.JOB_URL_TEMPLATE.format(job_title, job_id)
-
-    @staticmethod
-    def get_employment_type(job: Dict[str, Any]) -> str:
-        employment_type = job.get('employmentType', '')
-        return employment_type.replace("Ph-", "").replace("-", " ").capitalize() if employment_type else 'N/A'
-
     @staticmethod
     def sanitize_string(s: Optional[str]) -> str:
-        return s.strip() if s else 'N/A'
-
-    @staticmethod
-    def format_unix_time(unix_time: Optional[int]) -> str:
-        if unix_time is None:
+        if s is None:
             return 'N/A'
-        try:
-            return datetime.fromtimestamp(unix_time / 1000).strftime('%Y-%m-%d %H:%M:%S')
-        except (ValueError, TypeError):
-            return 'N/A'
\ No newline at end of file
+        return s.strip().replace(',', ' -').replace('\n', ' ').replace('\r', '')