From 2c0a6aab2c08244ab249a152111c34216ff57174 Mon Sep 17 00:00:00 2001 From: Perdana Hadi Date: Fri, 23 Aug 2024 10:42:22 +0700 Subject: [PATCH] refactor pipeline --- .github/workflows/scrape.yml | 56 +++++++++-------- QUICKSTART.md | 2 +- README.md | 2 +- pipeline/adjust_column_widths.py | 63 +++++++++++++++++++ scrape.sh => pipeline/scrape.sh | 0 .../upload_to_sheets.py | 0 6 files changed, 94 insertions(+), 29 deletions(-) create mode 100644 pipeline/adjust_column_widths.py rename scrape.sh => pipeline/scrape.sh (100%) rename upload_to_sheets.py => pipeline/upload_to_sheets.py (100%) diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml index ddf6e7e..69a7eb5 100644 --- a/.github/workflows/scrape.yml +++ b/.github/workflows/scrape.yml @@ -1,15 +1,16 @@ -name: Scrape and Upload to Google Sheets +name: Adjust Column Widths in Google Sheets on: push: branches: [master] pull_request: branches: [master] - schedule: - - cron: "0 0 * * *" + # Commenting out the schedule for testing purposes + # schedule: + # - cron: "0 0 * * *" jobs: - scrape-and-upload: + adjust-column-widths: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -25,31 +26,32 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pandas google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client - - name: Install Playwright browsers - run: playwright install --with-deps chromium firefox webkit - - - name: Run scraping - env: - PYTHONUNBUFFERED: 1 - run: | - chmod +x ./scrape.sh - ./scrape.sh - - - name: Upload to Google Sheets + # Commenting out non-relevant steps + # - name: Install Playwright browsers + # run: playwright install --with-deps chromium firefox webkit + + # - name: Run scraping + # env: + # PYTHONUNBUFFERED: 1 + # run: | + # chmod +x ./pipeline/scrape.sh + # ./pipeline/scrape.sh + + # - name: Upload to Google Sheets + # env: + # GCP_JSON: ${{ secrets.GCP_JSON }} + # GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }} + # PYTHONUNBUFFERED: 1 + # run: python pipeline/upload_to_sheets.py + + - name: Adjust Column Widths env: GCP_JSON: ${{ secrets.GCP_JSON }} GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }} PYTHONUNBUFFERED: 1 - run: python upload_to_sheets.py - - - name: Archive production artifacts - uses: actions/upload-artifact@v3 - with: - name: csv-files - path: public/*.csv - retention-days: 5 + run: python pipeline/adjust_column_widths.py - - name: Cleanup - if: always() - run: | - rm -rf output public + # - name: Cleanup + # if: always() + # run: | + # rm -rf output diff --git a/QUICKSTART.md b/QUICKSTART.md index 8650e9c..d8ed6f6 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -5,7 +5,7 @@ This guide provides instructions for setting up and running the id-jobs project ## Prerequisites - Git -- Python 3.15+ +- Python 3.12+ ## Setup diff --git a/README.md b/README.md index 4247937..2cc076e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Scrape and Upload to Google Sheets](https://github.com/ceroberoz/id-jobs/actions/workflows/scrape.yml/badge.svg)](https://github.com/ceroberoz/id-jobs/actions/workflows/scrape.yml) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) -[![Python 3.15+](https://img.shields.io/badge/python-3.15+-blue.svg)](https://www.python.org/downloads/) +[![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/) ![Made with Scrapy](https://img.shields.io/badge/Made%20with-Scrapy-green.svg) ![Made with Playwright](https://img.shields.io/badge/Made%20with-Playwright-orange.svg) diff --git a/pipeline/adjust_column_widths.py b/pipeline/adjust_column_widths.py new file mode 100644 index 0000000..097e5f5 --- /dev/null +++ b/pipeline/adjust_column_widths.py @@ -0,0 +1,63 @@ +import os +import json +from google.oauth2 import service_account +from googleapiclient.discovery import build + +def get_env_var(var_name): + value = os.environ.get(var_name) + if not value: + raise ValueError(f"{var_name} environment variable is not set or is empty") + return value + +def setup_credentials(): + gcp_json = get_env_var('GCP_JSON') + creds_dict = json.loads(gcp_json) + return service_account.Credentials.from_service_account_info( + creds_dict, scopes=["https://www.googleapis.com/auth/spreadsheets"]) + +def adjust_column_widths(spreadsheet_id): + creds = setup_credentials() + service = build("sheets", "v4", credentials=creds) + + column_widths = { + 'job_title': 684, + 'job_location': 255, + 'job_department': 548, + 'job_url': 662, + 'first_seen': 130, + 'base_salary': 304, + 'job_type': 112, + 'job_level': 64, + 'job_apply_end_date': 225, + 'last_seen': 170, + 'is_active': 63, + 'company': 467, + 'company_url': 646, + 'job_board': 72, + 'job_board_url': 213 + } + + requests = [] + for index, (column_name, width) in enumerate(column_widths.items()): + requests.append({ + "updateDimensionProperties": { + "range": { + "sheetId": 0, + "dimension": "COLUMNS", + "startIndex": index, + "endIndex": index + 1 + }, + "properties": { + "pixelSize": width + }, + "fields": "pixelSize" + } + }) + + body = {"requests": requests} + service.spreadsheets().batchUpdate(spreadsheetId=spreadsheet_id, body=body).execute() + print("Column widths adjusted successfully.") + +if __name__ == "__main__": + spreadsheet_id = get_env_var('GOOGLE_SHEETS_ID') + adjust_column_widths(spreadsheet_id) diff --git a/scrape.sh b/pipeline/scrape.sh similarity index 100% rename from scrape.sh rename to pipeline/scrape.sh diff --git a/upload_to_sheets.py b/pipeline/upload_to_sheets.py similarity index 100% rename from upload_to_sheets.py rename to pipeline/upload_to_sheets.py