diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml index cafd0e9..a53db16 100644 --- a/.github/workflows/scrape.yml +++ b/.github/workflows/scrape.yml @@ -18,7 +18,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.12" - cache: "pip" # This caches pip dependencies + cache: "pip" - name: Install dependencies run: | @@ -28,15 +28,19 @@ jobs: - name: Install Playwright browsers run: playwright install --with-deps chromium firefox webkit - - name: Run scraping and upload + - name: Run scraping env: - GCP_JSON: ${{ secrets.GCP_JSON }} - GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }} PYTHONUNBUFFERED: 1 run: | chmod +x ./scrape.sh ./scrape.sh - python upload_to_sheets.py + + - name: Upload to Google Sheets + env: + GCP_JSON: ${{ secrets.GCP_JSON }} + GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }} + PYTHONUNBUFFERED: 1 + run: python upload_to_sheets.py - name: Archive production artifacts uses: actions/upload-artifact@v3 diff --git a/public/.DS_Store b/public/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/public/.DS_Store and /dev/null differ diff --git a/upload_to_sheets.py b/upload_to_sheets.py index 7104588..0222775 100644 --- a/upload_to_sheets.py +++ b/upload_to_sheets.py @@ -1,15 +1,24 @@ import os -import csv import json import sys +import time +import random +from dataclasses import dataclass +from contextlib import contextmanager + +import pandas as pd from google.oauth2 import service_account from googleapiclient.discovery import build from googleapiclient.errors import HttpError -import time -SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] -MAX_RETRIES = 3 -RETRY_DELAY = 5 +@dataclass +class Config: + scopes: tuple = ("https://www.googleapis.com/auth/spreadsheets",) + max_retries: int = 3 + sheet_range: str = 'Sheet1' + sheet_id: int = 0 # Assumes first sheet in the spreadsheet + +config = Config() def get_env_var(var_name): value = os.environ.get(var_name) @@ -23,15 +32,15 @@ def setup_credentials(): try: creds_dict = json.loads(gcp_json) return service_account.Credentials.from_service_account_info( - creds_dict, scopes=SCOPES) + creds_dict, scopes=config.scopes) except json.JSONDecodeError: print("Error: Invalid JSON in GCP_JSON environment variable") sys.exit(1) def read_csv(file_path): try: - with open(file_path, 'r') as file: - return list(csv.reader(file)) + df = pd.read_csv(file_path) + return [df.columns.tolist()] + df.values.tolist() except FileNotFoundError: print(f"Error: CSV file not found at {file_path}") sys.exit(1) @@ -43,52 +52,102 @@ def validate_data(data): # Add more validation as needed return True +@contextmanager +def get_sheets_service(creds): + service = build("sheets", "v4", credentials=creds) + try: + yield service + finally: + service.close() + def upload_to_sheets(service, spreadsheet_id, data): - sheet_range = 'Sheet1' body = {'values': data} - for attempt in range(MAX_RETRIES): + for attempt in range(config.max_retries): try: spreadsheet = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute() print(f"Successfully accessed spreadsheet: {spreadsheet['properties']['title']}") + # Clear the sheet service.spreadsheets().values().clear( spreadsheetId=spreadsheet_id, - range=sheet_range + range=config.sheet_range ).execute() + # Update values result = service.spreadsheets().values().update( spreadsheetId=spreadsheet_id, - range=sheet_range, + range=config.sheet_range, valueInputOption='RAW', body=body ).execute() print(f"{result.get('updatedCells')} cells updated.") + + # Format header row as bold and freeze it + requests = [ + { + "repeatCell": { + "range": { + "sheetId": config.sheet_id, + "startRowIndex": 0, + "endRowIndex": 1 + }, + "cell": { + "userEnteredFormat": { + "textFormat": { + "bold": True + } + } + }, + "fields": "userEnteredFormat.textFormat.bold" + } + }, + { + "updateSheetProperties": { + "properties": { + "sheetId": config.sheet_id, + "gridProperties": { + "frozenRowCount": 1 + } + }, + "fields": "gridProperties.frozenRowCount" + } + } + ] + + # Execute the formatting requests + service.spreadsheets().batchUpdate( + spreadsheetId=spreadsheet_id, + body={"requests": requests} + ).execute() + + print("Header row formatted as bold and frozen.") return except HttpError as err: if err.resp.status in [403, 404]: print(f"Error {err.resp.status}: {err}") print("Check spreadsheet ID and service account permissions.") sys.exit(1) - elif attempt < MAX_RETRIES - 1: - print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...") - time.sleep(RETRY_DELAY) + elif attempt < config.max_retries - 1: + wait_time = (2 ** attempt) + random.uniform(0, 1) + print(f"Attempt {attempt + 1} failed. Retrying in {wait_time:.2f} seconds...") + time.sleep(wait_time) else: - print(f"Failed after {MAX_RETRIES} attempts: {err}") + print(f"Failed after {config.max_retries} attempts: {err}") sys.exit(1) def main(): creds = setup_credentials() - service = build("sheets", "v4", credentials=creds) spreadsheet_id = get_env_var('GOOGLE_SHEETS_ID') print(f"Attempting to access spreadsheet with ID: {spreadsheet_id}") - csv_content = read_csv('public/merged.csv') + csv_content = read_csv('output/merged.csv') if not validate_data(csv_content): sys.exit(1) - upload_to_sheets(service, spreadsheet_id, csv_content) + with get_sheets_service(creds) as service: + upload_to_sheets(service, spreadsheet_id, csv_content) if __name__ == "__main__": main()