Skip to content

Commit

Permalink
update upload to gsheets validations
Browse files Browse the repository at this point in the history
  • Loading branch information
ceroberoz committed Aug 22, 2024
1 parent 7d87659 commit f1f8618
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 24 deletions.
14 changes: 9 additions & 5 deletions .github/workflows/scrape.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: "3.12"
cache: "pip" # This caches pip dependencies
cache: "pip"

- name: Install dependencies
run: |
Expand All @@ -28,15 +28,19 @@ jobs:
- name: Install Playwright browsers
run: playwright install --with-deps chromium firefox webkit

- name: Run scraping and upload
- name: Run scraping
env:
GCP_JSON: ${{ secrets.GCP_JSON }}
GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }}
PYTHONUNBUFFERED: 1
run: |
chmod +x ./scrape.sh
./scrape.sh
python upload_to_sheets.py
- name: Upload to Google Sheets
env:
GCP_JSON: ${{ secrets.GCP_JSON }}
GOOGLE_SHEETS_ID: ${{ secrets.GOOGLE_SHEETS_ID }}
PYTHONUNBUFFERED: 1
run: python upload_to_sheets.py

- name: Archive production artifacts
uses: actions/upload-artifact@v3
Expand Down
Binary file removed public/.DS_Store
Binary file not shown.
97 changes: 78 additions & 19 deletions upload_to_sheets.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
import os
import csv
import json
import sys
import time
import random
from dataclasses import dataclass
from contextlib import contextmanager

import pandas as pd
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import time

SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
MAX_RETRIES = 3
RETRY_DELAY = 5
@dataclass
class Config:
scopes: tuple = ("https://www.googleapis.com/auth/spreadsheets",)
max_retries: int = 3
sheet_range: str = 'Sheet1'
sheet_id: int = 0 # Assumes first sheet in the spreadsheet

config = Config()

def get_env_var(var_name):
value = os.environ.get(var_name)
Expand All @@ -23,15 +32,15 @@ def setup_credentials():
try:
creds_dict = json.loads(gcp_json)
return service_account.Credentials.from_service_account_info(
creds_dict, scopes=SCOPES)
creds_dict, scopes=config.scopes)
except json.JSONDecodeError:
print("Error: Invalid JSON in GCP_JSON environment variable")
sys.exit(1)

def read_csv(file_path):
try:
with open(file_path, 'r') as file:
return list(csv.reader(file))
df = pd.read_csv(file_path)
return [df.columns.tolist()] + df.values.tolist()
except FileNotFoundError:
print(f"Error: CSV file not found at {file_path}")
sys.exit(1)
Expand All @@ -43,52 +52,102 @@ def validate_data(data):
# Add more validation as needed
return True

@contextmanager
def get_sheets_service(creds):
service = build("sheets", "v4", credentials=creds)
try:
yield service
finally:
service.close()

def upload_to_sheets(service, spreadsheet_id, data):
sheet_range = 'Sheet1'
body = {'values': data}

for attempt in range(MAX_RETRIES):
for attempt in range(config.max_retries):
try:
spreadsheet = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
print(f"Successfully accessed spreadsheet: {spreadsheet['properties']['title']}")

# Clear the sheet
service.spreadsheets().values().clear(
spreadsheetId=spreadsheet_id,
range=sheet_range
range=config.sheet_range
).execute()

# Update values
result = service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
range=sheet_range,
range=config.sheet_range,
valueInputOption='RAW',
body=body
).execute()
print(f"{result.get('updatedCells')} cells updated.")

# Format header row as bold and freeze it
requests = [
{
"repeatCell": {
"range": {
"sheetId": config.sheet_id,
"startRowIndex": 0,
"endRowIndex": 1
},
"cell": {
"userEnteredFormat": {
"textFormat": {
"bold": True
}
}
},
"fields": "userEnteredFormat.textFormat.bold"
}
},
{
"updateSheetProperties": {
"properties": {
"sheetId": config.sheet_id,
"gridProperties": {
"frozenRowCount": 1
}
},
"fields": "gridProperties.frozenRowCount"
}
}
]

# Execute the formatting requests
service.spreadsheets().batchUpdate(
spreadsheetId=spreadsheet_id,
body={"requests": requests}
).execute()

print("Header row formatted as bold and frozen.")
return
except HttpError as err:
if err.resp.status in [403, 404]:
print(f"Error {err.resp.status}: {err}")
print("Check spreadsheet ID and service account permissions.")
sys.exit(1)
elif attempt < MAX_RETRIES - 1:
print(f"Attempt {attempt + 1} failed. Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
elif attempt < config.max_retries - 1:
wait_time = (2 ** attempt) + random.uniform(0, 1)
print(f"Attempt {attempt + 1} failed. Retrying in {wait_time:.2f} seconds...")
time.sleep(wait_time)
else:
print(f"Failed after {MAX_RETRIES} attempts: {err}")
print(f"Failed after {config.max_retries} attempts: {err}")
sys.exit(1)

def main():
creds = setup_credentials()
service = build("sheets", "v4", credentials=creds)
spreadsheet_id = get_env_var('GOOGLE_SHEETS_ID')

print(f"Attempting to access spreadsheet with ID: {spreadsheet_id}")

csv_content = read_csv('public/merged.csv')
csv_content = read_csv('output/merged.csv')
if not validate_data(csv_content):
sys.exit(1)

upload_to_sheets(service, spreadsheet_id, csv_content)
with get_sheets_service(creds) as service:
upload_to_sheets(service, spreadsheet_id, csv_content)

if __name__ == "__main__":
main()

0 comments on commit f1f8618

Please sign in to comment.