Skip to content
This repository has been archived by the owner on May 6, 2024. It is now read-only.

Run scrapers

Run scrapers #85963

Workflow file for this run

name: Run scrapers
on:
schedule:
- cron: '*/20 * * * *' # run every 20min from 05:00-21:00 UTC
workflow_dispatch: ~
jobs:
run_scraper:
runs-on: ubuntu-20.04
continue-on-error: false
timeout-minutes: 10
strategy:
fail-fast: false
matrix:
canton:
#- AG
#- AI # currently no data available
- AR
- BE
- BL
- BS
#- FR
- GE
- GL
#- GR
- JU
- LU
- NE
- NW
#- OW
- SG
- SH
#- SO
#- SZ
- TG
#- TI
#- UR # no more data available
- VD
- VS
- ZG
- ZH
- FL
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.7
uses: actions/setup-python@v4
with:
python-version: 3.7
- run: npm ci
- name: Remove broken apt repos
run: |
for apt_file in `grep -lr microsoft /etc/apt/sources.list.d/`; do sudo rm $apt_file; done
- name: Install dependencies
env:
SCRAPER_KEY: ${{ matrix.canton }}
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
sudo apt update || true # do not fail if update does not work
sudo apt-get install sqlite3
sudo apt-get install poppler-utils
if [ "$SCRAPER_KEY" = "GE" ] ; then
sudo apt-get install chromium-browser
fi
- name: Scrape new data
env:
SCRAPER_KEY: ${{ matrix.canton }}
SCRAPER_OVERWRITE: ${{ matrix.overwrite }}
run: |
./scrapers/run_scraper.sh
- name: Check if there are changes in the repo
run: |
if git diff -w --no-ext-diff --quiet
then
echo "changed=0" >> $GITHUB_OUTPUT
else
echo "changed=1" >> $GITHUB_OUTPUT
fi
id: changes
- name: Set commit message
env:
SCRAPER_KEY: ${{ matrix.canton }}
run: |
if [ "$SCRAPER_KEY" = "FL" ] ; then
echo "commit_msg=Update COVID19_Fallzahlen_${SCRAPER_KEY}_total.csv from scraper" >> $GITHUB_ENV
else
echo "commit_msg=Update COVID19_Fallzahlen_Kanton_${SCRAPER_KEY}_total.csv from scraper" >> $GITHUB_ENV
fi
- name: Sleep randomly # sleep here to prevent two workers to commit at the same time
if: steps.changes.outputs.changed == 1 # only sleep, if we will try to commit
run: sleep $[ ( $RANDOM % 30 ) + 1 ]s
- name: Commit and push to repo
if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
uses: github-actions-x/commit@v2.9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
push-branch: master
name: GitHub Action Scraper
email: scraper@open.zh.ch
commit-message: ${{ env.commit_msg }}
rebase: 'true'
- name: Validate scraper output
continue-on-error: true
env:
SCRAPER_KEY: ${{ matrix.canton }}
run: |
echo "validate_status=failed" >> $GITHUB_ENV
./scrapers/validate_scraper_output.sh
echo "validate_status=success" >> $GITHUB_ENV
- name: Get current unix timestamp
if: always()
id: date
run: echo "ts=$(date +'%s')" >> $GITHUB_OUTPUT
- name: Notify slack validation error
if: ${{ env.validate_status == 'failed' }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
uses: pullreminders/slack-action@master
with:
args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"warning\", \"title\": \"CSV validation for ${{ matrix.canton }} failed\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: CSV validation after scraping failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
- name: Notify slack failure
if: ${{ failure() || cancelled() }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
uses: pullreminders/slack-action@master
with:
args: '{\"channel\":\"C013C0UUQ4S\", \"attachments\": [{\"fallback\": \"Job failed.\", \"color\": \"danger\", \"title\": \"Run Scrapers ${{ matrix.canton }}\", \"title_link\": \"https://github.com/openZH/covid_19/actions/runs/${{ github.run_id }}?check_suite_focus=true\", \"text\": \":x: Scraper failed\", \"footer\": \"<https://github.com/openZH/covid_19|openZH/covid_19>\", \"footer_icon\": \"https://github.com/abinoda/slack-action/raw/master/docs/app-icons/github-icon.png\", \"ts\": \"${{steps.date.outputs.ts}}\"}]}'
merge_csvs:
needs: run_scraper
if: always()
runs-on: ubuntu-20.04
timeout-minutes: 10
steps:
- uses: actions/checkout@v3
- name: Setup Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: '2.6'
- name: Print header line and then merge all files without header line
run: ruby scripts/merge_canton_csvs.rb > COVID19_Fallzahlen_CH_total_v2.csv
# Create v2 files
- name: Create v1 files based on new structure
run: |
./scripts/transform_all_new2old.sh
./scripts/new2oldcsv.py COVID19_Fallzahlen_CH_total_v2.csv > COVID19_Fallzahlen_CH_total.csv
- name: Check if there are changes in the repo
run: |
if git diff -w --no-ext-diff --quiet
then
echo "changed=0" >> $GITHUB_OUTPUT
else
echo "changed=1" >> $GITHUB_OUTPUT
fi
id: changes
# Commit to repo with updated file
- name: Commit and push to repo
if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
uses: github-actions-x/commit@v2.9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
push-branch: master
name: GitHub Action Scraper
email: scraper@open.zh.ch
commit-message: Update COVID19_Fallzahlen_CH_total.csv
rebase: 'true'
update_readme:
needs: merge_csvs
if: always()
runs-on: ubuntu-20.04
timeout-minutes: 10
steps:
- uses: actions/checkout@v3
- name: Update README with latest update dates from cantons
run: ./scripts/update_dates_in_readme.sh
- name: Check if there are changes in the repo
run: |
if git diff -w --no-ext-diff --quiet
then
echo "changed=0" >> $GITHUB_OUTPUT
else
echo "changed=1" >> $GITHUB_OUTPUT
fi
id: changes
# Commit to repo with updated file
- name: Commit and push to repo
if: steps.changes.outputs.changed == 1 # only try to commit if there are actually changes
uses: github-actions-x/commit@v2.9
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
push-branch: master
name: GitHub Action Scraper
email: scraper@open.zh.ch
commit-message: Update dates in README
rebase: 'true'