Skip to content

Scheduled ETL: Analyze Drudge #246

Scheduled ETL: Analyze Drudge

Scheduled ETL: Analyze Drudge #246

name: "Scheduled ETL: Analyze Drudge"
on:
schedule:
- cron: '0 10 * * *'
workflow_dispatch:
concurrency:
group: analyze-drudge
cancel-in-progress: true
jobs:
hyperlinks:
name: Process hyperlinks
runs-on: ubuntu-latest
timeout-minutes: 60
continue-on-error: true
steps:
- id: checkout
name: Checkout
uses: actions/checkout@v3
- id: install
name: Install
uses: ./.github/actions/install
- id: analyze
name: Analyze Drudge hyperlinks
run: pipenv run newshomepages-analyze drudge-hyperlinks -o ./
shell: bash
- id: save
name: Save artifact
uses: actions/upload-artifact@v3
with:
name: extracts
path: ./drudge-hyperlinks-analysis.csv
if-no-files-found: error
- id: upload-to-internet-archive
name: Upload files to archive.org
uses: palewire/internet-archive-upload@v1
with:
access-key: ${{ secrets.IA_ACCESS_KEY }}
secret-key: ${{ secrets.IA_SECRET_KEY }}
identifier: news-homepages-extracts
files: drudge-hyperlinks-analysis.csv
entities:
name: Extract entities
runs-on: ubuntu-latest
needs: hyperlinks
timeout-minutes: 60
continue-on-error: true
steps:
- id: checkout
name: Checkout
uses: actions/checkout@v3
- id: install
name: Install
uses: ./.github/actions/install
- id: install-spacy
name: Install Spacy language dictionary
run: pipenv run python -m spacy download en_core_web_lg
shell: bash
- id: analyze
name: Analyze Drudge entities
run: pipenv run newshomepages-analyze drudge-entities -o ./
shell: bash
- id: save
name: Save artifact
uses: actions/upload-artifact@v3
with:
name: extracts
path: ./drudge-entities-analysis.csv
if-no-files-found: error
- id: upload-to-internet-archive
name: Upload files to archive.org
uses: palewire/internet-archive-upload@v1
with:
access-key: ${{ secrets.IA_ACCESS_KEY }}
secret-key: ${{ secrets.IA_SECRET_KEY }}
identifier: news-homepages-extracts
files: drudge-entities-analysis.csv