diff --git a/.github/workflows/daily_crawl.yaml b/.github/workflows/daily_crawl.yaml index 6b1bcf8e1..b4a550eda 100644 --- a/.github/workflows/daily_crawl.yaml +++ b/.github/workflows/daily_crawl.yaml @@ -7,7 +7,7 @@ on: workflow_dispatch: jobs: - schedule-jobs: + daily-crawl: runs-on: ubuntu-latest env: SHUB_APIKEY: ${{ secrets.SHUB_APIKEY }} diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index bf52b77fb..4b4164eab 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -6,7 +6,7 @@ on: workflow_dispatch: jobs: - deploy_to_scrapy_cloud: + deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/monthly_crawl.yaml b/.github/workflows/monthly_crawl.yaml index 23b2c2c0f..30dd3dd68 100644 --- a/.github/workflows/monthly_crawl.yaml +++ b/.github/workflows/monthly_crawl.yaml @@ -7,7 +7,7 @@ on: workflow_dispatch: jobs: - schedule-jobs: + monthly-crawl: runs-on: ubuntu-latest env: SHUB_APIKEY: ${{ secrets.SHUB_APIKEY }} diff --git a/.github/workflows/schedule_spider.yaml b/.github/workflows/schedule_spider.yaml index b57cf5477..1c84be682 100644 --- a/.github/workflows/schedule_spider.yaml +++ b/.github/workflows/schedule_spider.yaml @@ -14,7 +14,7 @@ on: required: false jobs: - schedule: + full-crawl: runs-on: ubuntu-latest env: SHUB_APIKEY: ${{ secrets.SHUB_APIKEY }} diff --git a/.github/workflows/schedule_spider_by_date.yaml b/.github/workflows/schedule_spider_by_date.yaml index 22383a6d8..8e590845f 100644 --- a/.github/workflows/schedule_spider_by_date.yaml +++ b/.github/workflows/schedule_spider_by_date.yaml @@ -8,7 +8,7 @@ on: required: true jobs: - schedule: + full-crawl-by-date: runs-on: ubuntu-latest env: SHUB_APIKEY: ${{ secrets.SHUB_APIKEY }} diff --git a/.github/workflows/schedule_spider_yearly.yaml b/.github/workflows/schedule_spider_yearly.yaml new file mode 100644 index 000000000..7535a801e --- /dev/null +++ b/.github/workflows/schedule_spider_yearly.yaml @@ -0,0 +1,76 @@ +name: Schedule Spider Crawl Split Per Year + +on: + workflow_dispatch: + inputs: + spider_name: + description: 'Spider to be scheduled' + required: true + start_date: + description: 'Start date (YYYY-MM-Dd")' + required: true + end_date: + description: 'End date (YYYY-MM-Dd")' + required: false + +jobs: + full-crawl-yearly: + runs-on: ubuntu-latest + env: + SHUB_APIKEY: ${{ secrets.SHUB_APIKEY }} + SCRAPY_CLOUD_PROJECT_ID: ${{ secrets.SCRAPY_CLOUD_PROJECT_ID }} + FILES_STORE: ${{ secrets.FILES_STORE }} + QUERIDODIARIO_DATABASE_URL: ${{ secrets.QUERIDODIARIO_DATABASE_URL }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ENDPOINT_URL: ${{ secrets.AWS_ENDPOINT_URL }} + AWS_REGION_NAME: ${{ secrets.AWS_REGION_NAME }} + SPIDERMON_DISCORD_FAKE: ${{ secrets.SPIDERMON_DISCORD_FAKE }} + SPIDERMON_DISCORD_WEBHOOK_URL: ${{ secrets.SPIDERMON_DISCORD_WEBHOOK_URL }} + ZYTE_SMARTPROXY_APIKEY: ${{ secrets.ZYTE_SMARTPROXY_APIKEY }} + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.10' + - name: Prepare environment + run: | + python -m pip install --upgrade pip + pip install click python-decouple scrapinghub SQLAlchemy psycopg2 + - name: Schedule full crawl per year + if: ${{ !github.event.inputs.end_date }} + run: | + cd data_collection/ + spider=${{ github.event.inputs.spider_name }} + start_date=${{ github.event.inputs.start_date }} + int_start_date=$(date -d $start_date +"%Y%m%d") + int_end_date=$(date --date="today" +"%Y%m%d") + while [[ $int_start_date -lt $int_end_date ]]; do + int_date_to=$(date -d"$int_start_date + 1 year" +"%Y%m%d") + if [[ $int_date_to -ge $int_end_date ]]; then + int_date_to="$int_end_date" + fi + date_from=$(date -d"$int_start_date" +"%Y-%m-%d") + date_to=$(date -d"$int_date_to" +"%Y-%m-%d") + python scheduler.py schedule-spider --spider_name="$spider" --start_date="$date_from" --end_date="$date_to" + int_start_date="$int_date_to" + done + - name: Schedule partial crawl per year + if: ${{ github.event.inputs.end_date }} + run: | + cd data_collection/ + spider=${{ github.event.inputs.spider_name }} + start_date=${{ github.event.inputs.start_date }} + end_date=${{ github.event.inputs.end_date }} + int_start_date=$(date -d $start_date +"%Y%m%d") + int_end_date=$(date -d $end_date +"%Y%m%d") + while [[ $int_start_date -lt $int_end_date ]]; do + int_date_to=$(date -d"$int_start_date + 1 year" +"%Y%m%d") + if [[ $int_date_to -ge $int_end_date ]]; then + int_date_to="$int_end_date" + fi + date_from=$(date -d"$int_start_date" +"%Y-%m-%d") + date_to=$(date -d"$int_date_to" +"%Y-%m-%d") + python scheduler.py schedule-spider --spider_name="$spider" --start_date="$date_from" --end_date="$date_to" + int_start_date="$int_date_to" + done \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b0691322d..31f03c04e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -3,7 +3,7 @@ name: Run tests on: [ push, pull_request ] jobs: - build: + tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/update_spider_status.yaml b/.github/workflows/update_spider_status.yaml index 362c78efa..764cb811a 100644 --- a/.github/workflows/update_spider_status.yaml +++ b/.github/workflows/update_spider_status.yaml @@ -15,7 +15,7 @@ on: required: true jobs: - update_status: + update-status: runs-on: ubuntu-latest env: QUERIDODIARIO_DATABASE_URL: ${{ secrets.QUERIDODIARIO_DATABASE_URL }}