Merge pull request #10 from City-Bureau/cleanup

🧹 Clean up CI, deps, readme, and misc items
City-Bureau · Feb 5, 2024 · 79b46d7 · 79b46d7
2 parents 6dd75bc + a5ff484
commit 79b46d7
Show file tree

Hide file tree

Showing 9 changed files with 701 additions and 719 deletions.
diff --git a/.deploy.sh b/.deploy.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-pipenv run scrapy list | xargs -I {} pipenv run scrapy crawl {} -s LOG_ENABLED=False &
+pipenv run scrapy list | xargs -I {} pipenv run scrapy crawl {} -s LOG_ENABLED=True &
 
 # Output to the screen every 9 minutes to prevent a travis timeout
 # https://stackoverflow.com/a/40800348

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,19 +1,28 @@
-## Summary
+## What's this PR do?
+<!-- eg. This PR updates the scraper for Cleveland City Council because of changes to how they display their meeting schedule. -->
 
-**Issue:** #ISSUE_NUMBER
+## Why are we doing this?
+<!-- eg. The website's layout was recently updated, causing our existing scraper to fail. This change ensures our scraper remains functional and continues to provide timely updates on council meetings. -->
 
-Replace "ISSUE_NUMBER" with the number of your issue so that GitHub will link this pull request with the issue and make review easier.
+## Steps to manually test
+<!-- Text here is not always necessary but it is generally recommended in order to aid a reviewer.
+eg.
+1. Ensure the project is installed:
+```
+pipenv sync --dev
+```
+2. Activate the virtual env and enter the pipenv shell:
+```
+pipenv shell
+```
+3. Run the spider:
+```
+scrapy crawl <spider-name> -O test_output.csv
+```
+4. Monitor the output and ensure no errors are raised.
 
-## Checklist
+5. Inspect `test_output.csv` to ensure the data looks valid.
+-->
 
-All checks are run in [GitHub Actions](https://github.com/features/actions). You'll be able to see the results of the checks at the bottom of the pull request page after it's been opened, and you can click on any of the specific checks listed to see the output of each step and debug failures.
-
-- [ ] Tests are implemented
-- [ ] All tests are passing
-- [ ] Style checks run (see [documentation](https://cityscrapers.org/docs/development/) for more details)
-- [ ] Style checks are passing
-- [ ] Code comments from template removed
-
-## Questions
-
-Include any questions you have about what you're working on.
+## Are there any smells or added technical debt to note?
+<!-- eg. The new scraping logic includes a more complex parsing routine, which might be less efficient. Future optimization or a more robust parsing strategy may be needed if the website's layout continues to evolve. -->
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -28,14 +28,14 @@ jobs:
       - name: Install Pipenv
         uses: dschep/install-pipenv-action@v1
 
-      # - name: Cache Python dependencies
-      #   uses: actions/cache@v1
-      #   with:
-      #     path: .venv
-      #     key: pip-${{ matrix.python-version }}-${{ hashFiles('**/Pipfile.lock') }}
-      #     restore-keys: |
-      #       pip-${{ matrix.python-version }}-
-      #       pip-
+      - name: Cache Python dependencies
+        uses: actions/cache@v1
+        with:
+          path: .venv
+          key: pip-${{ matrix.python-version }}-${{ hashFiles('**/Pipfile.lock') }}
+          restore-keys: |
+            pip-${{ matrix.python-version }}-
+            pip-
 
       - name: Install dependencies
         run: pipenv sync --dev
@@ -56,7 +56,7 @@ jobs:
 
       - name: Test with pytest
         run: |
-          pipenv run pytest
+          pipenv run pytest || [ $? -eq 5 ]
 
       - name: Validate output with scrapy
         if: github.event_name == 'pull_request'

diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
@@ -2,7 +2,6 @@ name: Cron
 
 on:
   schedule:
-    # Set any time that you'd like scrapers to run (in UTC)
     - cron: "27 6 * * *"
   workflow_dispatch:
 
@@ -14,17 +13,10 @@ env:
   AUTOTHROTTLE_MAX_DELAY: 30.0
   AUTOTHROTTLE_START_DELAY: 1.5
   AUTOTHROTTLE_TARGET_CONCURRENCY: 3.0
-  # Add secrets for the platform you're using and uncomment here
-  # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  # S3_BUCKET: ${{ secrets.S3_BUCKET }}
   AZURE_ACCOUNT_KEY: ${{ secrets.AZURE_ACCOUNT_KEY }}
   AZURE_ACCOUNT_NAME: ${{ secrets.AZURE_ACCOUNT_NAME }}
   AZURE_CONTAINER: ${{ secrets.AZURE_CONTAINER }}
-  # GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
-  # GCS_BUCKET = os.getenv("GCS_BUCKET")
-  # Setup Sentry, add the DSN to secrets and uncomment here
-  # SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
+  SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
 
 jobs:
   crawl:
@@ -63,3 +55,10 @@ jobs:
         run: |
           export PYTHONPATH=$(pwd):$PYTHONPATH
           pipenv run scrapy combinefeeds -s LOG_ENABLED=False
+
+      - name: Prevent workflow deactivation
+        uses: gautamkrishnar/keepalive-workflow@v1
+        with:
+          committer_username: "citybureau-bot"
+          committer_email: "documenters@citybureau.org"
+
diff --git a/Pipfile b/Pipfile
@@ -5,17 +5,16 @@ name = "pypi"
 
 [packages]
 scrapy = "*"
-scrapy-sentry = "*"
-city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras=["azure"]}
-pypiwin32 = {version = "*", sys_platform = "== 'win32'"}
+scrapy-sentry-errors = "*"
+city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]}
 scrapy-wayback-middleware = "*"
-python-dateutil = "*"
-importlib-resources = "*"
-pdfminer-six = "*"
 
 [dev-packages]
 freezegun = "*"
 pytest = "*"
 "flake8" = "*"
 isort = "*"
-black = "==22.6"
+black = "*"
+
+[requires]
+python_version = "3.9"