Merge branch 'master' into update-requirements

endy-imam · endy-imam · commit cf792b8d6cf2 · 2020-02-25T20:37:29.000-05:00
diff --git a/.github/workflows/lint-and-test.yml b/.github/workflows/lint-and-test.yml
@@ -0,0 +1,33 @@
+name: Lint and Test
+
+on:
+  push:
+    branches:
+      - update-requirements
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python 3.7
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r dev_requirements.txt
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --ignore E501,W503,E203 --show-source --statistics
+    - name: Lint with Black
+      run: |
+        black .
+    - name: Test with django
+      run: |
+        python manage.py test
diff --git a/README.md b/README.md
@@ -110,20 +110,14 @@ Note: The scrapers live in an independent environment not neccessarily in the sa
   # enter the password when prompted. It can be any password that you wish to use.
   # It is used for login to the admin website.
  ```
-- Start up the webserver so we can create a user for the scraper.
+- Start up the webserver
 ```bash
 python3 manage.py runserver
 ```
-- Visit localhost:8000/admin and follow the UI to add a new user named "scraper", set the password to whatever you would like but make note of it.
-
-- In a new terminal tab, create a token for the scraper user using the following command
-```bash
-python3 manage.py drf_create_token scraper
-```
-Finally, the database is ready to go! We are now ready to run the server:
-
 Navigate in your browser to `http://127.0.0.1:8000/admin`. Log in with the new admin user you just created. Click on Agencys and you should see a list of
-agencies.
+agencies created with the ``fill_agency_objects`` command.
+
+To setup the scraper, read [the scraper README](scrapers/README.rst).
 
 ## Code formatting
 GovLens enforces code style using [Black](https://github.com/psf/black) and pep8 rules using [Flake8](http://flake8.pycqa.org/en/latest/).
diff --git a/apps/civic_pulse/management/commands/create_scraper_user.py b/apps/civic_pulse/management/commands/create_scraper_user.py
@@ -0,0 +1,26 @@
+"""Idempotent management command to create the scraper user with a DRF token
+"""
+from django.core.management.base import BaseCommand
+from django.contrib.auth.models import User
+from rest_framework.authtoken.models import Token
+
+SCRAPER_USERNAME = "scraper"
+
+
+class Command(BaseCommand):
+    help = "Get or create a scraper user with a Django REST Framework token"
+
+    def add_arguments(self, parser):
+        pass
+
+    def handle(self, *args, **options):
+        user, created = User.objects.get_or_create(username=SCRAPER_USERNAME)
+        user.save()
+
+        if created:
+            self.stdout.write(f"Created new user with username {SCRAPER_USERNAME}")
+        else:
+            self.stdout.write(f"User {SCRAPER_USERNAME} already exists.")
+
+        token, created = Token.objects.get_or_create(user=user)
+        self.stdout.write(f"The token for the user {SCRAPER_USERNAME} is {token}")
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -1,2 +1,3 @@
 black
 flake8
+coloredlogs==10.0
diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ simplejson==3.16.0
 sqlparse==0.3.0
 urllib3==1.24.2
 apscheduler==3.6.0
+python-dotenv==0.11.0
diff --git a/scrapers/README.rst b/scrapers/README.rst
@@ -27,28 +27,38 @@ Directory Structure
       ├── security_scraper.py       - scrapes for HTTPS & privacy policy
       └── social_scraper.py         - scrapes for phone number, email, address, social media
 
-Requirements
-============
+Quick Start
+===========
+
+Configuration
+~~~~~~~~~~~~~
+
+There are a few required environmental variables. The easiest way to set them in development is to create a file called `.env` in the root directory of this repository (don't commit this file). The file (named `.env`) should contain the following text::
+
+    GOVLENS_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+    GOVLENS_API_ENDPOINT=http://127.0.0.1:8000/api/agencies/
+    GOOGLE_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXX
+
+To get the ``GOOGLE_API_TOKEN``, you need to visit the following page: https://developers.google.com/speed/docs/insights/v5/get-started
+
+To get the ``GOVLENS_API_TOKEN``, run ``python3 manage.py create_scraper_user``. Copy the token from the command output and paste it into the ``.env`` file.
+
+Execution
+~~~~~~~~~
 
-Google Lighthouse API Key
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Get the API key for accessing lighthouse from here: https://developers.google.com/speed/docs/insights/v5/get-started (click on the button get key)
+Once you have created the `.env` file as mentioned above, run the scraper::
 
-Put that key in GOOGLE_API_KEY environment variable.
+  # run the following from the root directory of the repository
+  python3 -m scrapers.scrape_handler
 
-Running the Scrapers
-====================
-``scrape_handler.py`` is the entry point for scraping.
-When we run from our local machine, we get the list of agencies and start scraping them.
-But when deployed to AWS, the scraper is invoked by the schedule and ``scrape_handler.scrape_data()`` is the method hooked up to the lambda.
+Design
+======
 
-Local
-~~~~~
-If running from local, the following command should run the scraper::
+The scraper is intended to be used both locally and on AWS Lambda.
 
-  python scraper.py
+The ``scrapers`` directory in the root of this repository is the top-level Python package for this project. This means that any absolute imports should begin with ``scrapers.MODULE_NAME_HERE``.
 
-Make sure to set the environment variable to your local endpoint.
+``scrapers/scrape_handler.py`` is the main Python module invoked. On AWS Lambda, the method ``scrape_handler.scrape_data()`` is imported and called directly.
 
 AWS Lambda
 ~~~~~~~~~~
diff --git a/scrapers/__init__.py b/scrapers/__init__.py
diff --git a/scrapers/agency_api_service.py b/scrapers/agency_api_service.py
@@ -1,25 +1,30 @@
-import os
+import logging
 
 import requests
 
+from . import settings
+
+logger = logging.getLogger(__name__)
+
 
 class AgencyApiService:
     def __init__(self):
-        # If environment variable is set, we use the corresponding api(usually local). otherwise govlens api
-        if os.environ.get("govlens_api", None) is None:
-            self.base_url = (
-                "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/"
-            )
-        else:
-            self.base_url = os.environ["govlens_api"]
+        self.base_url = settings.GOVLENS_API_ENDPOINT
 
     def get_all_agencies(self):
         try:
             all_agency_list = self._get(self.base_url)
             return all_agency_list
         except Exception as ex:
-            print(f"Error while retrieving all the agency information: {str(ex)}")
+            logger.error(ex, "Error while retrieving all the agency information")
 
     def _get(self, url):
-        response = requests.get(url, headers={"Content-type": "application/json"})
+        response = requests.get(
+            url,
+            headers={
+                "Content-type": "application/json",
+                "Authorization": "Token {}".format(settings.GOVLENS_API_TOKEN),
+            },
+        )
+        response.raise_for_status()
         return response.json()
diff --git a/scrapers/lighthouse.py b/scrapers/lighthouse.py
@@ -1,7 +1,7 @@
-from scrapers.base_api_client import ApiClient
+from .scrapers.base_api_client import ApiClient
+from . import settings
 
 
-GOOGLE_API_KEY = ""  # os.environ['GOOGLE_API_KEY']
 PAGE_INSIGHTS_ENDPOINT = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
 MOBILE_FRIENDLY_ENDPOINT = "https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run"  # from what i have tested, very hard to automate
 
@@ -15,7 +15,7 @@
 
 
 class PageInsightsClient(ApiClient):
-    def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=GOOGLE_API_KEY):
+    def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=settings.GOOGLE_API_KEY):
         ApiClient.__init__(self, api_uri, api_key)
 
     def get_page_insights(self, url, category):
@@ -24,7 +24,9 @@ def get_page_insights(self, url, category):
 
 
 class GoogleMobileFriendlyClient(ApiClient):
-    def __init__(self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=GOOGLE_API_KEY):
+    def __init__(
+        self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=settings.GOOGLE_API_KEY
+    ):
         self.urls = []
         self.results = []
         ApiClient.__init__(self, api_uri, api_key)
diff --git a/scrapers/process_agency_info.py b/scrapers/process_agency_info.py
@@ -1,10 +1,12 @@
-import os
 import requests
 import logging
-from scrapers.social_scraper import SocialScraper
-from scrapers.security_scraper import SecurityScraper
-from scrapers.accessibility_scraper import AccessibilityScraper
-from agency_dataaccessor import AgencyDataAccessor
+from .scrapers.social_scraper import SocialScraper
+from .scrapers.security_scraper import SecurityScraper
+from .scrapers.accessibility_scraper import AccessibilityScraper
+from .agency_dataaccessor import AgencyDataAccessor
+from . import settings
+
+logger = logging.getLogger(__name__)
 
 
 class AgencyInfo:
@@ -24,15 +26,12 @@ def process_agency_info(self):
             # HTTP Get on agency url
             agency_url = self.agency.get("website", None)
             if agency_url is None or agency_url == "":
-                print(
-                    f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}"
-                )
-                logging.error(
+                logger.error(
                     f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}"
                 )
                 self.agency_dataaccessor.update_agency_info(self.agency)
                 return
-            print(f"Scraping the website {agency_url}")
+            logger.info(f"Scraping the website {agency_url}")
             page = requests.get(agency_url, timeout=30)
             # Initialize scrapers
             socialScraper = SocialScraper(page, agency_url)
@@ -45,7 +44,7 @@ def process_agency_info(self):
             # Figure out the google_api_key and then fix the below buckets
             for bucket in self.buckets:
                 if bucket == "security_and_privacy":
-                    if os.environ.get("GOOGLE_API_KEY", None) is not None:
+                    if settings.GOOGLE_API_KEY:
                         profile_info[
                             bucket
                         ] = securityScraper.get_security_privacy_info()
@@ -56,7 +55,7 @@ def process_agency_info(self):
                         social_media_info, contact_info
                     )
                 elif bucket == "website_accessibility":
-                    if os.environ.get("GOOGLE_API_KEY", None) is not None:
+                    if settings.GOOGLE_API_KEY:
                         profile_info[
                             bucket
                         ] = accessibilityScraper.get_website_accessibility_info()
@@ -71,9 +70,6 @@ def process_agency_info(self):
             self.agency_dataaccessor.enrich_agency_info_with_scrape_info(agency_details)
             return agency_details
         except Exception as ex:
-            logging.error(
-                f"An error occurred while processing the agency information: {str(ex)}"
-            )
-            print(
-                f"An error occurred while processing the agency information: {str(ex)}"
+            logger.error(
+                ex, "An error occurred while processing the agency information"
             )
diff --git a/scrapers/scrape_handler.py b/scrapers/scrape_handler.py
@@ -1,35 +1,30 @@
-import os
 import logging
 from .process_agency_info import AgencyInfo
 from .agency_api_service import AgencyApiService
 
+from . import settings
+
+settings.setup_logging()
+
+logger = logging.getLogger(__name__)
+
 
 # method invoked by lambda
 def scrape_data(event, context=None):
     agencies = event["agencies"]
     if event.get("agencies", None) is None or len(agencies) <= 0:
-        print("No Agency information was passed to scrape")
+        logger.warning("No Agency information was passed to scrape")
         return
 
     for agency in agencies:
         agency_instance = AgencyInfo(agency)
         agency_instance.process_agency_info()
 
 
-# if running from local, we get the list of agencies and scrape one by one.
 if __name__ == "__main__":
-    # If running from local, set the environment variable to your local
-    logging.basicConfig(
-        filename="Scraper_Errors.log",
-        level=logging.ERROR,
-        format="%(asctime)s %(message)s",
-    )
-    os.environ[
-        "govlens_api"
-    ] = "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/"
-    os.environ["GOOGLE_API_KEY"] = ""
+
     agency_api_service = AgencyApiService()
     agencies = agency_api_service.get_all_agencies()
     event = {"agencies": agencies}
     scrape_data(event)
-    print("SCRAPED")
+    logger.info("Finished scraping")
diff --git a/scrapers/scrapers/social_scraper.py b/scrapers/scrapers/social_scraper.py
diff --git a/scrapers/settings.py b/scrapers/settings.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`black`
`2`	`2`	`flake8`
	`3`	`+coloredlogs==10.0`