Merge branch 'master' of https://github.com/codeforboston/GovLens

endy-imam · endy-imam · commit b33a8e156456 · 2020-02-25T19:00:37.000-05:00
diff --git a/README.md b/README.md
@@ -110,20 +110,14 @@ Note: The scrapers live in an independent environment not neccessarily in the sa
   # enter the password when prompted. It can be any password that you wish to use.
   # It is used for login to the admin website.
  ```
-- Start up the webserver so we can create a user for the scraper.
+- Start up the webserver
 ```bash
 python3 manage.py runserver
 ```
-- Visit localhost:8000/admin and follow the UI to add a new user named "scraper", set the password to whatever you would like but make note of it.
-
-- In a new terminal tab, create a token for the scraper user using the following command
-```bash
-python3 manage.py drf_create_token scraper
-```
-Finally, the database is ready to go! We are now ready to run the server:
-
 Navigate in your browser to `http://127.0.0.1:8000/admin`. Log in with the new admin user you just created. Click on Agencys and you should see a list of
-agencies.
+agencies created with the ``fill_agency_objects`` command.
+
+To setup the scraper, read [the scraper README](scrapers/README.rst).
 
 ## Code formatting
 GovLens enforces code style using [Black](https://github.com/psf/black) and pep8 rules using [Flake8](http://flake8.pycqa.org/en/latest/).
diff --git a/apps/civic_pulse/management/commands/create_scraper_user.py b/apps/civic_pulse/management/commands/create_scraper_user.py
@@ -0,0 +1,26 @@
+"""Idempotent management command to create the scraper user with a DRF token
+"""
+from django.core.management.base import BaseCommand
+from django.contrib.auth.models import User
+from rest_framework.authtoken.models import Token
+
+SCRAPER_USERNAME = "scraper"
+
+
+class Command(BaseCommand):
+    help = "Get or create a scraper user with a Django REST Framework token"
+
+    def add_arguments(self, parser):
+        pass
+
+    def handle(self, *args, **options):
+        user, created = User.objects.get_or_create(username=SCRAPER_USERNAME)
+        user.save()
+
+        if created:
+            self.stdout.write(f"Created new user with username {SCRAPER_USERNAME}")
+        else:
+            self.stdout.write(f"User {SCRAPER_USERNAME} already exists.")
+
+        token, created = Token.objects.get_or_create(user=user)
+        self.stdout.write(f"The token for the user {SCRAPER_USERNAME} is {token}")
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -1,2 +1,3 @@
 black
 flake8
+coloredlogs==10.0
diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ simplejson==3.16.0
 sqlparse==0.3.0
 urllib3==1.24.2
 apscheduler==3.6.0
+python-dotenv==0.11.0
diff --git a/scrapers/README.rst b/scrapers/README.rst
@@ -27,28 +27,38 @@ Directory Structure
       ├── security_scraper.py       - scrapes for HTTPS & privacy policy
       └── social_scraper.py         - scrapes for phone number, email, address, social media
 
-Requirements
-============
+Quick Start
+===========
+
+Configuration
+~~~~~~~~~~~~~
+
+There are a few required environmental variables. The easiest way to set them in development is to create a file called `.env` in the root directory of this repository (don't commit this file). The file (named `.env`) should contain the following text::
+
+    GOVLENS_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+    GOVLENS_API_ENDPOINT=http://127.0.0.1:8000/api/agencies/
+    GOOGLE_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXX
+
+To get the ``GOOGLE_API_TOKEN``, you need to visit the following page: https://developers.google.com/speed/docs/insights/v5/get-started
+
+To get the ``GOVLENS_API_TOKEN``, run ``python3 manage.py create_scraper_user``. Copy the token from the command output and paste it into the ``.env`` file.
+
+Execution
+~~~~~~~~~
 
-Google Lighthouse API Key
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Get the API key for accessing lighthouse from here: https://developers.google.com/speed/docs/insights/v5/get-started (click on the button get key)
+Once you have created the `.env` file as mentioned above, run the scraper::
 
-Put that key in GOOGLE_API_KEY environment variable.
+  # run the following from the root directory of the repository
+  python3 -m scrapers.scrape_handler
 
-Running the Scrapers
-====================
-``scrape_handler.py`` is the entry point for scraping.
-When we run from our local machine, we get the list of agencies and start scraping them.
-But when deployed to AWS, the scraper is invoked by the schedule and ``scrape_handler.scrape_data()`` is the method hooked up to the lambda.
+Design
+======
 
-Local
-~~~~~
-If running from local, the following command should run the scraper::
+The scraper is intended to be used both locally and on AWS Lambda.
 
-  python scraper.py
+The ``scrapers`` directory in the root of this repository is the top-level Python package for this project. This means that any absolute imports should begin with ``scrapers.MODULE_NAME_HERE``.
 
-Make sure to set the environment variable to your local endpoint.
+``scrapers/scrape_handler.py`` is the main Python module invoked. On AWS Lambda, the method ``scrape_handler.scrape_data()`` is imported and called directly.
 
 AWS Lambda
 ~~~~~~~~~~
diff --git a/scrapers/__init__.py b/scrapers/__init__.py
diff --git a/scrapers/agency_api_service.py b/scrapers/agency_api_service.py
@@ -1,25 +1,30 @@
-import os
+import logging
 
 import requests
 
+from . import settings
+
+logger = logging.getLogger(__name__)
+
 
 class AgencyApiService:
     def __init__(self):
-        # If environment variable is set, we use the corresponding api(usually local). otherwise govlens api
-        if os.environ.get("govlens_api", None) is None:
-            self.base_url = (
-                "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/"
-            )
-        else:
-            self.base_url = os.environ["govlens_api"]
+        self.base_url = settings.GOVLENS_API_ENDPOINT
 
     def get_all_agencies(self):
         try:
             all_agency_list = self._get(self.base_url)
             return all_agency_list
         except Exception as ex:
-            print(f"Error while retrieving all the agency information: {str(ex)}")
+            logger.error(ex, "Error while retrieving all the agency information")
 
     def _get(self, url):
-        response = requests.get(url, headers={"Content-type": "application/json"})
+        response = requests.get(
+            url,
+            headers={
+                "Content-type": "application/json",
+                "Authorization": "Token {}".format(settings.GOVLENS_API_TOKEN),
+            },
+        )
+        response.raise_for_status()
         return response.json()
diff --git a/scrapers/lighthouse.py b/scrapers/lighthouse.py
@@ -1,7 +1,7 @@
-from scrapers.base_api_client import ApiClient
+from .scrapers.base_api_client import ApiClient
+from . import settings
 
 
-GOOGLE_API_KEY = ""  # os.environ['GOOGLE_API_KEY']
 PAGE_INSIGHTS_ENDPOINT = "https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
 MOBILE_FRIENDLY_ENDPOINT = "https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run"  # from what i have tested, very hard to automate
 
@@ -15,7 +15,7 @@
 
 
 class PageInsightsClient(ApiClient):
-    def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=GOOGLE_API_KEY):
+    def __init__(self, api_uri=PAGE_INSIGHTS_ENDPOINT, api_key=settings.GOOGLE_API_KEY):
         ApiClient.__init__(self, api_uri, api_key)
 
     def get_page_insights(self, url, category):
@@ -24,7 +24,9 @@ def get_page_insights(self, url, category):
 
 
 class GoogleMobileFriendlyClient(ApiClient):
-    def __init__(self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=GOOGLE_API_KEY):
+    def __init__(
+        self, api_uri=MOBILE_FRIENDLY_ENDPOINT, api_key=settings.GOOGLE_API_KEY
+    ):
         self.urls = []
         self.results = []
         ApiClient.__init__(self, api_uri, api_key)
diff --git a/scrapers/process_agency_info.py b/scrapers/process_agency_info.py
@@ -1,10 +1,12 @@
-import os
 import requests
 import logging
-from scrapers.social_scraper import SocialScraper
-from scrapers.security_scraper import SecurityScraper
-from scrapers.accessibility_scraper import AccessibilityScraper
-from agency_dataaccessor import AgencyDataAccessor
+from .scrapers.social_scraper import SocialScraper
+from .scrapers.security_scraper import SecurityScraper
+from .scrapers.accessibility_scraper import AccessibilityScraper
+from .agency_dataaccessor import AgencyDataAccessor
+from . import settings
+
+logger = logging.getLogger(__name__)
 
 
 class AgencyInfo:
@@ -24,15 +26,12 @@ def process_agency_info(self):
             # HTTP Get on agency url
             agency_url = self.agency.get("website", None)
             if agency_url is None or agency_url == "":
-                print(
-                    f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}"
-                )
-                logging.error(
+                logger.error(
                     f"Website url is not available for {self.agency['id']}, name: {self.agency['name']}"
                 )
                 self.agency_dataaccessor.update_agency_info(self.agency)
                 return
-            print(f"Scraping the website {agency_url}")
+            logger.info(f"Scraping the website {agency_url}")
             page = requests.get(agency_url, timeout=30)
             # Initialize scrapers
             socialScraper = SocialScraper(page, agency_url)
@@ -45,7 +44,7 @@ def process_agency_info(self):
             # Figure out the google_api_key and then fix the below buckets
             for bucket in self.buckets:
                 if bucket == "security_and_privacy":
-                    if os.environ.get("GOOGLE_API_KEY", None) is not None:
+                    if settings.GOOGLE_API_KEY:
                         profile_info[
                             bucket
                         ] = securityScraper.get_security_privacy_info()
@@ -56,7 +55,7 @@ def process_agency_info(self):
                         social_media_info, contact_info
                     )
                 elif bucket == "website_accessibility":
-                    if os.environ.get("GOOGLE_API_KEY", None) is not None:
+                    if settings.GOOGLE_API_KEY:
                         profile_info[
                             bucket
                         ] = accessibilityScraper.get_website_accessibility_info()
@@ -71,9 +70,6 @@ def process_agency_info(self):
             self.agency_dataaccessor.enrich_agency_info_with_scrape_info(agency_details)
             return agency_details
         except Exception as ex:
-            logging.error(
-                f"An error occurred while processing the agency information: {str(ex)}"
-            )
-            print(
-                f"An error occurred while processing the agency information: {str(ex)}"
+            logger.error(
+                ex, "An error occurred while processing the agency information"
             )
diff --git a/scrapers/scrape_handler.py b/scrapers/scrape_handler.py
@@ -1,35 +1,30 @@
-import os
 import logging
 from .process_agency_info import AgencyInfo
 from .agency_api_service import AgencyApiService
 
+from . import settings
+
+settings.setup_logging()
+
+logger = logging.getLogger(__name__)
+
 
 # method invoked by lambda
 def scrape_data(event, context=None):
     agencies = event["agencies"]
     if event.get("agencies", None) is None or len(agencies) <= 0:
-        print("No Agency information was passed to scrape")
+        logger.warning("No Agency information was passed to scrape")
         return
 
     for agency in agencies:
         agency_instance = AgencyInfo(agency)
         agency_instance.process_agency_info()
 
 
-# if running from local, we get the list of agencies and scrape one by one.
 if __name__ == "__main__":
-    # If running from local, set the environment variable to your local
-    logging.basicConfig(
-        filename="Scraper_Errors.log",
-        level=logging.ERROR,
-        format="%(asctime)s %(message)s",
-    )
-    os.environ[
-        "govlens_api"
-    ] = "http://govlens.us-east-2.elasticbeanstalk.com/api/agencies/"
-    os.environ["GOOGLE_API_KEY"] = ""
+
     agency_api_service = AgencyApiService()
     agencies = agency_api_service.get_all_agencies()
     event = {"agencies": agencies}
     scrape_data(event)
-    print("SCRAPED")
+    logger.info("Finished scraping")
diff --git a/scrapers/scrapers/social_scraper.py b/scrapers/scrapers/social_scraper.py
@@ -4,6 +4,8 @@
 import re
 import logging
 
+logger = logging.getLogger(__name__)
+
 
 class SocialScraper(BaseScraper):
 
@@ -39,34 +41,29 @@ def scrape_info(self):
                         elif any(link in tag["href"] for link in social_media_criteria):
                             social_media_links.append(tag["href"])
                 except Exception as ex:
-                    print(
-                        f"An error occurred while trying to extract the social media information: {str(ex)}"
-                    )
                     logging.error(
-                        f"An error occurred while trying to extract the social media information: {str(ex)}"
+                        ex,
+                        "An error occurred while trying to extract the social media information",
                     )
             if contact_us_link:
                 if "http" in contact_us_link["href"]:
-                    print(
+                    logger.info(
                         f"making an extra call to get the contact info: {contact_us_link['href']}"
                     )
                     contact_us_page = requests.get(contact_us_link["href"])
                 else:
-                    print(
+                    logger.info(
                         f"making an extra call to get the contact info: {self.url+contact_us_link['href']}"
                     )
                     contact_us_page = requests.get(self.url + contact_us_link["href"])
                 contact_us_soup = BeautifulSoup(contact_us_page.content, "html.parser")
                 contact_info = self.get_contact_info(contact_us_soup)
             else:
-                print("not making an extra call to get the contact info")
+                logger.info("not making an extra call to get the contact info")
                 contact_info = self.get_contact_info(soup)
         except Exception as ex:
-            print(
-                f"An error occurred while processing the social media information: {str(ex)}"
-            )
             logging.error(
-                f"An error occurred while processing the social media information: {str(ex)}"
+                ex, f"An error occurred while processing the social media information"
             )
 
         return social_media_links, contact_info
@@ -106,15 +103,13 @@ def get_contact_info(self, soup):
                     "address": list(set(address))[0] if address else [],
                 }
             else:
-                print("Contact Information not available")
+                logger.warning("Contact Information not available")
                 all_contact_info = {"email": [], "phone_number": [], "address": []}
             return all_contact_info
-        except Exception:
-            print(
-                "An error occurred while extracting the contact information for the firm {self.url}: {str(ex)}"
-            )
+        except Exception as ex:
             logging.error(
-                "An error occurred while extracting the contact information for the firm {self.url}: {str(ex)}"
+                ex,
+                "An error occurred while extracting the contact information for the firm {self.url}",
             )
             return None
 
diff --git a/scrapers/settings.py b/scrapers/settings.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`black`
`2`	`2`	`flake8`
	`3`	`+coloredlogs==10.0`