Streamline tests for speed (#416)

* Introduce pytest-vcr to start recording requests * Switch accessibility command to using the same utilities as screenshot * Fix it up a bit more * Add another one * Another * Add pytest-xdist * Switch to pipenv sync for speed
palewire · Oct 9, 2023 · c4d1278 · c4d1278
1 parent a7cbb7f
commit c4d1278
Show file tree

Hide file tree

Showing 16 changed files with 6,415 additions and 265 deletions.
diff --git a/.github/workflows/continuous-deployment.yml b/.github/workflows/continuous-deployment.yml
@@ -57,7 +57,7 @@ jobs:
 
       - id: install-python-dependencies
         name: Install Python dependencies
-        run: pipenv install --dev --python=`which python`
+        run: pipenv sync --dev
         shell: bash
 
       - id: cache-playwright
@@ -74,7 +74,7 @@ jobs:
 
       - id: test
         name: Run tests
-        run: pipenv run xvfb-run pytest tests -sv --cov
+        run: pipenv run xvfb-run pytest tests -sv --cov -n auto --vcr-record=none
         shell: bash
         env:
             IA_ACCESS_KEY: ${{ secrets.IA_ACCESS_KEY }}

diff --git a/Pipfile b/Pipfile
@@ -57,6 +57,8 @@ xlwt = "*"
 pytest-env = "*"
 pytest-cov = "*"
 pytest-optional-tests = "*"
+pytest-vcr = "*"
+pytest-xdist = "*"
 
 [requires]
 python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/newshomepages/accessibility.py b/newshomepages/accessibility.py
@@ -1,8 +1,12 @@
-import subprocess
-import typing
+"""Save the accessiblity tree of the provided site."""
+from __future__ import annotations
+
+import json
 from pathlib import Path
 
 import click
+from playwright.sync_api import sync_playwright
+from playwright.sync_api._generated import BrowserContext
 from retry import retry
 from rich import print
 
@@ -12,43 +16,40 @@
 @click.command()
 @click.argument("handle")
 @click.option("-o", "--output-dir", "output_dir", default="./")
-@click.option("--timeout", "timeout", default="180")
-def cli(handle: str, output_dir: str, timeout: str = "180"):
+@click.option("--verbose", "verbose", default=False, is_flag=True)
+def cli(handle, output_dir, verbose=False):
     """Save the accessiblity tree of the provided site."""
     # Get metadata
     site = utils.get_site(handle)
 
     # Set the output path
-    output_path = Path(output_dir) / f"{handle.lower()}.accessibility.json"
+    output_path = Path(output_dir) / f"{site['handle'].lower()}.accessibility.json"
     output_path.parent.mkdir(parents=True, exist_ok=True)
 
     # Do the thing
-    _get_accessibility(site, output_path, int(timeout))
+    if verbose:
+        print(f":newspaper: Fetching a11y tree from {site['url']}")
+    with sync_playwright() as p:
+        context = utils._load_persistent_context(p)
+        _get_accessibility(context, site["url"], site["handle"], output_path)
+        context.close()
 
 
 @retry(tries=3, delay=5, backoff=2)
-def _get_accessibility(data: typing.Dict, output_path: Path, timeout: int = 180):
+def _get_accessibility(
+    context: BrowserContext, url: str, handle: str, output_path: Path
+):
     """Run a command that fetches the accessibility tree for the provided site."""
-    print(f":newspaper: Fetching a11y tree from {data['url']}")
-
-    # Shoot the shot
-    command_list = [
-        "shot-scraper",
-        "accessibility",
-        data["url"],
-        "-o",
-        str(output_path),
-        "--timeout",
-        str(timeout * 1000),  # Convert from seconds into milliseconds
-    ]
-
-    # If there's a custom JavaScript include, toss that in
-    javascript = utils.get_javascript(data["handle"])
-    if javascript:
-        command_list.extend(["--javascript", javascript])
-
-    # Run the command
-    subprocess.run(command_list)
+    with open(output_path, "w") as fp:
+        page = utils._load_new_page_disable_javascript(
+            context=context,
+            url=url,
+            handle=handle,
+        )
+        snapshot = page.accessibility.snapshot()
+        page.close()
+        fp.write(json.dumps(snapshot, indent=4))
+        fp.write("\n")
 
 
 if __name__ == "__main__":

diff --git a/newshomepages/utils.py b/newshomepages/utils.py
@@ -118,7 +118,7 @@ def write_json(
 @retry(tries=3, delay=15, backoff=2)
 def get_url(
     url: str, timeout: int = 30, user_agent: str | None = None, verbose: bool = False
-):
+) -> requests.Response:
     """Get the provided URL.
 
     Args:
@@ -148,9 +148,24 @@ def get_url(
     return r
 
 
-def get_json_url(url: str):
-    """Get JSON data from the provided URL."""
-    r = get_url(url)
+def get_json_url(
+    url: str, timeout: int = 30, user_agent: str | None = None, verbose: bool = False
+) -> typing.Any:
+    """Get JSON data from the provided URL.
+
+    Args:
+        url (str): The URL to request
+        timeout (int): How long to wait before timing out
+        user_agent (str): The user agent to provide in the request headers. None by default.
+        verbose (bool): Whether or not to print a verbose output
+
+    Returns:
+        The JSON response as a Python object.
+    """
+    # Get the resposne
+    r = get_url(url, timeout=timeout, user_agent=user_agent, verbose=verbose)
+
+    # Return JSON
     return r.json()
 
 

diff --git a/newshomepages/wayback.py b/newshomepages/wayback.py
@@ -16,7 +16,8 @@
 @click.command()
 @click.argument("handle")
 @click.option("-o", "--output-dir", "output_dir", default="./")
-def cli(handle: str, output_dir: str):
+@click.option("--verbose", "verbose", default=False, is_flag=True)
+def cli(handle: str, output_dir: str, verbose: bool = False):
     """Archive a URL in the Wayback Machine."""
     # Pull the source’s metadata
     site = utils.get_site(handle)
@@ -25,7 +26,8 @@ def cli(handle: str, output_dir: str):
     assert IA_SECRET_KEY
 
     # Ask for a capture
-    print(f"🏛 Requesting a Wayback Machine capture of {site['url']}")
+    if verbose:
+        print(f"🏛 Requesting a Wayback Machine capture of {site['url']}")
     capture_data = _post(site["url"])
 
     # If we've got a message, we need to just give up now. They're not going to do it.
@@ -36,7 +38,8 @@ def cli(handle: str, output_dir: str):
         tries = 1
         while True:
             # Give it a second (as recommended by the archive.org admins)
-            print("Waiting 6 seconds to request our job's status")
+            if verbose:
+                print("Waiting 6 seconds to request our job's status")
             time.sleep(6)
 
             # Check in our capture
@@ -45,26 +48,31 @@ def cli(handle: str, output_dir: str):
 
             # If it's a success, we're done
             if status_data["status"] == "success":
-                print("Success!")
+                if verbose:
+                    print("Success!")
                 capture_data.update(status_data)
                 break
             elif status_data["status"] == "pending":
                 # If it's not done, up our counter and restart the loop
-                print("The capture is still pending.")
+                if verbose:
+                    print("The capture is still pending.")
                 tries += 1
                 # Unless we're over out limit, then we quit
                 if tries >= 11:
-                    print("10 tries have failed. We’re done here.")
+                    if verbose:
+                        print("10 tries have failed. We’re done here.")
                     break
             elif status_data["status"] == "error":
                 # If there's an error, end it now
-                print("There's an error. Time to call it quits.")
+                if verbose:
+                    print("There's an error. Time to call it quits.")
                 capture_data.update(status_data)
                 break
 
     # Write it out
     slug = site["handle"].lower()
-    utils.write_json(capture_data, Path(output_dir) / f"{slug}.wayback.json")
+    output_path = Path(output_dir) / f"{slug}.wayback.json"
+    utils.write_json(capture_data, output_path, verbose=verbose)
 
 
 @retry(tries=3, delay=30, backoff=2)

diff --git a/pytest.ini b/pytest.ini