Skip to content
This repository has been archived by the owner on Aug 11, 2023. It is now read-only.

Commit

Permalink
Merge pull request #8 from wjh18/sitemap-enhancements
Browse files Browse the repository at this point in the history
Sitemap enhancements
  • Loading branch information
wjh18 authored Oct 28, 2022
2 parents 6fce75c + c718a52 commit c9135cb
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 26 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,18 @@ To remove your API key from your default keystore, run `keyring del system psike

## Sitemap Support

Currently, only URLs to valid XML sitemaps are supported for reports that utilize sitemap format.
Currently, only URLs to valid XML sitemaps are supported for reports that utilize sitemap format. Please see [sitemaps.org](https://sitemaps.org/protocol.html) for specification details.

Your web server or sitemap plugin must also allow robots to crawl your sitemap. If you see any permission errors that would be the first thing to check.

In the future, support for sitemap indices, multiple sitemaps and more advanced sitemap parsing will hopefully be added.
Your web server or sitemap plugin must also allow robots to crawl your sitemap. If you see any permission errors that would be the first thing to check. Certain security solutions like CloudFlare also block crawlers so whitelisting the server you're running the package from may also be preferrable.

Your sitemap URL should be passed in as the positional argument for `url` when running `psi` from the command line.

### Sitemap Index

Support for sitemap index detection was recently added. This requires no additional action on your part. Simply pass your sitemap index in as the `url` argument via the cli.

If a sitemap index is detected, the package will recursively gather the URLs listed in each sitemap in your sitemap index and include them in requests. If a standard sitemap file is passed, only that sitemap will be processed.

## Command Line Arguments

If you've installed `pyspeedinsights` with `pip`, the default command to run cli commands is `psi`.
Expand Down
4 changes: 2 additions & 2 deletions src/pyspeedinsights/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .api.response import process_excel, process_json
from .cli import commands
from .core.excel import ExcelWorkbook
from .core.sitemap import parse_sitemap, request_sitemap
from .core.sitemap import process_sitemap, request_sitemap


def main():
Expand Down Expand Up @@ -42,7 +42,7 @@ def main():
# Create list of request URLs based on sitemap.
sitemap_url = url
sitemap = request_sitemap(sitemap_url)
request_urls = parse_sitemap(sitemap)
request_urls = process_sitemap(sitemap)
request_urls = list(set(request_urls)) # Remove duplicates if they exist.
else:
# For analyzing a single page, only process the requested URL.
Expand Down
83 changes: 63 additions & 20 deletions src/pyspeedinsights/core/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,26 @@


def request_sitemap(url):
"""Retrieve the sitemap from the URL provided in cmd args."""
"""Retrieve the sitemap from the given URL"""

url = validate_url(url)

if validate_sitemap_url(url) is not True:
err = "Invalid sitemap provided. Please provide a link to a valid XML sitemap."
# Set a dummy user agent to avoid bot detection by firewalls
# e.g. CloudFlare issues a 403 if it detects the default requests module user-agent
dummy_user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/104.0.5112.79 Safari/537.36"
)
headers = {"user-agent": dummy_user_agent}

if not validate_sitemap_url(url):
err = (
"Invalid sitemap URL provided. Please provide a URL to a valid XML sitemap."
)
raise SystemExit(err)
try:
print(f"Requesting sitemap... ({url})")
resp = requests.get(url)
resp = requests.get(url, headers=headers)
resp.raise_for_status()
except requests.exceptions.HTTPError as errh:
raise SystemExit(errh)
Expand All @@ -34,26 +44,59 @@ def request_sitemap(url):
return sitemap


def parse_sitemap(sitemap):
"""Parse URLs from the XML sitemap and return a list of URLs."""
def validate_sitemap_url(url):
"""Validate that the sitemap URL is valid (.xml format)."""

print("Parsing URLs from sitemap...")
u = urlsplit(url)
ext = splitext(u.path)[1]
return ext == ".xml"


def process_sitemap(sitemap):
"""
Process an individual sitemap or recursively process multiple sitemaps
via a sitemap index and return a full list of request URLs.
"""

root = ET.fromstring(sitemap)
namespace = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
sitemap_type = root.tag.split("}")[-1]

urls = []
for url in root.findall(f"{namespace}url"):
loc = url.find(f"{namespace}loc")
urls.append(loc.text)
if sitemap_type == "sitemapindex":
request_urls = []
sitemap_urls = _parse_sitemap_index(root)

return urls
for sm_url in sitemap_urls:
sitemap = request_sitemap(sm_url)
request_urls.extend(process_sitemap(sitemap))

elif sitemap_type == "urlset":
request_urls = _parse_sitemap_urls(root)

def validate_sitemap_url(url):
"""Validate that the sitemap URL is valid (.xml format)."""
return request_urls

u = urlsplit(url)
ext = splitext(u.path)[1]
if ext == ".xml":
return True

def _parse_sitemap_index(root):
"""Parse sitemap URLs from the sitemap index and return them as a list."""

print("Sitemap index found. Parsing sitemap URLs...")
return _parse_urls_from_root(root, type="sitemap")


def _parse_sitemap_urls(root):
"""Parse URLs from the XML sitemap and return a list of request URLs."""

print("Parsing URLs from sitemap...")
return _parse_urls_from_root(root)


def _parse_urls_from_root(root, type="url"):
"""Parse URL locs from root xml element"""

namespace = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
urls = []

for el in root.findall(f"{namespace}{type}"):
loc = el.find(f"{namespace}loc")
urls.append(loc.text)

return urls

0 comments on commit c9135cb

Please sign in to comment.