Skip to content

Commit

Permalink
Merge pull request #24 from GSA/feature/waf-extraction
Browse files Browse the repository at this point in the history
WAF extraction
  • Loading branch information
rshewitt authored Dec 19, 2023
2 parents d15f61f + 7986401 commit e9eccbc
Show file tree
Hide file tree
Showing 18 changed files with 1,824 additions and 159 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ test: up ## Runs poetry tests, ignores ckan load
up: ## Sets up local docker environment
docker compose up -d

down: ## Shuts down local docker instance
docker-compose down

clean: ## Cleans docker images
docker compose down -v --remove-orphans

lint: ## Lints wtih ruff
ruff .

Expand Down
42 changes: 42 additions & 0 deletions harvester/extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import os

import requests
from bs4 import BeautifulSoup
from requests.exceptions import JSONDecodeError, RequestException

logger = logging.getLogger("harvester")
Expand All @@ -26,6 +28,46 @@ def download_dcatus_catalog(url):
return Exception(e)


def traverse_waf(url, files=[], file_ext=".xml", folder="/", filters=[]):
# TODO: add exception handling
parent = os.path.dirname(url.rstrip("/"))

res = requests.get(url)
if res.status_code == 200:
soup = BeautifulSoup(res.content, "html.parser")
anchors = soup.find_all("a", href=True)

folders = []
for anchor in anchors:
if (
anchor["href"].endswith(folder)
and not parent.endswith(anchor["href"].rstrip("/"))
and anchor["href"] not in filters
):
folders.append(os.path.join(url, anchor["href"]))

if anchor["href"].endswith(file_ext):
files.append(os.path.join(url, anchor["href"]))

for folder in folders:
traverse_waf(folder, files=files, filters=filters)

return files


def download_waf(files):
output = []
for file in files:
data = {}
data["url"] = file
res = requests.get(file)
if res.status_code == 200:
data["content"] = res.content
output.append(data)

return output


def extract(harvest_source) -> list:
"""Extracts all records from a harvest_source"""
logger.info("Hello from harvester.extract()")
Expand Down
337 changes: 183 additions & 154 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ repository = "https://github.com/GSA/datagov-harvesting-logic"
[tool.poetry.dependencies]
python = ">=3.10"
jsonschema = ">=4"
requests = ">=2"
python-dotenv = ">=1"
deepdiff = ">=6"
pytest = ">=7.3.2"
ckanapi = ">=4.7"
beautifulsoup4 = "^4.12.2"

[tool.poetry.group.dev.dependencies]
pytest = "^7.3.0"
Expand Down
12 changes: 9 additions & 3 deletions tests/extract/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
@pytest.fixture
def get_dcatus_job():
"""example dcatus job payload"""
return "http://localhost/dcatus.json"
return "http://localhost/dcatus/dcatus.json"


@pytest.fixture
Expand All @@ -16,10 +16,16 @@ def get_bad_url():
@pytest.fixture
def get_bad_json():
"""example bad json with missing enclosing bracket"""
return "http://localhost/unclosed.json"
return "http://localhost/dcatus/unclosed.json"


@pytest.fixture
def get_no_dataset_key_dcatus_json():
"""example dcatus json with no 'dataset' key"""
return "http://localhost/no_dataset_key.json"
return "http://localhost/dcatus/no_dataset_key.json"


@pytest.fixture
def get_waf_url():
"""example waf"""
return "http://localhost"
9 changes: 9 additions & 0 deletions tests/extract/test_waf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from harvester.extract import download_waf, traverse_waf


def test_traverse_waf(get_waf_url):
files = traverse_waf(get_waf_url, filters=["../", "dcatus/"])
assert len(files) == 7

downloaded_files = download_waf(files)
assert len(downloaded_files) == 7
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
237 changes: 237 additions & 0 deletions tests/harvest-sources/waf/USGSHydroCached.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE metadata SYSTEM "https://thor-f5.er.usgs.gov/ngtoc/metadata/fgdc-std-001-1998.dtd">
<metadata>
<idinfo>
<citation>
<citeinfo>
<origin>U.S. Geological Survey</origin>
<pubdate>2018</pubdate>
<title>USGS Hydro Cached Base Map Service from The National Map</title>
<geoform>raster digital data, map service</geoform>
<pubinfo>
<pubplace>Rolla, MO and Denver, CO</pubplace>
<publish>USGS - National Geospatial Technical Operations Center (NGTOC)</publish>
</pubinfo>
<onlink>https://viewer.nationalmap.gov/viewer</onlink>
</citeinfo>
</citation>
<descript>
<abstract>​This service is a cached overlay of a cartographic representation of the National Hydrography Dataset (NHD). ​The NHD is a comprehensive set of digital ​geo​spatial data that encodes information about naturally occurring and constructed bodies of surface water​, paths through which water flows​, ​related features​ such as stream gages and dams​​, and additional hydrologic information​. ​It is available nationwide ​in a 1:24,000-scale seamless dataset, referred to as high resolution NHD. The NHD supports many applications, such as making maps, geocoding observations, flow modeling, data maintenance and stewardship. For additional information, go to http://nhd.usgs.gov. Additional datasets are used for small-scale hydrography representation as well​, including medium resolution NHDPlus published by EPA​; USGS Small-Scale hydrography; and bathymetry from ETOPO1 Global Relief, provided by NOAA National Centers for Environmental Information, U.S. Coastal Relief Model. </abstract>
<purpose>This tile cache base map provides a visualization of the free data that is
available for download from The National Map at https://viewer.nationalmap.gov/viewer. </purpose>
<supplinf></supplinf>
</descript>
<timeperd>
<timeinfo>
<sngdate>
<caldate>2018</caldate>
</sngdate>
</timeinfo>
<current>publication date</current>
</timeperd>
<status>
<progress>Complete</progress>
<update>As needed</update>
</status>
<spdom>
<bounding>
<westbc>-179.999</westbc>
<eastbc>-65</eastbc>
<northbc>71.5</northbc>
<southbc>17.625</southbc>
</bounding>
</spdom>
<keywords>
<theme>
<themekt>NGDA Portfolio Themes</themekt>
<themekey>National Geospatial Data Asset</themekey>
<themekey>NGDA</themekey>
<themekey>Water Inland Theme</themekey>
</theme>
<theme>
<themekt>None</themekt>
<themekey>National Hydrography Dataset</themekey>
<themekey>NHD</themekey>
<themekey>Water</themekey>
<themekey>Rivers</themekey>
<themekey>Streams</themekey>
<themekey>Lakes</themekey>
<themekey>The National Map</themekey>
<themekey>USGS</themekey>
</theme>
<theme>
<themekt>ISO 19115 Topic Category</themekt>
<themekey>imageryBaseMapsEarthCover</themekey>
<themekey>boundaries</themekey>
<themekey>elevation</themekey>
<themekey>inlandWaters</themekey>
</theme>
<theme>
<themekt>The National Map Theme Thesaurus</themekt>
<themekey>Hydrography</themekey>
</theme>
<theme>
<themekt>The National Map Type Thesaurus</themekt>
<themekey>Base Map Service</themekey>
</theme>

<place>
<placekt>Geographic Names Information System</placekt>
<placekey>US</placekey>
<placekey>United States</placekey>
</place>
</keywords>

<accconst>None</accconst>
<useconst>None. Acknowledgement of the originating agencies would be appreciated in products
derived from these data. </useconst>
<browse>
<browsen>https://thor-f5.er.usgs.gov/ngtoc/metadata/waf/services/base_maps/browse/basemap-hydro-cached.png</browsen>
<browsed>Browse graphic showing US representation at small scale.</browsed>
<browset>JPG</browset>
</browse>
<datacred>USGS, NHD Stewards - https://nhd.usgs.gov/stewardship.html#.W2tN29Uzqpo</datacred>
<native>ArcGIS 10.2</native>
</idinfo>
<spdoinfo>
<direct>Raster</direct>
</spdoinfo>
<spref>
<horizsys>
<planar>
<mapproj>
<mapprojn>WGS84 Web Mercator (Auxiliary Sphere)</mapprojn>
<mercator>
<stdparll>0</stdparll>
<longcm>0</longcm>
<feast>0</feast>
<fnorth>0</fnorth>
</mercator>
</mapproj>
<planci>
<plance>coordinate pair</plance>
<coordrep>
<absres>1.000000</absres>
<ordres>1.000000</ordres>
</coordrep>
<plandu>meters</plandu>
</planci>
</planar>
</horizsys>
</spref>
<distinfo>
<distrib>
<cntinfo>
<cntorgp>
<cntorg>U.S. Geological Survey</cntorg>
<cntper>Not Provided</cntper>
</cntorgp>
<cntaddr>
<addrtype>mailing and physical</addrtype>
<address>PO Box 25046 Denver Federal Center</address>
<city>Lakewood</city>
<state>CO</state>
<postal>80225</postal>
</cntaddr>
<cntvoice>1-888-ASK-USGS (1-888-275-8747)</cntvoice>
<cntemail>tnm_help@usgs.gov</cntemail>
</cntinfo>
</distrib>
<resdesc>USGS Hydro-NHD Base Map Service</resdesc>
<distliab>Although these data have been processed successfully on a computer system at the
U.S. Geological Survey, no warranty expressed or implied is made regarding the accuracy or
utility of the data on any other system or for general or scientific purposes, nor shall
the act of distribution constitute any such warranty. This disclaimer applies both to
individual use of the data and aggregate use with other data. It is strongly recommended
that these data are directly acquired from a U.S. Geological Survey server, and not
indirectly through other sources which may have changed the data in some way. It is also
strongly recommended that careful attention be paid to the contents of the metadata file
associated with these data. The U.S. Geological Survey shall not be held liable for
improper or incorrect use of the data described and/or contained herein.</distliab>
<stdorder>
<digform>
<digtinfo>
<formname>Hydro Cached Base Map Service (ArcGIS)</formname>
<formvern>10.21</formvern>
<formspec>https://developers.arcgis.com/rest/</formspec>
</digtinfo>
<digtopt>
<onlinopt>
<computer>
<networka>
<networkr>https://basemap.nationalmap.gov/arcgis/rest/services/USGSHydroCached/MapServer</networkr>
</networka>
</computer>
</onlinopt>
</digtopt>
</digform>
<digform>
<digtinfo>
<formname>Hydro Cached Base Map Service (WMS)</formname>
<formvern>1.3.0</formvern>
<formspec>https://www.opengeospatial.org/standards/wms</formspec>
</digtinfo>
<digtopt>
<onlinopt>
<computer>
<networka>
<networkr>https://basemap.nationalmap.gov/arcgis/services/USGSHydroCached/MapServer/WMSServer?request=GetCapabilities&amp;service=WMS</networkr>
</networka>
</computer>
</onlinopt>
</digtopt>
</digform>
<digform>
<digtinfo>
<formname>Hydro Cached Base Map Service (WMTS)</formname>
<formvern>1.0.0</formvern>
<formspec>https://www.opengeospatial.org/standards/wmts</formspec>
</digtinfo>
<digtopt>
<onlinopt>
<computer>
<networka>
<networkr>https://basemap.nationalmap.gov/arcgis/rest/services/USGSHydroCached/MapServer/WMTS/1.0.0/WMTSCapabilities.xml</networkr>
</networka>
</computer>
</onlinopt>
</digtopt>
</digform>
<fees>None</fees>
</stdorder>
</distinfo>
<metainfo>
<metd>20181128</metd>
<metc>
<cntinfo>
<cntorgp>
<cntorg>U.S. Geological Survey, National Geospatial Technical Operations
Center</cntorg>
<cntper>Not Provided</cntper>
</cntorgp>
<cntaddr>
<addrtype>mailing and physical</addrtype>
<address>1400 Independence Road</address>
<city>Rolla</city>
<state>MO</state>
<postal>65401</postal>
</cntaddr>
<cntaddr>
<addrtype>mailing and physical</addrtype>
<address>PO Box 25046 Denver Federal Center</address>
<city>Lakewood</city>
<state>CO</state>
<postal>80225</postal>
</cntaddr>
<cntvoice>1-888-ASK-USGS (1-888-275-8747)</cntvoice>
<cntemail>tnm_help@usgs.gov</cntemail>
<hours>Monday through Friday, 8:00 AM to 4:30 PM CT</hours>
<cntinst>Metadata information can also be obtained through online services using The
National Map Viewer at https://nationalmap.gov, or EarthExplorer at
https://earthexplorer.usgs.gov, or Ask USGS at https://www.usgs.gov/ask.</cntinst>
</cntinfo>
</metc>
<metstdn>FGDC Content Standard for Digital Geospatial Metadata</metstdn>
<metstdv>FGDC-STD-001-1998</metstdv>
</metainfo>
</metadata>
Loading

2 comments on commit e9eccbc

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
harvester
   __init__.py100100% 
   compare.py50100% 
   extract.py4877 85%
   load.py8522 98%
   transform.py50100% 
harvester/utils
   __init__.py20100% 
   json.py40100% 
harvester/validate
   __init__.py20100% 
   dcat_us.py2433 88%
TOTAL1851294% 

Tests Skipped Failures Errors Time
24 0 💤 0 ❌ 0 🔥 33.995s ⏱️

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
harvester
   __init__.py100100% 
   compare.py50100% 
   extract.py4877 85%
   load.py8522 98%
   transform.py50100% 
harvester/utils
   __init__.py20100% 
   json.py40100% 
harvester/validate
   __init__.py20100% 
   dcat_us.py2433 88%
TOTAL1851294% 

Tests Skipped Failures Errors Time
24 0 💤 0 ❌ 0 🔥 27.781s ⏱️

Please sign in to comment.