This repository has been archived by the owner on May 14, 2024. It is now read-only.
forked from CannonLock/nbmg_download
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalaska.py
99 lines (77 loc) · 2.75 KB
/
alaska.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Look for geologic maps from the Alaska Division of Geological & Geophysical Surveys.
"""
import csv
import sys
import urllib.parse
import bs4
import requests
TIMEOUT = 10 # seconds
def add_hostname(hostname: str, url: str) -> str:
"""
Ensure that the given URL has a scheme and hostname specified.
"""
parts = urllib.parse.urlparse(url)
if not parts.scheme:
parts = parts._replace(scheme="https")
if not parts.netloc:
parts = parts._replace(netloc=hostname)
return parts.geturl()
def get_hrefs(soup: bs4.BeautifulSoup) -> list[str]:
"""
Return the `href` attributes for all the anchor tags in `soup`.
"""
urls = []
for anchor in soup.find_all("a"):
if url := anchor.get("href", ""):
urls.append(url)
return urls
def main() -> None:
"""
Scrape the Alaska DGGS website and print a CSV file to standard out.
The CSV file can be processed with `macrostrat maps ingest-from-csv`.
"""
writer = csv.DictWriter(
sys.stdout,
[
"slug",
"name",
"website_url",
"archive_url",
],
)
writer.writeheader()
## Visit https://dggs.alaska.gov/pubs/ -> Geospatial Data Only = Yes.
search_url = "https://dggs.alaska.gov/pubs/pubs?title=&author=&pubnumber=&keyword=&keywordWildcard=all&quadrangle=&publisher=All&year=&startyear=&endyear=&digitaldata=Yes&reqtype=Search+Pubs"
resp = requests.get(search_url, timeout=TIMEOUT)
soup = bs4.BeautifulSoup(resp.text, "html.parser")
## Collect links to individual reports.
report_urls = [
add_hostname("dggs.alaska.gov", url)
for url in get_hrefs(soup)
if url.startswith("/pubs/id/")
]
## Scrape each report for links to shapefiles.
for report_url in report_urls:
resp = requests.get(report_url, timeout=TIMEOUT)
soup = bs4.BeautifulSoup(resp.text, "html.parser")
name = soup.title.text
slug = "alaska_dggs_" + report_url.split("/")[-1]
## Attempt to locate and parse the Geospatial & Analytical Data table.
for row in soup.find_all("tr"):
cols = row.find_all("td")
if len(cols) >= 2 and "Shapefile" in cols[1].text:
archive_urls = [
add_hostname("dggs.alaska.gov", url) for url in get_hrefs(cols[0])
]
for archive_url in archive_urls:
writer.writerow(
{
"slug": slug,
"name": name,
"website_url": report_url,
"archive_url": archive_url,
}
)
if __name__ == "__main__":
main()