Skip to content

Commit

Permalink
[WIP] Implement xml_file fetcher
Browse files Browse the repository at this point in the history
  • Loading branch information
lthurston committed Jul 19, 2023
1 parent aa897f6 commit 4ee0b6f
Showing 1 changed file with 69 additions and 0 deletions.
69 changes: 69 additions & 0 deletions metadata_fetcher/fetchers/xml_file_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import json
from .Fetcher import Fetcher, FetchError
import requests
from xml.etree import ElementTree
import settings
import math

class XmlFileFetcher(Fetcher):
def __init__(self, params: dict[str]):
"""
Parameters:
params: dict[str]
"""
super(XmlFileFetcher, self).__init__(params)

self.collection_id = params.get("collection_id")
self.url = params.get("harvest_data").get("url")
self.per_page = 100

def fetch_page(self) -> int:
"""
Returns:
int
"""
page = {"url": self.url}
print(
f"[{self.collection_id}]: Fetching {page.get('url')}"
)
try:
response = requests.get(**page)
response.raise_for_status()
except requests.exceptions.HTTPError:
raise FetchError(
f"[{self.collection_id}]: unable to fetch ")

return self.fetch_all_pages(response)

def fetch_all_pages(self, response) -> int:
"""
Parameters:
response: Requests.response
Returns:
int
"""
xml = ElementTree.fromstring(response.text)
record_nodes = xml.findall(".//record")
pages = math.ceil(len(record_nodes) / self.per_page)

for page in range(pages):
skip = self.write_page * self.per_page
items = record_nodes[skip:(skip + self.per_page)]
content = "".join([ElementTree.tostring(item, encoding="unicode")
for item in items])
if settings.DATA_DEST == 'local':
self.fetchtolocal(content)
else:
self.fetchtos3(content)
self.write_page += 1
return len(record_nodes)

def json(self) -> str:
"""
This fetcher is run once, then done
Returns: str
"""
return json.dumps({"finished": True})

0 comments on commit 4ee0b6f

Please sign in to comment.