Skip to content

Commit

Permalink
Merge pull request #101 from MITLibraries/TIMX-232-springshare-ids
Browse files Browse the repository at this point in the history
Timx 232 springshare ids
  • Loading branch information
ghukill authored Aug 7, 2023
2 parents 25649ec + 86c4ea2 commit 4fcb617
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 52 deletions.
58 changes: 29 additions & 29 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions tests/test_springshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
source="LibGuides",
source_link="https://libguides.mit.edu/materials",
timdex_record_id="libguides:materials",
timdex_record_id="libguides:guides-175846",
title="Materials Science & Engineering",
citation="Materials Science & Engineering. libguides. "
"https://libguides.mit.edu/materials",
Expand All @@ -33,7 +33,7 @@
RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
source="Research Databases",
source_link="https://libguides.mit.edu/llba",
timdex_record_id="researchdatabases:llba",
timdex_record_id="researchdatabases:az-65257807",
title="Linguistics and Language Behavior Abstracts (LLBA)",
citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. "
"https://libguides.mit.edu/llba",
Expand Down Expand Up @@ -94,7 +94,7 @@ def test_libguide_transform_with_all_fields_transforms_correctly():
assert next(output_records) == timdex.TimdexRecord(
source="LibGuides",
source_link="https://libguides.mit.edu/materials",
timdex_record_id="libguides:materials",
timdex_record_id="libguides:guides-175846",
title="Materials Science & Engineering",
citation="Ye Li. Materials Science & Engineering. MIT Libraries. libguides. "
"https://libguides.mit.edu/materials",
Expand Down Expand Up @@ -154,7 +154,7 @@ def test_research_databases_transform_with_all_fields_transforms_correctly():
assert next(output_records) == timdex.TimdexRecord(
source="Research Databases",
source_link="https://libguides.mit.edu/llba",
timdex_record_id="researchdatabases:llba",
timdex_record_id="researchdatabases:az-65257807",
title="Linguistics and Language Behavior Abstracts (LLBA)",
citation="Linguistics and Language Behavior Abstracts (LLBA). "
"researchdatabases. https://libguides.mit.edu/llba",
Expand Down
4 changes: 2 additions & 2 deletions transmogrifier/sources/ead.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,8 @@ def parse_mixed_value(
"""
if skipped_elements is None:
skipped_elements = []
if type(item) == NavigableString and item.strip():
if isinstance(item, NavigableString) and item.strip():
yield str(item.strip())
elif type(item) == Tag and item.name not in skipped_elements:
elif isinstance(item, Tag) and item.name not in skipped_elements:
for child in item.children:
yield from cls.parse_mixed_value(child, skipped_elements)
34 changes: 20 additions & 14 deletions transmogrifier/sources/springshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,25 +80,31 @@ def get_links(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Lin
]

@classmethod
def get_source_record_id(cls, xml: Tag) -> str:
def get_source_link(
cls, source_base_url: str, source_record_id: str, xml: Tag
) -> str:
"""
Get the source record ID from a Springshare OAI DC XML record.
Override for default source_link behavior.
Overrides metaclass get_source_record_id() method.
Springshare resources contain the source link in their dc:identifier fields.
However, this cannot be reliably split and combined with the source base url,
as this either provides poorly formed timdex record ids OR source links that
do not work.
The URL path of the Springshare resource is used as the source record id, which
results in a timdex record id like "libguides:materials" or
"researchdatabases:llba". This is preferred over the OAI-PMH identifier, a
numeric value, which cannot be used to construct an accessible source link.
Example libguides OAI identifier and <dc:identifier>:
- oai:libguides.com:guides/175846, https://libguides.mit.edu/materials
- oai:libguides.com:guides/175847, https://libguides.mit.edu/c.php?g=175847
Libguides example:
"https://libguides.mit.edu/materials" -> "materials"
Example researchdatabases OAI identifier and <dc:identifier>:
- oai:libguides.com:az/65257807, https://libguides.mit.edu/llba
AZ (Research Database) example:
"https://libguides.mit.edu/llba" -> "llba"
It is preferable to split the OAI header identifier and use this as the TIMDEX
record id, but then take the dc:identifier wholesale and use this for the source
link.
Args:
xml: A BeautifulSoup Tag representing a single Springshare OAI DC XML record.
source_base_url: Source base URL.
source_record_id: Record identifier for the source record.
xml: A BeautifulSoup Tag representing a single XML record.
"""

return str(xml.find("dc:identifier").string).split("/")[-1]
return str(xml.find("dc:identifier").string)
49 changes: 46 additions & 3 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,16 @@ def get_required_fields(self, xml: Tag) -> dict:
xml: A BeautifulSoup Tag representing a single OAI-PMH XML record.
"""
source_record_id = self.get_source_record_id(xml)

# run methods to generate required fields
source_link = self.get_source_link(self.source_base_url, source_record_id, xml)
timdex_record_id = self.get_timdex_record_id(self.source, source_record_id, xml)
title = self.get_valid_title(source_record_id, xml)

return {
"source": self.source_name,
"source_link": self.source_base_url + source_record_id,
"timdex_record_id": f"{self.source}:{source_record_id.replace('/', '-')}",
"source_link": source_link,
"timdex_record_id": timdex_record_id,
"title": title,
}

Expand Down Expand Up @@ -180,7 +185,7 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str:
source_record_id,
all_titles,
)
if all_titles and type(all_titles[0]) == str:
if all_titles and isinstance(all_titles[0], str):
title = all_titles[0]
elif all_titles and all_titles[0].string:
title = all_titles[0].string
Expand All @@ -191,3 +196,41 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str:
)
title = "Title not provided"
return title

@classmethod
def get_source_link(
cls, source_base_url: str, source_record_id: str, xml: Tag
) -> str:
"""
Class method to set the source link for the item.
May be overridden by source subclasses if needed.
Default behavior is to concatenate the source base URL + source record id.
Args:
source_base_url: Source base URL.
source_record_id: Record identifier for the source record.
xml: A BeautifulSoup Tag representing a single XML record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return source_base_url + source_record_id

@classmethod
def get_timdex_record_id(cls, source: str, source_record_id: str, xml: Tag) -> str:
"""
Class method to set the TIMDEX record id.
May be overridden by source subclasses if needed.
Default behavior is to concatenate the source name + source record id.
Args:
source: Source name.
source_record_id: Record identifier for the source record.
xml: A BeautifulSoup Tag representing a single XML record.
- not used by default implementation, but could be useful for subclass
overrides
"""
return f"{source}:{source_record_id.replace('/', '-')}"

0 comments on commit 4fcb617

Please sign in to comment.