Merge pull request #101 from MITLibraries/TIMX-232-springshare-ids

Timx 232 springshare ids
MITLibraries · Aug 7, 2023 · 4fcb617 · 4fcb617
2 parents 25649ec + 86c4ea2
commit 4fcb617
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 52 deletions.
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/tests/test_springshare.py b/tests/test_springshare.py
@@ -12,7 +12,7 @@
 LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
     source="LibGuides",
     source_link="https://libguides.mit.edu/materials",
-    timdex_record_id="libguides:materials",
+    timdex_record_id="libguides:guides-175846",
     title="Materials Science & Engineering",
     citation="Materials Science & Engineering. libguides. "
     "https://libguides.mit.edu/materials",
@@ -33,7 +33,7 @@
 RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
     source="Research Databases",
     source_link="https://libguides.mit.edu/llba",
-    timdex_record_id="researchdatabases:llba",
+    timdex_record_id="researchdatabases:az-65257807",
     title="Linguistics and Language Behavior Abstracts (LLBA)",
     citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. "
     "https://libguides.mit.edu/llba",
@@ -94,7 +94,7 @@ def test_libguide_transform_with_all_fields_transforms_correctly():
     assert next(output_records) == timdex.TimdexRecord(
         source="LibGuides",
         source_link="https://libguides.mit.edu/materials",
-        timdex_record_id="libguides:materials",
+        timdex_record_id="libguides:guides-175846",
         title="Materials Science & Engineering",
         citation="Ye Li. Materials Science & Engineering. MIT Libraries. libguides. "
         "https://libguides.mit.edu/materials",
@@ -154,7 +154,7 @@ def test_research_databases_transform_with_all_fields_transforms_correctly():
     assert next(output_records) == timdex.TimdexRecord(
         source="Research Databases",
         source_link="https://libguides.mit.edu/llba",
-        timdex_record_id="researchdatabases:llba",
+        timdex_record_id="researchdatabases:az-65257807",
         title="Linguistics and Language Behavior Abstracts (LLBA)",
         citation="Linguistics and Language Behavior Abstracts (LLBA). "
         "researchdatabases. https://libguides.mit.edu/llba",

diff --git a/transmogrifier/sources/ead.py b/transmogrifier/sources/ead.py
@@ -445,8 +445,8 @@ def parse_mixed_value(
         """
         if skipped_elements is None:
             skipped_elements = []
-        if type(item) == NavigableString and item.strip():
+        if isinstance(item, NavigableString) and item.strip():
             yield str(item.strip())
-        elif type(item) == Tag and item.name not in skipped_elements:
+        elif isinstance(item, Tag) and item.name not in skipped_elements:
             for child in item.children:
                 yield from cls.parse_mixed_value(child, skipped_elements)
diff --git a/transmogrifier/sources/springshare.py b/transmogrifier/sources/springshare.py
@@ -80,25 +80,31 @@ def get_links(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Lin
         ]
 
     @classmethod
-    def get_source_record_id(cls, xml: Tag) -> str:
+    def get_source_link(
+        cls, source_base_url: str, source_record_id: str, xml: Tag
+    ) -> str:
         """
-        Get the source record ID from a Springshare OAI DC XML record.
+        Override for default source_link behavior.
 
-        Overrides metaclass get_source_record_id() method.
+        Springshare resources contain the source link in their dc:identifier fields.
+        However, this cannot be reliably split and combined with the source base url,
+        as this either provides poorly formed timdex record ids OR source links that
+        do not work.
 
-        The URL path of the Springshare resource is used as the source record id, which
-        results in a timdex record id like "libguides:materials" or
-        "researchdatabases:llba".  This is preferred over the OAI-PMH identifier, a
-        numeric value, which cannot be used to construct an accessible source link.
+        Example libguides OAI identifier and <dc:identifier>:
+            - oai:libguides.com:guides/175846, https://libguides.mit.edu/materials
+            - oai:libguides.com:guides/175847, https://libguides.mit.edu/c.php?g=175847
 
-        Libguides example:
-            "https://libguides.mit.edu/materials" -> "materials"
+        Example researchdatabases OAI identifier and <dc:identifier>:
+            - oai:libguides.com:az/65257807, https://libguides.mit.edu/llba
 
-        AZ (Research Database) example:
-            "https://libguides.mit.edu/llba" -> "llba"
+        It is preferable to split the OAI header identifier and use this as the TIMDEX
+        record id, but then take the dc:identifier wholesale and use this for the source
+        link.
 
         Args:
-            xml: A BeautifulSoup Tag representing a single Springshare OAI DC XML record.
+            source_base_url: Source base URL.
+            source_record_id: Record identifier for the source record.
+            xml: A BeautifulSoup Tag representing a single XML record.
         """
-
-        return str(xml.find("dc:identifier").string).split("/")[-1]
+        return str(xml.find("dc:identifier").string)
diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py
@@ -117,11 +117,16 @@ def get_required_fields(self, xml: Tag) -> dict:
             xml: A BeautifulSoup Tag representing a single OAI-PMH XML record.
         """
         source_record_id = self.get_source_record_id(xml)
+
+        # run methods to generate required fields
+        source_link = self.get_source_link(self.source_base_url, source_record_id, xml)
+        timdex_record_id = self.get_timdex_record_id(self.source, source_record_id, xml)
         title = self.get_valid_title(source_record_id, xml)
+
         return {
             "source": self.source_name,
-            "source_link": self.source_base_url + source_record_id,
-            "timdex_record_id": f"{self.source}:{source_record_id.replace('/', '-')}",
+            "source_link": source_link,
+            "timdex_record_id": timdex_record_id,
             "title": title,
         }
 
@@ -180,7 +185,7 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str:
                 source_record_id,
                 all_titles,
             )
-        if all_titles and type(all_titles[0]) == str:
+        if all_titles and isinstance(all_titles[0], str):
             title = all_titles[0]
         elif all_titles and all_titles[0].string:
             title = all_titles[0].string
@@ -191,3 +196,41 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str:
             )
             title = "Title not provided"
         return title
+
+    @classmethod
+    def get_source_link(
+        cls, source_base_url: str, source_record_id: str, xml: Tag
+    ) -> str:
+        """
+        Class method to set the source link for the item.
+
+        May be overridden by source subclasses if needed.
+
+        Default behavior is to concatenate the source base URL + source record id.
+
+        Args:
+            source_base_url: Source base URL.
+            source_record_id: Record identifier for the source record.
+            xml: A BeautifulSoup Tag representing a single XML record.
+                - not used by default implementation, but could be useful for subclass
+                    overrides
+        """
+        return source_base_url + source_record_id
+
+    @classmethod
+    def get_timdex_record_id(cls, source: str, source_record_id: str, xml: Tag) -> str:
+        """
+        Class method to set the TIMDEX record id.
+
+        May be overridden by source subclasses if needed.
+
+        Default behavior is to concatenate the source name + source record id.
+
+        Args:
+            source: Source name.
+            source_record_id: Record identifier for the source record.
+            xml: A BeautifulSoup Tag representing a single XML record.
+                - not used by default implementation, but could be useful for subclass
+                overrides
+        """
+        return f"{source}:{source_record_id.replace('/', '-')}"