Implement flickr.sppl mapper

ucldc · Apr 14, 2023 · ea3b54d · ea3b54d
1 parent 30a89ec
commit ea3b54d
Showing 1 changed file with 147 additions and 0 deletions.
diff --git a/metadata_mapper/mappers/flickr/sppl_mapper.py b/metadata_mapper/mappers/flickr/sppl_mapper.py
@@ -0,0 +1,147 @@
+from .flickr_mapper import FlickrRecord, FlickrVernacular
+import re
+
+
+class SpplRecord(FlickrRecord):
+    def UCLDC_map(self):
+        return {
+            "description": self.map_description,
+            "identifier": self.map_identifier,
+            "type": self.map_type,
+            "rights": self.map_rights,
+            "provenance": self.map_provenance
+        }
+
+    @property
+    def source_description(self):
+        return self.source_metadata.get("description", {}).get("_content")
+
+    @staticmethod
+    def get_mapping_configuration():
+        """
+        Provides configuration for locating and extracting metadata values from
+        the description field using regex. The "key" field gives the regex a
+        name. The capture group in the regex is the value that we extract for
+        that field. The entire match (including the parts outside the capture
+        group), are replaced with an empty string in the description field.
+        The "keep" field indicates whether we want to store the value or not, it
+        will be removed from the description string in all cases.
+
+        The `rights_information` regex is a catch all that will capture
+        everything after "Rights Information:" appears in the description,
+        which is why it is last.
+        """
+        return [
+            {
+                "key": "type",
+                "regex": r"Type:([^\n]+)\n\s*\n"
+            },
+            {
+                "key": "provenance",
+                "regex": r"Source:([^\n]+)\n\s*\n"
+            },
+            {
+                "key": "date",
+                "regex": r"Date:([^\n]+)\n\s*\n"
+            },
+            {
+                "key": "identifier",
+                "regex": r"Identifier:([^\n]+)\n\s*\n"
+            },
+            {
+                "key": "owner",
+                "regex": r"Owner:([^\n]+)\n\s*\n",
+                "keep": False
+            },
+            {
+                "key": "previous_identifier_discard",
+                "regex": r"Previous Identifier: *N/A\n\s*\n",
+                "keep": False
+            },
+            {
+                "key": "previous_identifier",
+                "regex": r"Previous Identifier:([^\n]+)\n\s*\n"
+            },
+
+            {
+                "key": "category",
+                "regex": r"Category:([^\n]+)\n\s*\n"
+            },
+            {
+                "key": "rights_information",
+                "regex": r"Rights Information:([\S\s]+)"
+            }
+        ]
+
+    def split_description(self):
+        description = self.source_description
+        description_parts = {}
+
+        for field_configuration in self.get_mapping_configuration():
+            matches = re.search(field_configuration.get("regex"),
+                                self.source_description, re.MULTILINE)
+            if not matches:
+                continue
+
+            description = description.replace(matches.group(0), "")
+
+            if not field_configuration.get("keep", True):
+                continue
+
+            description_parts.update({field_configuration.get("key"):
+                                      matches.group(1).strip()})
+
+        description_parts.update({"description": description})
+
+        return description_parts
+
+    def map_subject(self):
+        subjects = [tag.get("raw") for tag
+                    in self.source_metadata.get("tags", {}).get("tag", [])]
+
+        category = self.split_description().get("category")
+
+        if category:
+            subjects.append(category.lower())
+
+        return subjects
+
+    def map_type(self):
+        return self.split_description().get("type")
+
+    def map_provenance(self):
+        return self.split_description().get("provenance")
+
+    def map_rights(self):
+        return self.split_description().get("rights_information")
+
+    def map_date(self):
+        return self.split_description().get("date")
+
+    def map_description(self):
+        return self.split_description().get("description")
+
+    def map_identifier(self):
+        """
+        Combine `previous_identifier` and `identifier` values from description
+        metadata. The `previous_identifier` may contain an ARK, which we
+        extract.
+        """
+        previous_identifiers = self.split_description().\
+            get("previous_identifier", "")
+        identifiers = [self.split_description().get("identifier")]
+
+        for previous_identifier in previous_identifiers.split(" / "):
+            if "ark:" in previous_identifier:
+                matches = re.search(r"ark:[\/a-z0-9]+", previous_identifier)
+                if matches:
+                    identifiers.append(matches[0])
+            else:
+                identifiers.append(previous_identifier)
+
+        return identifiers
+
+
+class SpplVernacular(FlickrVernacular):
+    record_cls = SpplRecord
+