diff --git a/metadata_mapper/mappers/flickr/sdasm_mapper.py b/metadata_mapper/mappers/flickr/sdasm_mapper.py index e3a0080ab..1271210c5 100644 --- a/metadata_mapper/mappers/flickr/sdasm_mapper.py +++ b/metadata_mapper/mappers/flickr/sdasm_mapper.py @@ -7,7 +7,7 @@ def UCLDC_map(self): split_description = self.split_description() return { - "description": self.map_description(split_description), + "description": [self.map_description(split_description)], "identifier": list(set(filter(None, [ split_description.get("piction_id"), split_description.get("catalog"), @@ -34,6 +34,14 @@ def get_mapping_configuration(): name. The capture group in the regex is the value that we extract for that field. The entire match (including the parts outside the capture group), are replaced with an empty string in the description field. + The "discard" field indicates whether we want to store the value or + not. The "keep_in_description" field indicates whether, after extraction, + the label and value should remain in the description field. + + Note that a couple values are have both "keep_in_description": True and + "discard": True. This is the same as if the field were not defined here + at all. A secondary goal is to document all the extractable fields even if + they are kept in the description, and not put in any other field. There are two formats that SDASM uses. One separates values with " - ". The other separates with linebreaks. The fields that use these two @@ -43,47 +51,49 @@ def get_mapping_configuration(): return [ { "key": "piction_id", - "regex": r"(^| )PictionID: ?(\S+)" + "regex": r"(^| )PictionID: ?(\S+)", + "prepend": "PictionID: " }, { "key": "catalog", - "regex": r"(^| )Catalog: ?(\S+)" + "regex": r"(^| )Catalog: ?(\S+)", + "prepend": "Catalog: " }, { "key": "filename", - "regex": r"(^| )Filename: ?(\S+)" + "regex": r"(^| )Filename: ?(\S+)", + "prepend": "Filename: " }, { "key": "date_on_neg", - "regex": r"(^| )Date on Neg: ?(\S+)" + "regex": r"(^| )Date on Neg: ?(\S+)", + "keep_in_description": True }, { "key": "year", - "regex": r"(^| )Year: ?([^\n]+)\n" + "regex": r"(^| )Year: ?([^\n]+)\n", + "keep_in_description": True }, { "key": "date", "regex": r"(^| )Date: ?(\S+)", + "keep_in_description": True }, { "key": "sdasm_catalog", - "regex": r"^SDASM Catalog #: ?([^\n]+)\n" + "regex": r"^SDASM Catalog #: ?([^\n]+)\n", + "prepend": "SDASM Catalog #: " }, { "key": "corp_name", - "regex": r"^Corp. Name: ?([^\n]+)\n" - }, - { - "key": "title", - "regex": r"^Title: ?([^\n]+)\n" + "regex": r"^Corp. Name: ?([^\n]+)\n", + "discard": True, + "keep_in_description": True }, { "key": "catalog_or_negative_number", - "regex": r"^Catalog or Negative #: ?([^\n]+)\n" - }, - { - "key": "media_negative_size", - "regex": r"^Media +\(negative size\): ?([^\n]+)\n" + "regex": r"^Catalog or Negative #: ?([^\n]+)\n", + "prepend": "Catalog or Negative #: " }, { "key": "description", @@ -91,7 +101,9 @@ def get_mapping_configuration(): }, { "key": "repository", - "regex": r"Repository:()? ?([^\n]*)$" + "regex": r"Repository:()? ?([^\n]*)$", + "discard": True, + "keep_in_description": True } ] @@ -105,10 +117,13 @@ def split_description(self): if not matches: continue - description = description.replace(matches.group(0), "") + if not field_configuration.get("keep_in_description", False): + description = description.replace(matches.group(0), "") + + prepend = field_configuration.get("prepend", "") description_parts.update({field_configuration.get("key"): - matches.groups()[-1].strip()}) + prepend + matches.groups()[-1].strip()}) # Set the description if it wasn't provided as metadata in the # description field @@ -120,10 +135,6 @@ def split_description(self): def map_description(self, split_description): description = split_description.get("description") - # Get rid of the message wrapped in triple dashes at the end. This only - # works if the repository field is already extracted. - description = re.sub(r"---[^-]+---$", "", description) - # Get rid of multiple " -" which serve as separators description = re.sub(r"( +-){2,}", " -", description)