Skip to content

Commit

Permalink
Address mapping and description parsing feedback
Browse files Browse the repository at this point in the history
Issue #460
  • Loading branch information
lthurston committed Jul 26, 2023
1 parent 32bf4c7 commit 8915635
Showing 1 changed file with 35 additions and 24 deletions.
59 changes: 35 additions & 24 deletions metadata_mapper/mappers/flickr/sdasm_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def UCLDC_map(self):
split_description = self.split_description()

return {
"description": self.map_description(split_description),
"description": [self.map_description(split_description)],
"identifier": list(set(filter(None, [
split_description.get("piction_id"),
split_description.get("catalog"),
Expand All @@ -34,6 +34,14 @@ def get_mapping_configuration():
name. The capture group in the regex is the value that we extract for
that field. The entire match (including the parts outside the capture
group), are replaced with an empty string in the description field.
The "discard" field indicates whether we want to store the value or
not. The "keep_in_description" field indicates whether, after extraction,
the label and value should remain in the description field.
Note that a couple values are have both "keep_in_description": True and
"discard": True. This is the same as if the field were not defined here
at all. A secondary goal is to document all the extractable fields even if
they are kept in the description, and not put in any other field.
There are two formats that SDASM uses. One separates values with " - ".
The other separates with linebreaks. The fields that use these two
Expand All @@ -43,55 +51,59 @@ def get_mapping_configuration():
return [
{
"key": "piction_id",
"regex": r"(^| )PictionID: ?(\S+)"
"regex": r"(^| )PictionID: ?(\S+)",
"prepend": "PictionID: "
},
{
"key": "catalog",
"regex": r"(^| )Catalog: ?(\S+)"
"regex": r"(^| )Catalog: ?(\S+)",
"prepend": "Catalog: "
},
{
"key": "filename",
"regex": r"(^| )Filename: ?(\S+)"
"regex": r"(^| )Filename: ?(\S+)",
"prepend": "Filename: "
},
{
"key": "date_on_neg",
"regex": r"(^| )Date on Neg: ?(\S+)"
"regex": r"(^| )Date on Neg: ?(\S+)",
"keep_in_description": True
},
{
"key": "year",
"regex": r"(^| )Year: ?([^\n]+)\n"
"regex": r"(^| )Year: ?([^\n]+)\n",
"keep_in_description": True
},
{
"key": "date",
"regex": r"(^| )Date: ?(\S+)",
"keep_in_description": True
},
{
"key": "sdasm_catalog",
"regex": r"^SDASM Catalog #: ?([^\n]+)\n"
"regex": r"^SDASM Catalog #: ?([^\n]+)\n",
"prepend": "SDASM Catalog #: "
},
{
"key": "corp_name",
"regex": r"^Corp. Name: ?([^\n]+)\n"
},
{
"key": "title",
"regex": r"^Title: ?([^\n]+)\n"
"regex": r"^Corp. Name: ?([^\n]+)\n",
"discard": True,
"keep_in_description": True
},
{
"key": "catalog_or_negative_number",
"regex": r"^Catalog or Negative #: ?([^\n]+)\n"
},
{
"key": "media_negative_size",
"regex": r"^Media +\(negative size\): ?([^\n]+)\n"
"regex": r"^Catalog or Negative #: ?([^\n]+)\n",
"prepend": "Catalog or Negative #: "
},
{
"key": "description",
"regex": r"Description: ?([^\n]*)\n"
},
{
"key": "repository",
"regex": r"Repository:(</b>)? ?([^\n]*)$"
"regex": r"Repository:(</b>)? ?([^\n]*)$",
"discard": True,
"keep_in_description": True
}
]

Expand All @@ -105,10 +117,13 @@ def split_description(self):
if not matches:
continue

description = description.replace(matches.group(0), "")
if not field_configuration.get("keep_in_description", False):
description = description.replace(matches.group(0), "")

prepend = field_configuration.get("prepend", "")

description_parts.update({field_configuration.get("key"):
matches.groups()[-1].strip()})
prepend + matches.groups()[-1].strip()})

# Set the description if it wasn't provided as metadata in the
# description field
Expand All @@ -120,10 +135,6 @@ def split_description(self):
def map_description(self, split_description):
description = split_description.get("description")

# Get rid of the message wrapped in triple dashes at the end. This only
# works if the repository field is already extracted.
description = re.sub(r"---[^-]+---$", "", description)

# Get rid of multiple " -" which serve as separators
description = re.sub(r"( +-){2,}", " -", description)

Expand Down

0 comments on commit 8915635

Please sign in to comment.