Skip to content

Commit

Permalink
[F] Sdasm
Browse files Browse the repository at this point in the history
  • Loading branch information
lthurston committed Apr 13, 2023
1 parent 4c25582 commit 6bbc0fd
Showing 1 changed file with 108 additions and 46 deletions.
154 changes: 108 additions & 46 deletions metadata_mapper/mappers/flickr/sdasm_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,6 @@


class SdasmRecord(FlickrRecord):

IDENTIFIER_FIELDS = [
"PictionID",
"Catalog",
"Filename"
]

DATE_FIELDS = [
"Date on Neg",
"Date"
]

FIELD_BEFORE_LABEL = r"(^| - )"

FIELD_AFTER_LABEL = r":\s*"

def UCLDC_map(self):
return {
"description": self.map_description,
Expand All @@ -29,48 +13,126 @@ def UCLDC_map(self):
def source_description(self):
return self.source_metadata.get("description", {}).get("_content")

def date_field_regex(self, field_name):
return self.field_wrap(field_name) + r"([\d/]+)"

def identifier_field_regex(self, field_name):
return self.field_wrap(field_name) + r"([-.\w]+)"

def field_wrap(self, field_name):
return self.FIELD_BEFORE_LABEL + re.escape(field_name) + \
self.FIELD_AFTER_LABEL

def map_description(self):
return self.source_description

def get_matches(self, regexes):
@staticmethod
def get_mapping_configuration():
"""
Provides configuration for locating and extracting metadata values from
the description field using regex. The "key" field gives the regex a
name. The capture group in the regex is the value that we extract for
that field. The entire match (including the parts outside the capture
group), are replaced with an empty string in the description field.
There are two formats that SDASM uses. One separates values with " - ".
The other separates with linebreaks. The fields that use these two
formats don't appear to mix fields, which is why this works. It might be
possible to match both formats, but doesn't seem necessary at this time.
"""
return [
{
"key": "piction_id",
"regex": r"(^| - )PictionID: ?(\S+)"
},
{
"key": "catalog",
"regex": r"(^| - )Catalog: ?(\S+)"
},
{
"key": "filename",
"regex": r"(^| - )Filename: ?(\S+)"
},
{
"key": "date",
"regex": r"(^| - )Date on Neg: ?(\S+)"
},
{
"key": "year",
"regex": r"(^| - )Year: ?([^\n]+)\n"
},
{
"key": "date_on_neg",
"regex": r"(^| - )Date: ?(\S+)",
},
{
"key": "sdasm_catalog",
"regex": r"^SDASM Catalog #: ?([^\n]+)\n"
},
{
"key": "corp_name",
"regex": r"^Corp. Name: ?([^\n]+)\n"
},
{
"key": "title",
"regex": r"^Title: ?([^\n]+)\n"
},
{
"key": "catalog_or_negative_number",
"regex": r"^Catalog or Negative #: ?([^\n]+)\n"
},
{
"key": "media_negative_size",
"regex": r"^Media +\(negative size\): ?([^\n]+)\n"
},
{
"key": "description",
"regex": r"Description: ?([^\n]*)\n"
},
{
"key": "repository",
"regex": r"Repository:(</b>)? ?([^\n]*)$"
}
]

def split_description(self):
description = self.source_description
description_parts = {}

matches = []
for regex in regexes:
matches = re.search(regex, description, re.IGNORECASE)
for field_configuration in self.get_mapping_configuration():
matches = re.search(field_configuration.get("regex"),
self.source_description, re.MULTILINE)
if not matches:
continue

matches.append(matches.group(2))
return matches
description = description.replace(matches.group(0), "")

def map_date(self):
if not self.source_description:
return
description_parts.update({field_configuration.get("key"):
matches.groups()[-1].strip()})

# Set the description if it wasn't provided as metadata in the
# description field
if "description" not in description_parts:
description_parts.update({"description": description})

date = self.get_matches([self.date_field_regex(field)
for field in self.DATE_FIELDS])
return description_parts

return date[-1] if date else None
def map_date(self):
return list(filter(None, [
self.split_description().get("date"),
self.split_description().get("date_on_neg"),
self.split_description().get("year")
]))

def map_identifier(self):
if not self.source_description:
return
return list(filter(None, [
self.split_description().get("piction_id"),
self.split_description().get("catalog"),
self.split_description().get("filename"),
self.split_description().get("sdasm_catalog"),
self.split_description().get("catalog_or_negative_number")
]))

def map_description(self):
description = self.split_description().get("description")
description = description.replace("---Please Tag these images so that "
"the information can be permanently "
"stored with the digital file.---",
"")
description = re.sub(r"( +-){2,}", " -", description)
description = re.sub(r"\A -", "", description, re.MULTILINE)

return list(set([self.identifier_field_regex(field)
for field in self.IDENTIFIER_FIELDS]))
# Extracting the title, from "Title:" to " - ", is possible, but would
# require some close analysis to review the results.
return description


class SdasmVernacular(FlickrVernacular):
record_cls = SdasmRecord

0 comments on commit 6bbc0fd

Please sign in to comment.