diff --git a/metadata_mapper/mappers/oai/cca_vault_mapper.py b/metadata_mapper/mappers/oai/cca_vault_mapper.py index dfd46b74d..6a4d2a990 100644 --- a/metadata_mapper/mappers/oai/cca_vault_mapper.py +++ b/metadata_mapper/mappers/oai/cca_vault_mapper.py @@ -1,9 +1,16 @@ -from typing import Union +from typing import Union, Any from .oai_mapper import OaiRecord, OaiVernacular +from ..mapper import Validator +from ...validator import ValidationLogLevel, ValidationMode class CcaVaultRecord(OaiRecord): + def UCLDC_map(self): + return { + "language": self.source_metadata.get("language"), + "source": self.source_metadata.get("source") + } def map_is_shown_at(self) -> Union[str, None]: return self.identifier_for_image() @@ -12,10 +19,8 @@ def map_is_shown_by(self) -> Union[str, None]: if not self.is_image_type(): return - if not self.source_metadata.get("type", [])[0].lower() != "image": - return - base_url: str = self.identifier_for_image() + return f"{base_url.replace('items', 'thumbs')}?gallery=preview" def is_image_type(self) -> bool: @@ -31,5 +36,95 @@ def identifier_for_image(self) -> Union[str, None]: return identifier[0] if identifier else None +class CcaVaultValidator(Validator): + + def __init__(self, **options): + super().__init__(**options) + self.add_validatable_field( + field="is_shown_by", type=str, + validations=[ + Validator.required_field, + CcaVaultValidator.str_match_ignore_url_protocol, + Validator.type_match + ]) + self.add_validatable_field( + field="source", type=str, + validations=[ + CcaVaultValidator.source_content_match, + ], + level=ValidationLogLevel.WARNING + ) + self.add_validatable_field( + field="description", type=str, + validations=[CcaVaultValidator.description_match], + level=ValidationLogLevel.WARNING, + ) + # these are all modified to indicate order doesn't matter with + # otherwise they're the same as the default validator + # validation_mode=ValidationMode.LAX + self.add_validatable_field( + field="temporal", type=str, + validations=[Validator.content_match], + level=ValidationLogLevel.WARNING, + validation_mode=ValidationMode.LAX + ) + self.add_validatable_field( + field="date", type=str, + validations=[Validator.content_match], + level=ValidationLogLevel.WARNING, + validation_mode=ValidationMode.LAX + ) + self.add_validatable_field( + field="creator", type=str, + validations=[Validator.content_match], + level=ValidationLogLevel.WARNING, + validation_mode=ValidationMode.LAX + ) + self.add_validatable_field( + field="format", type=str, + validations=[Validator.content_match], + level=ValidationLogLevel.WARNING, + validation_mode=ValidationMode.LAX + ) + + @staticmethod + def str_match_ignore_url_protocol(validation_def: dict, + rikolti_value: Any, + comparison_value: Any) -> None: + if rikolti_value == comparison_value: + return + + if comparison_value and comparison_value.startswith('http'): + comparison_value = comparison_value.replace('http', 'https') + + if not rikolti_value == comparison_value: + return "Content mismatch" + + # this represents a known improvement in rikolti's mapping logic + @staticmethod + def source_content_match(validation_def: dict, rikolti_value: Any, + comparison_value: Any) -> None: + accepted_values = [ + ['Hamaguchi Study Print Collection'], + ['Capp Street Project Archive'], + ['CCA/C Archives'] + ] + if comparison_value is None and rikolti_value in accepted_values: + return + else: + return Validator.content_match( + validation_def, rikolti_value, comparison_value) + + @staticmethod + def description_match(validation_def: dict, rikolti_value: Any, + comparison_value: Any) -> None: + if not validation_def["validation_mode"].value.compare( + rikolti_value, comparison_value): + new_comparison_value = [v.rstrip("\n ") for v in comparison_value] + return Validator.content_match( + validation_def, rikolti_value, new_comparison_value) + + class CcaVaultVernacular(OaiVernacular): record_cls = CcaVaultRecord + validator = CcaVaultValidator diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py index d0110d5f6..559b8f414 100644 --- a/metadata_mapper/mappers/oai/oai_mapper.py +++ b/metadata_mapper/mappers/oai/oai_mapper.py @@ -105,7 +105,7 @@ def parse(self, api_response): if sickle_header.deleted: continue - record = sickle_rec.metadata + record = self.strip_metadata(sickle_rec.metadata) record['datestamp'] = sickle_header.datestamp record['id'] = sickle_header.identifier record['request_url'] = request_url @@ -113,6 +113,17 @@ def parse(self, api_response): return self.get_records(records) + def strip_metadata(self, record_metadata): + stripped = {} + for key, value in record_metadata.items(): + if isinstance(value, str): + value = value.strip() + elif isinstance(value, list): + value = [v.strip() if isinstance(v, str) else v for v in value] + stripped[key] = value + + return stripped + # lxml parser requires bytes input or XML fragments without declaration, # so use 'rb' mode def get_local_api_response(self): diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py index 3bd9f7068..caaddefdb 100644 --- a/metadata_mapper/settings.py +++ b/metadata_mapper/settings.py @@ -24,6 +24,7 @@ SOLR_URL = os.environ.get('UCLDC_SOLR_URL', False) SOLR_API_KEY = os.environ.get('UCLDC_SOLR_API_KEY', False) +COUCH_URL = os.environ.get('UCLDC_COUCH_URL', False) def local_path(folder, collection_id): local_path = os.sep.join([ diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py index f876468c8..58bf68206 100644 --- a/metadata_mapper/validate_mapping.py +++ b/metadata_mapper/validate_mapping.py @@ -175,7 +175,7 @@ def couch_db_request(collection_id: int, field_name: str) -> list[dict[str, str] Returns: list[dict] """ - url = "https://harvest-prd.cdlib.org/" \ + url = f"{settings.COUCH_URL}/" \ "couchdb/ucldc/_design/all_provider_docs/" \ "_list/has_field_value/by_provider_name_wdoc" \ f"?key=\"{collection_id}\"&field={field_name}"