Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CCA Vault Validator ID Workaround #484

Merged
merged 8 commits into from
Oct 17, 2023
103 changes: 99 additions & 4 deletions metadata_mapper/mappers/oai/cca_vault_mapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
from typing import Union
from typing import Union, Any

from .oai_mapper import OaiRecord, OaiVernacular
from ..mapper import Validator
from ...validator import ValidationLogLevel, ValidationMode


class CcaVaultRecord(OaiRecord):
def UCLDC_map(self):
return {
"language": self.source_metadata.get("language"),
"source": self.source_metadata.get("source")
}

def map_is_shown_at(self) -> Union[str, None]:
return self.identifier_for_image()
Expand All @@ -12,10 +19,8 @@ def map_is_shown_by(self) -> Union[str, None]:
if not self.is_image_type():
return

if not self.source_metadata.get("type", [])[0].lower() != "image":
return

base_url: str = self.identifier_for_image()

return f"{base_url.replace('items', 'thumbs')}?gallery=preview"

def is_image_type(self) -> bool:
Expand All @@ -31,5 +36,95 @@ def identifier_for_image(self) -> Union[str, None]:
return identifier[0] if identifier else None


class CcaVaultValidator(Validator):

def __init__(self, **options):
super().__init__(**options)
self.add_validatable_field(
field="is_shown_by", type=str,
validations=[
Validator.required_field,
CcaVaultValidator.str_match_ignore_url_protocol,
Validator.type_match
])
self.add_validatable_field(
field="source", type=str,
validations=[
CcaVaultValidator.source_content_match,
],
level=ValidationLogLevel.WARNING
)
self.add_validatable_field(
field="description", type=str,
validations=[CcaVaultValidator.description_match],
level=ValidationLogLevel.WARNING,
)
# these are all modified to indicate order doesn't matter with
# otherwise they're the same as the default validator
# validation_mode=ValidationMode.LAX
self.add_validatable_field(
field="temporal", type=str,
validations=[Validator.content_match],
level=ValidationLogLevel.WARNING,
validation_mode=ValidationMode.LAX
)
self.add_validatable_field(
field="date", type=str,
validations=[Validator.content_match],
level=ValidationLogLevel.WARNING,
validation_mode=ValidationMode.LAX
)
self.add_validatable_field(
field="creator", type=str,
validations=[Validator.content_match],
level=ValidationLogLevel.WARNING,
validation_mode=ValidationMode.LAX
)
self.add_validatable_field(
field="format", type=str,
validations=[Validator.content_match],
level=ValidationLogLevel.WARNING,
validation_mode=ValidationMode.LAX
)

@staticmethod
def str_match_ignore_url_protocol(validation_def: dict,
rikolti_value: Any,
comparison_value: Any) -> None:
if rikolti_value == comparison_value:
return

if comparison_value and comparison_value.startswith('http'):
comparison_value = comparison_value.replace('http', 'https')

if not rikolti_value == comparison_value:
return "Content mismatch"

# this represents a known improvement in rikolti's mapping logic
@staticmethod
def source_content_match(validation_def: dict, rikolti_value: Any,
comparison_value: Any) -> None:
accepted_values = [
['Hamaguchi Study Print Collection'],
['Capp Street Project Archive'],
['CCA/C Archives']
]
if comparison_value is None and rikolti_value in accepted_values:
return
else:
return Validator.content_match(
validation_def, rikolti_value, comparison_value)

@staticmethod
def description_match(validation_def: dict, rikolti_value: Any,
comparison_value: Any) -> None:
if not validation_def["validation_mode"].value.compare(
rikolti_value, comparison_value):
new_comparison_value = [v.rstrip("\n ") for v in comparison_value]
return Validator.content_match(
validation_def, rikolti_value, new_comparison_value)


class CcaVaultVernacular(OaiVernacular):
record_cls = CcaVaultRecord
validator = CcaVaultValidator
13 changes: 12 additions & 1 deletion metadata_mapper/mappers/oai/oai_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,25 @@ def parse(self, api_response):
if sickle_header.deleted:
continue

record = sickle_rec.metadata
record = self.strip_metadata(sickle_rec.metadata)
record['datestamp'] = sickle_header.datestamp
record['id'] = sickle_header.identifier
record['request_url'] = request_url
records.append(record)

return self.get_records(records)

def strip_metadata(self, record_metadata):
stripped = {}
for key, value in record_metadata.items():
if isinstance(value, str):
value = value.strip()
elif isinstance(value, list):
value = [v.strip() if isinstance(v, str) else v for v in value]
stripped[key] = value

return stripped

# lxml parser requires bytes input or XML fragments without declaration,
# so use 'rb' mode
def get_local_api_response(self):
Expand Down
1 change: 1 addition & 0 deletions metadata_mapper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

SOLR_URL = os.environ.get('UCLDC_SOLR_URL', False)
SOLR_API_KEY = os.environ.get('UCLDC_SOLR_API_KEY', False)
COUCH_URL = os.environ.get('UCLDC_COUCH_URL', False)

def local_path(folder, collection_id):
local_path = os.sep.join([
Expand Down
2 changes: 1 addition & 1 deletion metadata_mapper/validate_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def couch_db_request(collection_id: int, field_name: str) -> list[dict[str, str]

Returns: list[dict]
"""
url = "https://harvest-prd.cdlib.org/" \
url = f"{settings.COUCH_URL}/" \
"couchdb/ucldc/_design/all_provider_docs/" \
"_list/has_field_value/by_provider_name_wdoc" \
f"?key=\"{collection_id}\"&field={field_name}"
Expand Down
Loading