Skip to content

Commit

Permalink
Add script to update sort_collection_data for ETL'd collections
Browse files Browse the repository at this point in the history
  • Loading branch information
amywieliczka committed May 22, 2024
1 parent 8caa442 commit ca1a962
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 0 deletions.
165 changes: 165 additions & 0 deletions record_indexer/scripts/update_sort_collection_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import json

import requests

from .. import settings

from utils.registry_client import registry_endpoint

etl_collection_url = (
"https://registry.cdlib.org/api/v1/rikolticollection/"
"?harvest_type=etl&format=json"
)

class OpensearchClient(object):
def __init__(self, endpoint, auth):
self.endpoint = endpoint
self.auth = auth

def search(self, **kwargs):
resp = requests.get(
f"{settings.ENDPOINT}/_search",
headers={"Content-Type": "application/json"},
auth=settings.get_auth(),
data=json.dumps(kwargs)
)
return resp.json()

def update_by_query(self, **kwargs):
resp = requests.post(
f"{settings.ENDPOINT}/rikolti-stg/_update_by_query",
headers={"Content-Type": "application/json"},
auth=settings.get_auth(),
data=json.dumps(kwargs)
)
json_resp = resp.json()

if json_resp['timed_out'] or json_resp['failures']:
raise Exception(f"TIMEOUT ERROR: \n{json.dumps(kwargs)}")

return json_resp


class FacetError(Exception):
pass

class FacetValueError(Exception):
pass

def check_sort_collection_data(response):
facet = response['aggregations']['sort_collection_data']
total = response['hits']['total']['value']

if len(facet['buckets']) != 1:
raise(FacetError(f"Please check facet values: {facet}"))

value = facet['buckets'][0]['key']
count = facet['buckets'][0]['doc_count']
print(f"{count}/{total} objects with this sort_collection_data value")
print(value)

if count != total:
raise(FacetError(f"Please check count discrepancies: {count}/{total}"))

if "registry.cdlib.org" not in value:
raise(FacetValueError(f"registry.cdlib.org not in \n{value}"))

if len(value.split("::")) < 4:
raise(FacetValueError(f"4 or more parts required in \n{value}"))

opensearch = OpensearchClient(settings.ENDPOINT, settings.get_auth())

skipped_collections = []
ignored_collections = []
good_collections = []

for collection in registry_endpoint(etl_collection_url):
collection_id = collection['id']
print(f"Getting sort_collection_data for collection: {collection_id}")

response = opensearch.search(
query = {"terms": {"collection_url": [collection_id]}},
aggs = {"sort_collection_data": {
"terms": {
"field": "sort_collection_data",
"size": 10000,
"order": {"_key": "asc"}
}
}}
)

try:
check_sort_collection_data(response)
except FacetError as e:
print(e)
action = input("Please type 'save' to save this error for later, or "
"press enter to continue...")
if action == 'save':
skipped_collections.append((collection, e))
else:
ignored_collections.append((collection, e))
continue
except FacetValueError as e:
print(e)
action = input("Please type 'save' to save this error for later, or "
"press enter to continue if the sort_collection_value "
"is already in 3 parts without a registry url")
if action == 'save':
skipped_collections.append((collection, e))
else:
good_collections.append((collection, e))
continue

sort_collection_data_value = (
response['aggregations']
['sort_collection_data']
['buckets'][0]
['key']
)
doc_count = (
response['aggregations']
['sort_collection_data']
['buckets'][0]
['doc_count']
)

sort_collection_data_parts = sort_collection_data_value.split("::")
sortable_name = sort_collection_data_parts[0]
display_name = ':'.join(sort_collection_data_parts[1:-2])
new_value = "::".join([sortable_name, display_name, str(collection_id)])
print(new_value)

if len(sort_collection_data_parts) > 4:
double_check = input("If the new value above looks good, please type "
"'yes' to continue...")
if double_check != "yes":
skipped_collections.append((collection, new_value))


new_value = [new_value]

update_response = opensearch.update_by_query(
query={"terms": {"collection_url": [str(collection_id)]}},
script={
"source": "ctx._source.sort_collection_data = params.data",
"lang": "painless",
"params": {
"data": new_value
}
}
)

if update_response['total'] != update_response['updated'] != doc_count:
raise ValueError(update_response)

print(update_response)

input("press enter to continue...")

print('---------------')
print(skipped_collections)
print('---------------')
print(ignored_collections)
print('---------------')
print(good_collections)
print('---------------')
4 changes: 4 additions & 0 deletions utils/registry_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from urllib.parse import urlparse, parse_qs
import requests

def registry_endpoint(url):
if parse_qs(urlparse(url).query).get('format') != ['json']:
raise KeyError("registry_client requires urls with format=json")

page = url
while page:
response = requests.get(url=page)
Expand Down

0 comments on commit ca1a962

Please sign in to comment.