From 00aada6534105a613325646a31b964eb035104f7 Mon Sep 17 00:00:00 2001 From: Barbara Hui Date: Wed, 22 May 2024 11:20:50 -0700 Subject: [PATCH 1/3] Update `sort_title` field to be type keyword, not text --- record_indexer/index_templates/record_index_config.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/record_indexer/index_templates/record_index_config.py b/record_indexer/index_templates/record_index_config.py index 56864756..44b69540 100644 --- a/record_indexer/index_templates/record_index_config.py +++ b/record_indexer/index_templates/record_index_config.py @@ -5,9 +5,8 @@ "number_of_shards": 1, "number_of_replicas": 1, "analysis": { - "analyzer": { - "keyword_lowercase_trim": { - "tokenizer": "keyword", + "normalizer": { + "lowercase_trim": { "filter": ["trim", "lowercase"] } } @@ -40,7 +39,7 @@ "temporal": {"type": "text", "fields": {"raw": {"type": "keyword"}}}, "type": {"type": "text", "fields": {"raw": {"type": "keyword"}}}, - "sort_title": {"type": "text", "analyzer": "keyword_lowercase_trim"}, + "sort_title": {"type": "keyword", "normalizer": "lowercase_trim"}, "facet_decade": {"type": "text", "fields": {"raw": {"type": "keyword"}}}, "description": {"type": "text"}, From fabac8a0d692c6781d1e4a796087d6f0b82c0bb3 Mon Sep 17 00:00:00 2001 From: Barbara Hui Date: Wed, 22 May 2024 15:29:02 -0700 Subject: [PATCH 2/3] Add new script for reindexing rikolti-stg --- record_indexer/scripts/reindex_rikolti_stg.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 record_indexer/scripts/reindex_rikolti_stg.py diff --git a/record_indexer/scripts/reindex_rikolti_stg.py b/record_indexer/scripts/reindex_rikolti_stg.py new file mode 100644 index 00000000..15956ee1 --- /dev/null +++ b/record_indexer/scripts/reindex_rikolti_stg.py @@ -0,0 +1,103 @@ +from datetime import datetime +import sys + +import json +import requests + +from ..utils import print_opensearch_error +from .. import settings + +def main(): + ''' + This script: + - reindexes all records in the index currently aliased to `rikolti-stg` + - removes the old index from the `rikolti-stg` alias + - adds the new index to the `rikolti-stg` alias + + NOTE: see TODO below re needing to use the task API to avoid a 504 timeout error + + Make sure that the rikolti index template is up to date with the settings + and mappings you want the new index to have before running this script: + + https://github.com/ucldc/rikolti/blob/main/record_indexer/README.md#create-opensearch-index-template + ''' + + alias = "rikolti-stg" + headers = {"Content-Type": "application/json"} + auth = settings.get_auth() + + # get name of index currently aliased to rikolti-stg + url = f"{settings.ENDPOINT}/_alias/{alias}" + r = requests.get(url, auth=settings.get_auth()) + r.raise_for_status() + aliased_indices = [key for key in r.json().keys()] + if len(aliased_indices) != 1: + raise ValueError( + f"Alias `{alias}` has {len(aliased_indices)} aliased indices. There should be 1.") + else: + source_index = aliased_indices[0] + + # get new index name + version = datetime.today().strftime("%Y%m%d%H%M%S") + destination_index = f"rikolti-stg-{version}" + + # reindex + url = f"{settings.ENDPOINT}/_reindex" + data = { + "source":{"index": source_index}, + "dest":{"index": destination_index} + } + print(f"Reindexing `{source_index}` into `{destination_index}`") + r = requests.post( + url, + headers=headers, + auth=auth, + data=json.dumps(data) + ) + # this results in a 504 Gateway timeout, but the reindex operation is still running + # TODO: need to use the task API to poll and see when the operation has finished + # https://opensearch.org/docs/2.11/api-reference/tasks/ + if not (200 <= r.status_code <= 299): + print_opensearch_error(r, url) + r.raise_for_status() + print(r.json()) + + # remove old index from rikolti-stg alias + url = f"{settings.ENDPOINT}/_aliases" + data = { + "actions": [ + {"remove": { + "indices": source_index, + "alias": alias + }} + ] + } + r = requests.post( + url, + headers=headers, + data=json.dumps(data), + auth=auth + ) + if not (200 <= r.status_code <= 299): + print_opensearch_error(r, url) + r.raise_for_status() + print(r.json()) + print(f"removed `{source_index}` from alias `{alias}`") + + # add new index to rikolti-stg alias + url = f"{settings.ENDPOINT}/_aliases" + data = {"actions": [{"add": {"index": index, "alias": alias}}]} + r = requests.post( + url, + headers=headers, + data=json.dumps(data), + auth=auth + ) + if not (200 <= r.status_code <= 299): + print_opensearch_error(r, url) + r.raise_for_status() + print(f"added index `{index}` to alias `{alias}`") + +if __name__ == "__main__": + main() + sys.exit() \ No newline at end of file From f21b429594bae48437afe05a4b61aa9bb17d188f Mon Sep 17 00:00:00 2001 From: Barbara Hui Date: Wed, 22 May 2024 15:37:53 -0700 Subject: [PATCH 3/3] Fix variable name in script --- record_indexer/scripts/reindex_rikolti_stg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/record_indexer/scripts/reindex_rikolti_stg.py b/record_indexer/scripts/reindex_rikolti_stg.py index 15956ee1..4f47cc09 100644 --- a/record_indexer/scripts/reindex_rikolti_stg.py +++ b/record_indexer/scripts/reindex_rikolti_stg.py @@ -86,7 +86,7 @@ def main(): # add new index to rikolti-stg alias url = f"{settings.ENDPOINT}/_aliases" - data = {"actions": [{"add": {"index": index, "alias": alias}}]} + data = {"actions": [{"add": {"index": destination_index, "alias": alias}}]} r = requests.post( url, headers=headers, @@ -96,7 +96,7 @@ def main(): if not (200 <= r.status_code <= 299): print_opensearch_error(r, url) r.raise_for_status() - print(f"added index `{index}` to alias `{alias}`") + print(f"added index `{destination_index}` to alias `{alias}`") if __name__ == "__main__": main()