From a0de0a7389e376baaf87ca3537c34572d0218ade Mon Sep 17 00:00:00 2001 From: Kargo Render Date: Wed, 29 Jan 2025 10:17:50 +0000 Subject: [PATCH] render wf: master branch instead of main Kargo Render created this commit by rendering manifests from 05299924ac8e38f2deef61026000c57af77088c8 --- .kargo-render/metadata.yaml | 1 + ...pt-add-to-fermilab-collection-cronjob.yaml | 82 ++ ...milab-collection-ff8f9mmk6t-configmap.yaml | 28 + .../hep-script-amend-pbn-cronjob.yaml | 82 ++ ...script-amend-pbn-f84mgc2959-configmap.yaml | 44 + .../hep-script-change-cnum-cronjob.yaml | 82 ++ ...ript-change-cnum-th6km57527-configmap.yaml | 43 + ...l-cdf-collection-527b6845t2-configmap.yaml | 27 + ...hange-internal-cdf-collection-cronjob.yaml | 82 ++ ...paper-to-article-745f8759cd-configmap.yaml | 63 ++ ...rt-conferencepaper-to-article-cronjob.yaml | 82 ++ ...-harvesting-mess-c4tk6tkt6k-configmap.yaml | 40 + ...cript-fix-aps-harvesting-mess-cronjob.yaml | 82 ++ .../hep-script-fix-arxiv-refs-cronjob.yaml | 82 ++ ...t-fix-arxiv-refs-kfkd72f22b-configmap.yaml | 59 ++ .../hep-script-fix-fermilab-urls-cronjob.yaml | 82 ++ ...ix-fermilab-urls-gk72hkh79g-configmap.yaml | 40 + ...y-jobs-deadlines-5mk6hhkg6b-configmap.yaml | 51 ++ ...ipt-fix-legacy-jobs-deadlines-cronjob.yaml | 82 ++ ...ix-linked-nonexistent-authors-cronjob.yaml | 82 ++ ...existent-authors-t9fdf8h844-configmap.yaml | 210 +++++ ...ac2021-fulltexts-8khdkgdfmt-configmap.yaml | 34 + ...cript-fix-rupac2021-fulltexts-cronjob.yaml | 82 ++ ...ag-export-to-hal-4hmmk2599f-configmap.yaml | 81 ++ ...hep-script-flag-export-to-hal-cronjob.yaml | 82 ++ .../hep-script-force-cds-harvest-cronjob.yaml | 82 ++ ...orce-cds-harvest-d6f829fb6t-configmap.yaml | 35 + .../hep-script-move-jacow-urls-cronjob.yaml | 82 ++ ...-move-jacow-urls-gmgt47mf5m-configmap.yaml | 33 + ...script-move-jetp-letters-urls-cronjob.yaml | 82 ++ ...etp-letters-urls-mdg4tk82f8-configmap.yaml | 32 + ...t-move-jetp-urls-2h5t5tk28d-configmap.yaml | 32 + .../hep-script-move-jetp-urls-cronjob.yaml | 82 ++ ...ipt-msnet-add-id-66t2654tct-configmap.yaml | 42 + .../hep-script-msnet-add-id-cronjob.yaml | 82 ++ ...cript-nsr-add-id-6ct6f2h6mh-configmap.yaml | 60 ++ .../hep-script-nsr-add-id-cronjob.yaml | 82 ++ .../hep-script-quant-ph-core-cronjob.yaml | 82 ++ ...pt-quant-ph-core-dbmdcdftmt-configmap.yaml | 26 + ...ve-authors-uuids-2k8bmm85g6-configmap.yaml | 61 ++ ...p-script-remove-authors-uuids-cronjob.yaml | 82 ++ ...t-remove-bai-from-lit-authors-cronjob.yaml | 82 ++ ...from-lit-authors-h6h4cmd942-configmap.yaml | 43 + ...ration-agreement-54b9h8mbb6-configmap.yaml | 63 ++ ...ern-aff-cooperation-agreement-cronjob.yaml | 82 ++ ...pt-remove-classifier-keywords-cronjob.yaml | 82 ++ ...ssifier-keywords-ddfmmfgcd4-configmap.yaml | 37 + ...ion-from-authors-2t8b8h4md9-configmap.yaml | 70 ++ ...curated-relation-from-authors-cronjob.yaml | 82 ++ ...e-pdfa-extension-b8hh7gtm72-configmap.yaml | 29 + ...-script-remove-pdfa-extension-cronjob.yaml | 82 ++ ...-remove-quant-ph-ck777tfgfh-configmap.yaml | 51 ++ .../hep-script-remove-quant-ph-cronjob.yaml | 82 ++ ...remove-wrong-ipac2023-authors-cronjob.yaml | 82 ++ ...ipac2023-authors-m25648bt42-configmap.yaml | 38 + ...order-babar-document-versions-cronjob.yaml | 82 ++ ...ocument-versions-ggd5ch4htk-configmap.yaml | 37 + ...eplace-europhys-lett-with-epl-cronjob.yaml | 82 ++ ...ys-lett-with-epl-hb47t84mh2-configmap.yaml | 30 + ...urophys-lett-with-epl-in-refs-cronjob.yaml | 82 ++ ...with-epl-in-refs-k785hm494h-configmap.yaml | 35 + ...-related-records-b8hcg96c74-configmap.yaml | 105 +++ ...restore-babar-related-records-cronjob.yaml | 82 ++ .../hep-script-set-refereed-cronjob.yaml | 82 ++ ...ipt-set-refereed-thm689762g-configmap.yaml | 782 ++++++++++++++++++ ...nowmass-add-link-7hgkd2h64k-configmap.yaml | 39 + .../hep-script-snowmass-add-link-cronjob.yaml | 82 ++ .../hep-script-test-5c66cgkk94-configmap.yaml | 7 + curation-scripts/hep-script-test-cronjob.yaml | 82 ++ ...journal-title-for-aps-physcis-cronjob.yaml | 82 ++ ...-for-aps-physcis-t5474b7fgb-configmap.yaml | 58 ++ 71 files changed, 5336 insertions(+) create mode 100644 .kargo-render/metadata.yaml create mode 100644 curation-scripts/hep-script-add-to-fermilab-collection-cronjob.yaml create mode 100644 curation-scripts/hep-script-add-to-fermilab-collection-ff8f9mmk6t-configmap.yaml create mode 100644 curation-scripts/hep-script-amend-pbn-cronjob.yaml create mode 100644 curation-scripts/hep-script-amend-pbn-f84mgc2959-configmap.yaml create mode 100644 curation-scripts/hep-script-change-cnum-cronjob.yaml create mode 100644 curation-scripts/hep-script-change-cnum-th6km57527-configmap.yaml create mode 100644 curation-scripts/hep-script-change-internal-cdf-collection-527b6845t2-configmap.yaml create mode 100644 curation-scripts/hep-script-change-internal-cdf-collection-cronjob.yaml create mode 100644 curation-scripts/hep-script-convert-conferencepaper-to-article-745f8759cd-configmap.yaml create mode 100644 curation-scripts/hep-script-convert-conferencepaper-to-article-cronjob.yaml create mode 100644 curation-scripts/hep-script-fix-aps-harvesting-mess-c4tk6tkt6k-configmap.yaml create mode 100644 curation-scripts/hep-script-fix-aps-harvesting-mess-cronjob.yaml create mode 100644 curation-scripts/hep-script-fix-arxiv-refs-cronjob.yaml create mode 100644 curation-scripts/hep-script-fix-arxiv-refs-kfkd72f22b-configmap.yaml create mode 100644 curation-scripts/hep-script-fix-fermilab-urls-cronjob.yaml create mode 100644 curation-scripts/hep-script-fix-fermilab-urls-gk72hkh79g-configmap.yaml create mode 100644 curation-scripts/hep-script-fix-legacy-jobs-deadlines-5mk6hhkg6b-configmap.yaml create mode 100644 curation-scripts/hep-script-fix-legacy-jobs-deadlines-cronjob.yaml create mode 100644 curation-scripts/hep-script-fix-linked-nonexistent-authors-cronjob.yaml create mode 100644 curation-scripts/hep-script-fix-linked-nonexistent-authors-t9fdf8h844-configmap.yaml create mode 100644 curation-scripts/hep-script-fix-rupac2021-fulltexts-8khdkgdfmt-configmap.yaml create mode 100644 curation-scripts/hep-script-fix-rupac2021-fulltexts-cronjob.yaml create mode 100644 curation-scripts/hep-script-flag-export-to-hal-4hmmk2599f-configmap.yaml create mode 100644 curation-scripts/hep-script-flag-export-to-hal-cronjob.yaml create mode 100644 curation-scripts/hep-script-force-cds-harvest-cronjob.yaml create mode 100644 curation-scripts/hep-script-force-cds-harvest-d6f829fb6t-configmap.yaml create mode 100644 curation-scripts/hep-script-move-jacow-urls-cronjob.yaml create mode 100644 curation-scripts/hep-script-move-jacow-urls-gmgt47mf5m-configmap.yaml create mode 100644 curation-scripts/hep-script-move-jetp-letters-urls-cronjob.yaml create mode 100644 curation-scripts/hep-script-move-jetp-letters-urls-mdg4tk82f8-configmap.yaml create mode 100644 curation-scripts/hep-script-move-jetp-urls-2h5t5tk28d-configmap.yaml create mode 100644 curation-scripts/hep-script-move-jetp-urls-cronjob.yaml create mode 100644 curation-scripts/hep-script-msnet-add-id-66t2654tct-configmap.yaml create mode 100644 curation-scripts/hep-script-msnet-add-id-cronjob.yaml create mode 100644 curation-scripts/hep-script-nsr-add-id-6ct6f2h6mh-configmap.yaml create mode 100644 curation-scripts/hep-script-nsr-add-id-cronjob.yaml create mode 100644 curation-scripts/hep-script-quant-ph-core-cronjob.yaml create mode 100644 curation-scripts/hep-script-quant-ph-core-dbmdcdftmt-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-authors-uuids-2k8bmm85g6-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-authors-uuids-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-bai-from-lit-authors-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-bai-from-lit-authors-h6h4cmd942-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-54b9h8mbb6-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-classifier-keywords-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-classifier-keywords-ddfmmfgcd4-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-curated-relation-from-authors-2t8b8h4md9-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-curated-relation-from-authors-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-pdfa-extension-b8hh7gtm72-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-pdfa-extension-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-quant-ph-ck777tfgfh-configmap.yaml create mode 100644 curation-scripts/hep-script-remove-quant-ph-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-wrong-ipac2023-authors-cronjob.yaml create mode 100644 curation-scripts/hep-script-remove-wrong-ipac2023-authors-m25648bt42-configmap.yaml create mode 100644 curation-scripts/hep-script-reorder-babar-document-versions-cronjob.yaml create mode 100644 curation-scripts/hep-script-reorder-babar-document-versions-ggd5ch4htk-configmap.yaml create mode 100644 curation-scripts/hep-script-replace-europhys-lett-with-epl-cronjob.yaml create mode 100644 curation-scripts/hep-script-replace-europhys-lett-with-epl-hb47t84mh2-configmap.yaml create mode 100644 curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-cronjob.yaml create mode 100644 curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-k785hm494h-configmap.yaml create mode 100644 curation-scripts/hep-script-restore-babar-related-records-b8hcg96c74-configmap.yaml create mode 100644 curation-scripts/hep-script-restore-babar-related-records-cronjob.yaml create mode 100644 curation-scripts/hep-script-set-refereed-cronjob.yaml create mode 100644 curation-scripts/hep-script-set-refereed-thm689762g-configmap.yaml create mode 100644 curation-scripts/hep-script-snowmass-add-link-7hgkd2h64k-configmap.yaml create mode 100644 curation-scripts/hep-script-snowmass-add-link-cronjob.yaml create mode 100644 curation-scripts/hep-script-test-5c66cgkk94-configmap.yaml create mode 100644 curation-scripts/hep-script-test-cronjob.yaml create mode 100644 curation-scripts/hep-script-update-journal-title-for-aps-physcis-cronjob.yaml create mode 100644 curation-scripts/hep-script-update-journal-title-for-aps-physcis-t5474b7fgb-configmap.yaml diff --git a/.kargo-render/metadata.yaml b/.kargo-render/metadata.yaml new file mode 100644 index 0000000..62fb7b9 --- /dev/null +++ b/.kargo-render/metadata.yaml @@ -0,0 +1 @@ +sourceCommit: 05299924ac8e38f2deef61026000c57af77088c8 diff --git a/curation-scripts/hep-script-add-to-fermilab-collection-cronjob.yaml b/curation-scripts/hep-script-add-to-fermilab-collection-cronjob.yaml new file mode 100644 index 0000000..b9e56a7 --- /dev/null +++ b/curation-scripts/hep-script-add-to-fermilab-collection-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-add-to-fermilab-collection +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-add-to-fermilab-collection-ff8f9mmk6t + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-add-to-fermilab-collection-ff8f9mmk6t-configmap.yaml b/curation-scripts/hep-script-add-to-fermilab-collection-ff8f9mmk6t-configmap.yaml new file mode 100644 index 0000000..6136e2a --- /dev/null +++ b/curation-scripts/hep-script-add-to-fermilab-collection-ff8f9mmk6t-configmap.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + + class AddToFermilabCollection(SearchCheckDo): + """Add records with Fermilab report numbers to the Fermilab collection.""" + + query = "r FERMILAB* -_collections:Fermilab" + + @staticmethod + def check(record, logger, state): + reports = record.get_value("report_numbers.value", []) + logger.info("Report numbers in record", reports=reports) + if "Fermilab" in record["_collections"]: + return False + return any(report.lower().startswith("fermilab") for report in reports) + + @staticmethod + def do(record, logger, state): + record["_collections"].append("Fermilab") + + + AddToFermilabCollection() +kind: ConfigMap +metadata: + name: hep-script-add-to-fermilab-collection-ff8f9mmk6t diff --git a/curation-scripts/hep-script-amend-pbn-cronjob.yaml b/curation-scripts/hep-script-amend-pbn-cronjob.yaml new file mode 100644 index 0000000..4f0dfba --- /dev/null +++ b/curation-scripts/hep-script-amend-pbn-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-amend-pbn +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-amend-pbn-f84mgc2959 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-amend-pbn-f84mgc2959-configmap.yaml b/curation-scripts/hep-script-amend-pbn-f84mgc2959-configmap.yaml new file mode 100644 index 0000000..a944f75 --- /dev/null +++ b/curation-scripts/hep-script-amend-pbn-f84mgc2959-configmap.yaml @@ -0,0 +1,44 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + tag_info = ("parent_isbn", "9789811947506") + + missing_info = { + "journal_title": "Springer Proc.Math.Stat.", + "journal_volume": "396", + "parent_record": {"$ref": "https://inspirehep.net/api/literature/2628642"}, + } + + + class AmendPBN(SearchCheckDo): + """Add missing info to PBNs with tag""" + + query = "publication_info.%s:%s" % tag_info + + @staticmethod + def check(record, logger, state): + # flag PBN containing tag info + + state["pos_tag"] = [] + for npbn, pbn in enumerate(record.get("publication_info", [])): + tag = pbn.get(tag_info[0], "") + if tag == tag_info[1]: + state["pos_tag"].append(npbn) + if state["pos_tag"]: + return True + return False + + @staticmethod + def do(record, logger, state): + # append missing info + + for npbn in state["pos_tag"]: + record["publication_info"][npbn].update(missing_info) + + + AmendPBN() +kind: ConfigMap +metadata: + name: hep-script-amend-pbn-f84mgc2959 diff --git a/curation-scripts/hep-script-change-cnum-cronjob.yaml b/curation-scripts/hep-script-change-cnum-cronjob.yaml new file mode 100644 index 0000000..5fed4a8 --- /dev/null +++ b/curation-scripts/hep-script-change-cnum-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-change-cnum +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-change-cnum-th6km57527 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-change-cnum-th6km57527-configmap.yaml b/curation-scripts/hep-script-change-cnum-th6km57527-configmap.yaml new file mode 100644 index 0000000..73cc7db --- /dev/null +++ b/curation-scripts/hep-script-change-cnum-th6km57527-configmap.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + wrong_cnum = "C20-05-18.1" + new_cnum = "C21-05-31" + new_conf_record = "https://inspirehep.net/api/conferences/1812458" + + + class ChangeCNUM(SearchCheckDo): + """Wrong CNUM assigned - replace by correct information""" + + query = "publication_info.cnum:%s" % wrong_cnum + + @staticmethod + def check(record, logger, state): + # flag PBN with wrong CNUM + + state["pos_cnum"] = [] + for npbn, pbn in enumerate(record.get("publication_info", [])): + cnum = pbn.get("cnum", "") + if cnum == wrong_cnum: + state["pos_cnum"].append(npbn) + if state["pos_cnum"]: + return True + return False + + @staticmethod + def do(record, logger, state): + # replace CNUM and conference record + + for npbn in state["pos_cnum"]: + record["publication_info"][npbn]["cnum"] = new_cnum + record["publication_info"][npbn]["conference_record"] = ( + "{'$ref': '%s'}" % new_conf_record + ) + + + ChangeCNUM() +kind: ConfigMap +metadata: + name: hep-script-change-cnum-th6km57527 diff --git a/curation-scripts/hep-script-change-internal-cdf-collection-527b6845t2-configmap.yaml b/curation-scripts/hep-script-change-internal-cdf-collection-527b6845t2-configmap.yaml new file mode 100644 index 0000000..9faf725 --- /dev/null +++ b/curation-scripts/hep-script-change-internal-cdf-collection-527b6845t2-configmap.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + + class ChangeInternalCDFCollection(SearchCheckDo): + """Ensure all CDF Internal Notes are really private.""" + + query = '_collections:"CDF Internal Notes"' + + @staticmethod + def check(record, logger, state): + return ( + len(record["_collections"]) > 1 + and "CDF Internal Notes" in record["_collections"] + ) + + @staticmethod + def do(record, logger, state): + record["_collections"] = ["CDF Internal Notes"] + + + ChangeInternalCDFCollection() +kind: ConfigMap +metadata: + name: hep-script-change-internal-cdf-collection-527b6845t2 diff --git a/curation-scripts/hep-script-change-internal-cdf-collection-cronjob.yaml b/curation-scripts/hep-script-change-internal-cdf-collection-cronjob.yaml new file mode 100644 index 0000000..f4ac6a9 --- /dev/null +++ b/curation-scripts/hep-script-change-internal-cdf-collection-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-change-internal-cdf-collection +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-change-internal-cdf-collection-527b6845t2 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-convert-conferencepaper-to-article-745f8759cd-configmap.yaml b/curation-scripts/hep-script-convert-conferencepaper-to-article-745f8759cd-configmap.yaml new file mode 100644 index 0000000..d734396 --- /dev/null +++ b/curation-scripts/hep-script-convert-conferencepaper-to-article-745f8759cd-configmap.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + cnums = [ + "C20-09-14.4", + "C19-09-23.10", + "C18-09-17.9", + "C17-09-11.10", + "C16-09-26.2", + "C14-09-22.10", + "C13-09-23.8", + "C12-09-17.11", + "C11-09-26.11", + ] + + + class ConvertConferencePapertoArticle(SearchCheckDo): + """SIF Conferences have no Proceedings, convert to regular articles""" + + query = " or ".join(["publication_info.cnum:%s" % cnum for cnum in cnums]) + + @staticmethod + def check(record, logger, state): + # process records with given CNUM + # dont process other CNUMs + # dont process proceedings + + state["pos_cnum"] = [] + if "proceedings" in record["document_type"]: + return False + for npbn, pbn in enumerate(record.get("publication_info", [])): + cnum = pbn.get("cnum", "") + if cnum in cnums: + state["pos_cnum"].append(npbn) + elif cnum: + return False + if state["pos_cnum"]: + return True + return False + + @staticmethod + def do(record, logger, state): + # remove CNUM and conference record + # remove doc_type conference paper + # add doc_type article + # add refereed + for npbn in state["pos_cnum"]: + record["publication_info"][npbn].pop("cnum", "") + record["publication_info"][npbn].pop("conference_record", "") + + if "conference paper" in record["document_type"]: + record["document_type"].remove("conference paper") + if "article" not in record["document_type"]: + record["document_type"].append("article") + record["refereed"] = True + + + ConvertConferencePapertoArticle() +kind: ConfigMap +metadata: + name: hep-script-convert-conferencepaper-to-article-745f8759cd diff --git a/curation-scripts/hep-script-convert-conferencepaper-to-article-cronjob.yaml b/curation-scripts/hep-script-convert-conferencepaper-to-article-cronjob.yaml new file mode 100644 index 0000000..ed198ea --- /dev/null +++ b/curation-scripts/hep-script-convert-conferencepaper-to-article-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-convert-conferencepaper-to-article +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-convert-conferencepaper-to-article-745f8759cd + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-fix-aps-harvesting-mess-c4tk6tkt6k-configmap.yaml b/curation-scripts/hep-script-fix-aps-harvesting-mess-c4tk6tkt6k-configmap.yaml new file mode 100644 index 0000000..03fee9b --- /dev/null +++ b/curation-scripts/hep-script-fix-aps-harvesting-mess-c4tk6tkt6k-configmap.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +data: + script.py: | + from itertools import permutations + + from inspirehep.curation.search_check_do import SearchCheckDo + + + class FixAPSHarvestingMess(SearchCheckDo): + """Fix metadata issues caused by harvesting APS when fulltext API was broken.""" + + query = "doi 10.1103* and du 2023-11-28->2024-01-01" + + @staticmethod + def check(record, logger, state): + state["to_delete"] = [] + + enumerated_pubinfo_pairs = permutations( + enumerate(record.get("publication_info", [])), r=2 + ) + for (i, pubinfo1), (_, pubinfo2) in enumerated_pubinfo_pairs: + if pubinfo1.items() <= pubinfo2.items(): + state["to_delete"].append(i) + + return bool(state["to_delete"]) + + @staticmethod + def do(record, logger, state): + new_pubinfo = [ + p + for (i, p) in enumerate(record["publication_info"]) + if i not in state["to_delete"] + ] + record["publication_info"] = new_pubinfo + + + FixAPSHarvestingMess() +kind: ConfigMap +metadata: + name: hep-script-fix-aps-harvesting-mess-c4tk6tkt6k diff --git a/curation-scripts/hep-script-fix-aps-harvesting-mess-cronjob.yaml b/curation-scripts/hep-script-fix-aps-harvesting-mess-cronjob.yaml new file mode 100644 index 0000000..aaf4115 --- /dev/null +++ b/curation-scripts/hep-script-fix-aps-harvesting-mess-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-fix-aps-harvesting-mess +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-fix-aps-harvesting-mess-c4tk6tkt6k + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-fix-arxiv-refs-cronjob.yaml b/curation-scripts/hep-script-fix-arxiv-refs-cronjob.yaml new file mode 100644 index 0000000..4840270 --- /dev/null +++ b/curation-scripts/hep-script-fix-arxiv-refs-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-fix-arxiv-refs +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-fix-arxiv-refs-kfkd72f22b + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-fix-arxiv-refs-kfkd72f22b-configmap.yaml b/curation-scripts/hep-script-fix-arxiv-refs-kfkd72f22b-configmap.yaml new file mode 100644 index 0000000..37f80a5 --- /dev/null +++ b/curation-scripts/hep-script-fix-arxiv-refs-kfkd72f22b-configmap.yaml @@ -0,0 +1,59 @@ +apiVersion: v1 +data: + script.py: | + from itertools import chain + from inspirehep.curation.search_check_do import SearchCheckDo + from inspire_schemas.utils import is_arxiv, normalize_arxiv + + + class FixArxivRefs(SearchCheckDo): + """Identify arXiv references in DOIs and URLs.""" + + query = ( + "references.reference.dois:10.48550* " + "or references.reference.urls.value:'arxiv' " + "or references.reference.urls.value:'arXiv' " + "or references.reference.urls.value:'ARXIV'" + ) + + @staticmethod + def check(record, logger, state): + dois = chain.from_iterable(record.get_value("references.reference.dois")) + urls = chain.from_iterable(record.get_value("references.reference.urls.value")) + return any("arxiv" in val.lower() for val in chain(dois, urls)) + + @staticmethod + def do(record, logger, state): + for reference in record["references"]: + reference = reference.get("reference", {}) + + if "arxiv_eprint" in reference: + continue + + new_dois = [] + for doi in reference.get("dois", []): + if is_arxiv(doi): + reference["arxiv_eprint"] = normalize_arxiv(doi) + else: + new_dois.append(doi) + if new_dois: + reference["dois"] = new_dois + else: + reference.pop("dois", None) + + new_urls = [] + for url in reference.get("urls", []): + if is_arxiv(url["value"]): + reference["arxiv_eprint"] = normalize_arxiv(url["value"]) + else: + new_urls.append(url) + if new_urls: + reference["urls"] = new_urls + else: + reference.pop("urls", None) + + + FixArxivRefs() +kind: ConfigMap +metadata: + name: hep-script-fix-arxiv-refs-kfkd72f22b diff --git a/curation-scripts/hep-script-fix-fermilab-urls-cronjob.yaml b/curation-scripts/hep-script-fix-fermilab-urls-cronjob.yaml new file mode 100644 index 0000000..7a847aa --- /dev/null +++ b/curation-scripts/hep-script-fix-fermilab-urls-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-fix-fermilab-urls +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-fix-fermilab-urls-gk72hkh79g + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-fix-fermilab-urls-gk72hkh79g-configmap.yaml b/curation-scripts/hep-script-fix-fermilab-urls-gk72hkh79g-configmap.yaml new file mode 100644 index 0000000..0115a79 --- /dev/null +++ b/curation-scripts/hep-script-fix-fermilab-urls-gk72hkh79g-configmap.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +data: + script.py: | + import requests + + from inspirehep.curation.search_check_do import SearchCheckDo + + FIXED_URLS = requests.get( + "https://cernbox.cern.ch/remote.php/dav/public-files/" + "FVgeaG5VAVx8B09/fermilab_fixed_urls.json" + ).json() + + + class FixFermilabURLs(SearchCheckDo): + """Rewrite URLs to Fermilab PDFs that have moved.""" + + query = "urls.value:'ccd.fnal.gov'" + + @staticmethod + def check(record, logger, state): + for url in record.get_value("urls.value", []): + if "ccd.fnal.gov" not in url: + continue + if url in FIXED_URLS: + return True + logger.warning("URL not found in translation map", url=url) + return False + + @staticmethod + def do(record, logger, state): + for url in record["urls"]: + value = url["value"] + if "ccd.fnal.gov" in value: + url["value"] = FIXED_URLS.get(value, value) + + + FixFermilabURLs() +kind: ConfigMap +metadata: + name: hep-script-fix-fermilab-urls-gk72hkh79g diff --git a/curation-scripts/hep-script-fix-legacy-jobs-deadlines-5mk6hhkg6b-configmap.yaml b/curation-scripts/hep-script-fix-legacy-jobs-deadlines-5mk6hhkg6b-configmap.yaml new file mode 100644 index 0000000..208a079 --- /dev/null +++ b/curation-scripts/hep-script-fix-legacy-jobs-deadlines-5mk6hhkg6b-configmap.yaml @@ -0,0 +1,51 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.search.api import JobsSearch + from inspirehep.curation.search_check_do import SearchCheckDo + + + class FixLegacyJobsDeadlines(SearchCheckDo): + """Fix legacy jobs with fake 'deadline_date=3000'""" + + search_class = JobsSearch + query = "deadline_date:3000" + + @staticmethod + def check(record, logger, state): + has_deadline_3000 = record.get("deadline_date") == "3000" + if has_deadline_3000: + return True + return False + + @staticmethod + def do(record, logger, state): + legacy_version = record.get("legacy_version") + if legacy_version: + year, month, day = ( + legacy_version[:4], + legacy_version[4:6], + legacy_version[6:8], + ) + date = f"{year}-{month}-{day}" + record["deadline_date"] = date + record.setdefault("_private_notes", []).append( + { + "value": "Record with no deadline," + " fake 'deadline_date' derived from 'legacy_version'" + } + ) + else: + record["deadline_date"] = record["legacy_creation_date"] + record.setdefault("_private_notes", []).append( + { + "value": "Record with no deadline," + " fake 'deadline_date' derived from 'legacy_creation_date'" + } + ) + + + FixLegacyJobsDeadlines() +kind: ConfigMap +metadata: + name: hep-script-fix-legacy-jobs-deadlines-5mk6hhkg6b diff --git a/curation-scripts/hep-script-fix-legacy-jobs-deadlines-cronjob.yaml b/curation-scripts/hep-script-fix-legacy-jobs-deadlines-cronjob.yaml new file mode 100644 index 0000000..482d2cf --- /dev/null +++ b/curation-scripts/hep-script-fix-legacy-jobs-deadlines-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-fix-legacy-jobs-deadlines +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-fix-legacy-jobs-deadlines-5mk6hhkg6b + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-fix-linked-nonexistent-authors-cronjob.yaml b/curation-scripts/hep-script-fix-linked-nonexistent-authors-cronjob.yaml new file mode 100644 index 0000000..df09711 --- /dev/null +++ b/curation-scripts/hep-script-fix-linked-nonexistent-authors-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-fix-linked-nonexistent-authors +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-fix-linked-nonexistent-authors-t9fdf8h844 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-fix-linked-nonexistent-authors-t9fdf8h844-configmap.yaml b/curation-scripts/hep-script-fix-linked-nonexistent-authors-t9fdf8h844-configmap.yaml new file mode 100644 index 0000000..80a1dd9 --- /dev/null +++ b/curation-scripts/hep-script-fix-linked-nonexistent-authors-t9fdf8h844-configmap.yaml @@ -0,0 +1,210 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + from inspire_dojson.utils import get_recid_from_ref + from inspire_utils.record import get_value + + + AFFECTED_AUTHORS_RECIDS = { + 2170881, + 2165762, + 2171911, + 2167309, + 2166287, + 2166288, + 2166290, + 2169364, + 2168854, + 2168855, + 2166296, + 2167322, + 2166300, + 2166302, + 2166303, + 2174501, + 2167346, + 2168891, + 2165821, + 2168387, + 2172506, + 2023003, + 2169948, + 2169436, + 2169438, + 2168417, + 2169441, + 2169444, + 2169445, + 2169961, + 2169962, + 2169963, + 2169964, + 2169965, + 2169967, + 2169968, + 2169969, + 2169970, + 2169971, + 1116790, + 2174586, + 2174587, + 2174588, + 2174589, + 2174590, + 2174591, + 2164361, + 2165391, + 2163865, + 1933467, + 2172577, + 2164389, + 2172582, + 2164391, + 2172584, + 2172583, + 2164390, + 2165420, + 2164405, + 2163896, + 2169029, + 2164433, + 2164464, + 2173170, + 2164466, + 2164468, + 2164470, + 2164471, + 2164472, + 2164473, + 2164474, + 2165509, + 2165511, + 2171149, + 2171151, + 2165535, + 2165538, + 2173219, + 2165540, + 2165541, + 2165542, + 2172712, + 2172713, + 2173744, + 2173747, + 2165556, + 2173748, + 2165559, + 2165560, + 2173754, + 2165562, + 2165564, + 2165566, + 2165568, + 2165569, + 2165570, + 2173763, + 2163010, + 2173765, + 2165575, + 2165576, + 2173769, + 2173775, + 2173777, + 2173779, + 2173780, + 2168661, + 2174810, + 2173787, + 2174812, + 2173789, + 2173791, + 2173794, + 2169194, + 2165108, + 2165109, + 2165120, + 2166659, + 2164612, + 2164615, + 2173323, + 2165135, + 2163610, + 2171293, + 2172830, + 2167209, + 2163118, + 2170810, + 2172861, + 2172863, + 2172864, + 2172865, + 2163135, + 2163149, + 2173902, + 2163150, + 2163151, + 2173901, + 2163668, + 2169301, + 2169302, + 2167769, + 2168797, + 2168799, + 2167776, + 2167777, + 2167779, + 2166249, + 2167274, + 2167275, + 2172908, + 2167276, + 2167293, + } + + + class FixLinkedNonexistentAuthors(SearchCheckDo): + """Remove ref from authors linked to nonexisting profiles.""" + + query = { + "query": { + "nested": { + "path": "authors", + "query": { + "terms": {"authors.record.$ref": list(AFFECTED_AUTHORS_RECIDS)} + }, + } + } + } + + def search(self): + self.logger.info("Searching records", query=self.query) + query = ( + self.search_class() + .from_dict(self.query) + .params(_source={}, size=self.size, scroll="60m") + ) + if shard_filter := self._current_shard_filter(): + query = query.filter("script", script=shard_filter) + return query.scan() + + @staticmethod + def check(record, logger, state): + refs = get_value(record, "authors.record", []) + journal_recids_record = {int(get_recid_from_ref(ref)) for ref in refs} + return AFFECTED_AUTHORS_RECIDS.intersection(journal_recids_record) + + @staticmethod + def do(record, logger, state): + for author in record.get("authors", []): + if "record" not in author: + continue + recid = get_recid_from_ref(author["record"]) + if int(recid) in AFFECTED_AUTHORS_RECIDS: + del author["record"] + + + FixLinkedNonexistentAuthors() +kind: ConfigMap +metadata: + name: hep-script-fix-linked-nonexistent-authors-t9fdf8h844 diff --git a/curation-scripts/hep-script-fix-rupac2021-fulltexts-8khdkgdfmt-configmap.yaml b/curation-scripts/hep-script-fix-rupac2021-fulltexts-8khdkgdfmt-configmap.yaml new file mode 100644 index 0000000..976b633 --- /dev/null +++ b/curation-scripts/hep-script-fix-rupac2021-fulltexts-8khdkgdfmt-configmap.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + FULLTEXT_URL = "https://jacow.org/rupac2021/papers/{}.pdf" + + + class FixRupac2021Fulltexts(SearchCheckDo): + """Fix fulltexts for RuPAC 2021 (Alushta, Crimea).""" + + query = "publication_info.conference_record.$ref:1954430" + + @staticmethod + def check(record, logger, state): + return "C21-09-27.4" in record.get_value("publication_info.cnum", []) + + @staticmethod + def do(record, logger, state): + artids = record.get_value("publication_info.artid", []) + if len(artids) != 1: + logger.warning("Ambiguous article IDs.", artids=artids) + return + if (num_docs := len(record.get("documents", []))) != 1: + logger.warning("Ambiguous or missing documents.", num_docs=num_docs) + return + artid = artids[0].lower() + record["documents"] = [{"url": FULLTEXT_URL.format(artid)}] + + + FixRupac2021Fulltexts() +kind: ConfigMap +metadata: + name: hep-script-fix-rupac2021-fulltexts-8khdkgdfmt diff --git a/curation-scripts/hep-script-fix-rupac2021-fulltexts-cronjob.yaml b/curation-scripts/hep-script-fix-rupac2021-fulltexts-cronjob.yaml new file mode 100644 index 0000000..a0dde83 --- /dev/null +++ b/curation-scripts/hep-script-fix-rupac2021-fulltexts-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-fix-rupac2021-fulltexts +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-fix-rupac2021-fulltexts-8khdkgdfmt + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-flag-export-to-hal-4hmmk2599f-configmap.yaml b/curation-scripts/hep-script-flag-export-to-hal-4hmmk2599f-configmap.yaml new file mode 100644 index 0000000..7a5ae87 --- /dev/null +++ b/curation-scripts/hep-script-flag-export-to-hal-4hmmk2599f-configmap.yaml @@ -0,0 +1,81 @@ +apiVersion: v1 +data: + script.py: | + from itertools import chain + + from inspirehep.curation.search_check_do import SearchCheckDo + + INSTITUTIONS = { + "2020952", + "903118", + "1188219", + "907607", + "1201986", + "904493", + "1347082", + "1776404", + "902828", + "911366", + "1743848", + "902740", + "902786", + "903119", + "902989", + "903421", + "1776405", + "910133", + "902703", + "906885", + "903453", + "907247", + "911249", + "1608212", + "903100", + "907588", + "903099", + "902974", + } + + + class FlagExportToHAL(SearchCheckDo): + """Enable export to HAL for a bunch of unflagged records.""" + + query = ( + 'jy 2016 and (document_type:"conference paper" or document_type:"article") ' + "and not _export_to.HAL:true and _collections:Literature " + f"and affid:{';'.join(INSTITUTIONS)} and external_system_identifiers.schema:HAL" + ) + + @staticmethod + def check(record, logger, state): + has_correct_pubyear = 2016 in record.get_value("publication_info.year", []) + has_correct_doctype = ( + "conference paper" in record["document_type"] + or "article" in record["document_type"] + ) + has_correct_external_identifier = "HAL" in record.get_value( + "external_system_identifiers.schema", [] + ) + has_correct_affiliations = any( + aff_ref.split("/")[-1] in INSTITUTIONS + for aff_ref in chain.from_iterable( + record.get_value("authors.affiliations.record.$ref", []) + ) + ) + + return ( + has_correct_pubyear + and has_correct_doctype + and has_correct_external_identifier + and has_correct_affiliations + ) + + @staticmethod + def do(record, logger, state): + record.setdefault("_export_to", {})["HAL"] = True + + + FlagExportToHAL() +kind: ConfigMap +metadata: + name: hep-script-flag-export-to-hal-4hmmk2599f diff --git a/curation-scripts/hep-script-flag-export-to-hal-cronjob.yaml b/curation-scripts/hep-script-flag-export-to-hal-cronjob.yaml new file mode 100644 index 0000000..97400a0 --- /dev/null +++ b/curation-scripts/hep-script-flag-export-to-hal-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-flag-export-to-hal +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-flag-export-to-hal-4hmmk2599f + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-force-cds-harvest-cronjob.yaml b/curation-scripts/hep-script-force-cds-harvest-cronjob.yaml new file mode 100644 index 0000000..503e971 --- /dev/null +++ b/curation-scripts/hep-script-force-cds-harvest-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-force-cds-harvest +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-force-cds-harvest-d6f829fb6t + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-force-cds-harvest-d6f829fb6t-configmap.yaml b/curation-scripts/hep-script-force-cds-harvest-d6f829fb6t-configmap.yaml new file mode 100644 index 0000000..15432d7 --- /dev/null +++ b/curation-scripts/hep-script-force-cds-harvest-d6f829fb6t-configmap.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + from inspirehep.oai.utils import is_cds_set, is_cern_arxiv_set + + + class ForceCDSHarvest(SearchCheckDo): + """Touch records harvested by CDS to force synchronization after fixing bug.""" + + query = ( + "(_oai.sets:CERN:arXiv or _oai.sets:ForCDS) " + "and du > 2021-06-26 and du < 2023-01-28 " + "and arxiv_eprints.value:* and de >= 2021" + ) + + @staticmethod + def check(record, logger, state): + return ( + is_cds_set(record) + or is_cern_arxiv_set(record) + and "arxiv_eprints" in record + ) + + @staticmethod + def do(record, logger, state): + # don't need to do anything here, just update the `update` timestamp as + # a side-effect + ... + + + ForceCDSHarvest() +kind: ConfigMap +metadata: + name: hep-script-force-cds-harvest-d6f829fb6t diff --git a/curation-scripts/hep-script-move-jacow-urls-cronjob.yaml b/curation-scripts/hep-script-move-jacow-urls-cronjob.yaml new file mode 100644 index 0000000..2383033 --- /dev/null +++ b/curation-scripts/hep-script-move-jacow-urls-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-move-jacow-urls +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-move-jacow-urls-gmgt47mf5m + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-move-jacow-urls-gmgt47mf5m-configmap.yaml b/curation-scripts/hep-script-move-jacow-urls-gmgt47mf5m-configmap.yaml new file mode 100644 index 0000000..48524ee --- /dev/null +++ b/curation-scripts/hep-script-move-jacow-urls-gmgt47mf5m-configmap.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + OLD_PREFIX = "http://accelconf.web.cern.ch/accelconf/" + NEW_PREFIX = "http://accelconf.web.cern.ch/" + + + class MoveJACOWURLs(SearchCheckDo): + """Fix URLs pointing to JACoW website.""" + + query = "urls.value:http://accelconf.web.cern.ch/*" + + @staticmethod + def check(record, logger, state): + return any( + value.lower().startswith(OLD_PREFIX) + for value in record.get_value("urls.value", []) + ) + + @staticmethod + def do(record, logger, state): + urls = record.get("urls", []) + for url in urls: + if url["value"].lower().startswith(OLD_PREFIX): + url["value"] = NEW_PREFIX + url["value"][len(OLD_PREFIX) :] + + + MoveJACOWURLs() +kind: ConfigMap +metadata: + name: hep-script-move-jacow-urls-gmgt47mf5m diff --git a/curation-scripts/hep-script-move-jetp-letters-urls-cronjob.yaml b/curation-scripts/hep-script-move-jetp-letters-urls-cronjob.yaml new file mode 100644 index 0000000..9471739 --- /dev/null +++ b/curation-scripts/hep-script-move-jetp-letters-urls-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-move-jetp-letters-urls +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-move-jetp-letters-urls-mdg4tk82f8 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-move-jetp-letters-urls-mdg4tk82f8-configmap.yaml b/curation-scripts/hep-script-move-jetp-letters-urls-mdg4tk82f8-configmap.yaml new file mode 100644 index 0000000..d51c934 --- /dev/null +++ b/curation-scripts/hep-script-move-jetp-letters-urls-mdg4tk82f8-configmap.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + OLD_PREFIX = "http://www.jetpletters.ac.ru" + NEW_PREFIX = "http://jetpletters.ru" + + + class MoveJETPLettersURLs(SearchCheckDo): + """Move URLs pointing to JETP Letters to new domain.""" + + query = f"urls.value:{OLD_PREFIX}*" + + @staticmethod + def check(record, logger, state): + return any( + value.startswith(OLD_PREFIX) for value in record.get_value("urls.value", []) + ) + + @staticmethod + def do(record, logger, state): + urls = record.get("urls", []) + for url in urls: + if url["value"].startswith(OLD_PREFIX): + url["value"] = url["value"].replace(OLD_PREFIX, NEW_PREFIX) + + + MoveJETPLettersURLs() +kind: ConfigMap +metadata: + name: hep-script-move-jetp-letters-urls-mdg4tk82f8 diff --git a/curation-scripts/hep-script-move-jetp-urls-2h5t5tk28d-configmap.yaml b/curation-scripts/hep-script-move-jetp-urls-2h5t5tk28d-configmap.yaml new file mode 100644 index 0000000..7a7138a --- /dev/null +++ b/curation-scripts/hep-script-move-jetp-urls-2h5t5tk28d-configmap.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + + class MoveJETPURLs(SearchCheckDo): + """Move URLs pointing to JETP to new domain.""" + + query = "urls.value:http://www.jetp.ac.ru*" + + @staticmethod + def check(record, logger, state): + return any( + value.startswith("http://www.jetp.ac.ru") + for value in record.get_value("urls.value", []) + ) + + @staticmethod + def do(record, logger, state): + urls = record.get("urls", []) + for url in urls: + if url["value"].startswith("http://www.jetp.ac.ru"): + url["value"] = url["value"].replace( + "http://www.jetp.ac.ru", "http://www.jetp.ras.ru" + ) + + + MoveJETPURLs() +kind: ConfigMap +metadata: + name: hep-script-move-jetp-urls-2h5t5tk28d diff --git a/curation-scripts/hep-script-move-jetp-urls-cronjob.yaml b/curation-scripts/hep-script-move-jetp-urls-cronjob.yaml new file mode 100644 index 0000000..f2510be --- /dev/null +++ b/curation-scripts/hep-script-move-jetp-urls-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-move-jetp-urls +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-move-jetp-urls-2h5t5tk28d + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-msnet-add-id-66t2654tct-configmap.yaml b/curation-scripts/hep-script-msnet-add-id-66t2654tct-configmap.yaml new file mode 100644 index 0000000..afd3edc --- /dev/null +++ b/curation-scripts/hep-script-msnet-add-id-66t2654tct-configmap.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +data: + script.py: | + import requests + + from inspirehep.curation.search_check_do import SearchCheckDo + + URL = ( + "https://cernbox.cern.ch/remote.php/dav/public-files/" + "DgV3O0I8D8haXMZ/msnet_add_id.json" + ) + MSNET_IDS = requests.get(URL).json() + + ELEMENT = "external_system_identifiers" + + + class AddMsnetIds(SearchCheckDo): + """Add MSNET IDs to INSPIRE records.""" + + query = f"tc:p -{ELEMENT}.schema:MSNET" + + @staticmethod + def check(record, logger, state): + return str(record["control_number"]) in MSNET_IDS and not any( + id_["schema"] == "MSNET" + for id_ in record.get("external_system_identifiers", []) + ) + + @staticmethod + def do(record, logger, state): + record.setdefault(ELEMENT, []).append( + { + "value": MSNET_IDS[str(record["control_number"])], + "schema": "MSNET", + } + ) + + + AddMsnetIds() +kind: ConfigMap +metadata: + name: hep-script-msnet-add-id-66t2654tct diff --git a/curation-scripts/hep-script-msnet-add-id-cronjob.yaml b/curation-scripts/hep-script-msnet-add-id-cronjob.yaml new file mode 100644 index 0000000..1e76676 --- /dev/null +++ b/curation-scripts/hep-script-msnet-add-id-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-msnet-add-id +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-msnet-add-id-66t2654tct + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-nsr-add-id-6ct6f2h6mh-configmap.yaml b/curation-scripts/hep-script-nsr-add-id-6ct6f2h6mh-configmap.yaml new file mode 100644 index 0000000..ef4a277 --- /dev/null +++ b/curation-scripts/hep-script-nsr-add-id-6ct6f2h6mh-configmap.yaml @@ -0,0 +1,60 @@ +apiVersion: v1 +data: + script.py: | + import requests + + from inspirehep.curation.search_check_do import SearchCheckDo + + URL = ( + "https://cernbox.cern.ch/remote.php/dav/public-files/yq254v51yVIdaQf/nsr-dois.json" + ) + + ELEMENT = "external_system_identifiers" + + + def get_unambiguous_ids(url): + """Get a mapping from doi to NSR record ID, ignoring duplicate DOIs.""" + seen = set() + result = {} + data = requests.get(url).json() + for nsr_id, doi in data.items(): + doi = doi.lower() + if doi in seen: + result.pop(doi, None) + seen.add(doi) + result[doi] = nsr_id + return result + + + NSR_IDS = get_unambiguous_ids(URL) + + + class AddNSRIds(SearchCheckDo): + """Add NNDC NSR IDs to INSPIRE records.""" + + query = f"doi * -{ELEMENT}.schema:NSR" + + @staticmethod + def check(record, logger, state): + if any(id_["schema"] == "NSR" for id_ in record.get(ELEMENT, [])): + return False + for doi in record.get_value("dois.value", []): + if doi.lower() in NSR_IDS: + state["nsr_id"] = NSR_IDS[doi.lower()] + return True + return False + + @staticmethod + def do(record, logger, state): + record.setdefault(ELEMENT, []).append( + { + "value": state["nsr_id"], + "schema": "NSR", + } + ) + + + AddNSRIds() +kind: ConfigMap +metadata: + name: hep-script-nsr-add-id-6ct6f2h6mh diff --git a/curation-scripts/hep-script-nsr-add-id-cronjob.yaml b/curation-scripts/hep-script-nsr-add-id-cronjob.yaml new file mode 100644 index 0000000..2746d8d --- /dev/null +++ b/curation-scripts/hep-script-nsr-add-id-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-nsr-add-id +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-nsr-add-id-6ct6f2h6mh + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-quant-ph-core-cronjob.yaml b/curation-scripts/hep-script-quant-ph-core-cronjob.yaml new file mode 100644 index 0000000..08b5644 --- /dev/null +++ b/curation-scripts/hep-script-quant-ph-core-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-quant-ph-core +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-quant-ph-core-dbmdcdftmt + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-quant-ph-core-dbmdcdftmt-configmap.yaml b/curation-scripts/hep-script-quant-ph-core-dbmdcdftmt-configmap.yaml new file mode 100644 index 0000000..2bb5779 --- /dev/null +++ b/curation-scripts/hep-script-quant-ph-core-dbmdcdftmt-configmap.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + from inspirehep.utils import flatten_list + + + class SetQuantPhCore(SearchCheckDo): + """Set all papers with quant-ph arXiv category as core.""" + + query = "arxiv_eprints.categories:quant-ph" + + @staticmethod + def check(record, logger, state): + categories = flatten_list(record.get_value("arxiv_eprints.categories", [])) + return any(c == "quant-ph" for c in categories) and not record.get("core") + + @staticmethod + def do(record, logger, state): + record["core"] = True + + + SetQuantPhCore() +kind: ConfigMap +metadata: + name: hep-script-quant-ph-core-dbmdcdftmt diff --git a/curation-scripts/hep-script-remove-authors-uuids-2k8bmm85g6-configmap.yaml b/curation-scripts/hep-script-remove-authors-uuids-2k8bmm85g6-configmap.yaml new file mode 100644 index 0000000..c463efb --- /dev/null +++ b/curation-scripts/hep-script-remove-authors-uuids-2k8bmm85g6-configmap.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspirehep.curation.search_check_do import SearchCheckDo + + + class RemoveAuthorsUuids(SearchCheckDo): + """Remove UUIDS for all the authors that doesn't have record.$ref""" + + query = { + "query": { + "bool": { + "must": [ + { + "nested": { + "path": "authors", + "query": { + "bool": { + "must_not": { + "exists": {"field": "authors.record.$ref"} + } + } + }, + } + }, + {"match": {"_collections": "Literature"}}, + ] + } + } + } + + def search(self): + self.logger.info("Searching records", query=self.query) + query = ( + self.search_class() + .from_dict(self.query) + .params(_source={}, size=self.size, scroll="60m") + ) + if shard_filter := self._current_shard_filter(): + query = query.filter("script", script=shard_filter) + return query.scan() + + @staticmethod + def check(record, logger, state): + return len(get_value(record, "authors.record.$ref", [])) < len( + get_value(record, "authors", []) + ) + + @staticmethod + def do(record, logger, state): + for author in record["authors"]: + author_ref = get_value(author, "record.$ref") + if not author_ref: + del author["uuid"] + + + RemoveAuthorsUuids() +kind: ConfigMap +metadata: + name: hep-script-remove-authors-uuids-2k8bmm85g6 diff --git a/curation-scripts/hep-script-remove-authors-uuids-cronjob.yaml b/curation-scripts/hep-script-remove-authors-uuids-cronjob.yaml new file mode 100644 index 0000000..d947506 --- /dev/null +++ b/curation-scripts/hep-script-remove-authors-uuids-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-authors-uuids +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-authors-uuids-2k8bmm85g6 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-bai-from-lit-authors-cronjob.yaml b/curation-scripts/hep-script-remove-bai-from-lit-authors-cronjob.yaml new file mode 100644 index 0000000..c8a5982 --- /dev/null +++ b/curation-scripts/hep-script-remove-bai-from-lit-authors-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-bai-from-lit-authors +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-bai-from-lit-authors-h6h4cmd942 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-bai-from-lit-authors-h6h4cmd942-configmap.yaml b/curation-scripts/hep-script-remove-bai-from-lit-authors-h6h4cmd942-configmap.yaml new file mode 100644 index 0000000..5a0d584 --- /dev/null +++ b/curation-scripts/hep-script-remove-bai-from-lit-authors-h6h4cmd942-configmap.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +data: + script.py: | + from itertools import chain + from inspire_utils.record import get_value, get_values_for_schema + from inspirehep.curation.search_check_do import SearchCheckDo + + + class RemoveAuthorsBai(SearchCheckDo): + """Remove BAI from literature records""" + + query = 'authors.ids.schema:"INSPIRE BAI"' + + @staticmethod + def check(record, logger, state): + authors_ids = get_value(record, "authors.ids", []) + return next( + chain.from_iterable( + get_values_for_schema(author_ids, "INSPIRE BAI") + for author_ids in authors_ids + ), + False, + ) + + @staticmethod + def do(record, logger, state): + for author in record["authors"]: + author_ids = author.get("ids") + if not author_ids: + continue + new_ids = [ + id_dict for id_dict in author_ids if id_dict["schema"] != "INSPIRE BAI" + ] + if new_ids: + author["ids"] = new_ids + else: + del author["ids"] + + + RemoveAuthorsBai() +kind: ConfigMap +metadata: + name: hep-script-remove-bai-from-lit-authors-h6h4cmd942 diff --git a/curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-54b9h8mbb6-configmap.yaml b/curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-54b9h8mbb6-configmap.yaml new file mode 100644 index 0000000..c82bd15 --- /dev/null +++ b/curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-54b9h8mbb6-configmap.yaml @@ -0,0 +1,63 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspire_utils.dedupers import dedupe_list + from inspirehep.curation.search_check_do import SearchCheckDo + + RAW_AFFS = { + "Affiliated with an institute covered by a cooperation agreement with CERN", + "Affiliated with an international laboratory covered by a cooperation" + " agreement with CERN", + "Affiliated with an Institute Covered by a Cooperation Agreement with CERN," + " Geneva, Switzerland", + } + + + def has_cooperation_agreement_raw_aff(author): + return RAW_AFFS & set( + get_value({"author": author}, "author.raw_affiliations.value", []) + ) + + + class RemoveCERNAffiliationCooperationAgreement(SearchCheckDo): + """Remove incorrect CERN aff for authors having only a cooperation agreement.""" + + query = ( + 'authors.raw_affiliations.value:"Affiliated with an institute covered by' + ' a cooperation agreement with CERN"' + ) + + @staticmethod + def check(record, logger, state): + return any( + "CERN" in get_value({"author": a}, "author.affiliations.value", []) + and has_cooperation_agreement_raw_aff(a) + for a in record.get("authors", []) + ) + + @staticmethod + def do(record, logger, state): + for author in record["authors"]: + if not has_cooperation_agreement_raw_aff(author): + continue + new_affs = [] + for aff in author["affiliations"]: + if aff["value"] == "CERN": + new_affs.append( + { + "value": "Unlisted", + "record": { + "$ref": "https://inspirehep.net/api/institutions/910325" + }, + } + ) + else: + new_affs.append(aff) + author["affiliations"] = dedupe_list(new_affs) + + + RemoveCERNAffiliationCooperationAgreement() +kind: ConfigMap +metadata: + name: hep-script-remove-cern-aff-cooperation-agreement-54b9h8mbb6 diff --git a/curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-cronjob.yaml b/curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-cronjob.yaml new file mode 100644 index 0000000..45c46ef --- /dev/null +++ b/curation-scripts/hep-script-remove-cern-aff-cooperation-agreement-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-cern-aff-cooperation-agreement +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-cern-aff-cooperation-agreement-54b9h8mbb6 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-classifier-keywords-cronjob.yaml b/curation-scripts/hep-script-remove-classifier-keywords-cronjob.yaml new file mode 100644 index 0000000..6998895 --- /dev/null +++ b/curation-scripts/hep-script-remove-classifier-keywords-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-classifier-keywords +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-classifier-keywords-ddfmmfgcd4 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-classifier-keywords-ddfmmfgcd4-configmap.yaml b/curation-scripts/hep-script-remove-classifier-keywords-ddfmmfgcd4-configmap.yaml new file mode 100644 index 0000000..bb5ae0e --- /dev/null +++ b/curation-scripts/hep-script-remove-classifier-keywords-ddfmmfgcd4-configmap.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspirehep.curation.search_check_do import SearchCheckDo + + + class RemoveClassifierKeywords(SearchCheckDo): + query = "keywords.source:classifier" + + @staticmethod + def check(record, logger, state): + if any( + keyword.get("source", "") == "classifier" + for keyword in get_value(record, "keywords", []) + ): + return True + else: + return False + + @staticmethod + def do(record, logger, state): + new_keywords = [ + keyword + for keyword in record.get("keywords", []) + if keyword.get("source", "") != "classifier" + ] + if new_keywords: + record["keywords"] = new_keywords + else: + del record["keywords"] + + + RemoveClassifierKeywords() +kind: ConfigMap +metadata: + name: hep-script-remove-classifier-keywords-ddfmmfgcd4 diff --git a/curation-scripts/hep-script-remove-curated-relation-from-authors-2t8b8h4md9-configmap.yaml b/curation-scripts/hep-script-remove-curated-relation-from-authors-2t8b8h4md9-configmap.yaml new file mode 100644 index 0000000..2883dfd --- /dev/null +++ b/curation-scripts/hep-script-remove-curated-relation-from-authors-2t8b8h4md9-configmap.yaml @@ -0,0 +1,70 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspirehep.curation.search_check_do import SearchCheckDo + + + class RemoveAuthorsCuratedRelation(SearchCheckDo): + """Remove curated_relation=True for all the authors that doesn't have record.$ref""" + + query = { + "_source": "control_number", + "query": { + "bool": { + "must": [ + { + "nested": { + "path": "authors", + "query": { + "bool": { + "must_not": { + "exists": {"field": "authors.record.$ref"} + }, + "must": { + "term": { + "authors.curated_relation": {"value": True} + } + }, + } + }, + } + }, + {"match": {"_collections": "Literature"}}, + ] + } + }, + } + + def search(self): + self.logger.info("Searching records", query=self.query) + query = ( + self.search_class() + .from_dict(self.query) + .params(_source={}, size=self.size, scroll="60m") + ) + if shard_filter := self._current_shard_filter(): + query = query.filter("script", script=shard_filter) + return query.scan() + + @staticmethod + def check(record, logger, state): + author_curated_relation_record = ( + (author.get("curated_relation"), author.get("record")) + for author in record.get("authors", []) + ) + return (True, None) in author_curated_relation_record + + @staticmethod + def do(record, logger, state): + for author in record["authors"]: + author_ref = get_value(author, "record.$ref") + curated_relation = get_value(author, "curated_relation") + if not author_ref and curated_relation: + del author["curated_relation"] + + + RemoveAuthorsCuratedRelation() +kind: ConfigMap +metadata: + name: hep-script-remove-curated-relation-from-authors-2t8b8h4md9 diff --git a/curation-scripts/hep-script-remove-curated-relation-from-authors-cronjob.yaml b/curation-scripts/hep-script-remove-curated-relation-from-authors-cronjob.yaml new file mode 100644 index 0000000..14fecd6 --- /dev/null +++ b/curation-scripts/hep-script-remove-curated-relation-from-authors-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-curated-relation-from-authors +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-curated-relation-from-authors-2t8b8h4md9 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-pdfa-extension-b8hh7gtm72-configmap.yaml b/curation-scripts/hep-script-remove-pdfa-extension-b8hh7gtm72-configmap.yaml new file mode 100644 index 0000000..a15e928 --- /dev/null +++ b/curation-scripts/hep-script-remove-pdfa-extension-b8hh7gtm72-configmap.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + + class RemovePDFAExtension(SearchCheckDo): + """Remove ``;pdfa`` extension from filenames as it messes up CDS display.""" + + query = "documents.filename:*pdfa" + + @staticmethod + def check(record, logger, state): + return any( + f.endswith(";pdfa") for f in record.get_value("documents.filename", []) + ) + + @staticmethod + def do(record, logger, state): + extension = ";pdfa" + for document in record["documents"]: + if (filename := document.get("filename", "")).endswith(extension): + document["filename"] = filename[: len(extension)] + + + RemovePDFAExtension() +kind: ConfigMap +metadata: + name: hep-script-remove-pdfa-extension-b8hh7gtm72 diff --git a/curation-scripts/hep-script-remove-pdfa-extension-cronjob.yaml b/curation-scripts/hep-script-remove-pdfa-extension-cronjob.yaml new file mode 100644 index 0000000..543029f --- /dev/null +++ b/curation-scripts/hep-script-remove-pdfa-extension-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-pdfa-extension +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-pdfa-extension-b8hh7gtm72 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-quant-ph-ck777tfgfh-configmap.yaml b/curation-scripts/hep-script-remove-quant-ph-ck777tfgfh-configmap.yaml new file mode 100644 index 0000000..acd7edb --- /dev/null +++ b/curation-scripts/hep-script-remove-quant-ph-ck777tfgfh-configmap.yaml @@ -0,0 +1,51 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + from inspire_utils.record import get_value + from inspire_utils.date import PartialDate + from itertools import chain + + + DATE_BEFORE_2023 = PartialDate.loads("2023") + + + class RemoveQuantPh(SearchCheckDo): + """Remove core from quant-ph literature records""" + + query = "arxiv_eprints.categories:quant-ph core:true not _desy_bookkeeping.status:final not _desy_bookkeeping.status:printed and de < 2023" # noqa: E501 + + @staticmethod + def check(record, logger, state): + arxiv_category_quant_ph = "quant-ph" in chain.from_iterable( + get_value(record, "arxiv_eprints.categories", []) + ) + is_core = record.get("core") + desy_bookkeeping_not_final = "final" not in record.get( + "_desy_bookkeeping.status", [] + ) + desy_bookkeeping_status_not_printed = "printed" not in get_value( + record, "_desy_bookkeeping.status", [] + ) + earliest_date_before_2023 = ( + PartialDate.loads(record.earliest_date) < DATE_BEFORE_2023 + ) + return all( + [ + arxiv_category_quant_ph, + is_core, + desy_bookkeeping_not_final, + desy_bookkeeping_status_not_printed, + earliest_date_before_2023, + ] + ) + + @staticmethod + def do(record, logger, state): + del record["core"] + + + RemoveQuantPh() +kind: ConfigMap +metadata: + name: hep-script-remove-quant-ph-ck777tfgfh diff --git a/curation-scripts/hep-script-remove-quant-ph-cronjob.yaml b/curation-scripts/hep-script-remove-quant-ph-cronjob.yaml new file mode 100644 index 0000000..e09fce5 --- /dev/null +++ b/curation-scripts/hep-script-remove-quant-ph-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-quant-ph +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-quant-ph-ck777tfgfh + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-wrong-ipac2023-authors-cronjob.yaml b/curation-scripts/hep-script-remove-wrong-ipac2023-authors-cronjob.yaml new file mode 100644 index 0000000..8693fb3 --- /dev/null +++ b/curation-scripts/hep-script-remove-wrong-ipac2023-authors-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-remove-wrong-ipac2023-authors +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-remove-wrong-ipac2023-authors-m25648bt42 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-remove-wrong-ipac2023-authors-m25648bt42-configmap.yaml b/curation-scripts/hep-script-remove-wrong-ipac2023-authors-m25648bt42-configmap.yaml new file mode 100644 index 0000000..138d6d6 --- /dev/null +++ b/curation-scripts/hep-script-remove-wrong-ipac2023-authors-m25648bt42-configmap.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + CNUM = "C23-05-07" + AUTHOR_LIST = [ + "Assmann, Ralph", + "McIntosh, Peter", + "Fabris, Alessandro", + "Bisoffi, Giovanni", + "Andrian, Ivan", + "Vinicola, Giulia", + ] + AUTHOR_QUERY = " and ".join(f"a {author}" for author in AUTHOR_LIST) + + + class RemoveWrongIPAC2023Authors(SearchCheckDo): + """Remove incorrect authors on IPAC2023 papers due to bad JACoW metadata.""" + + query = f"publication_info.cnum:{CNUM} and {AUTHOR_QUERY}" + + @staticmethod + def check(record, logger, state): + return ( + CNUM in record.get_value("publication_info.cnum", []) + and record.get_value("authors.full_name", []) == AUTHOR_LIST + ) + + @staticmethod + def do(record, logger, state): + del record["authors"] + + + RemoveWrongIPAC2023Authors() +kind: ConfigMap +metadata: + name: hep-script-remove-wrong-ipac2023-authors-m25648bt42 diff --git a/curation-scripts/hep-script-reorder-babar-document-versions-cronjob.yaml b/curation-scripts/hep-script-reorder-babar-document-versions-cronjob.yaml new file mode 100644 index 0000000..ace5a2a --- /dev/null +++ b/curation-scripts/hep-script-reorder-babar-document-versions-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-reorder-babar-document-versions +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-reorder-babar-document-versions-ggd5ch4htk + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-reorder-babar-document-versions-ggd5ch4htk-configmap.yaml b/curation-scripts/hep-script-reorder-babar-document-versions-ggd5ch4htk-configmap.yaml new file mode 100644 index 0000000..0c91fab --- /dev/null +++ b/curation-scripts/hep-script-reorder-babar-document-versions-ggd5ch4htk-configmap.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +data: + script.py: | + import re + + from inspirehep.curation.search_check_do import SearchCheckDo + + + def key_func(document): + description = document.get("description", "") + match = re.match(r"\[Version (\d+(.\d+)?)\]", description) + if match: + return ("", float(match.group(1))) + else: + return (description, 0) + + + class ReorderBabarDocumentVersions(SearchCheckDo): + """Order the documents in the Babar collections according to versions.""" + + query = "documents.description:version and _collections:babar analysis documents" + + @staticmethod + def check(record, logger, state): + documents = record.get("documents", []) + state["sorted"] = sorted(documents, key=key_func) + return state["sorted"] != documents + + @staticmethod + def do(record, logger, state): + record["documents"] = state["sorted"] + + + ReorderBabarDocumentVersions() +kind: ConfigMap +metadata: + name: hep-script-reorder-babar-document-versions-ggd5ch4htk diff --git a/curation-scripts/hep-script-replace-europhys-lett-with-epl-cronjob.yaml b/curation-scripts/hep-script-replace-europhys-lett-with-epl-cronjob.yaml new file mode 100644 index 0000000..be4bec6 --- /dev/null +++ b/curation-scripts/hep-script-replace-europhys-lett-with-epl-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-replace-europhys-lett-with-epl +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-replace-europhys-lett-with-epl-hb47t84mh2 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-replace-europhys-lett-with-epl-hb47t84mh2-configmap.yaml b/curation-scripts/hep-script-replace-europhys-lett-with-epl-hb47t84mh2-configmap.yaml new file mode 100644 index 0000000..93debba --- /dev/null +++ b/curation-scripts/hep-script-replace-europhys-lett-with-epl-hb47t84mh2-configmap.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspirehep.curation.search_check_do import SearchCheckDo + + + class ReplaceEurophLetterWithEPL(SearchCheckDo): + """Replace Europhys.Lett. in pubinfo with EPL""" + + query = "j Europhys.Lett. and not j EPL" + + @staticmethod + def check(record, logger, state): + return any( + journal.lower() == "europhys.lett." + for journal in get_value(record, "publication_info.journal_title", []) + ) + + @staticmethod + def do(record, logger, state): + for pubinfo in record["publication_info"]: + if pubinfo.get("journal_title") == "Europhys.Lett.": + pubinfo["journal_title"] = "EPL" + + + ReplaceEurophLetterWithEPL() +kind: ConfigMap +metadata: + name: hep-script-replace-europhys-lett-with-epl-hb47t84mh2 diff --git a/curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-cronjob.yaml b/curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-cronjob.yaml new file mode 100644 index 0000000..51d564e --- /dev/null +++ b/curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-replace-europhys-lett-with-epl-in-refs +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-replace-europhys-lett-with-epl-in-refs-k785hm494h + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-k785hm494h-configmap.yaml b/curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-k785hm494h-configmap.yaml new file mode 100644 index 0000000..05d0173 --- /dev/null +++ b/curation-scripts/hep-script-replace-europhys-lett-with-epl-in-refs-k785hm494h-configmap.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspirehep.curation.search_check_do import SearchCheckDo + + + class ReplaceEurophLetterWithEPLInRefs(SearchCheckDo): + """Replace Europhys.Lett. in pubinfo with EPL""" + + query = "references.reference.publication_info.journal_title:europhys.lett." + + @staticmethod + def check(record, logger, state): + return any( + ref_journal.lower() == "europhys.lett." + for ref_journal in get_value( + record, "references.reference.publication_info.journal_title", [] + ) + ) + + @staticmethod + def do(record, logger, state): + for reference in record["references"]: + ref_journal_title = get_value( + reference, "reference.publication_info.journal_title", "" + ) + if ref_journal_title.lower() == "europhys.lett.": + reference["reference"]["publication_info"]["journal_title"] = "EPL" + + + ReplaceEurophLetterWithEPLInRefs() +kind: ConfigMap +metadata: + name: hep-script-replace-europhys-lett-with-epl-in-refs-k785hm494h diff --git a/curation-scripts/hep-script-restore-babar-related-records-b8hcg96c74-configmap.yaml b/curation-scripts/hep-script-restore-babar-related-records-b8hcg96c74-configmap.yaml new file mode 100644 index 0000000..e5462f7 --- /dev/null +++ b/curation-scripts/hep-script-restore-babar-related-records-b8hcg96c74-configmap.yaml @@ -0,0 +1,105 @@ +apiVersion: v1 +data: + script.py: | + import re + + from dojson.contrib.marc21.utils import create_record + from invenio_pidstore.models import PersistentIdentifier + from inspire_dojson.utils import force_list + from inspire_utils.dedupers import dedupe_list + from inspirehep.curation.search_check_do import SearchCheckDo + from inspirehep.migrator.models import LegacyRecordsMirror + from inspirehep.records.utils import get_ref_from_pid + from inspirehep.search.api import LiteratureSearch, IQ + + BABAR_COLLECTIONS = [ + "BaBar Analysis Documents", + "BaBar Internal notes", + "BaBar Internal BAIs", + ] + + + def get_legacy_relations(recid): + record = LegacyRecordsMirror.query.get(recid) + if not record: + return None + legacy_rec = create_record(record.marcxml) + legacy_relations = force_list(legacy_rec.get("78708")) + return [ + (relation.get("i", "").strip(), relation.get("r", "").strip()) + for relation in legacy_relations + ] + + + def find_report_numbers(report_number): + clean_report_number = re.sub(r"\[.*\]", "", report_number).strip() + search_instance = LiteratureSearch() + query = search_instance.query( + IQ(f'report_numbers.value.fuzzy:"{clean_report_number}"', search_instance) + ).params(size=2) + result = query.execute() + uuids = [r.meta.id for r in result] + pids = PersistentIdentifier.query.filter( + PersistentIdentifier.object_uuid.in_(uuids), + PersistentIdentifier.object_type == "rec", + PersistentIdentifier.pid_type == "lit", + ).all() + return [get_ref_from_pid(pid.pid_type, pid.pid_value) for pid in pids] + + + class RestoreBabarRelatedRecords(SearchCheckDo): + """Restore ``related_records`` in BaBar records that got lost in migration.""" + + query = " or ".join([f'_collections:"{coll}"' for coll in BABAR_COLLECTIONS]) + + @staticmethod + def check(record, logger, state): + state["legacy_relations"] = get_legacy_relations(record["control_number"]) + if not state["legacy_relations"]: + logger.warning("No legacy record found") + return False + + return bool(state["legacy_relations"]) + + @staticmethod + def do(record, logger, state): + legacy_relations = state["legacy_relations"] + related_records = record.get("related_records", []) + urls = record.get("urls", []) + for description, report_number in legacy_relations: + matched_recs = find_report_numbers(report_number) + if not matched_recs: + logger.warning( + "No records found for report number", report_number=report_number + ) + continue + elif (num_matches := len(matched_recs)) > 1: + logger.warning( + "Multiple records found for report number", + report_number=report_number, + num_matches=num_matches, + ) + continue + related_records.append( + { + "relation_freetext": description, + "record": matched_recs[0], + "curated_relation": True, + } + ) + urls.append( + { + "value": matched_recs[0]["$ref"].replace("/api", ""), + "description": report_number, + } + ) + if related_records: + record["related_records"] = dedupe_list(related_records) + if urls: + record["urls"] = dedupe_list(urls) + + + RestoreBabarRelatedRecords() +kind: ConfigMap +metadata: + name: hep-script-restore-babar-related-records-b8hcg96c74 diff --git a/curation-scripts/hep-script-restore-babar-related-records-cronjob.yaml b/curation-scripts/hep-script-restore-babar-related-records-cronjob.yaml new file mode 100644 index 0000000..59a9db0 --- /dev/null +++ b/curation-scripts/hep-script-restore-babar-related-records-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-restore-babar-related-records +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-restore-babar-related-records-b8hcg96c74 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-set-refereed-cronjob.yaml b/curation-scripts/hep-script-set-refereed-cronjob.yaml new file mode 100644 index 0000000..aa2bf70 --- /dev/null +++ b/curation-scripts/hep-script-set-refereed-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-set-refereed +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-set-refereed-thm689762g + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-set-refereed-thm689762g-configmap.yaml b/curation-scripts/hep-script-set-refereed-thm689762g-configmap.yaml new file mode 100644 index 0000000..be99e00 --- /dev/null +++ b/curation-scripts/hep-script-set-refereed-thm689762g-configmap.yaml @@ -0,0 +1,782 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + from inspirehep.curation.utils import set_refereed_and_fix_document_type + from inspire_dojson.utils import get_recid_from_ref + from inspire_utils.record import get_value + + + AFFECTED_JOURNAL_RECORDS = { + 1214577, + 1214658, + 1214562, + 1214339, + 1214397, + 1214363, + 1214774, + 1214338, + 1211677, + 1214856, + 1214594, + 1214535, + 1212465, + 1211579, + 1831498, + 1211939, + 1214248, + 1214534, + 1214599, + 1213793, + 1340773, + 1214706, + 1214406, + 1214846, + 1214444, + 1214701, + 1212397, + 1214848, + 1211841, + 1213880, + 1214608, + 1214390, + 1212993, + 1788505, + 1214386, + 1214394, + 1213914, + 1214764, + 1212437, + 1211687, + 1213232, + 1214853, + 1212932, + 1212424, + 1214416, + 1213524, + 1214692, + 1212853, + 1214729, + 1212022, + 1613954, + 1214222, + 1211879, + 1214611, + 1214062, + 1213516, + 1214530, + 1213860, + 1213909, + 1214275, + 1693113, + 1213028, + 1214355, + 1214687, + 1213925, + 1213618, + 1213138, + 1214353, + 1214382, + 1212686, + 1613997, + 1214229, + 1831236, + 1213766, + 1214854, + 2169193, + 1214389, + 1214524, + 1212828, + 1213853, + 1213505, + 1214731, + 1214425, + 1212319, + 1212223, + 1213370, + 1832964, + 1214449, + 1211926, + 1213351, + 1212714, + 1214519, + 1831883, + 1214387, + 2097389, + 1214463, + 1212837, + 1214429, + 1211679, + 1213775, + 1214697, + 1211814, + 1214166, + 1213607, + 1213662, + 1214085, + 1616534, + 2036629, + 1837011, + 1212924, + 1214626, + 1211617, + 1211901, + 1214374, + 1721862, + 1214736, + 2010043, + 1213055, + 1213452, + 1214829, + 1512965, + 1214670, + 1325193, + 1213834, + 1214038, + 1212301, + 1212911, + 1212850, + 1214019, + 1213814, + 1662990, + 1213811, + 1211683, + 1384337, + 1214260, + 1214525, + 1212812, + 1735662, + 1211904, + 1214663, + 1213768, + 1212312, + 1214564, + 1213772, + 1214723, + 1211740, + 1214767, + 1212674, + 1820531, + 1214088, + 1214835, + 1214380, + 1212708, + 1212499, + 1212467, + 1214785, + 1213659, + 1214724, + 1831888, + 1214568, + 1214794, + 1831848, + 1214336, + 1214396, + 1214532, + 1212742, + 1214533, + 1214605, + 1212332, + 1211852, + 1213207, + 1214704, + 1212067, + 1214140, + 1692014, + 1212318, + 1213027, + 1213778, + 1214677, + 1749978, + 1831886, + 1211928, + 1212464, + 1212337, + 1212658, + 1214789, + 1620406, + 1213737, + 1214538, + 1213006, + 1214206, + 1861062, + 1214514, + 1214537, + 1214679, + 1613979, + 1834798, + 1212899, + 1213503, + 1213520, + 1214744, + 1212236, + 1213906, + 1214617, + 1214711, + 1212381, + 1212573, + 1212435, + 1214259, + 1214321, + 1214770, + 1214816, + 1212695, + 1212902, + 1213212, + 1213456, + 2170070, + 1212272, + 1212602, + 1212669, + 1213654, + 1214249, + 1613961, + 1212546, + 1214512, + 1211785, + 1212344, + 1213902, + 1214210, + 1214421, + 1214574, + 1834797, + 1389229, + 1213755, + 1214809, + 1214864, + 1212908, + 1214369, + 1214656, + 1738617, + 2036631, + 1212401, + 1214204, + 1214263, + 1212886, + 1214304, + 1214881, + 1613962, + 1613980, + 1213092, + 1212868, + 1213640, + 1831887, + 1211781, + 1213323, + 1214800, + 1211633, + 1212507, + 1214063, + 1211854, + 1212589, + 1212841, + 1214409, + 1214427, + 1848511, + 1214782, + 1212191, + 1214531, + 1473010, + 1624117, + 1212194, + 1213227, + 1214003, + 1662991, + 1212744, + 1212915, + 1214225, + 1214381, + 1214553, + 1381587, + 1212941, + 1213091, + 1214435, + 1214559, + 1214722, + 1214784, + 1214823, + 1341225, + 1613943, + 1621545, + 1211794, + 1212720, + 1214468, + 1214699, + 1214751, + 1452947, + 1211655, + 1211724, + 1213610, + 1213747, + 1213777, + 1665636, + 1211710, + 1212703, + 1212789, + 1213322, + 1214160, + 1613978, + 1211808, + 1211834, + 1212692, + 1212725, + 1212778, + 1213926, + 1214351, + 1214485, + 1214596, + 1214874, + 1333336, + 1212478, + 1212510, + 1214081, + 1214209, + 1214358, + 1214592, + 1214876, + 1245675, + 2056533, + 1211728, + 1213100, + 1213249, + 1214169, + 1610403, + 1738919, + 1211735, + 1211875, + 1212330, + 1212918, + 1214132, + 1214542, + 1214552, + 1214702, + 1736443, + 2070193, + 1212814, + 1213338, + 1213940, + 1214569, + 1211639, + 1212845, + 1212893, + 1213291, + 1213470, + 1214756, + 1305561, + 1482183, + 1621540, + 1211839, + 1213588, + 1213968, + 1214637, + 1214645, + 1636881, + 1729026, + 1211572, + 1212167, + 1212307, + 1212400, + 1213054, + 1213310, + 1214245, + 1214376, + 1214430, + 1214606, + 1214640, + 1211643, + 1213458, + 1213901, + 1214110, + 1214144, + 1214182, + 1214310, + 1214585, + 1214633, + 1214824, + 1622680, + 2155427, + 1212014, + 1212522, + 1212563, + 1212746, + 1212752, + 1213279, + 1213375, + 1213521, + 1212605, + 1212790, + 1212857, + 1213402, + 1213852, + 1214607, + 1621535, + 1831985, + 1211600, + 1212362, + 1212863, + 1213025, + 1213174, + 1213235, + 1213275, + 1214109, + 1214177, + 1214220, + 1214411, + 1214493, + 1214798, + 1621537, + 1698825, + 1701689, + 2016165, + 1211911, + 1212519, + 1212785, + 1212793, + 1213738, + 1213915, + 1214541, + 1214649, + 1214661, + 1621052, + 1735923, + 1211671, + 1212293, + 1212393, + 1212461, + 1212528, + 1836989, + 1212923, + 1212974, + 1214023, + 1214520, + 1212212, + 1212604, + 1212969, + 1834796, + 1213475, + 1213549, + 1213552, + 1214631, + 1214683, + 2010048, + 1212866, + 1214262, + 1341244, + 1212490, + 1214448, + 1478867, + 1826529, + 2064556, + 1211668, + 1211703, + 1214247, + 1213535, + 1214133, + 1213302, + 1211635, + 1213107, + 1212659, + 1212883, + 1613942, + 1212947, + 1213086, + 1214440, + 1214625, + 1211598, + 1213250, + 1212141, + 1341227, + 1214077, + 1214672, + 1214857, + 1729006, + 1418292, + 1212356, + 1213936, + 1212491, + 1243781, + 1214860, + 1622642, + 1643448, + 1341246, + 1214180, + 1214453, + 1385703, + 1717429, + 1214084, + 1341239, + 1613993, + 1717427, + 1212940, + 1213160, + 1386772, + 1213687, + 1722610, + 1211919, + 1613992, + 1213301, + 1213551, + 1653878, + 1589962, + 1211801, + 1682163, + 1212326, + 1211725, + 1212480, + 1421580, + 1214093, + 1212230, + 1211690, + 1212912, + 1213997, + 1212576, + 1311535, + 1617967, + 2637072, + 1214779, + 1613996, + 1214345, + 1214719, + 1613991, + 1213879, + 1214738, + 1214707, + 1214850, + 1613955, + 1214361, + 1214513, + 1214474, + 1613941, + 1213547, + 1214414, + 1214456, + 1214746, + 1212386, + 1214796, + 1214806, + 1212411, + 1214648, + 1214747, + 1212907, + 1213518, + 1214484, + 1214842, + 1213892, + 1214862, + 1214771, + 1212881, + 1213318, + 1613956, + 1214737, + 1214754, + 1214146, + 1214293, + 1212423, + 1214732, + 1214858, + 1212966, + 1214555, + 1211605, + 1212474, + 1214745, + 1212237, + 1214660, + 1214415, + 1214735, + 1214817, + 1214354, + 1213635, + 1214720, + 1623522, + 1211797, + 1214712, + 1214788, + 1213493, + 1615419, + 1214885, + 1213818, + 1212320, + 1214861, + 1214119, + 1213620, + 1213324, + 1214479, + 1211986, + 1615421, + 1214384, + 1214391, + 1213878, + 1213140, + 1211848, + 1613959, + 1212635, + 1668220, + 1214849, + 1213074, + 1214558, + 1213713, + 1212560, + 1212463, + 1214825, + 1615550, + 1212846, + 1613990, + 1214028, + 1214402, + 1212288, + 1213438, + 1214418, + 1211793, + 1713659, + 1214550, + 1423496, + 1212957, + 1213601, + 1212562, + 1212367, + 1212834, + 1613957, + 1214311, + 1212422, + 1212786, + 1212405, + 1212729, + 1212771, + 1212818, + 1214852, + 1613960, + 1212434, + 1213624, + 1213482, + 1212050, + 1212310, + 1214515, + 1512641, + 1613958, + 1212891, + 1212847, + 1213014, + 1213134, + 1213807, + 1212994, + 1214308, + 1613940, + 1214855, + 1214446, + 1212820, + 1214781, + 1214509, + 1415879, + 1365972, + 1214476, + 1214078, + 1213958, + 1214882, + 1212747, + 1211607, + 1214837, + 1212921, + 1214797, + 1214727, + 1214883, + 1869676, + 1214684, + 1212967, + 1214778, + 1212547, + 1214350, + 1214694, + 1212617, + 1214481, + 1214543, + 1470811, + 1212904, + 1213870, + 1213876, + 1214223, + 1213532, + 1214659, + 1213156, + 1214016, + 1214046, + 1214619, + 1213614, + 1212160, + 1213771, + 2619568, + 1758418, + 1214236, + 1214685, + 1834753, + 1428597, + 1213780, + 1213907, + 1214111, + 1746517, + 1395695, + 1407072, + 1415878, + 1600729, + 1258560, + 1397066, + 1510923, + 1716747, + 1716746, + 1213378, + 1214424, + 1213945, + 1214758, + 1214349, + 1212383, + 1214168, + 1212854, + 1211902, + 1212151, + 1211912, + } + + + class SetRefereed(SearchCheckDo): + """Set `refereed` and update `document_type` for selected records""" + + query = { + "query": { + "nested": { + "path": "publication_info", + "query": { + "terms": { + "publication_info.journal_record.$ref": list( + AFFECTED_JOURNAL_RECORDS + ) + } + }, + } + } + } + + def search(self): + self.logger.info("Searching records", query=self.query) + query = ( + self.search_class() + .from_dict(self.query) + .params(_source={}, size=self.size, scroll="60m") + ) + if shard_filter := self._current_shard_filter(): + query = query.filter("script", script=shard_filter) + return query.scan() + + @staticmethod + def check(record, logger, state): + refs = get_value(record, "publication_info.journal_record", []) + journal_recids_record = {int(get_recid_from_ref(ref)) for ref in refs} + return AFFECTED_JOURNAL_RECORDS.intersection(journal_recids_record) + + @staticmethod + def do(record, logger, state): + set_refereed_and_fix_document_type(record) + + + SetRefereed() +kind: ConfigMap +metadata: + name: hep-script-set-refereed-thm689762g diff --git a/curation-scripts/hep-script-snowmass-add-link-7hgkd2h64k-configmap.yaml b/curation-scripts/hep-script-snowmass-add-link-7hgkd2h64k-configmap.yaml new file mode 100644 index 0000000..2cfe425 --- /dev/null +++ b/curation-scripts/hep-script-snowmass-add-link-7hgkd2h64k-configmap.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +data: + script.py: | + from inspirehep.curation.search_check_do import SearchCheckDo + + ECONF_URL = "https://www.slac.stanford.edu/econf/C210711/" + ECONF_DESCRIPTION = "eConf" + ECONF_CNUM = "C21-07-11" + + + class AddSnowmassProceedingsURL(SearchCheckDo): + """Add link to the Snowmass eConf Proceedings website.""" + + query = f"publication_info.cnum:{ECONF_CNUM}" + query += f" -urls.description:{ECONF_DESCRIPTION}" + + @staticmethod + def check(record, logger, state): + urls = record.get_value("urls.description", []) + logger.info("URLs in record", urls=urls) + if ECONF_DESCRIPTION in urls: + return False + cnums = record.get_value("publication_info.cnum", []) + return ECONF_CNUM in cnums + + @staticmethod + def do(record, logger, state): + record.setdefault("urls", []).append( + { + "value": ECONF_URL, + "description": ECONF_DESCRIPTION, + } + ) + + + AddSnowmassProceedingsURL() +kind: ConfigMap +metadata: + name: hep-script-snowmass-add-link-7hgkd2h64k diff --git a/curation-scripts/hep-script-snowmass-add-link-cronjob.yaml b/curation-scripts/hep-script-snowmass-add-link-cronjob.yaml new file mode 100644 index 0000000..47b570c --- /dev/null +++ b/curation-scripts/hep-script-snowmass-add-link-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-snowmass-add-link +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-snowmass-add-link-7hgkd2h64k + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-test-5c66cgkk94-configmap.yaml b/curation-scripts/hep-script-test-5c66cgkk94-configmap.yaml new file mode 100644 index 0000000..706c896 --- /dev/null +++ b/curation-scripts/hep-script-test-5c66cgkk94-configmap.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +data: + script.py: | + print("Everything works fine!") +kind: ConfigMap +metadata: + name: hep-script-test-5c66cgkk94 diff --git a/curation-scripts/hep-script-test-cronjob.yaml b/curation-scripts/hep-script-test-cronjob.yaml new file mode 100644 index 0000000..dcb7588 --- /dev/null +++ b/curation-scripts/hep-script-test-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-test +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-test-5c66cgkk94 + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-update-journal-title-for-aps-physcis-cronjob.yaml b/curation-scripts/hep-script-update-journal-title-for-aps-physcis-cronjob.yaml new file mode 100644 index 0000000..debb5be --- /dev/null +++ b/curation-scripts/hep-script-update-journal-title-for-aps-physcis-cronjob.yaml @@ -0,0 +1,82 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + labels: + app.kubernetes.io/component: job + name: hep-script-update-journal-title-for-aps-physcis +spec: + jobTemplate: + metadata: + labels: + app.kubernetes.io/component: job + spec: + completionMode: Indexed + completions: 10 + parallelism: 10 + template: + metadata: + labels: + app.kubernetes.io/component: job + spec: + containers: + - args: + - shell + - /usr/local/src/script.py + command: + - inspirehep + env: + - name: SENTRY_ENVIRONMENT + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POSTGRESQL_HOST + value: inspire-qa-db-cluster-pooler-rw.inspire-qa.svc + - name: POSTGRESQL_PORT + value: "5432" + - name: POSTGRESQL_USER + valueFrom: + secretKeyRef: + key: user + name: postgres-inspire-pguser-hep + - name: POSTGRESQL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: postgres-inspire-pguser-hep + - name: JOB_COMPLETIONS + value: "10" + envFrom: + - configMapRef: + name: hep-defaults + - configMapRef: + name: hep-globals + - configMapRef: + name: hep-feature-flags + - secretRef: + name: hep-creds + image: registry.cern.ch/docker.io/inspirehep/hep + name: hep + volumeMounts: + - mountPath: /usr/local/var/instance/inspirehep_api.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /usr/local/var/instance/inspirehep.cfg + name: hep-cfg + subPath: inspirehep.cfg + - mountPath: /home/invenio + name: invenio-home + - mountPath: /usr/local/src/script.py + name: hep-script + subPath: script.py + restartPolicy: Never + volumes: + - configMap: + name: hep-cfg + name: hep-cfg + - emptyDir: {} + name: invenio-home + - configMap: + name: hep-script-update-journal-title-for-aps-physcis-t5474b7fgb + name: hep-script + schedule: '@yearly' + suspend: true diff --git a/curation-scripts/hep-script-update-journal-title-for-aps-physcis-t5474b7fgb-configmap.yaml b/curation-scripts/hep-script-update-journal-title-for-aps-physcis-t5474b7fgb-configmap.yaml new file mode 100644 index 0000000..88b8d61 --- /dev/null +++ b/curation-scripts/hep-script-update-journal-title-for-aps-physcis-t5474b7fgb-configmap.yaml @@ -0,0 +1,58 @@ +apiVersion: v1 +data: + script.py: | + from inspire_utils.record import get_value + from inspirehep.curation.search_check_do import SearchCheckDo + + + class UpdateJournalTitleForApsPhysics(SearchCheckDo): + """ + Update journal title for journals with source=`APS` + and title `Physcis` to `APS Physics` + """ + + query = { + "_source": "control_number", + "query": { + "bool": { + "must": [ + {"match": {"journal_title_variants": "Physics"}}, + {"match": {"acquisition_source.source": "APS"}}, + ] + } + }, + } + + def search(self): + self.logger.info("Searching records", query=self.query) + query = ( + self.search_class() + .from_dict(self.query) + .params(_source={}, size=self.size, scroll="60m") + ) + if shard_filter := self._current_shard_filter(): + query = query.filter("script", script=shard_filter) + return query.scan() + + @staticmethod + def check(record, logger, state): + journal_titles = [ + pubinfo.get("journal_title") + for pubinfo in record.get("publication_info", []) + ] + return ( + "Physics" in journal_titles + and get_value(record, "acquisition_source.source") == "APS" + ) + + @staticmethod + def do(record, logger, state): + for publication_info in record["publication_info"]: + if publication_info.get("journal_title") == "Physics": + publication_info["journal_title"] = "APS Physics" + + + UpdateJournalTitleForApsPhysics() +kind: ConfigMap +metadata: + name: hep-script-update-journal-title-for-aps-physcis-t5474b7fgb