Merge pull request #2013 from microbiomedata/2011-queries-to-delete-l…

…egacy-orphan-data-objects-and-functional-annotation-agg-records Delete queries for orphan functional_annotation_agg records and id non-conformaing Data Objects
microbiomedata · Jul 24, 2024 · 2e5f5da · 2e5f5da
2 parents 05ced17 + ad5c50d
commit 2e5f5da
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 1 deletion.
diff --git a/...ts/mongodb_queries/data_qc/metabolomics_analysis_acrtivity_has_calibration_data_object.js b/...ts/mongodb_queries/data_qc/metabolomics_analysis_acrtivity_has_calibration_data_object.js
@@ -1,4 +1,4 @@
-db.getCollection('nom_analysis_activity_set').aggregate(
+db.getCollection('metabolomics_analysis_activity_set').aggregate(
 [
 {
 $match: { has_calibration: { $ne: null } }

diff --git a/...db_queries/data_qc/orphan_data_objects.js → ...flow_activities_has_output_data_object.js b/...db_queries/data_qc/orphan_data_objects.js → ...flow_activities_has_output_data_object.js
diff --git a/assets/mongodb_queries/find_id_nonconforming_data_objects.js b/assets/mongodb_queries/find_id_nonconforming_data_objects.js
@@ -0,0 +1,12 @@
+db.getCollection('data_object_set').aggregate(
+  [
+    {
+      $match: {
+        id: {
+          $not: { $regex: RegExp('nmdc:dobj-') }
+        }
+      }
+    }
+  ],
+  { maxTimeMS: 60000, allowDiskUse: true }
+);
diff --git a/assets/mongodb_queries/find_orphan_legacy_id_functional_annotation_agg.js b/assets/mongodb_queries/find_orphan_legacy_id_functional_annotation_agg.js
@@ -0,0 +1,26 @@
+db.getCollection(
+  'functional_annotation_agg'
+).aggregate(
+  [
+    {
+      $lookup: {
+        from: 'metagenome_annotation_activity_set',
+        localField: 'metagenome_annotation_id',
+        foreignField: 'id',
+        as: 'annotation_activities'
+      }
+    },
+    {
+      $match: {
+        annotation_activities: { $size: 0 }
+      }
+    },
+    {
+      $group: {
+        _id: '$metagenome_annotation_id',
+        count: { $sum: 1 }
+      }
+    }
+  ],
+  { maxTimeMS: 90000, allowDiskUse: true }
+);
diff --git a/nmdc_schema/create_legacy_orphan_deletion_requests.py b/nmdc_schema/create_legacy_orphan_deletion_requests.py
@@ -0,0 +1,42 @@
+import json
+import csv
+
+
+def gen_delete(delete_list: list, key: str, limit: int, del_coll: str):
+    r"""
+    Generates HTTP request bodies that can be sent to the Runtime API's
+    `/queries:run` endpoint in order to delete documents from the database.
+    """
+    if limit not in (0, 1):
+        raise ValueError("limit must be 0 or 1.")
+    delete_statement = []
+    for record in delete_list:
+        delete_statement.append({"q": {key: r"%s" % record}, "limit": limit})
+    request_body_json = {"delete": del_coll, "deletes": delete_statement}
+    request_body_file = del_coll + "_request_body.json"
+    with open(request_body_file, "w") as f:
+        json.dump(request_body_json, f)
+
+
+# delete legacy orphan functional_annotation_agg records
+delete_agg_list = []
+with open("to_delete_nmdc.functional_annotation_agg.csv", newline="") as csvfile:
+    agg_records = csv.reader(csvfile, delimiter=",")
+    for row in agg_records:
+        delete_agg_list.append(row[0])
+del_coll = "functional_annotation_agg"
+gen_delete(delete_agg_list, "metagenome_annotation_id", 0, del_coll)
+
+# delete leagacy orphan data_object_set records
+# uses regex as a workaround for newline issues double backslash issues
+delete_do_list = []
+with open("legacy_orphan.nmdc.data_object_set.csv", newline="") as csvfile:
+    do_records = csv.reader(csvfile, delimiter=",")
+    for row in do_records:
+        strip_id = row[1].replace("\\n", "")
+        print(strip_id)
+        delete_do_list.append({"q": {"id": {"$regex": strip_id}}, "limit": 1})
+    request_body_json = {"delete": "data_object_set", "deletes": delete_do_list}
+    request_body_file = "data_object_set_request_body.json"
+    with open(request_body_file, "w") as f:
+        json.dump(request_body_json, f)