Skip to content

Commit

Permalink
Merge pull request #2013 from microbiomedata/2011-queries-to-delete-l…
Browse files Browse the repository at this point in the history
…egacy-orphan-data-objects-and-functional-annotation-agg-records

Delete queries for orphan functional_annotation_agg records and id non-conformaing Data Objects
  • Loading branch information
mbthornton-lbl authored Jul 24, 2024
2 parents 05ced17 + ad5c50d commit 2e5f5da
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
db.getCollection('nom_analysis_activity_set').aggregate(
db.getCollection('metabolomics_analysis_activity_set').aggregate(
[
{
$match: { has_calibration: { $ne: null } }
Expand Down
12 changes: 12 additions & 0 deletions assets/mongodb_queries/find_id_nonconforming_data_objects.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
db.getCollection('data_object_set').aggregate(
[
{
$match: {
id: {
$not: { $regex: RegExp('nmdc:dobj-') }
}
}
}
],
{ maxTimeMS: 60000, allowDiskUse: true }
);
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
db.getCollection(
'functional_annotation_agg'
).aggregate(
[
{
$lookup: {
from: 'metagenome_annotation_activity_set',
localField: 'metagenome_annotation_id',
foreignField: 'id',
as: 'annotation_activities'
}
},
{
$match: {
annotation_activities: { $size: 0 }
}
},
{
$group: {
_id: '$metagenome_annotation_id',
count: { $sum: 1 }
}
}
],
{ maxTimeMS: 90000, allowDiskUse: true }
);
42 changes: 42 additions & 0 deletions nmdc_schema/create_legacy_orphan_deletion_requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import json
import csv


def gen_delete(delete_list: list, key: str, limit: int, del_coll: str):
r"""
Generates HTTP request bodies that can be sent to the Runtime API's
`/queries:run` endpoint in order to delete documents from the database.
"""
if limit not in (0, 1):
raise ValueError("limit must be 0 or 1.")
delete_statement = []
for record in delete_list:
delete_statement.append({"q": {key: r"%s" % record}, "limit": limit})
request_body_json = {"delete": del_coll, "deletes": delete_statement}
request_body_file = del_coll + "_request_body.json"
with open(request_body_file, "w") as f:
json.dump(request_body_json, f)


# delete legacy orphan functional_annotation_agg records
delete_agg_list = []
with open("to_delete_nmdc.functional_annotation_agg.csv", newline="") as csvfile:
agg_records = csv.reader(csvfile, delimiter=",")
for row in agg_records:
delete_agg_list.append(row[0])
del_coll = "functional_annotation_agg"
gen_delete(delete_agg_list, "metagenome_annotation_id", 0, del_coll)

# delete leagacy orphan data_object_set records
# uses regex as a workaround for newline issues double backslash issues
delete_do_list = []
with open("legacy_orphan.nmdc.data_object_set.csv", newline="") as csvfile:
do_records = csv.reader(csvfile, delimiter=",")
for row in do_records:
strip_id = row[1].replace("\\n", "")
print(strip_id)
delete_do_list.append({"q": {"id": {"$regex": strip_id}}, "limit": 1})
request_body_json = {"delete": "data_object_set", "deletes": delete_do_list}
request_body_file = "data_object_set_request_body.json"
with open(request_body_file, "w") as f:
json.dump(request_body_json, f)

0 comments on commit 2e5f5da

Please sign in to comment.