updated reconcile.py and mappings

BigDataBiology · Feb 28, 2024 · 765df84 · 765df84
1 parent eeaaf31
commit 765df84
Show file tree

Hide file tree

Showing 20 changed files with 121,458 additions and 9,735 deletions.
diff --git a/db_harmonisation/README.md b/db_harmonisation/README.md
@@ -17,9 +17,4 @@ The approach used here is:
 
     conda env create -f env.yml
     conda activate crude_harmonisation
-    bash crude_db_harmonisation.sh
-
-Output mapping can be found at `resfinder_ncbi_ARO_mapping.tsv`
-
-- 93.1% of ResFinder entries with trivial mapping to ARO using homology models (218 entries missing).
-- 80.7% of NCBI entries with trivial mapping to ARO using homology models (1260 entries missing).
+    bash crude_db_harmonisation.sh
diff --git a/db_harmonisation/clean_ncbi.py b/db_harmonisation/clean_ncbi.py
@@ -0,0 +1,7 @@
+from Bio import SeqIO
+from Bio.Seq import Seq
+
+with open('./dbs/ncbi_amr.faa') as original, open('./dbs/ncbi_amr_corrected.faa', 'w') as corrected:
+    for record in SeqIO.parse('./dbs/ncbi_amr.faa', 'fasta'):
+        record.seq = Seq(str(record.seq).replace("*", ""))
+        SeqIO.write(record, corrected, 'fasta')
diff --git a/db_harmonisation/crude_db_harmonisation.sh b/db_harmonisation/crude_db_harmonisation.sh
@@ -34,8 +34,8 @@ wget -c -O dbs/card.tar.bz2 https://card.mcmaster.ca/latest/data
 tar -xvf dbs/card.tar.bz2 -C dbs
 ls -hl dbs
 
-# Get megares db
-wget -c -O dbs/megares.fna https://megares.meglab.org/download/megares_v2.00/megares_full_database_v2.00.fasta
+# Get megares db: LINK NOT WORKING
+# wget -c -O dbs/megares.fna https://megares.meglab.org/download/megares_v2.00/megares_full_database_v2.00.fasta
 
 # Get argannot db
 wget -c -O dbs/argannot.fna https://raw.githubusercontent.com/tseemann/abricate/master/db/argannot/sequences
@@ -47,7 +47,8 @@ rgi load -i dbs/card.json
 mkdir -p mapping
 
 rgi main -i dbs/resfinder.fna -o mapping/resfinder_rgi -t contig -a BLAST --clean --include_loose
-rgi main -i dbs/ncbi_amr.faa -o mapping/ncbi_rgi -t protein -a BLAST --clean --include_loose
+python clean_ncbi.py
+rgi main -i dbs/ncbi_amr_corrected.faa -o mapping/ncbi_rgi -t protein -a BLAST --clean --include_loose
 rgi main -i dbs/sarg.faa -o mapping/sarg_rgi -t protein -a BLAST --clean --include_loose
 rgi main -i dbs/resfinder_fg.faa -o mapping/resfinder_fg_rgi -t protein -a BLAST --clean --include_loose
 rgi main -i dbs/deeparg.faa -o mapping/deeparg_rgi -t protein -a BLAST --clean --include_loose
@@ -62,3 +63,9 @@ python reconcile.py -f dbs/resfinder_fg.faa -r mapping/resfinder_fg_rgi.txt -d r
 python reconcile.py -f dbs/deeparg.faa -r mapping/deeparg_rgi.txt -d deeparg
 python reconcile.py -f dbs/argannot.fna -r mapping/argannot_rgi.txt -d argannot
 python reconcile.py -f dbs/megares.fna -r mapping/megares_rgi.txt -d megares
+
+# Clean up
+mv {megares,ncbi,resfinder,argannot,sarg,deeparg,resfinder_fg}_ARO_mapping.tsv mapping
+rm -rf ResFinderFG
+rm -rf deeparg-largerepo
+rm ./mapping/*.json
diff --git a/db_harmonisation/dbs/megares.fasta b/db_harmonisation/dbs/megares.fasta
diff --git a/db_harmonisation/mapping/argannot_ARO_mapping.tsv b/db_harmonisation/mapping/argannot_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/argannot_rgi.txt b/db_harmonisation/mapping/argannot_rgi.txt
diff --git a/db_harmonisation/mapping/deeparg_ARO_mapping.tsv b/db_harmonisation/mapping/deeparg_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/deeparg_rgi.txt b/db_harmonisation/mapping/deeparg_rgi.txt
diff --git a/db_harmonisation/mapping/megares_ARO_mapping.tsv b/db_harmonisation/mapping/megares_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/megares_rgi.txt b/db_harmonisation/mapping/megares_rgi.txt
diff --git a/db_harmonisation/mapping/ncbi_ARO_mapping.tsv b/db_harmonisation/mapping/ncbi_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/ncbi_rgi.txt b/db_harmonisation/mapping/ncbi_rgi.txt
diff --git a/db_harmonisation/mapping/resfinder_ARO_mapping.tsv b/db_harmonisation/mapping/resfinder_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/resfinder_fg_ARO_mapping.tsv b/db_harmonisation/mapping/resfinder_fg_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/resfinder_fg_rgi.txt b/db_harmonisation/mapping/resfinder_fg_rgi.txt
diff --git a/db_harmonisation/mapping/resfinder_rgi.txt b/db_harmonisation/mapping/resfinder_rgi.txt
diff --git a/db_harmonisation/mapping/sarg_ARO_mapping.tsv b/db_harmonisation/mapping/sarg_ARO_mapping.tsv
diff --git a/db_harmonisation/mapping/sarg_rgi.txt b/db_harmonisation/mapping/sarg_rgi.txt
diff --git a/db_harmonisation/reconcile.py b/db_harmonisation/reconcile.py
@@ -14,19 +14,8 @@ def check_file(path):
         return path
     else:
         raise argparse.ArgumentTypeError(f"{path} can't be read")
-
-
-def get_aro_for_hits(fasta, rgi_output, database):
-    """
-    Extract the mappings and any missed mappings for other AMR databases
-    vs NCBI
-    """
-    database_entries = []
-    for record in SeqIO.parse(str(fasta), 'fasta'):
-        if "mutation"  not in record.id:
-            database_entries.append(record.id)
-    database_entries = set(database_entries)
-
+
+def get_aro_for_hits(rgi_output, database):
     rgi_hits = pd.read_csv(rgi_output, sep='\t')
 
     if database == 'resfinder':
@@ -43,23 +32,11 @@ def get_aro_for_hits(fasta, rgi_output, database):
         rgi_hits['Original ID'] = rgi_hits['Contig'].apply(lambda x: '_'.join(x.split('_')[:-1]))
     elif database == 'megares':
         rgi_hits['Original ID'] = rgi_hits['Contig'].apply(lambda x: '_'.join(x.split('_')[:-1]))
-    # homolog models only for now
-    rgi_hits = rgi_hits[rgi_hits['Model_type'] == "protein homolog model"]
 
-    # tidy up "ORF ID"
+
     mapping = rgi_hits[['Original ID', "Best_Hit_ARO", 'ARO']]
     mapping = mapping.astype({'ARO': 'str'})
     mapping = mapping.rename(columns={'Best_Hit_ARO': 'Gene Name in CARD'})
-
-    # fill in any missing mappings with NAs
-    missing_hits_from_original = database_entries - set(mapping['Original ID'].unique())
-    print(f"Warning {len(missing_hits_from_original)} in {database} without "
-           "trivial mapping to ARO. Listed as 'nan' in output mapping tsv")
-
-    # merge with full set of database entries to add appropriate NA values
-    database_entries = pd.Series(list(database_entries))
-    database_entries.name = "Original ID"
-    mapping = pd.merge(database_entries, mapping, how='outer')
     mapping['Database'] = database
 
     return mapping
@@ -69,7 +46,7 @@ def get_aro_for_hits(fasta, rgi_output, database):
     parser = argparse.ArgumentParser("Recover cure ARO mapping for an AMR "
                                      "database using the fasta and an RGI "
                                      "tsv (homolog models only)")
-    parser.add_argument("-f", "--fasta", required=True, type=check_file,
+    parser.add_argument("-f", "--fasta", required=False, type=check_file,
 						 help="Fasta file containing sequences run through "
 							  "RGI")
 
@@ -82,8 +59,8 @@ def get_aro_for_hits(fasta, rgi_output, database):
 
     args = parser.parse_args()
 
-    mapping = get_aro_for_hits(args.fasta, args.rgi, args.database)
+    mapping = get_aro_for_hits(args.rgi, args.database)
 
     output_file = f"{args.database}_ARO_mapping.tsv"
     print(f"Writing mapping to {output_file}")
-    mapping.to_csv(output_file, sep='\t')
+    mapping.to_csv(output_file, sep='\t', index=False)