Skip to content

Commit

Permalink
updated reconcile.py and mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
Vedanth-Ramji committed Feb 28, 2024
1 parent eeaaf31 commit 765df84
Show file tree
Hide file tree
Showing 20 changed files with 121,458 additions and 9,735 deletions.
7 changes: 1 addition & 6 deletions db_harmonisation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,4 @@ The approach used here is:

conda env create -f env.yml
conda activate crude_harmonisation
bash crude_db_harmonisation.sh

Output mapping can be found at `resfinder_ncbi_ARO_mapping.tsv`

- 93.1% of ResFinder entries with trivial mapping to ARO using homology models (218 entries missing).
- 80.7% of NCBI entries with trivial mapping to ARO using homology models (1260 entries missing).
bash crude_db_harmonisation.sh
7 changes: 7 additions & 0 deletions db_harmonisation/clean_ncbi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from Bio import SeqIO
from Bio.Seq import Seq

with open('./dbs/ncbi_amr.faa') as original, open('./dbs/ncbi_amr_corrected.faa', 'w') as corrected:
for record in SeqIO.parse('./dbs/ncbi_amr.faa', 'fasta'):
record.seq = Seq(str(record.seq).replace("*", ""))
SeqIO.write(record, corrected, 'fasta')
13 changes: 10 additions & 3 deletions db_harmonisation/crude_db_harmonisation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ wget -c -O dbs/card.tar.bz2 https://card.mcmaster.ca/latest/data
tar -xvf dbs/card.tar.bz2 -C dbs
ls -hl dbs

# Get megares db
wget -c -O dbs/megares.fna https://megares.meglab.org/download/megares_v2.00/megares_full_database_v2.00.fasta
# Get megares db: LINK NOT WORKING
# wget -c -O dbs/megares.fna https://megares.meglab.org/download/megares_v2.00/megares_full_database_v2.00.fasta

# Get argannot db
wget -c -O dbs/argannot.fna https://raw.githubusercontent.com/tseemann/abricate/master/db/argannot/sequences
Expand All @@ -47,7 +47,8 @@ rgi load -i dbs/card.json
mkdir -p mapping

rgi main -i dbs/resfinder.fna -o mapping/resfinder_rgi -t contig -a BLAST --clean --include_loose
rgi main -i dbs/ncbi_amr.faa -o mapping/ncbi_rgi -t protein -a BLAST --clean --include_loose
python clean_ncbi.py
rgi main -i dbs/ncbi_amr_corrected.faa -o mapping/ncbi_rgi -t protein -a BLAST --clean --include_loose
rgi main -i dbs/sarg.faa -o mapping/sarg_rgi -t protein -a BLAST --clean --include_loose
rgi main -i dbs/resfinder_fg.faa -o mapping/resfinder_fg_rgi -t protein -a BLAST --clean --include_loose
rgi main -i dbs/deeparg.faa -o mapping/deeparg_rgi -t protein -a BLAST --clean --include_loose
Expand All @@ -62,3 +63,9 @@ python reconcile.py -f dbs/resfinder_fg.faa -r mapping/resfinder_fg_rgi.txt -d r
python reconcile.py -f dbs/deeparg.faa -r mapping/deeparg_rgi.txt -d deeparg
python reconcile.py -f dbs/argannot.fna -r mapping/argannot_rgi.txt -d argannot
python reconcile.py -f dbs/megares.fna -r mapping/megares_rgi.txt -d megares

# Clean up
mv {megares,ncbi,resfinder,argannot,sarg,deeparg,resfinder_fg}_ARO_mapping.tsv mapping
rm -rf ResFinderFG
rm -rf deeparg-largerepo
rm ./mapping/*.json
15,736 changes: 15,736 additions & 0 deletions db_harmonisation/dbs/megares.fasta

Large diffs are not rendered by default.

2,225 changes: 2,225 additions & 0 deletions db_harmonisation/mapping/argannot_ARO_mapping.tsv

Large diffs are not rendered by default.

2,225 changes: 2,225 additions & 0 deletions db_harmonisation/mapping/argannot_rgi.txt

Large diffs are not rendered by default.

12,268 changes: 12,268 additions & 0 deletions db_harmonisation/mapping/deeparg_ARO_mapping.tsv

Large diffs are not rendered by default.

12,268 changes: 12,268 additions & 0 deletions db_harmonisation/mapping/deeparg_rgi.txt

Large diffs are not rendered by default.

7,858 changes: 7,858 additions & 0 deletions db_harmonisation/mapping/megares_ARO_mapping.tsv

Large diffs are not rendered by default.

7,858 changes: 7,858 additions & 0 deletions db_harmonisation/mapping/megares_rgi.txt

Large diffs are not rendered by default.

8,041 changes: 8,041 additions & 0 deletions db_harmonisation/mapping/ncbi_ARO_mapping.tsv

Large diffs are not rendered by default.

8,041 changes: 8,041 additions & 0 deletions db_harmonisation/mapping/ncbi_rgi.txt

Large diffs are not rendered by default.

6,251 changes: 6,251 additions & 0 deletions db_harmonisation/mapping/resfinder_ARO_mapping.tsv

Large diffs are not rendered by default.

3,461 changes: 3,461 additions & 0 deletions db_harmonisation/mapping/resfinder_fg_ARO_mapping.tsv

Large diffs are not rendered by default.

3,461 changes: 3,461 additions & 0 deletions db_harmonisation/mapping/resfinder_fg_rgi.txt

Large diffs are not rendered by default.

6,251 changes: 6,251 additions & 0 deletions db_harmonisation/mapping/resfinder_rgi.txt

Large diffs are not rendered by default.

12,745 changes: 12,745 additions & 0 deletions db_harmonisation/mapping/sarg_ARO_mapping.tsv

Large diffs are not rendered by default.

12,745 changes: 12,745 additions & 0 deletions db_harmonisation/mapping/sarg_rgi.txt

Large diffs are not rendered by default.

35 changes: 6 additions & 29 deletions db_harmonisation/reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,8 @@ def check_file(path):
return path
else:
raise argparse.ArgumentTypeError(f"{path} can't be read")


def get_aro_for_hits(fasta, rgi_output, database):
"""
Extract the mappings and any missed mappings for other AMR databases
vs NCBI
"""
database_entries = []
for record in SeqIO.parse(str(fasta), 'fasta'):
if "mutation" not in record.id:
database_entries.append(record.id)
database_entries = set(database_entries)


def get_aro_for_hits(rgi_output, database):
rgi_hits = pd.read_csv(rgi_output, sep='\t')

if database == 'resfinder':
Expand All @@ -43,23 +32,11 @@ def get_aro_for_hits(fasta, rgi_output, database):
rgi_hits['Original ID'] = rgi_hits['Contig'].apply(lambda x: '_'.join(x.split('_')[:-1]))
elif database == 'megares':
rgi_hits['Original ID'] = rgi_hits['Contig'].apply(lambda x: '_'.join(x.split('_')[:-1]))
# homolog models only for now
rgi_hits = rgi_hits[rgi_hits['Model_type'] == "protein homolog model"]

# tidy up "ORF ID"

mapping = rgi_hits[['Original ID', "Best_Hit_ARO", 'ARO']]
mapping = mapping.astype({'ARO': 'str'})
mapping = mapping.rename(columns={'Best_Hit_ARO': 'Gene Name in CARD'})

# fill in any missing mappings with NAs
missing_hits_from_original = database_entries - set(mapping['Original ID'].unique())
print(f"Warning {len(missing_hits_from_original)} in {database} without "
"trivial mapping to ARO. Listed as 'nan' in output mapping tsv")

# merge with full set of database entries to add appropriate NA values
database_entries = pd.Series(list(database_entries))
database_entries.name = "Original ID"
mapping = pd.merge(database_entries, mapping, how='outer')
mapping['Database'] = database

return mapping
Expand All @@ -69,7 +46,7 @@ def get_aro_for_hits(fasta, rgi_output, database):
parser = argparse.ArgumentParser("Recover cure ARO mapping for an AMR "
"database using the fasta and an RGI "
"tsv (homolog models only)")
parser.add_argument("-f", "--fasta", required=True, type=check_file,
parser.add_argument("-f", "--fasta", required=False, type=check_file,
help="Fasta file containing sequences run through "
"RGI")

Expand All @@ -82,8 +59,8 @@ def get_aro_for_hits(fasta, rgi_output, database):

args = parser.parse_args()

mapping = get_aro_for_hits(args.fasta, args.rgi, args.database)
mapping = get_aro_for_hits(args.rgi, args.database)

output_file = f"{args.database}_ARO_mapping.tsv"
print(f"Writing mapping to {output_file}")
mapping.to_csv(output_file, sep='\t')
mapping.to_csv(output_file, sep='\t', index=False)
Loading

0 comments on commit 765df84

Please sign in to comment.