Skip to content

Commit

Permalink
remove accession version
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Feb 12, 2021
1 parent ecb8d21 commit ac10e90
Showing 1 changed file with 21 additions and 31 deletions.
52 changes: 21 additions & 31 deletions README.md
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ Mapping NCBI genbank accession to GTDB accession

## Strategy

# genbank & refseq assembly
# genbank assembly
NCBI assembly_accession (1) <- (1) ftp-url/fna.gz (1) <- (n) NCBI accession

# gtdb
@@ -21,10 +21,7 @@ Mapping NCBI genbank accession to GTDB accession

# download assembly summary files
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/genbank/archaea/assembly_summary.txt -O as-genbank-archaea.txt
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt -O as-genbank-bacteria.txt
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/assembly_summary.txt -O as-refseq-archaea.txt
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt -O as-refseq-bacteria.txt

wget -c https://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt -O as-genbank-bacteria.txt

# download gtdb metadata
wget -c https://data.ace.uq.edu.au/public/gtdb/data/releases/release95/95.0/ar122_metadata_r95.tar.gz
@@ -45,31 +42,32 @@ Mapping NCBI genbank accession to GTDB accession
tar -zxvf bac120_metadata_r95.tar.gz
cat ar122_metadata_r95.tsv <(sed 1d bac120_metadata_r95.tsv) \
| csvtk cut -t -f ncbi_genbank_assembly_accession,accession \
| csvtk replace -t -f ncbi_genbank_assembly_accession -p "\.\d+$" \
| sed 1d \
> ncbi_ass2gtdb_acc.tsv
# extract NCBI assembly_accession and ftp path
cat as-genbank-archaea.tsv <(sed 1d as-genbank-bacteria.tsv) \
<(sed 1d as-refseq-archaea.tsv) <(sed 1d as-refseq-bacteria.tsv) \
| csvtk cut -t -f assembly_accession,ftp_path \
| csvtk replace -t -f assembly_accession -p "\.\d+$" \
| csvtk uniq -t -f assembly_accession \
> assacc2ftp.tsv
> ncbi_ass2ftp.tsv

# filter these in gtdb
cat assacc2ftp.tsv \
cat ncbi_ass2ftp.tsv \
| sed 1d \
| csvtk grep -H -t -f 1 -P <(cut -f 1 ncbi_ass2gtdb_acc.tsv) \
> assacc2ftp.filter.tsv
> ncbi_ass2ftp.filter.tsv

# counting
wc -l ncbi_ass2gtdb_acc.tsv assacc2ftp.filter.tsv
194600 ncbi_ass2gtdb_acc.tsv
192796 assacc2ftp.filter.tsv
wc -l ncbi_ass2gtdb_acc.tsv ncbi_ass2ftp.filter.tsv
# 194600 ncbi_ass2gtdb_acc.tsv
# 194436 ncbi_ass2ftp.filter.tsv

# find out these not in genbank.
# One of the reassons: version changed, e.g., GCA_901457705.1 has bumped to GCA_901457705.2
# TODO: fix this
# possible reasson: deleted fro genbank, e.g., GCA_007036345
cat ncbi_ass2gtdb_acc.tsv \
| csvtk grep -H -t -f 1 -v -P <(cut -f 1 assacc2ftp.tsv) \
| csvtk grep -H -t -f 1 -v -P <(cut -f 1 ncbi_ass2ftp.tsv) \
| cut -f 1 \
> ncbi_ass2gtdb_acc.tsv.not-in-sa.tsv

@@ -81,11 +79,10 @@ Mapping NCBI genbank accession to GTDB accession
# may be due to my bad internet connection.
#
# time: abount one day with a VPS at San Francisco, CA

time cat assacc2ftp.filter.tsv \
| csvtk replace -t -f ftp_path -p "^ftp" -r "https" \
| sed 1d \
| rush -k -j 10 'wget {2}/{2%}_genomic.fna.gz -o /dev/null -O - \
threads=20
time cat ncbi_ass2ftp.filter.tsv \
| csvtk replace -H -t -f 2 -p "^ftp" -r "https" \
| rush -k -j $threads 'wget {2}/{2%}_genomic.fna.gz -o /dev/null -O - \
| gzip -cd | grep "^>" | cut -f 1 | sed "s/>//" | awk "{print \$1, \"{1}\"}"' \
| sed 's/ /\t/' \
| gzip -c \
@@ -98,22 +95,15 @@ Mapping NCBI genbank accession to GTDB accession
| sed 's/--/\t/' \
| gzip -c \
> ncbi_acc2gtdb_acc.tsv.gz
Note that not all NCBI assembly_accessions can be mapped to GTDB.

# filtering:
zcat ncbi_acc2gtdb_acc.tsv.gz \
| grep -v __ \
| gzip -c \
> ncbi_acc2gtdb_acc.filter.tsv.gz

## Result

# check
zcat ncbi_acc2gtdb_acc.filter.tsv.gz | cut -f 3 | uniq | wc -l
192766
zcat ncbi_acc2gtdb_acc.tsv.gz | cut -f 3 | uniq | wc -l
194436

zcat ncbi_acc2gtdb_acc.filter.tsv.gz | head -n 5
zcat ncbi_acc2gtdb_acc.tsv.gz | head -n 5
LMVM01000001.1 GCA_002287175.1 RS_GCF_002287175.1
LMVM01000002.1 GCA_002287175.1 RS_GCF_002287175.1
LMVM01000003.1 GCA_002287175.1 RS_GCF_002287175.1

0 comments on commit ac10e90

Please sign in to comment.