forked from jrjhealey/bioinfo-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_from_NCBI.py
executable file
·73 lines (63 loc) · 2.53 KB
/
get_from_NCBI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse, subprocess, sys, fnmatch
from ftplib import FTP
#initiate parser
parser = argparse.ArgumentParser(description="This program will download bacterial sequencing data from NCBI's FTP server")
#optional arguments
parser.add_argument("-b", "--bacterium", help="name of bacterium (eg. Klebsiella_pneumoniae)", type=str)
parser.add_argument("-d", "--database", default="refseq", choices = ["refseq", "genbank"], type=str)
parser.add_argument("-t", "--type", default="fasta", help="file type", choices = ["genbank", "fasta", "gff", "feature_table"], type=str)
parser.add_argument("-g", "--genomes", help="Will list all available genomes in specified database", choices=["genbank", "refseq"], type=str)
parser.add_argument("-o", "--outputpath", default=".", help="output directory", type=str)
#initialize FTP server
ftp_site = 'ftp.ncbi.nlm.nih.gov'
ftp = FTP(ftp_site)
#print help if no arguments given
if len(sys.argv) == 1:
parser.print_help()
sys.exit()
args = parser.parse_args()
#set default variables
NCBI_database = "refseq"
file_type = "fasta"
#change default variable based on command arguments (eg. -t genbank)
if args.database == "refseq":
NCBI_database = "refseq"
elif args.database == "genbank":
NCBI_database = "genbank"
if args.type == "fasta":
file_type = "--exclude='*cds_from*' --exclude='*rna_from*' --include='*genomic.fna.gz' --exclude='*'"
elif args.type == "genbank":
file_type = "--include='*genomic.gbff.gz' --exclude='*'"
elif args.type == "gff":
file_type = "--include='*genomic.gff.gz' --exclude='*'"
elif args.type == "feature_table":
file_type = "--include='*feature_table.txt.gz' --exclude='*'"
#Have a way to list all available genomes
if args.genomes:
ftp.login()
ftp.cwd('genomes/%s/bacteria' % NCBI_database)
dirs = ftp.nlst()
for genome in dirs:
print(genome)
sys.exit()
#Check that bacterium is in NCBI genbank/refseq
if args.bacterium:
ftp.login()
ftp.cwd('genomes/%s/bacteria' % NCBI_database)
dirs = ftp.nlst()
pattern = args.bacterium
genome_match = fnmatch.filter(dirs, pattern)
if len(genome_match) == 0:
print("Bacterium %s not found in %s -----> Exiting program" % (args.bacterium, NCBI_database))
sys.exit(1)
else:
print("")
print("Please provide bacterium name '-b'")
print("Exiting program")
print("")
sys.exit(1)
#The meat of the program
try:
subprocess.call("rsync -Lrtv %s rsync://ftp.ncbi.nlm.nih.gov/genomes/%s/bacteria/%s/latest_assembly_versions/*/ %s" % (file_type, NCBI_database, args.bacterium, args.outputpath), shell=True)
except:
raise Exception("Failed to download files")