-
Notifications
You must be signed in to change notification settings - Fork 0
/
getFastaGenomes_python
executable file
·150 lines (115 loc) · 3.92 KB
/
getFastaGenomes_python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
"""
Description: Downloads a set of fasta genome sequences. Infile is an output file from parseNCBIcsv.py
Usage: getFastaGenomes -i infileName -o outDir -e email
outDir is the output directory to hold all downloaded genomes. It must be in same
directory from which script is run.
email is your email address and is required
Example: getFastaGenomes -i ShortEcoliList.txt -o GenomeFiles -e myName@gmail.com
"""
#********************* MODULES**********************
import sys
import os
import time
# Use the correct module based on the python version.
if sys.version_info[0] == 2:
import httplib as httplib
else: # More modern python
import http.client as httplib
#******************* FUNCTION DEFINITIONS *******************
def get(hostname, uri):
GET="GET"
connection = httplib.HTTPSConnection(hostname)
connection.request(GET, uri)
response = connection.getresponse()
data = response.read()
connection.close()
return (response.status, response.reason, data)
def EFetch(acc, retmode="text", rettype="fasta", db="nuccore", email="ksnp-dev@kissake.net"):
eutilsHost = "eutils.ncbi.nlm.nih.gov"
efetchPrefix = "/entrez/eutils/efetch.fcgi?"
argSeparator = "&"
arguments={
"tool": "kSNP4",
"email": email,
"db": db,
"rettype": rettype,
"retmode": retmode,
"id": acc,
}
URI = efetchPrefix + argSeparator.join([ "=".join(item) for item in arguments.items() ])
time.sleep(0.34) # Sleep for 1/3 of a second here to enforce a rate limit of 3 requests per second.
(status, reason, fetched) = get(eutilsHost,URI)
if status != 200:
sys.stderr.write("HTTP Error %d: %s\n" % (status, reason))
sys.exit(1)
return fetched
def readInfile(infileName):
#variables
genomesList = []
ID = ''
accNums = ''
#open the infile
INFILE = open(infileName, 'r')
#skip the first line
line = INFILE.readline()
for line in INFILE:
line = line.rstrip()
temp = line.split('\t')
ID = temp[0]
accNums = temp[1]
genomesList.append([ID,accNums])
INFILE.close()
return genomesList
def getGenome(ID, accList, outDir, email):
#variables
numRep = 0 #number of replicons in the genome
fileName = ID +'.fna'
OUTFILE = open(fileName, 'wb')
#print(ID)
#print(accList)
#split accList into its components
temp = accList.split(' ')
numRep = len(temp)
startTime = time.time()
for i in range(numRep):
result = EFetch(db="nuccore", acc = temp[i], rettype = "fasta", retmode = "text", email=email)
OUTFILE.write(result)
elapsedTime = time.time() - startTime
sys.stdout.write("Time to download {0} was {1:.2f} seconds.\n".format(fileName, elapsedTime))
OUTFILE.close()
#********************** Main *************************
#variables
infileName = '' #name of the input file
outDir = '' #name of the output directory
email = ''
genomesList = [] #col0= genome name, col1 = accession numbers separated by space
numGenomes = 0
home = '' #the directory from which the program is reun
#get command line arguments
for i in range(1, len(sys.argv)):
if sys.argv[i] == '-i':
infileName = sys.argv[i+1]
if sys.argv[i] == '-o':
outDir = sys.argv[i+1]
if sys.argv[i] == '-e':
email = sys.argv[i+1]
#check that all required arguments have been entered on the command line
if infileName == '':
sys.stdout.write('You must enter the infile name on the command line as -i infileName.\nQuitting the program now.\n')
sys.exit(1)
if outDir == '':
sys.stdout.write('You must enter the output directory name on the command line as -o outputDirectoryName.\nQuitting the program now.\n')
sys.exit(1)
if email == '':
sys.stdout.write('You must enter your email address on the command line as -e myself@someplace.com.\nQuitting the program now.\n')
sys.exit(1)
home = os.getcwd()
#read the infile
genomesList = readInfile(infileName)
numGenomes = len(genomesList)
#move to the outdir
os.chdir(outDir)
#download the fasta records from Entrez
for i in range(numGenomes): #numGenomes
getGenome(genomesList[i][0], genomesList[i][1], outDir, email)