-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstage1b_pull_proteins_NCBI.py
158 lines (119 loc) · 5.15 KB
/
stage1b_pull_proteins_NCBI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import SeqRecord
import pandas as pd
import numpy as np
import ms_module as ms
import re
############################
from Bio import Entrez
from Bio import SeqIO
from StringIO import StringIO
import time
from urllib2 import HTTPError # for Python 2
import argparse
# do some arguments parsing to make the script looks civilized ...
parser = argparse.ArgumentParser()
parser.add_argument("-f","--pept_summary_fetch", help="speicfy peptide summary file name (with fetchid column(!), with/without path)",required=True)
# we don't need spectrum file for downloading proteins, it is too redundant for that purpose ...
parser.add_argument("--verbose", help="verbose output", action="store_true")
parser.add_argument("--prefix", help="specify common part of the path for peptide and spectrum files")
parser.add_argument("--email", help="Provide your email for NCBI servers abuse-feedback")
args = parser.parse_args()
# print args
###############################################
if args.verbose:
print "Verbose output is to follow ...\n\n"
###############################################
if args.prefix is not None:
pep_info_with_fetch_fname = os.path.join( args.prefix, args.pept_summary_fetch )
else:
pep_info_with_fetch_fname = args.pept_summary_fetch
# get the common path for later use ...
pep_path = os.path.dirname(pep_info_with_fetch_fname)
#
# don'r forget to provide you email
Entrez.email = args.email if args.email else "your_email@mail_server.com"
#
# peptides_with_fetch.csv
# pep_info_with_fetch_fname
# pep_info_with_fetch
pep_info_with_fetch = pd.read_csv(pep_info_with_fetch_fname)
assert 'fetchid' in pep_info_with_fetch.columns
############################################
# columns that needs to be delivered ... #
############################################
# A gsites, 1 per line
# B pept, 1 per line
# B1 enzyme, G or T, derive from 'Biological sample category', like this: {'TrypsinSample1':'T','GluC_Sample2':'G'}
# C peptide_start, 1 per line accordingly
# D all_uids, REPLACE WITH col:H
# E prot_seq, try to get those from NCBI, not from UniProt ...
# F protein, ??? sequence, name or what???
# G uid_max, UID for major form instead or something like that ...
# H prot_name, parsed out human-readable name from 'Protein name'
# H1 gene_name, parsed out GN=xxx from 'Protein name'
# I uniq_peptide_count, discrad that column ...
# J pept_probability, output number not the string - this would be the criteria
# K gsites_predicted, OK
# L gsites_predicted_number, OK
# M gsite_start, beware of 0 or 1 type of indexing ...
# N,O,P - gsites AAs in separate columns
# M1, NOP combined, gsite sequence basically!
# Q signal, from GeneBank record on the protein, simply Y,N on whether there is a 'Signal' in gb.
# R signal_location, location of the signal from Q
# S tm_span, Y,N just for the fact of having TM span as a protein feature.
#
#
print
print "Posting and fetching genebank records corresponding to the available FetchIDs from the Protein DB ..."
pulled_gb_recs_fname = os.path.join( pep_path, "pulled_proteins.gb" )
batch_size = 60
attempts_limit = 3
# THEN WE'D NEED TO DO POST AND ONLY AFTER EFETCH ...
# there might be some EMPTY fetchids ...
non_empty_fetchids = pep_info_with_fetch['fetchid'][pep_info_with_fetch['fetchid'].notnull()].apply(int)
with_empty_fetchids = pep_info_with_fetch[pep_info_with_fetch['fetchid'].isnull()]
#
print
print "BEWARE! There are %d empty fetchids ..."%with_empty_fetchids.shape[0]
print with_empty_fetchids[['Protein name','Peptide sequence']]
print
#
# Epost will support GIs only for some time ... https://ncbiinsights.ncbi.nlm.nih.gov/2016/07/15/ncbi-is-phasing-out-sequence-gis-heres-what-you-need-to-know/
search_results = Entrez.read( Entrez.epost("protein", id=",".join( non_empty_fetchids.apply(str).unique() )) )
webenv = search_results["WebEnv"]
query_key = search_results["QueryKey"]
# download results in batches using history and coockies ....
count, = non_empty_fetchids.unique().shape
out_handle = open(pulled_gb_recs_fname, "w")
for start in range(0, count, batch_size):
end = min(count, start+batch_size)
print("Going to download record %i to %i" % (start+1, end))
attempt = 0
while attempt < attempts_limit:
attempt += 1
try:
fetch_handle = Entrez.efetch(db="protein", rettype="gb", retmode="text",
retstart=start, retmax=batch_size,
webenv=webenv, query_key=query_key)
break # skip subsequent attempts is succeeded ...
except HTTPError as err:
if 500 <= err.code <= 599:
print("Received error from server %s" % err)
print("Attempt %d of %d"%(attempt,attempts_limit))
# attempt += 1
time.sleep(15)
else:
print "oh Shut! %d"%attempt
raise
data = fetch_handle.read()
fetch_handle.close()
out_handle.write(data)
out_handle.close()
#
print "Fetched genebank records are stored in %s."%pulled_gb_recs_fname
print "Check for BioPython gb consistency before processing ..."
print "THE END"