runQUERYunpkb.py

import Bio
from Bio import SeqIO
import requests
import pandas as pd
from Bio.Blast.Applications import NcbideltablastCommandline
from Bio.Blast import NCBIXML
#to create temporary file with sequence
import os
import sys
import tempfile
import subprocess
from subprocess import Popen, PIPE
#requests.__version__

#prepares lists to later add to dataframe,xlsx
querylst = []
acc = []
sequence = []
rbhits = [] #phmmer
cluster = [] #uniref
length = []
subc_location = []
cell_compn = []
fragment = []
db = []
families = []
existence = []
reviewed = []

#gets FASTA file generated by phmmer
#filename = './fasta_files/ul23_h_phmmer_uniprotkb.fasta'

output = sys.argv[1]
input = sys.argv[2]

in_file = output + "/excel/hmmsearch.xlsx"
out_file = output + "/excel/hmmsearch.xlsx"
#print(out_file)
file = output + "/phmmer/hello.txt"
#print(file)

df = pd.read_excel(in_file)
print('Querying UniProtKB. Please wait...')

for index, row in df.iterrows() :
    query = row['TARGET']
    seq = '>unknown\n' + row['TARGET_SEQUENCE']
    seq = subprocess.Popen(['printf', seq], stdout=subprocess.PIPE)
    #print(seq)
    second = subprocess.Popen(['phmmer', '--noali', '-E 0.005', '-', './bin/GCF_000146045.2_R64_protein.faa'], stdin=seq.stdout, stdout=subprocess.PIPE)
    seq.stdout.close()
    out = second.communicate()[0]
    seq.wait()
    out = out.decode('ascii')


    with open(file, 'w') as f:
        f.write(out)

    with open(file, 'r') as f:
        count = 0
        for line in f :
            if line.startswith('>>') and count < 1 :
                line=line.strip('>>').replace('  ', ' ')
                count +=1
                rbhits.append(line)
                #if line.startswith('>>') :
                #print(line)
    if count == 0 :
        print('no hits for ' + query)
        rbhits.append('NO HITS')
    #we have defined a varible var, then we have used the subprocess.Popen class to get the value of var using / bin / echo and then sent this value to the pipe.The first is an object of subprocess.Popen class.
    #Then we have used the STDOUT of first as STDIN of second via pipe and executed the bash script accordingly.
    #processs = subprocess.Popen(stdin.split(), stdin=subprocess.PIPE)
    #input=processs.communicate()

    #process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)

    # blastp_cline = NcbiblastpCommandline(query="../files/fasta/prot.fa", db="../bin/refseq_yeast/scerevisiae_refseqprot", evalue=0.001, outfmt=5, out="opuntia.xml")

    ##blastp_cline = NcbideltablastCommandline(cmd='deltablast', query='-', db='../bin/refseq_yeast/scerevisiae_refseqprot', evalue=0.005, window_size=10, threshold=11, gapopen=13, gapextend=3, matrix='PAM30', outfmt=5, out='../files/deltablast/opuntia.xml', rpsdb='~/cdd_delta')
        # blast_records = NCBIXML.parse(result_handle)
        # blast_record = next(blast_records)
    ##stdout, stderr = blastp_cline(stdin=seq)
        # print(blastp_cline)
        # blast_record = NCBIXML.read(result_handle)
    ##result_handle = open("../files/deltablast/opuntia.xml")
        #psiblast_record = NCBIXML.read(result_handle)
    #psiblast_records = NCBIXML.parse(result_handle)
    ##blast_records = NCBIXML.parse(result_handle)
    #print(blast_records)

    ##for record in blast_records :
     #   i=0
      ##  score = 0
        #print(record, 'record title')
       ## for alignment in record.alignments :
            #score = 0
            #print(hit_id, 'this is hit ID')
            #i+=1
            #print(alignment.title, i)
         ##   for hsp in alignment.hsps :
                #print(hsp.score, ' score of ' + str(i))
                #print(hsp.score)
                #print(score)
           ##     if hsp.score >= score :
             ##       score = hsp.score
               ##     hit = alignment.title.split('|')
                 ##   prot = hit[2]
                    #print(prot, 'prot of highest score')

    ##rbhits.append(prot)
    ##print('top hit:', prot)


        #os.close(fd)
    #finally:
     #   os.remove(filename)


    output = requests.get('https://www.uniprot.org/uniprot/?query=' + query + '&format=tab&columns=id,sequence,length,fragment,comment(SUBCELLULAR LOCATION),go(cellular component),database(Pfam),families,existence,reviewed')
    if output.content :#.ok:
        querylst.append(query)
        #print(query)
        #q = request.GET['q']
        #if q:
        #print(output)


        table = output.text
        table = table.replace('\n', '\t')
        table = table.split('\t')
        #print(table)

        if table[10] :
            acc.append(table[10])
        else :
            acc.append('NA')

        if table[11]:
            sequence.append(table[11])
        else:
            sequence.append('NA')

        if table[12] :
            length.append(table[12])
        else :
            length.append('NA')
        #print(table[10], 'this is table 10')
        uniref_req = requests.get(
            'https://www.uniprot.org/uniref/?query=' + table[10] + '&format=tab&sort=score')
        if uniref_req.content :
            uniref_out = uniref_req.text
            uniref_out = uniref_out.replace('\n', '\t')
            uniref_out = uniref_out.split('\t')
            #print(uniref_out)
            cluster.append(uniref_out[10])

        else :
            requests.session().close()
            cluster.append('NA')

        if table[13] :
            fragment.append(table[13])
        else :
            fragment.append('complete')

        if table[14] :
            subc_location.append(table[14])
        else :
            subc_location.append('NA')

        if table[15]:
            cell_compn.append(table[15])
        else:
            cell_compn.append('NA')

        if table[16]:
            db.append(table[16])
        else:
            db.append('NA')

        if table[17] :
            families.append(table[17])
        else :
            families.append('NA')

        if table[18]:
            existence.append(table[18])
        else:
            existence.append('NA')

        if table[19]:
            reviewed.append(table[19])
            #print(table[19], 'reviewed:')
        else:
            reviewed.append('NA')

        #families.append(table[16])
        #existence.append(table[16])
        #reviewed.append(table[17])
        #print(len(species))
    else:
        querylst.append(query)
        acc.append('ERROR')
        sequence.append('ERROR')
        length.append('ERROR')
        cluster.append('ERROR')
        fragment.append('ERROR')
        subc_location.append('ERROR')
        cell_compn.append('ERROR')
        db.append('ERROR')
        families.append('ERROR')
        existence.append('ERROR')
        reviewed.append('ERROR')
        #print('Something went wrong while querying sequence ID \'' + query + '\'. Check folder \'/logs\'\n')
        #print('Resumed query. Please wait...')


d = {k:v for k, v in zip(querylst, acc)}
e = {k:v for k, v in zip(querylst, sequence)}
f = {k:v for k, v in zip(querylst, length)}
g = {k:v for k, v in zip(querylst, fragment)}
h = {k:v for k, v in zip(querylst, subc_location)}
i = {k:v for k, v in zip(querylst, cell_compn)}
j = {k:v for k, v in zip(querylst, cluster)}
k = {k:v for k, v in zip(querylst, families)}
l = {k:v for k, v in zip(querylst, db)}
m = {k:v for k, v in zip(querylst, rbhits)}
n = {k:v for k, v in zip(querylst, existence)}
o = {k:v for k, v in zip(querylst, reviewed)}

df['UNIPROT ACC.'] = df['TARGET'].map(d)
df['SEQUENCE'] = df['TARGET'].map(e)
df['LENGTH'] = df['TARGET'].map(f)
df['FRAGMENT'] = df['TARGET'].map(g)
df['SUBC.LOCATION'] = df['TARGET'].map(h)
df['CELL COMPARTM.'] = df['TARGET'].map(i)
df['CLUSTER'] = df['TARGET'].map(j)
df['FAMILY'] = df['TARGET'].map(k)
df['PFAM DOMAIN'] = df['TARGET'].map(l)
df['PHMMER TOP HIT'] = df['TARGET'].map(m)
df['EXISTENCE'] = df['TARGET'].map(n)
df['REVIEWED'] = df['TARGET'].map(o)
df.to_excel(out_file)


#prepares to query uniprot programmatically
#BASE = 'https://www.uniprot.org'
#KB_ENDPOINT = '/uniprot/'
#TOOL_ENDPOINT = '/uploadlists/'

#establish outfile directory and filename
#outfile = .

#reads FASTA entries one by one
#extracts ID and seq of each entry
#appends ID,seq to lists
#queries uniprot based on ID
#appends columns to respective lists
#with open(filename, "rU") as handle:
#    for record in SeqIO.parse(handle, "fasta"):
#        mnemonic = str(record.id)
        #print(mnemonic)
        #ID = ID_menmonic.split("_")
        #accession = ID[0]
#        IDs.append(mnemonic)
#        sequences.append(str(record.seq))
        #might have problems with ID, is more than just accesssion
        #BASE = 'https://www.uniprot.org'
        #KB_ENDPOINT = '/uniprot/?'
        #payload = {'query':"'ID:'accession",'format':'tab','columns':'organism'}
        #PAYLOAD = 'query=ID:'
        #FORMAT = '&format=tab'
        #COLUMNS = '&columns=organism,lineage(ALL)'
        #https://www.uniprot.org/uniprot/?query=mnemonic:MRP20_SCHPO&format=tab&columns=organism,lineage(ALL)


# df = pd.DataFrame(IDs, columns=['ID'])
# df['PROT_NAME'] = pd.Series(prot_names)
# df['SEQUENCE'] = pd.Series(sequences)
# df['FRAGMENT'] = pd.Series(fragment)
# df['LENGTH'] = pd.Series(prot_length)
# df['MASS'] = pd.Series(prot_mass)
# df['LOCATION'] = pd.Series(subc_location)
# df['COMPONENT'] = pd.Series(cell_compn)
# df['SIGNALP'] = pd.Series(signalp)
# df['SUPERKINGDOM'] = pd.Series(superkingdom)
# df['KINGDOM'] = pd.Series(kingdom)
# df['SUBKINGDOM'] = pd.Series(subkingdom)
# df['PHYLUM'] = pd.Series(phylum)
# df['SUBPHYLUM'] = pd.Series(subphylum)
# df['SUPERCLASS'] = pd.Series(superclass)
# df['CLASS'] = pd.Series(clas)
# df['SUBCLASS'] = pd.Series(subclass)
# df['INFRACLASS'] = pd.Series(infraclass)
# df['SUPERORDER'] = pd.Series(superorder)
# df['ORDER'] = pd.Series(order)
# df['SUBORDER'] = pd.Series(suborder)
# df['INFRAORDER'] = pd.Series(infraorder)
# df['PARVORDER'] = pd.Series(parvorder)
# df['SUPERFAMILY'] = pd.Series(superfamily)
# df['FAMILY'] = pd.Series(family)
# df['SUBFAMILY'] = pd.Series(subfamily)
# df['TRIBE'] = pd.Series(tribe)
# df['GENUS'] = pd.Series(genus)
# df['SUBGENUS'] = pd.Series(subgenus)
# df['SPECIES GR'] = pd.Series(speciesgroup)
# df['SPECIES SUBGR'] = pd.Series(speciessubgroup)
# df['SPECIES'] = pd.Series(species)
# df['SUBSPECIES'] = pd.Series(subspecies)
# df['VARIETAS'] = pd.Series(varietas)
# df['FORMA'] = pd.Series(forma)
# df['STATUS'] = pd.Series(existence)
# df['REVIEWED'] = pd.Series(reviewed)
#
# outfile = './L23_h_phmmer_uniprotkb.xlsx'
# df.to_excel(outfile)
#
# #with open(outfile, 'w') as f:
# #    for index, row in df.iterrows():
# #        accession = row['ID']
# #        f.write(">%s\n%s\n" % (accession,row['SEQUENCE']))
#
# outfile_fasta = './fasta_files/L23_h_phmmer_uniprotkb_fromxlsx.fasta'
# df = pd.read_excel(outfile)
# df.fillna('?',inplace=True)
#
# number = 0
# with open(outfile_fasta, 'w') as f:
#     for index, row in df.iterrows():
#         species = row['SPECIES']
#         kingdom = (row['KINGDOM'])[0]
#         phylum = (row['PHYLUM'])[0]
#         subphylum = (row['SUBPHYLUM'])[0]
#         clas = (row['CLASS'])[0]
#         order = (row['ORDER'])[0]
#         family = (row['FAMILY'])[0]
#         genus = species[0]
#         specific_n = species.split()[1]
#         number+=1
#         unique_identifier = '[' + str(number) + ']'
#         print(number)
#         lineage = kingdom + phylum + subphylum + clas + order + family
#         f.write(">%s\n%s\n" % (lineage + '|' +  genus + '.' + specific_n + unique_identifier,row['SEQUENCE']))