-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpfam_download.py
139 lines (127 loc) · 6.77 KB
/
pfam_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import requests
from bs4 import BeautifulSoup
import os
from os import path
import re
import socket
from time import gmtime, strftime
import sys
import subprocess
#function pings google.com to check if the computer is connected to the internet
def is_connected():
try:
socket.create_connection(("www.google.com", 80))
return True
except OSError:
pass
return False
#function for creating a list of accession number from pfam's entry table
def accession_list():
#pfam entry table is organized alphabetically
browse_list = 'abcdefghijklmnopqrstuvwxyz'
#pfam entry table also has these three fields apart from alphabets
browse_list = list(browse_list) + ['numbers', 'new', 'top%20twenty']
line_list = []
accession_with_size = {}
total = 0
for key in browse_list:
print('Fetching entries that start with: ' + key)
browse_url = 'https://pfam.xfam.org/family/browse?browse=' + key
#a response object for the above url
response = requests.get(browse_url)
#if there is a good repsonse
if response.status_code == 200 or response.status_code == '200':
try:
#BeautifulSoup parses the html part of the text from the response
soup = BeautifulSoup(response.text, 'html.parser')
#for finding all cells with a tr tag in the entry table
find = soup.find_all('tr')
for i in find:
line = i.text
line = line.split('\n')
for l in line:
#if row is not empty then append it to line_list
if l != '':
line_list.append(l)
except:
print('Response Error')
sys.exit()
else:
continue
#for each row in line_list
for i in range(len(line_list)):
x = line_list[i]
#if row has an accession number then proceed
if re.search('^PF', x):
try:
#size of the family is after every third element after accession
size = int(line_list[i + 3])
#get families only with more than 500 sequences
if size >= 500:
#if family is already in list then continue
if x in accession_with_size:
continue
else:
#if family is not in list then add it to the dictionary
total += 1
accession_with_size[x] = size
except:
continue
#write all accession numbers in a file
with open('temp_files/accession_list.txt', 'w') as f:
for i in accession_with_size:
f.write(str(i) +"\n")
print('pickle dumped......')
#downloading all entries in the accession list file
def download_entries(accession_with_size):
for key in accession_with_size:
if is_connected():
filename = 'pfam_entries/' + key + '.fasta'
if path.exists(filename) is False:
#this line is from when I restarted the download to another folder (from families folder to pfam_entries folder)
old_filename = 'families/' + key + '.fasta'
if path.exists:
#delete old file before downloading new file
os.system('rm -rf ' + old_filename)
download_url = 'https://pfam.xfam.org/family/' + key + '/alignment/full/format?format=fasta&alnType=full&order=t&case=u&gaps=dashes&download=0'
try:
#print current date and time
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
print('Downloading alignment: ' + key)
response = requests.get(download_url)
if response.status_code == 200 or response.status_code == '200':
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
filename = 'pfam_entries/' + key + '.fasta'
with open(filename, 'w') as f:
for line in text:
if line.startswith('500 Internal Server Error'):
return
f.write(line)
else:
continue
except:
continue
else:
return
def main():
#print('If you think that you have enough families downloaded then just press Ctrl + Z and stop the script :)')
accession_list()
accession_with_size = []
with open('temp_files/accession_list.txt', 'r') as f:
for line in f:
#appending all accessions numbers in a list
accession_with_size.append(line.strip('\n'))
'''
dict_size = len(accession_with_size)
while True:
download_entries(accession_with_size)
os.chdir('media/Data/consensus/pfam_entries')
size_of_directory = int(subprocess.check_output('ls -l | wc -l', shell=True))
if dict_size == size_of_directory:
break
print('Download Complete!')
sys.exit()
'''
if __name__ == '__main__':
main()