-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_microbes.py
53 lines (40 loc) · 1.69 KB
/
get_microbes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
from bs4 import BeautifulSoup
import pandas as pd
base = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/'
tax = ['phylum', 'class', 'order', 'family', 'genus', 'species']
df = pd.DataFrame(columns = tax)
def get_category(base, link, cat_name):
page = requests.get(base + link)
soup = BeautifulSoup(page.content, 'html.parser')
categories = soup.find_all('a', {'title' : cat_name})
return categories
link ='wwwtax.cgi?mode=Tree&id=2&lvl=1&lin=f&keep=1&srchmode=1&unlock'
phylums = get_category(base, link, tax[0])
for phylum in phylums:
link = phylum['href']
classes = get_category(base, link, tax[1])
phylum_text = phylum.get_text()
for class_name in classes:
link = class_name['href']
orders = get_category(base, link, tax[2])
class_text = class_name.get_text()
for order in orders:
link = order['href']
families = get_category(base, link, tax[3])
order_text = order.get_text()
for family in families:
link = family['href']
genuses = get_category(base, link, tax[4])
family_text = family.get_text()
for genus in genuses:
link = genus['href']
species_plural = get_category(base, link, tax[5])
genus_text = genus.get_text()
for species in species_plural:
species_text = species.get_text()
row = [phylum_text, class_text, order_text, family_text, genus_text, species_text]
print(row)
df.loc[len(df)] = row
df.to_csv('microbes.csv')
print(df.head())