-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patholympic_scrapper.py
91 lines (78 loc) · 3.55 KB
/
olympic_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import threading
from bs4 import BeautifulSoup
import requests
def scrape_nation_dict():
"""Scrape all nationalities in full country name."""
nation_dict = {}
nation_dict_url = 'http://www.olympedia.org/countries'
response = requests.get(
nation_dict_url, headers={'user-agent': 'patchekh'})
Parser = BeautifulSoup(response.text, 'html.parser')
new_hrefs = Parser.find_all('td')
for td in new_hrefs:
if td.contents and len(td.contents[0].contents) == 1:
nation_dict[td.contents[0].get('href')[-3:]] = td.contents[0].contents[0]
with open("data/NationDict.txt", 'w') as f:
for nation in nation_dict:
f.write(nation + '\t' + nation_dict[nation] + '\n')
def scrape_olympics(url_list, athlete_nationality_dict):
"""Scrape all athletes names and their 3-letter nationality."""
for url in url_list:
# for every list, parse the html and get all atheletes name
response = requests.get(
url, headers={'user-agent': 'patchekh'})
Parser = BeautifulSoup(response.text, 'html.parser')
new_hrefs = Parser.find_all('td')
i = 0
while i < len(new_hrefs) - 1:
if len(new_hrefs[i]) != 3 or not new_hrefs[i].contents[1].get('href'):
i += 1
continue
athlete = new_hrefs[i].contents[1].contents[0]
if athlete in athlete_nationality_dict:
i += 1
continue
i += 1
athlete_nationality_dict[athlete] = []
if new_hrefs[i].contents:
for elem in new_hrefs[i].contents[0].contents:
if len(elem) == 3:
athlete_nationality_dict[athlete].append(elem)
i += 1
def main():
init_list_url = 'http://www.olympedia.org/lists'
response = requests.get(
init_list_url, headers={'user-agent': 'patchekh'})
Parser = BeautifulSoup(response.text, 'html.parser')
new_hrefs = Parser.find_all('a')
url_list = []
# Fetch the list of links to all athletes lists from http://www.olympedia.org/lists
for href in new_hrefs:
link = href.get('href')
if '/lists' in link and '/manual' in link:
url_list.append(init_list_url.rstrip('/lists') + link)
athlete_nationality_dict = {}
# Split the list of athletes lists into 4 parts and crawl on different threads
thread_1 = threading.Thread(target=scrape_olympics, args=(url_list[:len(url_list) // 4], athlete_nationality_dict))
thread_2 = threading.Thread(target=scrape_olympics, args=(url_list[len(url_list) // 4 : len(url_list) // 4 * 2], athlete_nationality_dict))
thread_3 = threading.Thread(target=scrape_olympics, args=(url_list[len(url_list) // 4 * 2 :len(url_list) // 4 * 3], athlete_nationality_dict))
thread_4 = threading.Thread(target=scrape_olympics, args=(url_list[len(url_list) // 4 * 3:], athlete_nationality_dict))
thread_1.start()
thread_2.start()
thread_3.start()
thread_4.start()
thread_1.join()
thread_2.join()
thread_3.join()
thread_4.join()
# Output scrapped data
with open("data/NameNationality.txt", 'w') as f:
for athelete in athlete_nationality_dict:
f.write(athelete + '\t')
for nation in athlete_nationality_dict[athelete]:
f.write(nation + ' ')
f.write('\n')
# Crawl the correspondence of country acronyms and country names
scrape_nation_dict()
if __name__ == '__main__':
main()