-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscrape_extensions.py
executable file
·60 lines (43 loc) · 1.58 KB
/
scrape_extensions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#! /usr/bin/env python3
'''
Scrapes file extensions for various file types from FileInfo.com.
The file types are defined in the constants module.
'''
import io
import json
from time import sleep
from urllib.request import urlopen
from bs4 import BeautifulSoup
from constants import BASE_URL, FILE_TYPES
EXTENSIONS_DICT = {}
EXTENSIONS_BY_TYPE = {}
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html, 'lxml')
def get_extensions_for(type):
soup = make_soup(BASE_URL + FILE_TYPES[type]['url'])
extension_table = soup.find('tbody')
EXTENSIONS_BY_TYPE[type] = []
for row in extension_table.find_all('tr'):
cols = row.find_all('td')
extension = cols[0].get_text()
EXTENSIONS_BY_TYPE[type].append(extension)
EXTENSIONS_DICT[extension] = {}
EXTENSIONS_DICT[extension]['type'] = type
EXTENSIONS_DICT[extension]['description'] = cols[1].get_text()
def get_all_extensions():
for type in FILE_TYPES:
get_extensions_for(type)
sleep(1)
def write_dict_to_json_file(dictionary, filename):
with io.open(filename, 'w', encoding='utf8') as file:
json_str = json.dumps(dictionary,
ensure_ascii=False,
indent=4,
sort_keys=True,
separators=(',', ': '))
file.write(json_str)
if __name__ == '__main__':
get_all_extensions()
write_dict_to_json_file(EXTENSIONS_DICT, 'extensions.json')
write_dict_to_json_file(EXTENSIONS_BY_TYPE, 'extensions_by_type.json')