-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_functions.py
139 lines (108 loc) · 4.45 KB
/
file_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pytz
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
from datetime import datetime
import os
from pandas import read_csv, DataFrame
def game_types() -> dict:
'''
Returns a dict of the abbreviation codes of T20 leagues provided in the README
Returns:
a dict of the abbreviation codes of T20 leagues provided in the README
'''
# url of the source
main_url = 'https://cricsheet.org/matches/'
# requesting website for data with BS4
page = requests.get(main_url)
soup = BeautifulSoup(page.content, 'html.parser')
tag_dl = soup.find_all('dl')
# storing required links and their wrapped text in a dictionary called data_url
data_url = dict()
match_type = [match.text for match in tag_dl[0].find_all('dt')]
count = 0
for i in tag_dl[0].find_all('dd'):
# print(type(i) == "<class 'bs4.element.Tag'>")
# print(isinstance(type(i), bs4.element.Tag))
# print(i.find_all('a')[1]['href'].split('/')[2][:-4])
key = i.find_all('a')[1]['href'].split('/')[2][:-4]
link = 'https://cricsheet.org' + i.find_all('a')[1]['href']
data_url[key] = [link, match_type[count]]
count += 1
# printing the dictionary for reference
# for key in data_url:
# print(key,':',data_url[key][1], data_url[key][0])
with open('./README.md', 'r') as f:
readme = f.readlines()
readme = readme[readme.index('### Game types available <br>\n'):]
readme = readme[4:]
table = list()
for x in readme:
if x[0] != '|':
break
table.append(x)
table = [x.split('| ') for x in table]
input_list = [table[x][2].strip().lower() for x in range(0, len(table))]
# print(data_url)
output_dict = {}
for input in input_list:
output_dict[input] = data_url[input][0]
return output_dict
def file_process():
games_dict = game_types()
for game in games_dict:
r = requests.get(games_dict[game])
zip_file = f'./{game}.zip'
with open(zip_file, 'wb') as zip:
zip.write(r.content)
with ZipFile(zip_file, 'r') as zipObj:
listofFileNames = zipObj.namelist()
for filename in listofFileNames:
if filename.endswith('.yaml'):
zipObj.extract(filename, f'./{game}_files')
if filename.endswith('.txt'):
zipObj.extract(filename, './')
files_list = list()
if game == 't20s':
with open('./README.txt', 'a') as file_obj:
file_obj.write('\n')
file_obj.write(
'2019-05-05 - international - T20 - female - 1182643 - Kenya vs Namibia')
with open('./README.txt', 'r') as readme:
for line in readme:
if line[0] == '2':
readme_list = line.split('-')
readme_list = [x.strip(' ') for x in readme_list]
readme_list = [x.strip('\n') for x in readme_list]
files_list.append(str(readme_list[6]))
count = 1
for i in range(len(files_list)-1, -1, -1):
for filename in enumerate(os.listdir(f'./{game}_files')):
new_name = filename[1][:-5]
if files_list[i] == new_name:
file_count = str(format(count, '04d'))
dst = f'./{game}_files/{game}{file_count}.yaml'
src = f'./{game}_files/{filename[1]}'
os.rename(src, dst)
count += 1
with open('./logs.txt','a') as f:
now = datetime.now(pytz.utc)
dt_string = now.strftime("%b %d, %Y %H:%M:%S UTC")
f.write(f'{dt_string} | {len(files_list)} files of {game.upper()} processed\n')
if game == 'wtc':
final_line = '-'*20
f.write(f'{final_line}\n')
try:
os.remove(f'{game}.zip')
except OSError as e:
print(f'Error: {game}.zip : {e.strerror}')
def load_df(file_types: dict, game: str) -> dict:
dict_data_df = dict()
for file_type in file_types.keys():
if not f'{file_type}_df.csv' in os.listdir(f'{game}_data'):
dict_data_df[file_type] = DataFrame(
columns=file_types[file_type])
else:
dict_data_df[file_type] = read_csv(
f'./{game}_data/{file_type}_df.csv')
return dict_data_df