-
Notifications
You must be signed in to change notification settings - Fork 0
/
all_pressure.py
104 lines (68 loc) · 3.08 KB
/
all_pressure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# import libraries
from bs4 import BeautifulSoup
import pandas as pd
import argparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
parser = argparse.ArgumentParser()
parser.add_argument('--year', default=2018, type=int)
args = parser.parse_args()
# change this for a stat of different year
YEAR = args.year
# this link gets stat of ratings for the particular year
link = f'https://www.atptour.com/en/stats/leaderboard?boardType=pressure&timeFrame={YEAR}&surface=hard&versusRank=all&formerNo1=false'
options = webdriver.ChromeOptions()
options.add_argument('--headless=new')
options.add_argument(f'user-agent={YEAR}-{str(YEAR)[::-1]}')
browser = webdriver.Chrome(service=Service(
ChromeDriverManager().install()), options=options)
browser.get(link)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
player_tbls = soup.find(id="statsListingTableContent").find('table')
browser.quit()
thead_child = player_tbls.find('thead').find_all(recursive=False)
tbody_child = player_tbls.find('tbody').find_all(recursive=False)
thead = player_tbls.find('thead')
for child in thead_child:
thead.append(child)
tbody = player_tbls.find('tbody')
for child in tbody_child:
tbody.append(child)
df = pd.read_html(str(player_tbls))[0]
df.columns = ['rank', 'name', 'pressure_rating', 'break_pts_converted%',
'break_pts_saved%', 'tie_breaks%', 'deciding_sets%']
# merge with the original set of stats
main_df = pd.read_csv(f'./data/aus-open-player-stats-{YEAR}.csv')
main_df = main_df.merge(df, left_on='name', right_on='name', how='left')
# this link fetches the career ratings of players to fill for players whose
# yearly rating was unavailable
link = f'https://www.atptour.com/en/stats/leaderboard?boardType=pressure&timeFrame=Career&surface=hard&versusRank=all&formerNo1=false'
options = webdriver.ChromeOptions()
options.add_argument('--headless=new')
browser = webdriver.Chrome(service=Service(
ChromeDriverManager().install()), options=options)
browser.get(link)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
player_tbls = soup.find(id="statsListingTableContent").find('table')
browser.quit()
thead_child = player_tbls.find('thead').find_all(recursive=False)
tbody_child = player_tbls.find('tbody').find_all(recursive=False)
thead = player_tbls.find('thead')
for child in thead_child:
thead.append(child)
tbody = player_tbls.find('tbody')
for child in tbody_child:
tbody.append(child)
df = pd.read_html(str(player_tbls))[0]
no_stats = main_df.loc[main_df['pressure_rating'].isna(), 'name']
df.columns = ['rank', 'name', 'pressure_rating', 'break_pts_converted%',
'break_pts_saved%', 'tie_breaks%', 'deciding_sets%']
df = df[df['name'].isin(no_stats)].copy(deep=True).reset_index(drop=True)
main_df.set_index('name', inplace=True)
df.set_index('name', inplace=True)
main_df['pressure_rating'].fillna(df['pressure_rating'], inplace=True)
main_df.reset_index(inplace=True)
main_df.to_csv(f'data/aus-open-player-stats-{YEAR}.csv', index=False)