-
Notifications
You must be signed in to change notification settings - Fork 0
/
cov.py
79 lines (65 loc) · 3 KB
/
cov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
import numpy as np
from datetime import datetime
data = pd.read_csv("./data/linestar_data.csv")
data["Opposing Pitcher"] = data.loc[data["Position"] != "P", "Opponent"].str.split(
",", expand=True
)[0]
slate = pd.read_csv(f"./data/slate_{datetime.today().strftime('%Y-%m-%d')}.csv")
batters = data[data["Position"] != "P"]
order_scored = batters.groupby(["Date", "Team", "Order"]).sum()["Scored"]
batters_corr = order_scored.unstack().corr()
pitchers = data[data["Position"] == "P"]
batters = batters.merge(
pitchers[["Name", "Date", "Scored"]],
left_on=["Date", "Opposing Pitcher"],
right_on=["Date", "Name"],
how="left",
suffixes=[None, " Opposing"],
)
pitchers_corr = batters[["Scored", "Scored Opposing"]].corr()
pitchers_corr = pitchers_corr.loc["Scored", "Scored Opposing"]
corr = pd.DataFrame(columns=slate["Name"], index=slate["Name"], dtype=float)
for row in slate.itertuples():
# Correlation with themselves is 1
corr.loc[row.Name, row.Name] = 1
# If pitcher, set correlation to everyone else to 0
if row.Position == "P":
corr.loc[row.Name, corr.columns != row.Name] = 0
else:
# Setting correlation to other batters on the same team according to
# batting order
for teammate in slate.loc[slate["Team"] == row.Team, :].itertuples():
# If the teammate is the pitcher, then 0 correlation
if teammate.Position == "P":
corr.loc[row.Name, teammate.Name] = 0
corr.loc[teammate.Name, row.Name] = 0
else:
order_corr = batters_corr.loc[row.Order, teammate.Order]
corr.loc[row.Name, teammate.Name] = order_corr
corr.loc[teammate.Name, row.Name] = order_corr
# Set correlation to opposing pitcher
corr.loc[row.Name, row.Opp_Pitcher] = pitchers_corr
corr.loc[row.Opp_Pitcher, row.Name] = pitchers_corr
# Correlations with every other Name is 0
corr.loc[row.Name, corr.loc[row.Name].isna()] = 0
# Check that the correlation matrix is symmetric and all its eigenvalues are >= 0
# These two conditions jointly imply the matrix is positive semi-definite
if not np.array_equal(corr, corr.T) & np.all(np.linalg.eigvals(corr) >= 0):
raise ValueError("Correlation matrix not positive semi-definite")
hist_std = data.groupby("Name").std()["Scored"]
# Default standard deviations for players with missing values
default_pitcher_std = 15
default_other_std = 10
# Get historical standard deviation of scored points for players
# on the current slate
hist_std = hist_std.loc[slate["Player"]]
for player in hist_std[hist_std.isna()].index:
player_position = slate.loc[slate["Player"] == player, "Position"].values[0]
if player_position == "P":
hist_std.loc[player] = default_pitcher_std
else:
hist_std.loc[player] = default_other_std
cov = np.diag(hist_std) @ corr @ np.diag(hist_std)
cov = pd.DataFrame(cov, columns=slate["Name"], index=slate["Name"])
cov.to_csv("./data/slate_cov.csv")