-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataPrep.py
80 lines (59 loc) · 3.04 KB
/
dataPrep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
## Import library for data handling
import pandas as pd
## Read in two data frames
## matchups - all of the matchup data, so we know which team played which and who won
## fullData - the data for each team and their aggregate statistics
matchups = pd.read_csv('matchups.csv')
fullData = pd.read_csv('fullData.csv')
## Some dataframe handling
matchups = matchups.dropna()
matchups = matchups.reset_index()
## Define a function to retrieve a team's stats based on the year and team name
def getTeamStats(team, year):
newDF = fullData[((fullData['Year'] == year) & (fullData['Team'] == team))]
for index, row in newDF.iterrows():
return list(row)
## Test of the function
print(getTeamStats('iona', 2019))
# newCols = ['RPI', 'Win%', 'CGWin%', 'SOS', 'Last 10', 'Margin', 'OffE', 'AdjO', 'DefE', 'AdjD', 'EffM', 'AdjEM', 'PF', 'PA', 'PFAM', 'TrueS%', 'OpTS%', 'TS%M', 'FG%', 'OpFG%', 'FG%M', '3P%', 'FT%', 'RB%', 'ST/Pos', 'TO/Pos', 'OpTO/Pos', 'TOM', 'BL%', 'PF/Pos']
# statList = ['effective-field-goal-pct', 'ftm-per-100-possessions', 'offensive-rebounding-pct', 'defensive-rebounding-pct', 'assists-per-fgm', 'effective-possession-ratio']
# columnsToUse = ['Seed', 'Year', 'Team'] + statList
# allCols = columnsToUse + newCols
# allCols = allCols + allCols
# regressionData = pd.DataFrame(columns = allCols)
## Generating an empty list for collection purposes
regressionData = []
## Declaring variables outside of the loop to track progress of the loop
failureTotal = 0
successTotal = 0
i = 0
## Using the csv of teams and their stats (matchups and fullData) to produce data that is usable for a regression/other ML algorithm
## the csv will have winning team and the losing team on the same line, with the result as its own column
## For each row in matchups
for index, row in matchups.iterrows():
## Because of the way the data is formatted - every third row we are ready to read in a new matchup
if i % 3 == 0:
## The first team is always the winner, so we grab that first
year = int(row['Year'])
winner = row['Team'].strip()
winnerInfo = getTeamStats(winner, year)
## The next row will be the losing team
nextRow = matchups.iloc[index + 1]
loser = nextRow['Team'].strip()
loserInfo = getTeamStats(loser, year)
# Combining the data into one row and append it to our list of matchups
try:
totalData = winnerInfo + loserInfo
regressionData.append(totalData)
successTotal += 1
except:
failureTotal += 1
print('failure', i, winner, loser, year)
i += 1
## Report the accuracy of this process - there may be some edge case failures
print(round(failureTotal / (failureTotal + successTotal), 4) * 100, '%')
## Transform list of matchups into a dataframe, write it to a csv
regressionDF = pd.DataFrame.from_records(regressionData)
regressionDF.to_csv('regressionData.csv')
## NOTE FOR THE END TO END PROCESS:
## regressionData gets changed and manually cleaned to turn into regressionUseData