-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraper.py
88 lines (62 loc) · 3.29 KB
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
## Importing libraries for webscraping and dataframe handling
import requests
from bs4 import BeautifulSoup
import pandas as pd
## This is the website from where statistics are being pulled
## https://www.teamrankings.com/ncb/stats/
## Defining function to retrieve a year's march madness teams
def getTeamsForYear(year):
filename = str(year) + '.csv'
yearDF = pd.read_csv(filename)
teams = list(yearDF['Team'])
return teams
## Get list of stats to scrape
statList = ['effective-field-goal-pct', 'ftm-per-100-possessions', 'offensive-rebounding-pct', 'defensive-rebounding-pct', 'assists-per-fgm', 'effective-possession-ratio',
'RPI', 'win-pct-all-games', 'win-pct-close-games', 'SOS', 'Last 10', 'average-scoring-margin', 'offensive-efficiency', 'AdjO', 'defensive-efficiency', 'AdjD','EffM', 'AdjEM', 'points-per-game', 'opponent-points-per-game', 'PFAM', 'true-shooting-percentage', 'opponent-true-shooting-percentage', 'TS%M', 'shooting-pct', 'opponent-shooting-pct', 'FG%M', 'three-point-pct', 'free-throw-pct', 'total-rebounding-percentage', 'steals-perpossession', 'turnovers-per-possession', 'opponent-turnovers-per-possession', 'TOM', 'block-pct', 'personal-fouls-per-possession']
## Adding year and team as columns, creating a dataframe with a column for each stat
columnsToUse = ['Year', 'Team'] + statList
collectorDF = pd.DataFrame(columns = columnsToUse)
## Looping through each year for which we have data (2008 - 2019)
for year in range(2008, 2020):
## Creating a new variable for year, technical debt
currentYear = year
## Grabbing all the march madness teams of this year
teamsThisYear = getTeamsForYear(currentYear)
teamsThisYear = teamsThisYear[:-1]
## Initialize empty dataframe
columnsToUse = ['Year', 'Team'] + statList
yearDF = pd.DataFrame(columns = columnsToUse)
## Status update for the Loop
print('CURRENT YEAR :', currentYear)
yearDF['Team'] = teamsThisYear; yearDF['Year'] = currentYear
## Loop through all the stats we want to collect
for stat in statList:
## Status update for loop
print(stat)
## Define Dictionary
statDict = {}
## Webscrape
year = str(year)
URL = 'https://www.teamrankings.com/ncaa-basketball/stat/' + stat + '?date=' + year + '-04-10'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
table_body=soup.find('tbody')
try:
rows = table_body.find_all('tr')
except:
print('failure ', stat)
continue
## Parse data
for row in rows:
cols=row.find_all('td')
cols=[x.text.strip() for x in cols]
## If we are looking at data from a march madness team for this year, add it to our DF
if cols[1] in teamsThisYear:
statistic = cols[2]
yearDF.loc[yearDF.Team == cols[1], stat] = statistic
## Append the current year's dataframe to our main dataframe to collect information for each year
collectorDF = collectorDF.append(yearDF)
## Write dataframe to csv
collectorDF.to_csv('collectorDF.csv')
## NOTE FOR THE END TO END PROCESS:
## collectorDF will have some data manually added, and will eventually become fullData