forked from GregorUT/vgchartzScrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vgchartzfull.py
130 lines (120 loc) · 4.4 KB
/
vgchartzfull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from bs4 import BeautifulSoup, element
import urllib
import pandas as pd
import numpy as np
pages = 19
rec_count = 0
rank = []
gname = []
platform = []
year = []
genre = []
critic_score = []
user_score = []
publisher = []
developer = []
sales_na = []
sales_pal = []
sales_jp = []
sales_ot = []
sales_gl = []
urlhead = 'http://www.vgchartz.com/gamedb/?page='
urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
for page in range(1, pages):
surl = urlhead + str(page) + urltail
r = urllib.request.urlopen(surl).read()
soup = BeautifulSoup(r)
print(f"Page: {page}")
# vgchartz website is really weird so we have to search for
# <a> tags with game urls
game_tags = list(filter(
lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
# discard the first 10 elements because those
# links are in the navigation bar
soup.find_all("a")
))[10:]
for tag in game_tags:
# add name to list
gname.append(" ".join(tag.string.split()))
print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
# get different attributes
# traverse up the DOM tree
data = tag.parent.parent.find_all("td")
rank.append(np.int32(data[0].string))
platform.append(data[3].find('img').attrs['alt'])
publisher.append(data[4].string)
developer.append(data[5].string)
critic_score.append(
float(data[6].string) if
not data[6].string.startswith("N/A") else np.nan)
user_score.append(
float(data[7].string) if
not data[7].string.startswith("N/A") else np.nan)
sales_na.append(
float(data[9].string[:-1]) if
not data[9].string.startswith("N/A") else np.nan)
sales_pal.append(
float(data[10].string[:-1]) if
not data[10].string.startswith("N/A") else np.nan)
sales_jp.append(
float(data[11].string[:-1]) if
not data[11].string.startswith("N/A") else np.nan)
sales_ot.append(
float(data[12].string[:-1]) if
not data[12].string.startswith("N/A") else np.nan)
sales_gl.append(
float(data[8].string[:-1]) if
not data[8].string.startswith("N/A") else np.nan)
release_year = data[13].string.split()[-1]
# different format for year
if release_year.startswith('N/A'):
year.append('N/A')
else:
if int(release_year) >= 80:
year_to_add = np.int32("19" + release_year)
else:
year_to_add = np.int32("20" + release_year)
year.append(year_to_add)
# go to every individual website to get genre info
url_to_game = tag.attrs['href']
site_raw = urllib.request.urlopen(url_to_game).read()
sub_soup = BeautifulSoup(site_raw, "html.parser")
# again, the info box is inconsistent among games so we
# have to find all the h2 and traverse from that to the genre name
h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
# make a temporary tag here to search for the one that contains
# the word "Genre"
temp_tag = element.Tag
for h2 in h2s:
if h2.string == 'Genre':
temp_tag = h2
genre.append(temp_tag.next_sibling.string)
rec_count += 1
columns = {
'Rank': rank,
'Name': gname,
'Platform': platform,
'Year': year,
'Genre': genre,
'Critic_Score': critic_score,
'User_Score': user_score,
'Publisher': publisher,
'Developer': developer,
'NA_Sales': sales_na,
'PAL_Sales': sales_pal,
'JP_Sales': sales_jp,
'Other_Sales': sales_ot,
'Global_Sales': sales_gl
}
print(rec_count)
df = pd.DataFrame(columns)
print(df.columns)
df = df[[
'Rank', 'Name', 'Platform', 'Year', 'Genre',
'Publisher', 'Developer', 'Critic_Score', 'User_Score',
'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)