-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsurvivor_scraper.py
294 lines (247 loc) · 11.7 KB
/
survivor_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import requests
from bs4 import BeautifulSoup
import json
from typing import Dict, List, Set
import time
import os
class SurvivorScraper:
def __init__(self):
self.base_url = "https://www.truedorktimes.com/survivor/cast/season{}-{}.htm"
self.players: Dict[str, Dict[str, any]] = {} # player_name -> {seasons: [], image_url: str}
self.season_logos: Dict[int, str] = {} # season_number -> logo_url
self.placements: Dict[str, Dict[int, int]] = {} # player_name -> {season: placement}
# Initialize season logos with paths relative to the base URL
self.season_logos = {
season: f"/survivor-stats/logos/season_{season}.png"
for season in range(1, 49)
}
def process_saved_files(self):
"""Process all saved HTML files to build the complete dataset"""
ranges = [(1,10), (11,20), (21,30), (31,40), (41,50)]
for start, end in ranges:
filename = f"survivor_seasons_{start}-{end}.html"
try:
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
self.parse_page(content, start, end)
except FileNotFoundError:
print(f"Warning: {filename} not found")
def run(self):
# First save the HTML files if they don't exist
files_exist = all(os.path.exists(f"survivor_seasons_{start}-{end}.html")
for start, end in [(1,10), (11,20), (21,30), (31,40), (41,50)])
if not files_exist:
self.save_html_for_analysis()
# Then process all files to build the complete dataset
self.process_saved_files()
# Finally save both datasets
self.save_data()
self.save_placements()
def save_html_for_analysis(self):
"""Save HTML content locally for analysis"""
ranges = [(1,10), (11,20), (21,30), (31,40), (41,50)]
for start, end in ranges:
url = self.base_url.format(start, end)
try:
page_content = self.fetch_page(url)
filename = f"survivor_seasons_{start}-{end}.html"
with open(filename, 'w', encoding='utf-8') as f:
f.write(page_content)
print(f"Saved {filename}")
except requests.RequestException as e:
print(f"Error fetching seasons {start}-{end}: {e}")
time.sleep(2)
def fetch_page(self, url: str) -> str:
# Add delay to be nice to the server
time.sleep(1)
response = requests.get(url)
response.raise_for_status()
return response.text
def clean_name(self, name: str) -> str:
# Remove quotes and normalize spaces
name = name.strip()
# Remove numerical prefixes (e.g., "1. ", "2. ")
parts = name.split()
parts = [p for p in parts if not p.rstrip('.').isdigit()]
# Handle special cases and capitalization
cleaned = []
for part in parts:
# Skip empty parts
if not part:
continue
# Remove quotes if present (for nicknames)
if part.startswith("'") and part.endswith("'"):
part = part[1:-1]
# Handle all-caps names
if part.isupper() and len(part) > 2:
part = part.capitalize()
cleaned.append(part)
full_name = ' '.join(cleaned)
# Handle special cases where players are known by multiple names
name_mappings = {
"Rob Mariano": "Boston Rob Mariano",
"Boston Rob": "Boston Rob Mariano",
"Candice Woodcock": "Candice Woodcock Cody",
"Candice Cody": "Candice Woodcock Cody",
"Amber Brkich": "Amber Mariano",
}
return name_mappings.get(full_name, full_name)
def parse_page(self, content: str, start_season: int, end_season: int):
soup = BeautifulSoup(content, 'html.parser')
# Find all season cards
season_cards = soup.find_all('div', class_='card')
for card in season_cards:
# Get season number from the card divider
divider = card.find('div', class_='card-divider')
if not divider:
continue
# Look for season number in the image alt text first
img = divider.find('img')
season_num = None
if img and img.get('alt'):
alt_text = img.get('alt')
if 'S' in alt_text:
try:
season_num = int(''.join(c for c in alt_text.split('S')[1] if c.isdigit()))
except (IndexError, ValueError):
pass
# If not found in image, try the divider text
if not season_num:
season_text = divider.get_text().strip()
# Try different season number formats
if 'S' in season_text:
try:
s_part = season_text.split('S')[1]
season_num = int(''.join(c for c in s_part if c.isdigit()))
except (IndexError, ValueError):
pass
if not season_num and 'Survivor ' in season_text:
try:
season_num = int(season_text.split('Survivor ')[1].split()[0])
except (IndexError, ValueError):
pass
if not season_num:
# Last resort: try to find any number in the text
try:
numbers = [int(s) for s in season_text.split() if s.strip(':').isdigit()]
if numbers:
season_num = numbers[0]
except (ValueError):
pass
if not season_num or season_num < start_season or season_num > end_season:
continue
# Find all contestant entries in this season card
contestant_entries = card.find_all('li', class_=['final', 'jury', 'generic'])
# Count total contestants for placement calculation
total_contestants = len(contestant_entries)
current_placement = total_contestants
for entry in contestant_entries:
link = entry.find('a')
if not link:
continue
# Get contestant image URL
img = link.find('img')
image_url = None
if img and img.get('src'):
image_url = 'https://www.truedorktimes.com/survivor/cast/' + img.get('src')
# Get contestant name from the spans
name_spans = link.find_all('span', class_=['firstname', 'lastname'])
if not name_spans:
# Try getting name directly from link if no spans
name = link.get_text().strip()
if name:
full_name = self.clean_name(name)
if full_name:
if full_name not in self.players:
self.players[full_name] = {
"seasons": set(),
"image_url": image_url
}
self.players[full_name]["seasons"].add(season_num)
# Add placement data
if full_name not in self.placements:
self.placements[full_name] = {}
self.placements[full_name][season_num] = current_placement
current_placement -= 1
continue
# Process each name part and combine
name_parts = []
for span in name_spans:
text = span.get_text().strip()
if text:
# Handle cases where nickname is part of firstname
if "'" in text and text.count("'") == 2:
nickname = text[text.find("'")+1:text.rfind("'")]
name_parts.append(nickname)
else:
name_parts.append(text)
full_name = self.clean_name(' '.join(name_parts))
if not full_name:
continue
# Add or update player's seasons and image
if full_name not in self.players:
self.players[full_name] = {
"seasons": set(),
"image_url": image_url
}
self.players[full_name]["seasons"].add(season_num)
if image_url: # Update image URL if we found one
self.players[full_name]["image_url"] = image_url
# Add placement data
if full_name not in self.placements:
self.placements[full_name] = {}
self.placements[full_name][season_num] = current_placement
current_placement -= 1
def save_data(self, filename: str = None):
if filename is None:
# Create data directory if it doesn't exist
data_dir = "survivor-graph/src/data"
os.makedirs(data_dir, exist_ok=True)
filename = os.path.join(data_dir, "survivor_data.json")
# Merge any duplicate entries that might exist
merged_data = {}
for player, data in self.players.items():
if "Rob Mariano" in player: # Special case for Boston Rob
key = "Boston Rob Mariano"
else:
key = player
if key not in merged_data:
merged_data[key] = {
"seasons": set(),
"image_url": data["image_url"]
}
merged_data[key]["seasons"].update(data["seasons"])
# Convert sets to lists for JSON serialization
output_data = {
"players": {
name: {
"seasons": sorted(list(data["seasons"])),
"image_url": data["image_url"]
}
for name, data in merged_data.items()
},
"season_logos": self.season_logos
}
with open(filename, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2)
def save_placements(self, filename: str = None):
if filename is None:
# Create data directory if it doesn't exist
data_dir = "survivor-graph/src/data"
os.makedirs(data_dir, exist_ok=True)
filename = os.path.join(data_dir, "survivor_placements.json")
# Merge any duplicate entries that might exist
merged_placements = {}
for player, placements in self.placements.items():
if "Rob Mariano" in player: # Special case for Boston Rob
key = "Boston Rob Mariano"
else:
key = player
if key not in merged_placements:
merged_placements[key] = {}
merged_placements[key].update(placements)
with open(filename, 'w', encoding='utf-8') as f:
json.dump(merged_placements, f, indent=2)
if __name__ == "__main__":
scraper = SurvivorScraper()
scraper.run()