-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_somafm_charts.py
200 lines (167 loc) · 6.82 KB
/
get_somafm_charts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# -*- coding: utf-8 -*-
"""
Simple scraper to pull the 'Top 30' charts from SomaFM (somafm.com) and reformat them
for analysis.
Author: Aaron Penne
Created: 2018-03-07
Developed with:
Python 3.6
Windows 10
"""
import os
from lxml import html
import requests
from datetime import datetime, date, timedelta
import pandas as pd
from multiprocessing.dummy import Pool
import itertools
# Profiling
t_start = datetime.now()
# Multithreading parameters
threads = 20 # I/O based task so threads>#CPUs is okay
# threads = 10 # Run time: 0:19:45
# threads = 20 # Run time: 0:09:57
# threads = 100 # Run time: 0:05:10 but heavy CPU usage
# Set output directory, make it if needed
output_dir = os.path.realpath(r'C:\tmp\somafm') # Windows machine
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
# List of all channel names formatted as in chart urls
channel_names = ['bagel',
'beatblender',
'bootliquor',
'brfm',
'christmas',
'cliqhop',
'covers',
'deepspaceone',
'defcon',
'digitalis',
'doomed',
'dronezone',
'dubstep',
'earwaves',
'fluid',
'folkfwd',
'groovesalad',
'illstreet',
'indiepop',
'jollysoul',
'lush',
'metal',
'missioncontrol',
'poptron',
'secretagent',
'seventies',
'sf1033',
'sonicuniverse',
'spacestation',
'suburbsofgoa',
'thetrip',
'thistle',
'u80s',
'xmasinfrisko',
'xmasrocks']
def last_saturday():
"""Round today's date to the most recent Saturday (when charts are published)"""
today = date.today()
today_index = (today.weekday() + 1) % 7
week = today - timedelta(today_index + 1)
return week
def get_chart_weeks(week):
"""Get list of week start dates in two formats"""
chart_weeks = {'url': [],
'csv': []}
# Set beginning of time to 01 Jan 2000 since SomaFM started in 2000. Published charts do not go back that far.
while week != date(2000, 1, 1):
chart_weeks['url'] += [week.strftime('%d%b%y')] # DDMMMYY (ex. 30Dec17)
chart_weeks['csv'] += [week.isoformat()] # YYYY-MM-DD
week = week - timedelta(days=7) # Previous week
return chart_weeks
def get_channel_charts(args):
"""Gets a single week's charts for a particular channel and reshapes it into readable CSV"""
# args are handled this way to get around single arg limitation of Pool.map
channel = args[0]
week_url = args[1]
week_csv = args[2]
# Clear data for next channel
data = {'week': [],
'rank': [],
'artist': [],
'media': [],
'media_type': [],
'score_type': [],
'score': [],
'channel': [],
'url': []}
# Search backwards through the weeks until an error occurs (likely end of recorded data)
try:
# Parse raw data from website
page_url = 'http://somafm.com/charts/{}/{}-{}.html'.format(channel, channel, week_url)
page = requests.get(page_url)
tree = html.fromstring(page.content)
text = tree.xpath('//div[@id="content"]/pre/text()')
text = str(text[0])
text_split = text.splitlines()
text_split = list(filter(None, text_split))
for line in text_split:
# If end of charts, go to next week
if 'adds' in line.lower():
break
# If chart header, then split according to whitespace
if line[0:7] == 'Top 30 ':
line_split = line.split()
media_type = line_split[2].lower()
score_type = line_split[4].lower()
# If not known header, is this line a chart record?
else:
# Look in line for period and parenthesis, indicitave of chart record
dot = [ix for ix in range(len(line)) if line[ix:ix+2]=='. '] # This method is used over regex for speed #FIXME check speed
paren = [ix for ix in range(len(line)) if line[ix:ix+1]=='(']
if dot and paren:
# Ensure the indexes for the first '.' and last '(' are used
dot = dot[0]
paren = paren[-1]
# If ' - ' exists then it's an album or a track
dash = [ix for ix in range(len(line)) if line[ix:ix+3]==' - ']
if dash:
dash = dash[0]
data['artist'] += [line[dot+1:dash].strip()]
data['media'] += [line[dash+2:paren].strip()]
# If ' - ' doesn't exist then it's an artist
else:
data['artist'] += [line[dot+1:paren].strip()]
data['media'] += ['']
# Fill in rest of data
data['week'] += [week_csv]
data['rank'] += [line[0:dot]]
data['media_type'] += [media_type]
data['score_type'] += [score_type]
data['score'] += [line[paren+1:-1]]
data['channel'] += [channel]
data['url'] += [page_url]
# If error occurs, just move past it
except:
pass
# Create dataframe out of dict
df = pd.DataFrame(data, columns=data.keys())
df.index.name = 'index'
return df
def main():
pass # Local variables aren't showing up in Variable Explorer in Spyder, skip for now
if __name__ == '__main__':
week = last_saturday()
chart_weeks = get_chart_weeks(week)
# For each channel, grab weekly charts, aggregate, and create csv
for channel in channel_names:
print(channel)
with Pool(threads) as p:
# Run a chunk of weeks in a particular chart in parallel
data = p.map(get_channel_charts, zip(itertools.repeat(channel), chart_weeks['url'], chart_weeks['csv']))
# Concatenate all dataframes from multithreaded output, resetting index to 0:n-1
df = pd.concat(data, ignore_index=True)
# Create csv out of dataframe
df_out = df.astype(str)
df_out.to_csv(os.path.join(output_dir, 'somafm_charts_' + channel + '.csv'))
t_end = datetime.now()
print('Run time: {}'.format(t_end - t_start))