-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathspotiflite.py
259 lines (206 loc) · 7.14 KB
/
spotiflite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import sqlite3
from contextlib import contextmanager
import re
import os
import time
import random
import json
from bs4 import BeautifulSoup as Soup
import requests
import click
from colorama.ansi import Fore, Back, Style, clear_line, Cursor, clear_screen, set_title
URL_TEMPLATE = "https://open.spotify.com/artist/{}/about"
SPOTIFY_ENTITY_PATTERN = r".*?\Spotify.Entity = (.*);.*"
GREEN = Style.BRIGHT + Fore.GREEN
RED = Style.BRIGHT + Fore.RED
MAGENTA = Style.BRIGHT + Fore.MAGENTA
SPLASH = (
"\n\n"
f"{MAGENTA}███████╗██████╗ ██████╗ ████████╗██╗███████╗{GREEN}██╗ ██╗████████╗███████╗\n"
f"{MAGENTA}██╔════╝██╔══██╗██╔═══██╗╚══██╔══╝██║██╔════╝{GREEN}██║ ██║╚══██╔══╝██╔════╝\n"
f"{MAGENTA}███████╗██████╔╝██║ ██║ ██║ ██║█████╗ {GREEN}██║ ██║ ██║ █████╗\n"
f"{MAGENTA}╚════██║██╔═══╝ ██║ ██║ ██║ ██║██╔══╝ {GREEN}██║ ██║ ██║ ██╔══╝\n"
f"{MAGENTA}███████║██║ ╚██████╔╝ ██║ ██║██║ {GREEN}███████╗██║ ██║ ███████╗\n"
f"{MAGENTA}╚══════╝╚═╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ {GREEN}╚══════╝╚═╝ ╚═╝ ╚══════╝\n"
f" {MAGENTA}with 🌮 from sloev{GREEN}\n"
)
class singleton:
sqlite_connection = None
singleton = singleton
@click.group(invoke_without_command=True)
@click.pass_context
@click.option(
"--spotifydb",
"-db",
type=str,
default="spotify.db",
help="sqlite filename [default: spotify.db]",
envvar="SPOTIFY_DB",
)
def cli(ctx, spotifydb):
if ctx.invoked_subcommand is None:
click.echo("\n".join([SPLASH, ctx.get_help(), ""]))
return
else:
ctx.meta["spotifydb"] = spotifydb
connect(spotifydb)
cli_command = cli.command(
context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)
)
def connect(filename):
singleton.sqlite_connection = sqlite3.connect(filename)
@cli_command
def setup():
"""creates tables"""
if not click.prompt("Are you sure you wanna create table?", type=bool):
return
with commiting_cursor() as cur:
cur.execute(
"""
CREATE TABLE spotify_data
(
referer_id varchar(22),
id varchar(22),
data json,
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
cur.execute(
"""
create unique index spotify_data_id_index on spotify_data ( id )
"""
)
@cli_command
def teardown():
"""deletes tables"""
if not click.prompt("Are you sure you wanna drop tables?", type=bool):
return
with commiting_cursor() as cur:
cur.execute("drop table spotify_data")
@cli_command
@click.argument("artist_id")
def scrape(artist_id):
"""starts scraping from given artist id"""
visit_id(artist_id)
while True:
artist_ids = get_jobs()
info(f"Got {len(artist_ids)} jobs from db to do")
if not artist_ids:
info("No more jobs")
break
for artist_id in artist_ids:
visit_id(artist_id)
info("exiting")
@cli_command
@click.pass_context
def stats(ctx):
"""print out db stats"""
byte_size = os.stat(ctx.meta["spotifydb"]).st_size
human_readable_byte_size = byte_size_to_human_readable(byte_size)
with commiting_cursor() as cur:
jobs = cur.execute(
"select count(*) from spotify_data where length(data) = 0"
).fetchone()[0]
completed_jobs = cur.execute(
"select count(*) from spotify_data where length(data) != 0"
).fetchone()[0]
info(
(
f"rows: {jobs+completed_jobs}\n"
f"completed: {completed_jobs}\n"
f"jobs to do: {jobs}\n"
f"DB size: {human_readable_byte_size}"
)
)
@contextmanager
def commiting_cursor():
c = singleton.sqlite_connection.cursor()
try:
yield c
finally:
singleton.sqlite_connection.commit()
def byte_size_to_human_readable(byte_size):
for count in ["Bytes", "KB", "MB", "GB"]:
if byte_size > -1024.0 and byte_size < 1024.0:
return "{:.2f} {}".format(byte_size, count)
byte_size /= 1024.0
return "{:.2f} TB".format(byte_size)
def info(message):
click.echo(GREEN + message + Style.RESET_ALL)
def error(message):
click.echo(RED + message + Style.RESET_ALL)
def create_job(referrer_id, id):
with commiting_cursor() as cur:
cur.execute(
"""
INSERT
or IGNORE
into spotify_data
(
referer_id,
id,
data
)
VALUES
(
?,
?,
''
)
""",
(referrer_id, id),
)
def complete_job(id, data):
with commiting_cursor() as cur:
cur.execute(
"""
UPDATE spotify_data
SET data = ?
WHERE id = ?
""",
(data, id),
)
def get_jobs():
with commiting_cursor() as cur:
results = cur.execute(
"""
SELECT id
FROM spotify_data
WHERE Length(data) = 0
ORDER BY created ASC
LIMIT 200
"""
).fetchall()
return [result[0] for result in results]
def visit_id(id):
info(f"visiting {id}")
resp = requests.get(URL_TEMPLATE.format(id))
if resp.status_code != 200:
period = random.randint(2, 10)
error(
f"Spotify gave error on {id}, status:{resp.status_code}, sleeping {period} seconds\n"
)
time.sleep(period)
return
text = resp.text
html = Soup(text, "html.parser")
artist_ids = {
a.split("artist/", 1)[1]
for a in (link["href"] for link in html.find_all("a"))
if "artist/" in a and "playlist" not in a
}
artist_ids = artist_ids - {id}
info(f"got {len(artist_ids)} artist ids")
for artist_id in artist_ids:
create_job(id, artist_id)
data = re.search(SPOTIFY_ENTITY_PATTERN, text).group(1)
json_data = json.loads(data)
artist_name = json_data["name"]
info(f"extracted data for {artist_name}")
complete_job(id, data)
info(f"saved data for {artist_name}\n")
period = random.randint(1, 25) * 0.1
time.sleep(period)
if __name__ == "__main__":
cli()