-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
368 lines (262 loc) · 11.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
import re # Regexes
import sys # To create the directory to save the subs in, and for sys.exit()
from pathlib import Path
import requests # For HTTP requests
from bs4 import BeautifulSoup
# Constants
VIDEO_JSON_BASE_URL: str = "https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/"
USER_AGENT: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
DOWNLOAD_DIR = "subs/"
# Start with main() near the bottom of the file
def save_file(data: str, filename: str):
"""Saves text data to a file. Does NOT check if file or directory exists
beforehand."""
f = open(filename, "w")
f.write(data)
f.close()
def get_vlive_html(url: str) -> str:
"""Sends a GET request to the vlive video URL and returns its HTML content."""
if not is_valid_vlive_url(url):
print(f"URL '{url}' is not a valid vlive.tv/video address.")
sys.exit(1)
# We don't actually need the headers for the request, but it will help the scraper
# look more like a real human
headers = {
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
print("Sending GET request to " + url)
response = requests.get(url, headers=headers)
# Check status code for 200 OK ?
print(f"Received response, status code {response.status_code}\n")
return response.text
def is_valid_vlive_url(url: str) -> bool:
"""Uses a regex to check if the given url is a valid 'vlive.tv./video/' address."""
# VLIVE videos are only identified by numbers in the url (unlike Youtube IDs,
# for example)
vlive_url_regex = r"(vlive\.tv\/video\/[0-9]*)"
if not re.search(vlive_url_regex, url):
return False
return True
def get_video_metadata(html: str) -> (str, str, str, str):
"""Extracts the video metadata from the the HTML.
This data includes:
- Upload date
- Channel
- Video title
- Canonical link (we can also get this one from the url)"""
soup = BeautifulSoup(html, features="html.parser")
# Finds a <meta> element that looks like this:
# <meta name="description" content="{date} - [{channel}] - {video title} - You can watch videos on V LIVE.">
meta_description = soup.find("meta", attrs={"name": "description"})
if meta_description is None:
print('Could not find <meta> element with attribute name="description"')
sys.exit(1)
elements = meta_description["content"].split(" - ")
# Hard coded, but we know the expected amount and the order of the elements
upload_date = elements[0]
# Get everything between the 1st index and the second-to-last, to eliminate the square brackets
channel = elements[1][1:-1]
video_title = elements[2]
# Finds a <link> element that looks like this:
# <link rel="canonical" href="https://www.vlive.tv/video/12345678"/>
# We need this link to populate a referrer header that we will use later
canonical_tag = soup.find("link", attrs={"rel": "canonical"})
if canonical_tag is None:
print('Could not find <link> element with attribute rel="canonical"')
sys.exit(1)
# Get the actual link
canonical_link = canonical_tag["href"]
return upload_date, channel, video_title, canonical_link
def get_video_id_and_key(html: str) -> (str, str):
"""Extracts the video ID and key from the script attached to the HTML.
These are used to send a request to an address in a Naver domain that contains a
JSON file with all the data for the video, including captions."""
# Define regexes globally / in another file ?
# Captures everything inside the function $(document).ready(function() { ... });
document_ready_regex = r"\$\(document\).*\{([\s\S]*?)(?=\})"
# Captures everything inside the function video.init(...)
vlive_video_init_regex = r"video\.init\(([\s\S]*?)\)"
# Captures anything between quotation marks
args_regex = r'"(.*?)"'
video_id_index = 5
video_key_index = 6
# Get everything inside the function in case we need it later
# But for the moment, we only need a specific string inside it, so we could actually
# skip this step
match = re.search(document_ready_regex, html)
document_function = match.group(1)
# TODO: handle error
# Search for the vlive.video.init() function
match = re.search(vlive_video_init_regex, document_function)
vlive_video_init_args = match.group(1)
# TODO: handle error
# Get all the arguments inside the function above, ready to go
# This doesn't get the "[]" argument, however
args_search = re.findall(args_regex, vlive_video_init_args)
# TODO: handle error
video_id = args_search[video_id_index]
video_key = args_search[video_key_index]
return video_id, video_key
def get_video_json(canonical_url: str, video_id: str, video_key: str) -> dict:
"""Sends a GET request to a url constructed with the video ID which responds with a
JSON file containing all the data for the video.
Might be a Video.js-related JSON ?"""
# Headers as used by a Mozilla browser
headers = {
"User-Agent": USER_AGENT,
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Referer": canonical_url,
"Origin": "https://www.vlive.tv",
"DNT": "1",
"Connection": "keep-alive",
}
# Parameters used by a Mozilla browser
# See especially key and video ID
params = (
("key", video_key),
# NOTE:
# This value is generated by the javascript, so there's no way to get it from the HTML
# It is different per video. However, I have tested that the headers are
# optional, so it's not particularly critical
("pid", "rmcPlayer_15950858086901360"),
("sid", "2024"),
("ver", "2.0"),
("devt", "html5_mo"),
("doct", "json"),
("ptc", "https"),
("sptc", "https"),
("cpt", "vtt"),
(
"ctls",
'{"visible":{"fullscreen":true,"logo":false,"playbackRate":false,"scrap":false,"playCount":true,"commentCount":true,"title":true,"writer":true,"expand":true,"subtitles":true,"thumbnails":true,"quality":true,"setting":true,"script":false,"logoDimmed":true,"badge":true,"seekingTime":true,"muted":true,"muteButton":false,"viewerNotice":false,"linkCount":false,"createTime":false,"thumbnail":true},"clicked":{"expand":false,"subtitles":false}}',
),
("pv", "4.18.40"),
("dr", "1920x1080"),
("cpl", "en_US"),
("lc", "en_US"),
("videoId", video_id),
("cc", "CO"), # Your country here
)
# Concatenating the Naver URL with the video ID gives us the URL we need
video_json_url = VIDEO_JSON_BASE_URL + video_id
print(f"Requesting JSON to {video_json_url}")
response = requests.get(video_json_url, headers=headers, params=params)
# Check status code for 200 OK ?
print(f"Received response, status code {response.status_code}\n")
# It could also return an error JSON, but we are assuming it is the correct one
return response.json()
def get_subtitle_options(video_json: str):
"""Gets a list of available captions from the JSON.
If it can't find the list, it means the video has no subtitles yet."""
try:
captions_dict: list = video_json["captions"]["list"]
return captions_dict
except KeyError:
# We could raise the exception, but we will just exit the program
print("An error occurred.")
print("This video does not have captions available.")
sys.exit(1)
def download_subs(source_url: str, canonical_link: str, video_title: str):
"""Sends a request to the URL in the 'source' element of the item in the captions
list, which contains the subtitles in .VTT format."""
headers = {
"User-Agent": USER_AGENT,
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Referer": canonical_link,
"Origin": "https://www.vlive.tv",
"DNT": "1",
"Connection": "keep-alive",
"TE": "Trailers",
}
print("Sending request to download subs...")
response = requests.get(source_url, headers=headers)
# Check status code for 200 OK ?
print("Received response, status code {}".format(response.status_code))
if response.status_code == 200:
return response.text
else:
print("An error occurred in the request.")
def get_last_url_item(url: str):
"""Returns everything after the last slash ("/") of a URL."""
last_slash_index = url.rindex("/")
return url[last_slash_index + 1 :]
def save_subs_to_file(source_url: str, canonical_link: str, video_title: str, text: str):
"""Saves the VTT text data to a folder named after the video sequence number, which
is inside a subs/ folder in the upper directory relative to our current one.
If the directories don't exist yet, the program will create them."""
filename = get_last_url_item(source_url)
directory = DOWNLOAD_DIR + get_last_url_item(canonical_link)
filepath = directory + "/" + filename
Path(directory).mkdir(parents=True, exist_ok=True)
save_file(text, filepath)
print(f"File {filename} saved to {directory}")
def list_subs(subs: list):
"""Prints all the captions in subs, and some of their data like language or type."""
index = 1
for s in subs:
print(f"{index}. {s['label']} ({s['language']}), type: {s['type']} {s['fanName']}")
index += 1
print()
def main():
"""Main loop: performs the requests, displays the available subs and offers a prompt
to download them."""
print("--- VLIVE Subtitle Scraper ---\n")
print("WARNING! Use of this program could be a violation of the VLIVE Terms of Use.")
print("This program is only meant for educational purposes.")
print()
if len(sys.argv) < 2:
print("Usage: main.py vlive-url\n")
url = input("Enter video URL: ")
else:
url = sys.argv[1]
# Get the HTML from the vlive url
html = get_vlive_html(url)
# Extract the metadata from the HTML
video_id, video_key = get_video_id_and_key(html)
upload_date, channel, video_title, canonical_link = get_video_metadata(html)
print(f"Upload date: {upload_date}")
print(f"Channel: {channel}")
print(f"Title: {video_title}")
print(f"Link: {canonical_link}")
print()
# Get the json with the video data using the metadata we just extracted
video_json = get_video_json(canonical_link, video_id, video_key)
# Get the subs (a simple dict access)
subs = get_subtitle_options(video_json)
print("Available subs:")
list_subs(subs)
index: int = 1
# Show the user simple prompts to download the subs
while True:
sub_index = input("Choose subtitles to download (type in the number), or type 'exit' to finish: ")
if sub_index == "exit":
break
else:
try:
index = int(sub_index) - 1
if index >= len(subs) or index < 0:
print("The number you entered does not match any subs entry.")
else:
print(f"You selected {subs[index]['label']}.")
selected_sub = subs[index]
source = selected_sub["source"]
# Send a request to the source URL for that particular language
# and get the .VTT data back
vtt_data = download_subs(source, canonical_link, video_title)
save_subs_to_file(source, canonical_link, video_title, vtt_data)
print()
list_subs(subs)
except ValueError:
print("Please enter a valid number.")
print()
print("Program finished.")
if __name__ == "__main__":
main()