-
Notifications
You must be signed in to change notification settings - Fork 0
/
upload_archived.py
237 lines (183 loc) · 8.26 KB
/
upload_archived.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import logging
import mimetypes
import os
import re
import sys
from argparse import ArgumentParser
from collections import Counter
from mimetypes import guess_type
import dotenv
import psycopg
from src.db import models
from src.db.dbutils import DbUtils
dir_path = os.path.dirname(os.path.realpath(__file__))
os.chdir(dir_path)
dotenv.load_dotenv()
from src.enum import S3ObjectType
from src.config import Config
subtitle_formats = {'.ass', '.srv1', '.srv2', '.srv3', '.srt', '.ttml', '.vtt', '.stl', '.dfxp', '.sami', '.ssa'}
image_formats = ['webp']
for im_f in image_formats:
mimetypes.add_type(f'image/{im_f}', f'.{im_f}', strict=False)
def get_ext(path: str) -> [str, str]:
filepath, ext = os.path.splitext(path)
return filepath, ext
def get_s3_type(path: str) -> S3ObjectType:
filepath, ext = get_ext(path)
if ext in subtitle_formats:
return S3ObjectType.subtitle
mimetype, _ = guess_type(path, strict=False)
if 'image/' in mimetype:
return S3ObjectType.thumbnail
if 'video/' in mimetype:
return S3ObjectType.video
if '/json' in mimetype:
if filepath.endswith('info'):
return S3ObjectType.metadata
if filepath.endswith('live_chat'):
return S3ObjectType.subtitle
if 'audio/' in mimetype:
return S3ObjectType.audio
return S3ObjectType.other
if __name__ == '__main__':
logger = logging.getLogger('debug')
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:[%(module)s] %(message)s'))
logger.addHandler(handler)
logs_dir = os.getenv('LOGS_DIR', dir_path)
handler = logging.FileHandler(filename=os.path.join(logs_dir, 'debug.log'), encoding='utf-8', mode='a')
handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:[%(module)s] %(message)s'))
logger.addHandler(handler)
parser = ArgumentParser()
parser.add_argument('-d', '--dry-run', action='store_true', default=False)
parser.add_argument('-a', '--upload-all', action='store_true', default=False,
help='''If this parameter is given uploads all files,
even if those files could not be linked to a video id.
However those uploads cannot be further tracked back to the database.''')
args = parser.parse_args(sys.argv[1:])
config = Config.load()
data_dir = os.path.join(dir_path, 'data')
video_dir = os.path.join(data_dir, 'videos')
counter = Counter()
video_files = {}
for site in os.listdir(video_dir):
site_path = os.path.join(video_dir, site)
for file in os.listdir(site_path):
s3_type = get_s3_type(file)
counter.update([s3_type])
filename, ext = os.path.splitext(file)
filename = re.sub(r'(\.info|\.live_chat|\.\w{2})$', '', filename)
if filename not in video_files:
video_files[filename] = {}
data = video_files[filename]
strict = False
key = s3_type
match s3_type:
case S3ObjectType.video | S3ObjectType.thumbnail | S3ObjectType.metadata | S3ObjectType.audio:
strict = True
if s3_type == S3ObjectType.other:
print(f'File {file} classified as S3ObjectType.other')
if strict and key in data:
raise ValueError(f'Multiple files of same type for file {file}.\n{data}')
file_path = os.path.join(site_path, file)
if strict:
data[key] = file_path
else:
if key in data:
data[key].append(file_path)
else:
data[key] = [file_path]
print('')
print('File type statistics')
for name, count in counter.most_common():
print(f'{name}: {count}')
sql = '''
SELECT id, downloaded_filename, site, video_id FROM videos WHERE downloaded_filename IS NOT NULL
'''
with psycopg.connect(config.db_conn_string) as conn:
with conn.cursor() as cur:
cur.execute(sql)
filename2id = {os.path.split(f)[1]: (v_id, site, video_id) for v_id, f, site, video_id in cur}
not_found = {}
for k, v in list(video_files.items()):
# If only audio file found use that as the video as well
if S3ObjectType.video not in v and S3ObjectType.audio in v:
v[S3ObjectType.video] = v[S3ObjectType.audio]
if S3ObjectType.video not in v:
not_found[k] = v
if not args.upload_all:
video_files.pop(k)
continue
video_file = os.path.split(v[S3ObjectType.video])[1]
if not (ids := filename2id.get(video_file, None)):
not_found[k] = v
if not args.upload_all:
video_files.pop(k)
continue
v['id'] = ids[0]
v['site'] = ids[1]
v['video_id'] = ids[2]
print(f'{len(video_files)} matches found from database. {len(not_found)} videos could not be linked to a video id.')
if args.dry_run:
exit()
from src.app import PlaylistChecker
with psycopg.connect(config.db_conn_string) as conn:
db = DbUtils(conn)
checker = PlaylistChecker(config=config)
for d in video_files.values():
base_tags = {}
site = d.get('site')
video_db_id = d.get('id')
if video_file := d.get(S3ObjectType.video):
s3_file, s3_filesize = checker.upload_and_delete_file(video_file, base_tags, S3ObjectType.video)
if s3_file and video_db_id:
db.update_filename(s3_file, video_db_id)
db.update_filesize(s3_filesize, video_db_id)
info_file, info_filesize = checker.upload_and_delete_file(d.get(S3ObjectType.metadata), base_tags, S3ObjectType.metadata)
thumbnail_file, thumb_filesize = checker.upload_and_delete_file(d.get(S3ObjectType.thumbnail), base_tags, S3ObjectType.thumbnail)
# If video and audio are the same file the file has already been deleted.
# just use the normal uploaded filename
if video_file and video_file == d.get(S3ObjectType.audio):
audio_file = video_file
else:
audio_file, audio_filesize = checker.upload_and_delete_file(d.get(S3ObjectType.audio), base_tags, S3ObjectType.audio)
subs = []
total_filesize = info_filesize + thumb_filesize + audio_filesize
if subtitle_paths := d.get(S3ObjectType.subtitle):
for sub in subtitle_paths:
sub_path, sub_filesize = checker.upload_and_delete_file(sub, base_tags, S3ObjectType.subtitle)
if sub_path is not None:
subs.append(sub_path)
total_filesize += sub_filesize
if video_db_id:
extra_files = models.VideoExtraFiles(
video_id=video_db_id,
thumbnail=thumbnail_file,
info_json=info_file,
audio_file=audio_file,
subtitles=subs,
total_filesize=total_filesize
)
logger.info(f'Updating extra files with object {extra_files}')
db.update_extra_files(extra_files)
sql = '''
SELECT v.site, v.video_id, v.id FROM videos v
LEFT JOIN extra_video_files evf ON v.id = evf.video_id
WHERE evf.thumbnail IS NULL
'''
with checker.conn.cursor() as cur:
cur.execute(sql)
missing_thumbs = cur.fetchall()
for site, video_id, id_ in missing_thumbs:
thumbs_path = os.path.join(data_dir, str(site))
thumbnail_file = os.path.join(thumbs_path, video_id + '.jpg') # It's all jpeg
if os.path.exists(thumbnail_file):
base_tags = {}
thumbnail, filesize = checker.upload_and_delete_file(thumbnail_file, base_tags, S3ObjectType.thumbnail)
if thumbnail:
db.update_extra_files(models.VideoExtraFiles(
video_id=id_,
thumbnail=thumbnail,
total_filesize=filesize,
))