-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvtt_convert.py
49 lines (36 loc) · 1.18 KB
/
vtt_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import click
from pathlib import Path
import re
@click.command()
@click.argument('vtt_files', nargs=-1)
@click.option('-o', '--output', default='converted.txt')
def cli(vtt_files, output):
result = [convert(Path(v).read_text(errors='ignore')) for v in vtt_files]
result = '\n\n'.join(result)
Path(output).write_text(result)
print('Created ' + output)
def convert(full_text: str) -> str:
paragraphs = full_text.split('\n\n')
ret = []
for para in paragraphs[4:]:
parsed = _parse_para(para)
parsed = remove_format(parsed)
ret.append(parsed)
ret = '\n'.join(ret) + '\n'
return ret
def _parse_para(para: str) -> str:
ret = '\n'.join(para.strip().split('\n')[2:])
return ret
def remove_format(s: str) -> str:
"""Remove the formatting tags
>>> remove_format("<i>Baby, don't get hooked on me</i>")
"Baby, don't get hooked on me"
>>> remove_format('<c.white><c.mono_sans> SO NO ONE TOLD YOU</c.mono_sans></c.white>')
'So no one told you'
"""
for matched in re.findall(r'\<\/?[^<>]*\>', s):
s = s.replace(matched, '')
s = s.strip().capitalize()
return s
if __name__ == '__main__':
cli()