Skip to content

Commit

Permalink
support transcription during batch processing
Browse files Browse the repository at this point in the history
  • Loading branch information
baxtree committed Apr 14, 2023
1 parent 1228a9c commit 2731fdf
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 43 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ifdef PYTHON
PYTHON := $(PYTHON)
else
PYTHON := 3.7.7
PYTHON := 3.8.2
endif

ifdef PLATFORM
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
[![Documentation Status](https://readthedocs.org/projects/subaligner/badge/?version=latest)](https://subaligner.readthedocs.io/en/latest/?badge=latest)
[![GitHub license](https://img.shields.io/github/license/baxtree/subaligner)](https://github.com/baxtree/subaligner/blob/master/LICENSE)
[![PyPI](https://badge.fury.io/py/subaligner.svg)](https://badge.fury.io/py/subaligner)
[![Docker Build](https://img.shields.io/docker/cloud/build/baxtree/subaligner?label=Docker&style=flat)](https://hub.docker.com/r/baxtree/subaligner/builds)
[![Docker Pulls](https://img.shields.io/docker/pulls/baxtree/subaligner)](https://hub.docker.com/r/baxtree/subaligner)
[![Citation](https://zenodo.org/badge/228440472.svg)](https://doi.org/10.5281/zenodo.5603083)

Expand Down Expand Up @@ -126,7 +125,14 @@ $ subaligner --languages
$ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt
$ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt
$ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt
$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt
$ subaligner -m dual -v video.mp4 -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
$ subaligner -m dual -v video.mp4 -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt
$ subaligner -m dual -v video.mp4 -tr whisper -tf small -o subtitle_aligned.srt -t src,eng
```
```
# Transcribe audiovisual files and generate translated subtitles
$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
```
```
# Shift subtitle manually by offset in seconds
Expand Down
8 changes: 7 additions & 1 deletion site/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,13 @@ Make sure you have got the virtual environment activated upfront.
(.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt
(.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt
(.venv) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt
(.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt
(.venv) $ subaligner -m dual -v video.mp4 -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
(.venv) $ subaligner -m dual -v video.mp4 -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt
(.venv) $ subaligner -m dual -v video.mp4 -tr whisper -tf small -o subtitle_aligned.srt -t src,eng

**Transcribe audiovisual files and generate translated subtitles**::

(.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt

**Shift subtitle manually by offset in seconds**::

Expand Down
148 changes: 109 additions & 39 deletions subaligner/subaligner_batch/__main__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env python
"""
usage: subaligner_batch [-h] [-m {single,dual}] [-vd VIDEO_DIRECTORY] [-sd SUBTITLE_DIRECTORY] [-l MAX_LOGLOSS] [-so]
usage: subaligner_batch [-h] [-m {single,dual,script,transcribe}] [-sd SUBTITLE_DIRECTORY] [-vd VIDEO_DIRECTORY] [-l MAX_LOGLOSS] [-so]
[-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
[-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-od OUTPUT_DIRECTORY] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver]
[-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-od OUTPUT_DIRECTORY] [-of {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}] [-t TRANSLATE]
[-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
[-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-lgs] [-d] [-q] [-ver]
Batch align multiple subtitle files and audiovisual files
Expand All @@ -11,13 +13,13 @@
optional arguments:
-h, --help show this help message and exit
-vd VIDEO_DIRECTORY, --video_directory VIDEO_DIRECTORY
Path to the video directory
-sd SUBTITLE_DIRECTORY, --subtitle_directory SUBTITLE_DIRECTORY
Path to the subtitle directory
-vd VIDEO_DIRECTORY, --video_directory VIDEO_DIRECTORY
Path to the video directory
-l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS
Max global log loss for alignment
-so, --stretch_on Switch on stretch on subtitles
-so, --stretch_on Switch on stretch on subtitles)
-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}
Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].
NB: This will be ignored if neither -so nor --stretch_on is present
Expand All @@ -26,15 +28,23 @@
Path to the output directory containing training results
-od OUTPUT_DIRECTORY, --output_directory OUTPUT_DIRECTORY
Path to the output subtitle directory
-of {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}, --output_format {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}
File format of the output subtitles
-t TRANSLATE, --translate TRANSLATE
Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)
-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --main_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}
Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]
-mr {whisper}, --transcription_recipe {whisper}
LLM recipe used for transcribing video files
-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}
Flavour variation for a specific LLM recipe supporting transcription
-lgs, --languages Print out language codes used for stretch and translation
-d, --debug Print out debugging information
-q, --quiet Switch off logging information
-ver, --version show program's version number and exit
required arguments:
-m {single,dual}, --mode {single,dual}
-m {single,dual,script,transcribe}, --mode {single,dual,script,transcribe}
Alignment mode: either single or dual
"""

Expand All @@ -43,6 +53,7 @@
import traceback
import os
import pkg_resources
import tempfile


def main():
Expand All @@ -65,22 +76,22 @@ def main():
"--mode",
type=str,
default="",
choices=["single", "dual"],
choices=["single", "dual", "script", "transcribe"],
help="Alignment mode: either single or dual",
)
parser.add_argument(
"-vd",
"--video_directory",
"-sd",
"--subtitle_directory",
type=str,
default="",
help="Path to the video directory",
help="Path to the subtitle directory",
)
parser.add_argument(
"-sd",
"--subtitle_directory",
"-vd",
"--video_directory",
type=str,
default="",
help="Path to the subtitle directory",
help="Path to the video directory",
)
parser.add_argument(
"-l",
Expand Down Expand Up @@ -139,6 +150,31 @@ def main():
type=str,
help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)",
)
parser.add_argument(
"-ml",
"--main_language",
type=str.lower,
choices=Utils.get_stretch_language_codes(),
help="Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]",
)
from subaligner.llm import TranscriptionRecipe
from subaligner.llm import WhisperFlavour
parser.add_argument(
"-mr",
"--transcription_recipe",
type=str.lower,
default=TranscriptionRecipe.WHISPER.value,
choices=[r.value for r in TranscriptionRecipe],
help="LLM recipe used for transcribing video files"
)
parser.add_argument(
"-mf",
"--transcription_flavour",
type=str.lower,
default=WhisperFlavour.SMALL.value,
choices=[wf.value for wf in WhisperFlavour],
help="Flavour variation for a specific LLM recipe supporting transcription"
)
parser.add_argument("-lgs", "--languages", action="store_true",
help="Print out language codes used for stretch and translation")
parser.add_argument("-d", "--debug", action="store_true",
Expand All @@ -159,36 +195,48 @@ def main():
print("ERROR: --video_directory was not passed in")
parser.print_usage()
sys.exit(21)
if FLAGS.subtitle_directory == "":
if FLAGS.mode != "transcribe" and FLAGS.subtitle_directory == "":
print("ERROR: --subtitle_directory was not passed in")
parser.print_usage()
sys.exit(21)
if FLAGS.output_directory == "":
print("ERROR: --output_directory was not passed in")
parser.print_usage()
sys.exit(21)
if os.path.abspath(FLAGS.subtitle_directory) == os.path.abspath(FLAGS.output_directory):
if FLAGS.mode != "transcribe" and os.path.abspath(FLAGS.subtitle_directory) == os.path.abspath(FLAGS.output_directory):
print("ERROR: The output directory cannot be set to the same as the input subtitle directory")
parser.print_usage()
sys.exit(21)
if FLAGS.translate is not None:
if FLAGS.translate is not None or FLAGS.mode == "transcribe":
if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}:
print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.')
print('ERROR: Alignment has been configured to use language models. Please install "subaligner[llm]" and run your command again.')
sys.exit(21)
if FLAGS.stretch_on or FLAGS.mode == "script":
if "aeneas" not in {pkg.key for pkg in pkg_resources.working_set}:
print('ERROR: Alignment has been configured to use extra features. Please install "subaligner[stretch]" and run your command again.')
sys.exit(21)
if FLAGS.mode == "transcribe":
if FLAGS.main_language is None:
print("ERROR: --main_language was not passed in but required by mode 'transcribe'")
parser.print_usage()
sys.exit(21)

video_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in
os.walk(FLAGS.video_directory) for p in files if not p.startswith(".")]
subtitle_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in
os.walk(FLAGS.subtitle_directory) for p in files if not p.startswith(".")]
if len(video_file_paths) != len(subtitle_file_paths):
print("ERROR: The numbers of input videos and subtitles do not match")
parser.print_usage()
sys.exit(21)

if FLAGS.mode != "transcribe":
subtitle_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in
os.walk(FLAGS.subtitle_directory) for p in files if not p.startswith(".")]
if len(video_file_paths) != len(subtitle_file_paths):
print("ERROR: The numbers of input videos and subtitles do not match")
parser.print_usage()
sys.exit(21)

output_dir = os.path.abspath(FLAGS.output_directory)
os.makedirs(output_dir, exist_ok=True)
video_file_paths = sorted(video_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0])
subtitle_file_paths = sorted(subtitle_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0])
if FLAGS.mode != "transcribe":
subtitle_file_paths = sorted(subtitle_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0])
exit_segfail = FLAGS.exit_segfail
stretch = FLAGS.stretch_on
stretch_in_lang = FLAGS.stretch_in_language
Expand All @@ -205,15 +253,16 @@ def main():
failures = []
for index in range(len(video_file_paths)):
local_video_path = video_file_paths[index]
local_subtitle_path = subtitle_file_paths[index]
local_subtitle_path = subtitle_file_paths[index] if FLAGS.mode != "transcribe" else "{}.srt".format(tempfile.mkstemp()[1])
try:
voice_probabilities = None
if FLAGS.mode == "single":
aligned_subs, audio_file_path, voice_probabilities, frame_rate = predictor.predict_single_pass(
video_file_path=local_video_path,
subtitle_file_path=local_subtitle_path,
weights_dir=os.path.join(FLAGS.training_output_directory, "models", "training", "weights")
)
else:
elif FLAGS.mode == "dual":
aligned_subs, subs, voice_probabilities, frame_rate = predictor.predict_dual_pass(
video_file_path=local_video_path,
subtitle_file_path=local_subtitle_path,
Expand All @@ -222,29 +271,50 @@ def main():
stretch_in_lang=stretch_in_lang,
exit_segfail=exit_segfail,
)
elif FLAGS.mode == "script":
aligned_subs, _, voice_probabilities, frame_rate = predictor.predict_plain_text(
video_file_path=local_video_path,
subtitle_file_path=local_subtitle_path,
stretch_in_lang=stretch_in_lang,
)
elif FLAGS.mode == "transcribe":
from subaligner.transcriber import Transcriber
transcriber = Transcriber(recipe=FLAGS.transcription_recipe, flavour=FLAGS.transcription_flavour)
subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang)
aligned_subs = subtitle.subs

parent_dir = os.path.dirname(local_subtitle_path.replace(os.path.abspath(FLAGS.subtitle_directory), output_dir))
os.makedirs(parent_dir, exist_ok=True)
file_parts = os.path.basename(local_subtitle_path).rsplit(".", 1)
file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else file_parts[1]
aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt")))
if FLAGS.mode == "transcribe":
parent_dir = os.path.dirname(video_file_paths[index].replace(os.path.abspath(FLAGS.video_directory), output_dir))
os.makedirs(parent_dir, exist_ok=True)
file_parts = os.path.basename(video_file_paths[index]).rsplit(".", 1)
file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else "srt"
aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt")))
else:
parent_dir = os.path.dirname(local_subtitle_path.replace(os.path.abspath(FLAGS.subtitle_directory), output_dir))
os.makedirs(parent_dir, exist_ok=True)
file_parts = os.path.basename(local_subtitle_path).rsplit(".", 1)
file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else file_parts[1]
aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt")))

if FLAGS.translate is not None:
from subaligner.translator import Translator
source, target = FLAGS.translate.split(",")
translator = Translator(source, target)
aligned_subs = translator.translate(aligned_subs)
Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8")
else:
elif FLAGS.mode == "transcribe":
Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8")
else:
Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate)

log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs)
if log_loss is None or log_loss > FLAGS.max_logloss:
print(
"ERROR: Alignment failed with a too high loss value: {} for {} and {}".format(log_loss, local_video_path, local_subtitle_path)
)
failures.append((local_video_path, local_subtitle_path))
continue
if voice_probabilities is not None:
log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs)
if log_loss is None or log_loss > FLAGS.max_logloss:
print(
"ERROR: Alignment failed with a too high loss value: {} for {} and {}".format(log_loss, local_video_path, local_subtitle_path)
)
failures.append((local_video_path, local_subtitle_path))
continue

print("Aligned subtitle saved to: {}".format(aligned_subtitle_path))
except UnsupportedFormatException as e:
Expand Down

0 comments on commit 2731fdf

Please sign in to comment.