support transcription during batch processing

baxtree · Apr 14, 2023 · 2731fdf · 2731fdf
1 parent 1228a9c
commit 2731fdf
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 43 deletions.
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 ifdef PYTHON
 PYTHON := $(PYTHON)
 else
-PYTHON := 3.7.7
+PYTHON := 3.8.2
 endif
 
 ifdef PLATFORM

diff --git a/README.md b/README.md
@@ -7,7 +7,6 @@
 [![Documentation Status](https://readthedocs.org/projects/subaligner/badge/?version=latest)](https://subaligner.readthedocs.io/en/latest/?badge=latest)
 [![GitHub license](https://img.shields.io/github/license/baxtree/subaligner)](https://github.com/baxtree/subaligner/blob/master/LICENSE)
 [![PyPI](https://badge.fury.io/py/subaligner.svg)](https://badge.fury.io/py/subaligner)
-[![Docker Build](https://img.shields.io/docker/cloud/build/baxtree/subaligner?label=Docker&style=flat)](https://hub.docker.com/r/baxtree/subaligner/builds)
 [![Docker Pulls](https://img.shields.io/docker/pulls/baxtree/subaligner)](https://hub.docker.com/r/baxtree/subaligner)
 [![Citation](https://zenodo.org/badge/228440472.svg)](https://doi.org/10.5281/zenodo.5603083)
 
@@ -126,7 +125,14 @@ $ subaligner --languages
 $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt
 $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt
 $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt
-$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt
+$ subaligner -m dual -v video.mp4 -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
+$ subaligner -m dual -v video.mp4 -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt
+$ subaligner -m dual -v video.mp4 -tr whisper -tf small -o subtitle_aligned.srt -t src,eng
+```
+```
+# Transcribe audiovisual files and generate translated subtitles
+
+$ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
 ```
 ```
 # Shift subtitle manually by offset in seconds

diff --git a/site/source/usage.rst b/site/source/usage.rst
@@ -49,7 +49,13 @@ Make sure you have got the virtual environment activated upfront.
     (.venv) $ subaligner -m single -v video.mp4 -s subtitle.srt -t src,tgt
     (.venv) $ subaligner -m dual -v video.mp4 -s subtitle.srt -t src,tgt
     (.venv) $ subaligner -m script -v test.mp4 -s subtitle.txt -o subtitle_aligned.srt -t src,tgt
-    (.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -o subtitle_aligned.srt -t src,tgt
+    (.venv) $ subaligner -m dual -v video.mp4 -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
+    (.venv) $ subaligner -m dual -v video.mp4 -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt
+    (.venv) $ subaligner -m dual -v video.mp4 -tr whisper -tf small -o subtitle_aligned.srt -t src,eng
+
+**Transcribe audiovisual files and generate translated subtitles**::
+
+    (.venv) $ subaligner -m transcribe -v video.mp4 -ml src -mr whisper -mf small -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt
 
 **Shift subtitle manually by offset in seconds**::
 

diff --git a/subaligner/subaligner_batch/__main__.py b/subaligner/subaligner_batch/__main__.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python
 """
-usage: subaligner_batch [-h] [-m {single,dual}] [-vd VIDEO_DIRECTORY] [-sd SUBTITLE_DIRECTORY] [-l MAX_LOGLOSS] [-so]
+usage: subaligner_batch [-h] [-m {single,dual,script,transcribe}] [-sd SUBTITLE_DIRECTORY] [-vd VIDEO_DIRECTORY] [-l MAX_LOGLOSS] [-so]
                         [-sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
-                        [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-od OUTPUT_DIRECTORY] [-t TRANSLATE] [-lgs] [-d] [-q] [-ver]
+                        [-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-od OUTPUT_DIRECTORY] [-of {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}] [-t TRANSLATE]
+                        [-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
+                        [-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}] [-lgs] [-d] [-q] [-ver]
 
 Batch align multiple subtitle files and audiovisual files
 
@@ -11,13 +13,13 @@
 
 optional arguments:
   -h, --help            show this help message and exit
-  -vd VIDEO_DIRECTORY, --video_directory VIDEO_DIRECTORY
-                        Path to the video directory
   -sd SUBTITLE_DIRECTORY, --subtitle_directory SUBTITLE_DIRECTORY
                         Path to the subtitle directory
+  -vd VIDEO_DIRECTORY, --video_directory VIDEO_DIRECTORY
+                        Path to the video directory
   -l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS
                         Max global log loss for alignment
-  -so, --stretch_on    Switch on stretch on subtitles
+  -so, --stretch_on     Switch on stretch on subtitles)
   -sil {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --stretch_in_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}
                         Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].
                         NB: This will be ignored if neither -so nor --stretch_on is present
@@ -26,15 +28,23 @@
                         Path to the output directory containing training results
   -od OUTPUT_DIRECTORY, --output_directory OUTPUT_DIRECTORY
                         Path to the output subtitle directory
+  -of {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}, --output_format {srt,ytt,ttml,txt,smi,xml,ssa,ass,dfxp,sub,scc,tmp,sami,vtt,stl,sbv}
+                        File format of the output subtitles
   -t TRANSLATE, --translate TRANSLATE
                         Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)
+  -ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}, --main_language {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}
+                        Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]
+  -mr {whisper}, --transcription_recipe {whisper}
+                        LLM recipe used for transcribing video files
+  -mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large}
+                        Flavour variation for a specific LLM recipe supporting transcription
   -lgs, --languages     Print out language codes used for stretch and translation
   -d, --debug           Print out debugging information
   -q, --quiet           Switch off logging information
   -ver, --version       show program's version number and exit
 
 required arguments:
-  -m {single,dual}, --mode {single,dual}
+  -m {single,dual,script,transcribe}, --mode {single,dual,script,transcribe}
                         Alignment mode: either single or dual
 """
 
@@ -43,6 +53,7 @@
 import traceback
 import os
 import pkg_resources
+import tempfile
 
 
 def main():
@@ -65,22 +76,22 @@ def main():
         "--mode",
         type=str,
         default="",
-        choices=["single", "dual"],
+        choices=["single", "dual", "script", "transcribe"],
         help="Alignment mode: either single or dual",
     )
     parser.add_argument(
-        "-vd",
-        "--video_directory",
+        "-sd",
+        "--subtitle_directory",
         type=str,
         default="",
-        help="Path to the video directory",
+        help="Path to the subtitle directory",
     )
     parser.add_argument(
-        "-sd",
-        "--subtitle_directory",
+        "-vd",
+        "--video_directory",
         type=str,
         default="",
-        help="Path to the subtitle directory",
+        help="Path to the video directory",
     )
     parser.add_argument(
         "-l",
@@ -139,6 +150,31 @@ def main():
         type=str,
         help="Source and target ISO 639-3 language codes separated by a comma (e.g., eng,zho)",
     )
+    parser.add_argument(
+        "-ml",
+        "--main_language",
+        type=str.lower,
+        choices=Utils.get_stretch_language_codes(),
+        help="Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]",
+    )
+    from subaligner.llm import TranscriptionRecipe
+    from subaligner.llm import WhisperFlavour
+    parser.add_argument(
+        "-mr",
+        "--transcription_recipe",
+        type=str.lower,
+        default=TranscriptionRecipe.WHISPER.value,
+        choices=[r.value for r in TranscriptionRecipe],
+        help="LLM recipe used for transcribing video files"
+    )
+    parser.add_argument(
+        "-mf",
+        "--transcription_flavour",
+        type=str.lower,
+        default=WhisperFlavour.SMALL.value,
+        choices=[wf.value for wf in WhisperFlavour],
+        help="Flavour variation for a specific LLM recipe supporting transcription"
+    )
     parser.add_argument("-lgs", "--languages", action="store_true",
                         help="Print out language codes used for stretch and translation")
     parser.add_argument("-d", "--debug", action="store_true",
@@ -159,36 +195,48 @@ def main():
         print("ERROR: --video_directory was not passed in")
         parser.print_usage()
         sys.exit(21)
-    if FLAGS.subtitle_directory == "":
+    if FLAGS.mode != "transcribe" and FLAGS.subtitle_directory == "":
         print("ERROR: --subtitle_directory was not passed in")
         parser.print_usage()
         sys.exit(21)
     if FLAGS.output_directory == "":
         print("ERROR: --output_directory was not passed in")
         parser.print_usage()
         sys.exit(21)
-    if os.path.abspath(FLAGS.subtitle_directory) == os.path.abspath(FLAGS.output_directory):
+    if FLAGS.mode != "transcribe" and os.path.abspath(FLAGS.subtitle_directory) == os.path.abspath(FLAGS.output_directory):
         print("ERROR: The output directory cannot be set to the same as the input subtitle directory")
         parser.print_usage()
         sys.exit(21)
-    if FLAGS.translate is not None:
+    if FLAGS.translate is not None or FLAGS.mode == "transcribe":
         if "transformers" not in {pkg.key for pkg in pkg_resources.working_set}:
-            print('ERROR: Alignment has been configured to perform translation. Please install "subaligner[llm]" and run your command again.')
+            print('ERROR: Alignment has been configured to use language models. Please install "subaligner[llm]" and run your command again.')
+            sys.exit(21)
+    if FLAGS.stretch_on or FLAGS.mode == "script":
+        if "aeneas" not in {pkg.key for pkg in pkg_resources.working_set}:
+            print('ERROR: Alignment has been configured to use extra features. Please install "subaligner[stretch]" and run your command again.')
+            sys.exit(21)
+    if FLAGS.mode == "transcribe":
+        if FLAGS.main_language is None:
+            print("ERROR: --main_language was not passed in but required by mode 'transcribe'")
+            parser.print_usage()
             sys.exit(21)
 
     video_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in
                         os.walk(FLAGS.video_directory) for p in files if not p.startswith(".")]
-    subtitle_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in
-                           os.walk(FLAGS.subtitle_directory) for p in files if not p.startswith(".")]
-    if len(video_file_paths) != len(subtitle_file_paths):
-        print("ERROR: The numbers of input videos and subtitles do not match")
-        parser.print_usage()
-        sys.exit(21)
+
+    if FLAGS.mode != "transcribe":
+        subtitle_file_paths = [os.path.abspath(os.path.join(path, p)) for path, _, files in
+                               os.walk(FLAGS.subtitle_directory) for p in files if not p.startswith(".")]
+        if len(video_file_paths) != len(subtitle_file_paths):
+            print("ERROR: The numbers of input videos and subtitles do not match")
+            parser.print_usage()
+            sys.exit(21)
 
     output_dir = os.path.abspath(FLAGS.output_directory)
     os.makedirs(output_dir, exist_ok=True)
     video_file_paths = sorted(video_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0])
-    subtitle_file_paths = sorted(subtitle_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0])
+    if FLAGS.mode != "transcribe":
+        subtitle_file_paths = sorted(subtitle_file_paths, key=lambda x: os.path.splitext(os.path.basename(x))[0])
     exit_segfail = FLAGS.exit_segfail
     stretch = FLAGS.stretch_on
     stretch_in_lang = FLAGS.stretch_in_language
@@ -205,15 +253,16 @@ def main():
     failures = []
     for index in range(len(video_file_paths)):
         local_video_path = video_file_paths[index]
-        local_subtitle_path = subtitle_file_paths[index]
+        local_subtitle_path = subtitle_file_paths[index] if FLAGS.mode != "transcribe" else "{}.srt".format(tempfile.mkstemp()[1])
         try:
+            voice_probabilities = None
             if FLAGS.mode == "single":
                 aligned_subs, audio_file_path, voice_probabilities, frame_rate = predictor.predict_single_pass(
                     video_file_path=local_video_path,
                     subtitle_file_path=local_subtitle_path,
                     weights_dir=os.path.join(FLAGS.training_output_directory, "models", "training", "weights")
                 )
-            else:
+            elif FLAGS.mode == "dual":
                 aligned_subs, subs, voice_probabilities, frame_rate = predictor.predict_dual_pass(
                     video_file_path=local_video_path,
                     subtitle_file_path=local_subtitle_path,
@@ -222,29 +271,50 @@ def main():
                     stretch_in_lang=stretch_in_lang,
                     exit_segfail=exit_segfail,
                 )
+            elif FLAGS.mode == "script":
+                aligned_subs, _, voice_probabilities, frame_rate = predictor.predict_plain_text(
+                    video_file_path=local_video_path,
+                    subtitle_file_path=local_subtitle_path,
+                    stretch_in_lang=stretch_in_lang,
+                )
+            elif FLAGS.mode == "transcribe":
+                from subaligner.transcriber import Transcriber
+                transcriber = Transcriber(recipe=FLAGS.transcription_recipe, flavour=FLAGS.transcription_flavour)
+                subtitle, frame_rate = transcriber.transcribe(local_video_path, stretch_in_lang)
+                aligned_subs = subtitle.subs
 
-            parent_dir = os.path.dirname(local_subtitle_path.replace(os.path.abspath(FLAGS.subtitle_directory), output_dir))
-            os.makedirs(parent_dir, exist_ok=True)
-            file_parts = os.path.basename(local_subtitle_path).rsplit(".", 1)
-            file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else file_parts[1]
-            aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt")))
+            if FLAGS.mode == "transcribe":
+                parent_dir = os.path.dirname(video_file_paths[index].replace(os.path.abspath(FLAGS.video_directory), output_dir))
+                os.makedirs(parent_dir, exist_ok=True)
+                file_parts = os.path.basename(video_file_paths[index]).rsplit(".", 1)
+                file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else "srt"
+                aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt")))
+            else:
+                parent_dir = os.path.dirname(local_subtitle_path.replace(os.path.abspath(FLAGS.subtitle_directory), output_dir))
+                os.makedirs(parent_dir, exist_ok=True)
+                file_parts = os.path.basename(local_subtitle_path).rsplit(".", 1)
+                file_parts[1] = FLAGS.output_format if FLAGS.output_format != "" else file_parts[1]
+                aligned_subtitle_path = os.path.abspath(os.path.join(parent_dir, ".".join(file_parts).replace(".stl", ".srt")))
 
             if FLAGS.translate is not None:
                 from subaligner.translator import Translator
                 source, target = FLAGS.translate.split(",")
                 translator = Translator(source, target)
                 aligned_subs = translator.translate(aligned_subs)
                 Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8")
-            else:
+            elif FLAGS.mode == "transcribe":
                 Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate, "utf-8")
+            else:
+                Subtitle.save_subs_as_target_format(aligned_subs, local_subtitle_path, aligned_subtitle_path, frame_rate)
 
-            log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs)
-            if log_loss is None or log_loss > FLAGS.max_logloss:
-                print(
-                    "ERROR: Alignment failed with a too high loss value: {} for {} and {}".format(log_loss, local_video_path, local_subtitle_path)
-                )
-                failures.append((local_video_path, local_subtitle_path))
-                continue
+            if voice_probabilities is not None:
+                log_loss = predictor.get_log_loss(voice_probabilities, aligned_subs)
+                if log_loss is None or log_loss > FLAGS.max_logloss:
+                    print(
+                        "ERROR: Alignment failed with a too high loss value: {} for {} and {}".format(log_loss, local_video_path, local_subtitle_path)
+                    )
+                    failures.append((local_video_path, local_subtitle_path))
+                    continue
 
             print("Aligned subtitle saved to: {}".format(aligned_subtitle_path))
         except UnsupportedFormatException as e: