Added convenience function for launching external UI.

Improved shutdown code, including for UI. Updated readme. Other minor bugfixes.
gdiaz384 · Feb 24, 2024 · 474377a · 474377a
1 parent 66d6325
commit 474377a
Show file tree

Hide file tree

Showing 11 changed files with 3,069 additions and 11 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
@@ -101,15 +101,7 @@ ipython_config.py
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 
 # Celery stuff
@@ -153,8 +145,11 @@ dmypy.json
 cython_debug/
 
 # PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+scratchpad/
+bin/
+resources/cache/
diff --git a/README.md b/README.md
diff --git a/py3translationServer.py b/py3translationServer.py
diff --git a/py3translationServer.py.bat b/py3translationServer.py.bat
@@ -0,0 +1,51 @@
+@echo off
+pushd "%~dp0"
+:: '::' means comment
+
+:: The NMT engine used to process the model. Can be fairseq or ctranslate2
+set mode=fairseq
+::set mode=ctranslate2
+
+:: The path to the model. No quotes.
+:: For fairseq, include the full path and model name.
+:: For ctranslate2, include only the path to model.bin but not the model name itself.
+set modelPath=D:\model\big.pretrain.pt
+::set modelPath=ct2_model
+
+:: Use two letter language codes: www.loc.gov/standards/iso639-2/php/code_list.php
+set sourceLanguage=ja
+set targetLanguage=en
+
+:: The path and file name for the source sentence piece model. No quotes.
+set sourceSentencePieceModel=%modelPath%\spm\spm.%sourceLanguage%.nopretok.model
+
+:: The path and file name for the target sentence piece model. No quotes.
+set targetSentencePieceModel=%modelPath%\spm\spm.%targetLanguage%.nopretok.model
+
+:: Valid values are: cpu, gpu, cuda, directml. gpu is aliased to cuda. directml requires fairseq.
+set device=cpu
+
+:: The path and file name for Python.
+:: To use with a portable version of Python, prepend a custom path to python.exe
+set pythonExe=python.exe
+
+:: Less common options. Append to core logic as needed.
+:: Specify the internet protocol address for the server. 0.0.0.0 means bind to all local addresses.
+:: --address 0.0.0.0
+:: Specify port to listen on. Associated with --address. Default=14366.
+:: --port 14366
+:: Disable performance metrics (time keeping).
+:: --disablePerfMetrics
+:: Preload the model for lower latency inferencing.
+:: --preloadModel
+:: Print more information.
+:: --verbose
+:: Print too much information.
+:: --debug
+
+:: To change internal fairseq or ctranslate2 variables, look at the defaults near the top of the .py file.
+
+
+:: Core logic. Invoke server with the options specified above.
+"%pythonExe%" py3translationServer.py %mode% "%modelPath%" -dev %device% --sourceLanguage %sourceLanguage% --targetLanguage %targetLanguage% -sspm "%sourceSentencePieceModel%" -tspm "%targetSentencePieceModel%"
+popd
diff --git a/resources/ctranslate2.benchmarks.txt b/resources/ctranslate2.benchmarks.txt
@@ -0,0 +1,105 @@
+CTranslate2 CPU - Ryzen 5 5600X - 101 lines from Translator++ (non-cached, model preloaded, beam_size=5)
+intra/inter_threads=Request servicing time (total time, not just raw processing time)
+
+~40% CPU usage
+0/6=36.24s
+0/8=35.48s / 36.75s
+0/12=35.65s / 35.59s
+0/16=35.8s
+0/18=35.77s
+0/24=35.89s / 36.19s
+0/30=36.11s / 36.18s
+0/32=35.9s
+
+~50% CPU usage:
+6/6=33.1s / 33.86s
+6/8=33.25s
+6/12=33.54s
+6/16=33.32s
+6/24=33.58s
+6/30=33.32s
+6/32=33.35s
+
+100% CPU usage:
+12/6=38.43s / 38.87s
+12/8=38.8s / 48.19s /44.91s/ 41.84s /42.48s
+12/12=38.79s / 38.48s
+12/16=38.51s
+12/18=39.24s /38.48s
+12/24=38.74s
+12/30=37.67s
+12/32=37.98s
+
+Conclusion1: inter_threads does not matter for CPU load.
+Conclusion2: intra_threads = CPU threads.
+Conclusion3: Best CTranslate2 CPU performance is when intra_threads matches physical CPU core count.
+
+
+A few one-off tests (still run multiple times each).
+old fairseq 1.0.0a0 CPU on Python 3.9 ; single process (Default Sugoi v7)
+94.92s / 95.4s
+
+fully updated fairseq 0.2.2+latest commits CPU on Python 3.10 ; single process (Updated Sugoi)
+71.6s / 69.76s / 69.37 / 69.15s
+
+Ryzen 5 5600X + Nvidia RTX 3060 12 GB
+fully updated fairseq w/CUDA 11 on Python 3.10 ;  single process (Updated Sugoi + CUDA)
+11.54s / 9.44s / 9.46s / 9.47s
+
+
+fully updated CTranslate2 w/CUDA 11 on Python 3.10 ; single process
+intra/inter_threads=Request servicing time (total time, not just raw processing time)
+
+~38% CPU usage
+0/6=6.47s / 5.71s / 5.78s / 5.74s / 5.81s
+0/8=6.44s / 5.77s / 5.72s / 5.79s
+0/12=6.49s / 5.71s / 5.73s / 5.76s
+0/16=6.41s / 5.75s / 5.76s / 5.73s
+0/18=6.51s / 5.74s / 5.81s / 5.87s
+0/24=6.48s / 5.71s / 5.79s / 5.77s
+0/30=6.49s / 5.72s / 5.72s
+0/32=6.47s / 5.70s / 5.76s / 6.42s
+
+~50% CPU usage:
+6/6=6.58s / 5.82s / 5.85s / 5.93s
+6/8=6.62s / 5.87s / 5.95s / 5.93s
+6/12=6.61s / 5.87s / 5.85s / 5.95s
+6/16=6.58s / 5.82s / 5.96s / 5.89s
+6/24=7.08s / 5.94s / 5.92s / 6.03s / 6.04s
+6/30=6.61s / 5.82s / 5.88s / 5.99s
+6/32=6.8s / 6.11s / 6.18s / 6.1s / 6.62s / 5.82s / 5.92s
+6/128=6.59s / 5.84s / 5.92s
+
+100% CPU usage:
+12/6=7.75s / 6.82s / 7.08s
+12/8=7.76s / 6.75s / 6.98s / 6.73s
+12/12=7.46s / 6.61s / 7.06s
+12/16=7.32s / 6.62s / 7.17s
+12/18=7.76s / 6.96s / 6.82s
+12/24=7.84s / 6.58s / 6.8s
+12/30=7.8s / 6.9s / 6.67s
+12/32=7.82s / 6.84s / 6.87s / 6.92s / 6.93s
+
+Conclusion: CPU threads do not matter much when using CUDA unless overloading on CPU threads. In that case, the performance drops.
+
+
+CTranslate2 CPU - AMD FX 8320 - 101 lines from Translator++ (non-cached, model preloaded, beam_size=5)
+intra_threads=Request servicing time (total time, not just raw processing time)
+~55% CPU usage
+0=171s / 168.59s
+
+~55% CPU usage
+4=167.33s / 168.04s
+
+100% CPU usage
+8=129.09s / 130.08s
+
+100% CPU usage
+12=145.04s / 146.38s
+
+Conclusion: Best CTranslate2 CPU performance is when intra_threads matches physical CPU core count (8).
+
+
+fairseq Direct ML AMD FX 8320 + RX 560 4 GB - 101 lines from Translator++ (non-cached, model preloaded, beam_size=5)
+20-50% CPU usage
+119.21s / 119.65s / 118.86s / 118.54s / 118.15s /118.48s / 118.57s / 117.72s / 117.43s /117.51s / 119s / 119.28s
diff --git a/resources/fairseqToCTranslate2_converter.bat b/resources/fairseqToCTranslate2_converter.bat
@@ -0,0 +1,201 @@
+@echo off
+setlocal enabledelayedexpansion
+::
+:: This script attempts to convert fairseq models (big.pretrain.pt) to the format required by CTranslate2 (model.bin).
+::
+:: ct2-fairseq-converter.exe must be installed. It can be installed after installing Python 3 by using pip. Syntax:
+:: pip install ctranslate2 scikit-learn
+:: Entering the above command into a command prompt will install ct2-fairseq-converter.exe to the Scripts\ folder of the Python 3 environment.
+::
+:: Usage: 0) fairseqToCTranslate2_converter.bat must be run inside of a folder that has python.exe available and where that resulting Python
+:: environment has access to the PyTorch library. In other words, copy this file to the appropriate location for portable versions of Python,
+:: or install PyTorch globally. Instructions: https://pytorch.org   Select pip + Python.
+:: 1) Update the modelPath variable below to the path of myModel.pt.
+:: 2) Update the sourceLanguage and targetLanguage variables to those the model uses.
+:: Use two letter language codes from: www.loc.gov/standards/iso639-2/php/code_list.php
+:: 3) Place the word lists for that model in the same folder as myModel.pt, a.k.a. the modelPath directory.
+:: For example, for converting a JPN->ENG model, the following files must exist in the modelPath directory:
+:: dict.ja.txt
+:: dict.en.txt
+:: The conversion tool will prepend dict. and append .txt to the source and target languages to find them.
+:: 3) If there is a vocabulary model file for the source language, update sourceLanguageVocabularyModel with the
+:: absolute path to that vocabulary file. This is optional but recommended to ensure the best quality and speed from CTranslate2.
+:: Alternatively, if any of the vocabModelSearchPath locations (relative to modelPath) contain the model and the Prefix and Postfix
+:: are specified, then there will be an attempt to search for the vocabulary models using that information.
+:: It is assumed that the source and destination vocabulary models are in the same folder. 
+:: 4) Update pythonExe and converterExe to point to the correct directories. Be sure to include the literal binary name.
+:: Example for portable versions:
+:: set pythonExe=Python310\python.exe
+:: set converterExe=Python310\Scripts\ct2-fairseq-converter.exe
+:: 5) Save any changes to this file.
+:: 6) Double-click on this .bat file to run it. Alternatively, open a command prompt, and then run this batch file.
+
+:: Set defaults.
+set modelPath=D:\myModel\big.pretrain.pt
+set destinationPath=myModel_ctranslate2
+
+:: Two letter language codes.
+set sourceLanguage=ja
+set targetLanguage=en
+
+set pythonExe=python.exe
+set converterExe=ct2-fairseq-converter.exe
+set installedAsGlobal=false
+
+:: set sourceLanguageVocabularyModel=invalid
+set sourceLanguageVocabularyModel=invalid
+
+:: if sourceLanguageVocabularyModel=invalid, then search for the vocab file using Prefix and Postfix specified.
+set vocabularyModelPrefix=spm.
+set vocabularyModelPostfix=.nopretok.model
+set vocabularyModelPostfix2=.nopretok.vocab
+
+:: These are relative to modelPath. Do not use a trailing \
+set vocabModelSearchPath0=.
+set vocabModelSearchPath1=spm
+set vocabModelSearchPath2=spmModels
+set vocabModelSearchPath3=..\spm
+set vocabModelSearchPath4=..\spmModels
+
+
+:: Do not modify the stuff below this line. ::
+
+
+if /i "" neq "%~1" goto usage
+:: Debug code.
+:: %~dp0 returns a backslash \ at the end
+::set currentDir=%cd%
+:: sometimes %cd% will append a \, but sometimes not, so remove and then add it back to obtain a known state.
+::if /i "%currentDir:~-1%" equ "\" set currentDir=%currentDir:~,-1%
+::set currentDir=%currentDir%\
+if /i "%destinationPath:~-1%" equ "\" set destinationPath=%destinationPath:~,-1%
+
+:: Make all paths specified in the script, relative to the script instead of relative to current location of wherever command prompt happens to be.
+pushd "%~dp0"
+
+if not exist "%modelPath%" goto usage
+
+:: Install required dependencies. This has an extremely high chance of failure because scikit-learn may require compiling.
+python.exe -m pip install ctranslate2 fairseq scikit-learn
+:: ctranslate2 dependencies: setuptools, numpy, pyyaml. Tested using: ctranslate2==3.24.0 setuptools==65.5.0 numpy==1.26.3 pyyaml==6.0.1
+:: scikit-learn dependencies: hreadpoolctl, scipy, joblib. Tested using: scikit-learn==1.4.0 threadpoolctl==3.2.0 scipy==1.12.0 joblib==1.3.2
+
+:: So, ct2-fairseq-converter.exe is not fully standalone. It must be run as python.exe ct2-fairseq-converter.exe because it looks for PyTorch in the environmental path. If python.exe is not specified, it seems to randomly add one under the current user account? Anyways, always preprend python.exe to prevent this strange behavior. This also means that this script has a hard dependency on being run where python.exe is available and also has both PyTorch and fairseq installed.
+if /i "%installedAsGlobal%" neq "true" if not exist "%pythonExe%" (
+echo  Error: Unable to find python.exe.
+goto end)
+if /i "%installedAsGlobal%" neq "true" if not exist "%converterExe%" (
+echo  Error: Unable to find converterExe at: "%converterExe%"
+goto end)
+set converterExe="%pythonExe%" "%converterExe%"
+
+set dataDir=invalid
+call :determineDataDir "%modelPath%"
+if not exist "%dataDir%" goto invalidDataDir
+
+
+set vocabModelValid=False
+if exist "%sourceLanguageVocabularyModel%" set vocabModelValid=True
+if not exist "%sourceLanguageVocabularyModel%" call :updateSourceLanguageVocabModel
+
+set config=--model_path "%modelPath%" --data_dir "%dataDir%" --output_dir "%destinationPath%" --force --source_lang %sourceLanguage% --target_lang %targetLanguage%
+
+:: This appears to be incorrect, so comment it out. The correct way to generate a vocab_mapping file is at:
+:: https://github.com/OpenNMT/papers/tree/master/WNMT2018/vmap
+::if /i "%vocabModelValid%" equ "True" set config=%config% --vocab_mapping "%sourceLanguageVocabularyModel%"
+:: The model will need to be re-converted once the correct vmap has been generated.
+
+:: heuristic to see if core logic has already run
+if exist "%destinationPath%\model.bin" goto afterCoreLogic
+
+:: Core logic.
+echo %converterExe% %config%
+%converterExe% %config%
+
+:afterCoreLogic
+
+
+:: Copy sentencepiece models to destinationPath\spm.
+:: First, check if the sentence pieces are available.
+if /i "%vocabModelValid%" neq "True" (echo Warning: Unable to find sentencepiece models.
+goto end)
+
+:: The previous operation to convert might have failed and the output folder was never created. goto end if that was the case.
+if not exist "%destinationPath%" goto end
+
+:: Debug code.
+::echo cd=%cd%
+
+:: The copy command needs the directory already created.
+if not exist "%destinationPath%\spm" mkdir "%destinationPath%\spm"
+
+::if not exist target, then copy from source to target.
+echo if not exist "%destinationPath%\spm\%sourceLanguageVocabModelName%" copy "%sourceLanguageVocabularyModel%" "%destinationPath%\spm\%sourceLanguageVocabModelName%"
+if not exist "%destinationPath%\spm\%sourceLanguageVocabModelName%" copy "%sourceLanguageVocabularyModel%" "%destinationPath%\spm\%sourceLanguageVocabModelName%"
+echo if not exist "%destinationPath%\spm\%sourceLanguageVocabModelName2%" "%sourceLanguageVocabularyModel2%" copy "%sourceLanguageVocabularyModel2%" "%destinationPath%\spm\%sourceLanguageVocabModelName2%"
+if not exist "%destinationPath%\spm\%sourceLanguageVocabModelName2%" if exist "%sourceLanguageVocabularyModel2%" copy "%sourceLanguageVocabularyModel2%" "%destinationPath%\spm\%sourceLanguageVocabModelName2%"
+
+echo if not exist "%destinationPath%\spm\%targetLanguageVocabModelName%" if exist "%targetLanguageVocabularyModel%" copy "%targetLanguageVocabularyModel%" "%destinationPath%\spm\%targetLanguageVocabModelName%"
+if not exist "%destinationPath%\spm\%targetLanguageVocabModelName%" if exist "%targetLanguageVocabularyModel%" copy "%targetLanguageVocabularyModel%" "%destinationPath%\spm\%targetLanguageVocabModelName%"
+echo if not exist "%destinationPath%\spm\%targetLanguageVocabModelName2%" if exist "%targetLanguageVocabularyModel2%" copy "%targetLanguageVocabularyModel2%" "%destinationPath%\spm\%targetLanguageVocabModelName2%"
+if not exist "%destinationPath%\spm\%targetLanguageVocabModelName2%" if exist "%targetLanguageVocabularyModel2%" copy "%targetLanguageVocabularyModel2%" "%destinationPath%\spm\%targetLanguageVocabModelName2%"
+
+goto end
+:: Start functions list
+
+:: Determines dataDir based upon input of first argument (%~1).
+:determineDataDir
+set dataDir=%~dp1
+if /i "%dataDir:~-1%" equ "\" set dataDir=%dataDir:~,-1%
+
+goto :eof
+
+:: Goal is to fill sourceLanguageVocabularyModel with the correct path based upon vocabularyModelPrefix + sourceLanguage + vocabularyModelPostfix
+:updateSourceLanguageVocabModel
+if /i "%sourceLanguage%" equ "" goto :eof
+if /i "%vocabularyModelPrefix%" equ "" goto :eof
+if /i "%vocabularyModelPostfix%" equ "" goto :eof
+set sourceLanguageVocabModelName=%vocabularyModelPrefix%%sourceLanguage%%vocabularyModelPostfix%
+set sourceLanguageVocabModelName2=%vocabularyModelPrefix%%sourceLanguage%%vocabularyModelPostfix2%
+set targetLanguageVocabModelName=%vocabularyModelPrefix%%targetLanguage%%vocabularyModelPostfix%
+set targetLanguageVocabModelName2=%vocabularyModelPrefix%%targetLanguage%%vocabularyModelPostfix2%
+
+
+:: stupid and simple way
+::if exist "%dataDir%\%vocabModelSearchPath0%\%sourceLanguageVocabModelName%" set sourceLanguageVocabularyModel=%vocabModelSearchPath0%\%sourceLanguageVocabModelName%
+
+:: smart and complicated way
+for /l %%i in (4,-1,0) do if exist "%dataDir%\!vocabModelSearchPath%%i!\%sourceLanguageVocabModelName%" (
+set sourceLanguageVocabularyModel=%dataDir%\!vocabModelSearchPath%%i!\%sourceLanguageVocabModelName%
+set sourceLanguageVocabularyModel2=%dataDir%\!vocabModelSearchPath%%i!\%sourceLanguageVocabModelName2%
+set targetLanguageVocabularyModel=%dataDir%\!vocabModelSearchPath%%i!\%targetLanguageVocabModelName%
+set targetLanguageVocabularyModel2=%dataDir%\!vocabModelSearchPath%%i!\%targetLanguageVocabModelName2%
+set vocabModelValid=True
+)
+
+:: Debug code
+::echo set vocabModelValid=%vocabModelValid%
+::echo set sourceLanguageVocabularyModel=%sourceLanguageVocabularyModel%
+::echo set sourceLanguageVocabularyModel2=%sourceLanguageVocabularyModel2%
+::echo set targetLanguageVocabularyModel=%targetLanguageVocabularyModel%
+::echo set targetLanguageVocabularyModel2=%targetLanguageVocabularyModel2%
+::set vocabModelValid=False
+
+goto :eof
+
+
+:invalidDataDir
+echo  Error: Conversion failed.
+echo  Reason: Unable to determine correct data directory based on: 
+echo  modelPath="%modelPath%"
+goto end
+
+
+:usage
+start notepad "%~0"
+
+
+:end
+popd
+endlocal
+pause
diff --git a/resources/optional.txt b/resources/optional.txt
@@ -0,0 +1,2 @@
+pywin32==306
+streamlit==1.31.1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Auto detect text files and perform LF normalization
		* text=auto