From 64a71237c57868bd6b4235808fa388e9a6485afa Mon Sep 17 00:00:00 2001
From: C0untFloyd <131583554+C0untFloyd@users.noreply.github.com>
Date: Wed, 3 May 2023 16:06:49 +0200
Subject: [PATCH] - BUGFIX: Joining prompt names in subfolders - Release with
 Windows Installer - Installer .bat includes automatic updater - Trying to
 detect voice changes, adding short pause breaks inbetween

---
 README.md                   | 32 +++++++++++----
 installer/installer.py      | 78 +++++++++++++++++++++++++++++++++++++
 installer/windows_start.bat | 65 +++++++++++++++++++++++++++++++
 parseinput.py               |  2 +-
 webui.py                    | 34 +++++++++-------
 5 files changed, 188 insertions(+), 23 deletions(-)
 create mode 100644 installer/installer.py
 create mode 100644 installer/windows_start.bat

diff --git a/README.md b/README.md
index 0cdae368..711ec121 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Bark GUI
 
-[Changelog](#-changelog) • [Example](#-example-input) • [Installation](#-installation) • [Usage](#-usage) • [FAQ](#-faq)
+[Changelog](#changelog) • [Example](#example-input) • [Installation](#installation) • [Usage](#usage) • [FAQ](#faq)
 
 
 A Gradio Web UI for an extended - easy to use - Bark Version, focused on Windows running locally.
@@ -30,19 +30,21 @@ We know NOW that in the early years of the twentieth century, this world was bei
 
 ### Installation
 
+For Windows you can now use the 1-click installer released. This will download and install everything
+in a handy miniconda environment.
+For other OS or if you'd rather like to do this by yourself then:
+
 - `git clone https://github.com/C0untFloyd/bark-gui`
 - `pip install .`
 - `pip install gradio`
-
-- (optional for audio playback) `pip install soundfile` 
-- (optional) install Torch with CUDA for much faster generation e.g. `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --force-reinstall` 
-
+- `pip install soundfile` 
+- (optional but necessary for fast generation) install Torch with CUDA `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --force-reinstall` 
 
 
 ### Usage
 
-- Linux `python webui.py (optional arguments)`
-- Windows Use the `StartBark.bat`
+- Windows Either run the `windows_start.bat` from the Installer or use the `StartBark.bat`
+- Linux `python webui.py (and optional commandline arguments)`
 
 #### Commandline Arguments:
 
@@ -96,6 +98,7 @@ As you can see every new line will be split as a potential new line for another
 Afterwards you could change one of the speaker names (name="en_speaker_0) to create dialogues with fixed voices. If the number of words
 exceeds the max for Bark generation, new lines will be created reusing the voice from the sentence before that.
 
+Clicking on the `Generate` Button will automatically detect if this is SSML input or just plain text.
 
 
 
@@ -122,6 +125,13 @@ You probably have outdated Torch/CUDA Drivers. Try re-installing them:
 Use commandline argument `-enablemps` to make Bark use it.
 
 
+**Q:** How much VRAM do I need to have this run on GPU?
+
+Running this on GPU is currently only possible on NVIDIA Cards with at least 2 Gb VRAM. Below 8 Gb you
+would probably need to use the smaller models and if you are still having memory problems, you would need
+to use the -offloadcpu command argument, which tries to offload as much memory to your standard memory.
+
+
 **Q:** Why are there voice changes in the resulting audio files?
 
 Because (from my limited understanding) this is a similar stochastic model as other GPT-Style Models, where each output is based on a previous one.
@@ -149,6 +159,14 @@ I'm doing this basically for myself but I'm glad if you enjoy my experiments too
 
 ### Changelog
 
+**03.05.2023** First Release v0.4.0
+
+- BUGFIX: Joining prompt names in subfolders
+- Release with Windows Installer (lifted & modified from [Oobabooga](https://github.com/oobabooga/one-click-installers))
+- Installer .bat includes automatic updater
+- Trying to detect voice changes, adding short pause breaks inbetween
+
+
 **02.05.2023**
 
 - Merged all changes from base Suno branch
diff --git a/installer/installer.py b/installer/installer.py
new file mode 100644
index 00000000..6b0d036d
--- /dev/null
+++ b/installer/installer.py
@@ -0,0 +1,78 @@
+import argparse
+import glob
+import os
+import shutil
+import site
+import subprocess
+import sys
+
+script_dir = os.getcwd()
+
+
+def run_cmd(cmd, capture_output=False, env=None):
+    # Run shell commands
+    return subprocess.run(cmd, shell=True, capture_output=capture_output, env=env)
+
+
+def check_env():
+    # If we have access to conda, we are probably in an environment
+    conda_not_exist = run_cmd("conda", capture_output=True).returncode
+    if conda_not_exist:
+        print("Conda is not installed. Exiting...")
+        sys.exit()
+    
+    # Ensure this is a new environment and not the base environment
+    if os.environ["CONDA_DEFAULT_ENV"] == "base":
+        print("Create an environment for this project and activate it. Exiting...")
+        sys.exit()
+
+
+def install_dependencies():
+    # Select your GPU or, choose to run in CPU mode
+    print("Do you have a GPU (Nvidia)?")
+    print("Enter Y for Yes")
+    print()
+    gpuchoice = input("Input> ").lower()
+
+    # Clone webui to our computer
+    run_cmd("git clone https://github.com/C0untFloyd/bark-gui.git")
+    if gpuchoice == "y":
+        run_cmd("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
+
+    run_cmd("pip install IPython")
+    run_cmd("pip install soundfile")
+    run_cmd("pip install gradio")
+    # Install the webui dependencies
+    update_dependencies()
+
+
+def update_dependencies():
+    os.chdir("bark-gui")
+    run_cmd("git pull")
+    # Installs/Updates dependencies from all requirements.txt
+    run_cmd("python -m pip install .")
+    
+
+def start_app():
+    os.chdir("bark-gui")
+    run_cmd('python webui.py -autolaunch')
+
+
+if __name__ == "__main__":
+    # Verifies we are in a conda environment
+    check_env()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--update', action='store_true', help='Update the web UI.')
+    args = parser.parse_args()
+
+    if args.update:
+        update_dependencies()
+    else:
+        # If webui has already been installed, skip and run
+        if not os.path.exists("bark-gui/"):
+            install_dependencies()
+            os.chdir(script_dir)
+
+        # Run the model with webui
+        start_app()
diff --git a/installer/windows_start.bat b/installer/windows_start.bat
new file mode 100644
index 00000000..6ff92362
--- /dev/null
+++ b/installer/windows_start.bat
@@ -0,0 +1,65 @@
+@echo off
+
+cd /D "%~dp0"
+
+echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
+
+set PATH=%PATH%;%SystemRoot%\system32
+
+@rem config
+set INSTALL_DIR=%cd%\installer_files
+set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
+set INSTALL_ENV_DIR=%cd%\installer_files\env
+set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Windows-x86_64.exe
+set conda_exists=F
+
+@rem figure out whether git and conda needs to be installed
+call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1
+if "%ERRORLEVEL%" EQU "0" set conda_exists=T
+
+@rem (if necessary) install git and conda into a contained environment
+@rem download conda
+if "%conda_exists%" == "F" (
+	echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe
+
+	mkdir "%INSTALL_DIR%"
+	call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end )
+
+	echo Installing Miniconda to %CONDA_ROOT_PREFIX%
+	start /wait "" "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX%
+
+	@rem test the conda binary
+	echo Miniconda version:
+	call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end )
+)
+
+@rem create the installer env
+if not exist "%INSTALL_ENV_DIR%" (
+  echo Packages to install: %PACKAGES_TO_INSTALL%
+  call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10 || ( echo. && echo Conda environment creation failed. && goto end )
+)
+
+@rem check if conda environment was actually created
+if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
+
+@rem activate installer env
+call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
+
+@rem always ask for update check
+if "%conda_exists%" == "T" (
+	choice /C:YN /M:"Check for Updates"
+	IF ERRORLEVEL == 1 (
+		echo Checking...
+		call python installer.py --update
+	)
+)
+
+@rem setup installer env
+echo Launching Bark GUI
+call python installer.py
+
+echo.
+echo Done!
+
+:end
+pause
diff --git a/parseinput.py b/parseinput.py
index 15b7fc54..27a62f9b 100644
--- a/parseinput.py
+++ b/parseinput.py
@@ -4,7 +4,7 @@
 #import nltk
 
 # Chunked generation originally from https://github.com/serp-ai/bark-with-voice-clone
-def split_and_recombine_text(text, desired_length=100, max_length=150):
+def split_and_recombine_text(text, desired_length=150, max_length=200):
     # return nltk.sent_tokenize(text)
 
     # from https://github.com/neonbjb/tortoise-tts
diff --git a/webui.py b/webui.py
index c555354c..8a345285 100644
--- a/webui.py
+++ b/webui.py
@@ -82,14 +82,20 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu
     use_last_generation_as_history = "Use last generation as history" in complete_settings
     progress(0, desc="Generating")
 
-    silence = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32)  # quarter second of silence
+    silenceshort = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32)  # quarter second of silence
+    silencelong = np.zeros(int(0.50 * SAMPLE_RATE), dtype=np.float32)  # half a second of silence
 
     all_parts = []
     text = text.lstrip()
     if is_ssml(text):
         list_speak = create_clips_from_ssml(text)
+        prev_speaker = None
         for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)):
             selected_speaker = clip[0]
+            # Add pause break between speakers
+            if i > 0 and selected_speaker != prev_speaker:
+                all_parts += [silencelong.copy()]
+            prev_speaker = selected_speaker
             text = clip[1]
             text = saxutils.unescape(text)
             if selected_speaker == "None":
@@ -99,7 +105,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu
             audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp)
             if len(list_speak) > 1:
                 save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav"))
-            all_parts += [audio_array, silence.copy()]
+            all_parts += [audio_array]
     else:
         texts = split_and_recombine_text(text)
         for i, text in tqdm(enumerate(texts), total=len(texts)):
@@ -139,7 +145,10 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu
                           full_generation['fine_prompt'])
                 # loading voice from custom folder needs to have extension
                 voice_name = voice_name + ".npz"
-            all_parts += [audio_array, silence.copy()]
+            all_parts += [audio_array]
+            # Add short pause between sentences
+            if text[-1] in "!?.\n" and i > 1:
+                all_parts += [silenceshort.copy()]
 
     # save & play audio
     result = create_filename(OUTPUTFOLDER, "final",".wav")
@@ -234,9 +243,10 @@ def convert_text_to_ssml(text, selected_speaker):
 	for file in files:
 		if(file.endswith(".npz")):
 			pathpart = root.replace("./bark/assets/prompts", "")
-			if len(pathpart) < 1:
-				pathpart = "/"
-			speakers_list.append(os.path.join(pathpart, file[:-4]))
+			name = os.path.join(pathpart, file[:-4])
+			if name.startswith("/") or name.startswith("\\"):
+				name = name[1:]
+			speakers_list.append(name)
 
 speakers_list = sorted(speakers_list, key=lambda x: x.lower())
 speakers_list.insert(0, 'None')
@@ -244,7 +254,7 @@ def convert_text_to_ssml(text, selected_speaker):
 # Create Gradio Blocks
 
 with gr.Blocks(title="Bark Enhanced Gradio GUI", mode="Bark Enhanced") as barkgui:
-    gr.Markdown("### [Bark Enhanced](https://github.com/C0untFloyd/bark-gui)")
+    gr.Markdown("### [Bark Enhanced v0.4.0](https://github.com/C0untFloyd/bark-gui)")
     with gr.Tab("TTS"):
         with gr.Row():
             with gr.Column():
@@ -281,13 +291,7 @@ def convert_text_to_ssml(text, selected_speaker):
                 gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)")
                 speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice")
             with gr.Column():
-                text_temp = gr.Slider(
-                    0.1,
-                    1.0,
-                    value=0.7,
-                    label="Generation Temperature",
-                    info="1.0 more diverse, 0.1 more conservative"
-                )
+                text_temp = gr.Slider(0.1, 1.0, value=0.6, label="Generation Temperature", info="1.0 more diverse, 0.1 more conservative")
                 waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative")
 
         with gr.Row():
@@ -300,7 +304,7 @@ def convert_text_to_ssml(text, selected_speaker):
 
         with gr.Row():
             with gr.Column():
-                tts_create_button = gr.Button("Create")
+                tts_create_button = gr.Button("Generate")
             with gr.Column():
                 hidden_checkbox = gr.Checkbox(visible=False)
                 button_delete_files = gr.Button("Clear output folder")