From 64a71237c57868bd6b4235808fa388e9a6485afa Mon Sep 17 00:00:00 2001 From: C0untFloyd <131583554+C0untFloyd@users.noreply.github.com> Date: Wed, 3 May 2023 16:06:49 +0200 Subject: [PATCH] - BUGFIX: Joining prompt names in subfolders - Release with Windows Installer - Installer .bat includes automatic updater - Trying to detect voice changes, adding short pause breaks inbetween --- README.md | 32 +++++++++++---- installer/installer.py | 78 +++++++++++++++++++++++++++++++++++++ installer/windows_start.bat | 65 +++++++++++++++++++++++++++++++ parseinput.py | 2 +- webui.py | 34 +++++++++------- 5 files changed, 188 insertions(+), 23 deletions(-) create mode 100644 installer/installer.py create mode 100644 installer/windows_start.bat diff --git a/README.md b/README.md index 0cdae368..711ec121 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Bark GUI -[Changelog](#-changelog) • [Example](#-example-input) • [Installation](#-installation) • [Usage](#-usage) • [FAQ](#-faq) +[Changelog](#changelog) • [Example](#example-input) • [Installation](#installation) • [Usage](#usage) • [FAQ](#faq) A Gradio Web UI for an extended - easy to use - Bark Version, focused on Windows running locally. @@ -30,19 +30,21 @@ We know NOW that in the early years of the twentieth century, this world was bei ### Installation +For Windows you can now use the 1-click installer released. This will download and install everything +in a handy miniconda environment. +For other OS or if you'd rather like to do this by yourself then: + - `git clone https://github.com/C0untFloyd/bark-gui` - `pip install .` - `pip install gradio` - -- (optional for audio playback) `pip install soundfile` -- (optional) install Torch with CUDA for much faster generation e.g. `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --force-reinstall` - +- `pip install soundfile` +- (optional but necessary for fast generation) install Torch with CUDA `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --force-reinstall` ### Usage -- Linux `python webui.py (optional arguments)` -- Windows Use the `StartBark.bat` +- Windows Either run the `windows_start.bat` from the Installer or use the `StartBark.bat` +- Linux `python webui.py (and optional commandline arguments)` #### Commandline Arguments: @@ -96,6 +98,7 @@ As you can see every new line will be split as a potential new line for another Afterwards you could change one of the speaker names (name="en_speaker_0) to create dialogues with fixed voices. If the number of words exceeds the max for Bark generation, new lines will be created reusing the voice from the sentence before that. +Clicking on the `Generate` Button will automatically detect if this is SSML input or just plain text. @@ -122,6 +125,13 @@ You probably have outdated Torch/CUDA Drivers. Try re-installing them: Use commandline argument `-enablemps` to make Bark use it. +**Q:** How much VRAM do I need to have this run on GPU? + +Running this on GPU is currently only possible on NVIDIA Cards with at least 2 Gb VRAM. Below 8 Gb you +would probably need to use the smaller models and if you are still having memory problems, you would need +to use the -offloadcpu command argument, which tries to offload as much memory to your standard memory. + + **Q:** Why are there voice changes in the resulting audio files? Because (from my limited understanding) this is a similar stochastic model as other GPT-Style Models, where each output is based on a previous one. @@ -149,6 +159,14 @@ I'm doing this basically for myself but I'm glad if you enjoy my experiments too ### Changelog +**03.05.2023** First Release v0.4.0 + +- BUGFIX: Joining prompt names in subfolders +- Release with Windows Installer (lifted & modified from [Oobabooga](https://github.com/oobabooga/one-click-installers)) +- Installer .bat includes automatic updater +- Trying to detect voice changes, adding short pause breaks inbetween + + **02.05.2023** - Merged all changes from base Suno branch diff --git a/installer/installer.py b/installer/installer.py new file mode 100644 index 00000000..6b0d036d --- /dev/null +++ b/installer/installer.py @@ -0,0 +1,78 @@ +import argparse +import glob +import os +import shutil +import site +import subprocess +import sys + +script_dir = os.getcwd() + + +def run_cmd(cmd, capture_output=False, env=None): + # Run shell commands + return subprocess.run(cmd, shell=True, capture_output=capture_output, env=env) + + +def check_env(): + # If we have access to conda, we are probably in an environment + conda_not_exist = run_cmd("conda", capture_output=True).returncode + if conda_not_exist: + print("Conda is not installed. Exiting...") + sys.exit() + + # Ensure this is a new environment and not the base environment + if os.environ["CONDA_DEFAULT_ENV"] == "base": + print("Create an environment for this project and activate it. Exiting...") + sys.exit() + + +def install_dependencies(): + # Select your GPU or, choose to run in CPU mode + print("Do you have a GPU (Nvidia)?") + print("Enter Y for Yes") + print() + gpuchoice = input("Input> ").lower() + + # Clone webui to our computer + run_cmd("git clone https://github.com/C0untFloyd/bark-gui.git") + if gpuchoice == "y": + run_cmd("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118") + + run_cmd("pip install IPython") + run_cmd("pip install soundfile") + run_cmd("pip install gradio") + # Install the webui dependencies + update_dependencies() + + +def update_dependencies(): + os.chdir("bark-gui") + run_cmd("git pull") + # Installs/Updates dependencies from all requirements.txt + run_cmd("python -m pip install .") + + +def start_app(): + os.chdir("bark-gui") + run_cmd('python webui.py -autolaunch') + + +if __name__ == "__main__": + # Verifies we are in a conda environment + check_env() + + parser = argparse.ArgumentParser() + parser.add_argument('--update', action='store_true', help='Update the web UI.') + args = parser.parse_args() + + if args.update: + update_dependencies() + else: + # If webui has already been installed, skip and run + if not os.path.exists("bark-gui/"): + install_dependencies() + os.chdir(script_dir) + + # Run the model with webui + start_app() diff --git a/installer/windows_start.bat b/installer/windows_start.bat new file mode 100644 index 00000000..6ff92362 --- /dev/null +++ b/installer/windows_start.bat @@ -0,0 +1,65 @@ +@echo off + +cd /D "%~dp0" + +echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end + +set PATH=%PATH%;%SystemRoot%\system32 + +@rem config +set INSTALL_DIR=%cd%\installer_files +set CONDA_ROOT_PREFIX=%cd%\installer_files\conda +set INSTALL_ENV_DIR=%cd%\installer_files\env +set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Windows-x86_64.exe +set conda_exists=F + +@rem figure out whether git and conda needs to be installed +call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1 +if "%ERRORLEVEL%" EQU "0" set conda_exists=T + +@rem (if necessary) install git and conda into a contained environment +@rem download conda +if "%conda_exists%" == "F" ( + echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe + + mkdir "%INSTALL_DIR%" + call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end ) + + echo Installing Miniconda to %CONDA_ROOT_PREFIX% + start /wait "" "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX% + + @rem test the conda binary + echo Miniconda version: + call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end ) +) + +@rem create the installer env +if not exist "%INSTALL_ENV_DIR%" ( + echo Packages to install: %PACKAGES_TO_INSTALL% + call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10 || ( echo. && echo Conda environment creation failed. && goto end ) +) + +@rem check if conda environment was actually created +if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end ) + +@rem activate installer env +call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end ) + +@rem always ask for update check +if "%conda_exists%" == "T" ( + choice /C:YN /M:"Check for Updates" + IF ERRORLEVEL == 1 ( + echo Checking... + call python installer.py --update + ) +) + +@rem setup installer env +echo Launching Bark GUI +call python installer.py + +echo. +echo Done! + +:end +pause diff --git a/parseinput.py b/parseinput.py index 15b7fc54..27a62f9b 100644 --- a/parseinput.py +++ b/parseinput.py @@ -4,7 +4,7 @@ #import nltk # Chunked generation originally from https://github.com/serp-ai/bark-with-voice-clone -def split_and_recombine_text(text, desired_length=100, max_length=150): +def split_and_recombine_text(text, desired_length=150, max_length=200): # return nltk.sent_tokenize(text) # from https://github.com/neonbjb/tortoise-tts diff --git a/webui.py b/webui.py index c555354c..8a345285 100644 --- a/webui.py +++ b/webui.py @@ -82,14 +82,20 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu use_last_generation_as_history = "Use last generation as history" in complete_settings progress(0, desc="Generating") - silence = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32) # quarter second of silence + silenceshort = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32) # quarter second of silence + silencelong = np.zeros(int(0.50 * SAMPLE_RATE), dtype=np.float32) # half a second of silence all_parts = [] text = text.lstrip() if is_ssml(text): list_speak = create_clips_from_ssml(text) + prev_speaker = None for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)): selected_speaker = clip[0] + # Add pause break between speakers + if i > 0 and selected_speaker != prev_speaker: + all_parts += [silencelong.copy()] + prev_speaker = selected_speaker text = clip[1] text = saxutils.unescape(text) if selected_speaker == "None": @@ -99,7 +105,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp) if len(list_speak) > 1: save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav")) - all_parts += [audio_array, silence.copy()] + all_parts += [audio_array] else: texts = split_and_recombine_text(text) for i, text in tqdm(enumerate(texts), total=len(texts)): @@ -139,7 +145,10 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu full_generation['fine_prompt']) # loading voice from custom folder needs to have extension voice_name = voice_name + ".npz" - all_parts += [audio_array, silence.copy()] + all_parts += [audio_array] + # Add short pause between sentences + if text[-1] in "!?.\n" and i > 1: + all_parts += [silenceshort.copy()] # save & play audio result = create_filename(OUTPUTFOLDER, "final",".wav") @@ -234,9 +243,10 @@ def convert_text_to_ssml(text, selected_speaker): for file in files: if(file.endswith(".npz")): pathpart = root.replace("./bark/assets/prompts", "") - if len(pathpart) < 1: - pathpart = "/" - speakers_list.append(os.path.join(pathpart, file[:-4])) + name = os.path.join(pathpart, file[:-4]) + if name.startswith("/") or name.startswith("\\"): + name = name[1:] + speakers_list.append(name) speakers_list = sorted(speakers_list, key=lambda x: x.lower()) speakers_list.insert(0, 'None') @@ -244,7 +254,7 @@ def convert_text_to_ssml(text, selected_speaker): # Create Gradio Blocks with gr.Blocks(title="Bark Enhanced Gradio GUI", mode="Bark Enhanced") as barkgui: - gr.Markdown("### [Bark Enhanced](https://github.com/C0untFloyd/bark-gui)") + gr.Markdown("### [Bark Enhanced v0.4.0](https://github.com/C0untFloyd/bark-gui)") with gr.Tab("TTS"): with gr.Row(): with gr.Column(): @@ -281,13 +291,7 @@ def convert_text_to_ssml(text, selected_speaker): gr.Markdown("[Voice Prompt Library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)") speaker = gr.Dropdown(speakers_list, value=speakers_list[0], label="Voice") with gr.Column(): - text_temp = gr.Slider( - 0.1, - 1.0, - value=0.7, - label="Generation Temperature", - info="1.0 more diverse, 0.1 more conservative" - ) + text_temp = gr.Slider(0.1, 1.0, value=0.6, label="Generation Temperature", info="1.0 more diverse, 0.1 more conservative") waveform_temp = gr.Slider(0.1, 1.0, value=0.7, label="Waveform temperature", info="1.0 more diverse, 0.1 more conservative") with gr.Row(): @@ -300,7 +304,7 @@ def convert_text_to_ssml(text, selected_speaker): with gr.Row(): with gr.Column(): - tts_create_button = gr.Button("Create") + tts_create_button = gr.Button("Generate") with gr.Column(): hidden_checkbox = gr.Checkbox(visible=False) button_delete_files = gr.Button("Clear output folder")