From 3c3571e4be0547eaf5fc97bbb9e6bb8a762c06dc Mon Sep 17 00:00:00 2001 From: Marvin Weigand Date: Mon, 1 May 2023 15:13:22 +0200 Subject: [PATCH 1/3] Added possibility to set a static seed --- .gitignore | 6 +++++- bark/__init__.py | 2 +- bark/api.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ webui.py | 5 +++-- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index b2c82bf1..fdab925e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,8 @@ __pycache__/ /docs/magis.wav /docs/harald_24000.wav /docs/harald.wav -/models \ No newline at end of file +/models +.venv/* +build/* +Outputs/* +suno_bark.egg-info/* \ No newline at end of file diff --git a/bark/__init__.py b/bark/__init__.py index e0b17c8b..4349f797 100644 --- a/bark/__init__.py +++ b/bark/__init__.py @@ -1,2 +1,2 @@ -from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt +from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt, set_seed from .generation import SAMPLE_RATE, preload_models diff --git a/bark/api.py b/bark/api.py index e1c75566..f3df0fa1 100644 --- a/bark/api.py +++ b/bark/api.py @@ -2,6 +2,10 @@ import numpy as np +import torch +import random +import os + from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic @@ -123,3 +127,44 @@ def generate_audio( else: audio_arr = out return audio_arr + +def set_seed(seed: int = 0): + """Set the seed + + seed = 0 Generate a random seed + seed = -1 Disable deterministic algorithms + 0 < seed < 2**32 Set the seed + + Args: + seed: integer to use as seed + + Returns: + integer used as seed + """ + + original_seed = seed + + # See for more informations: https://pytorch.org/docs/stable/notes/randomness.html + if seed == -1: + # Disable deterministic + torch.backends.cudnn.deterministic = False + torch.backends.cudnn.benchmark = True + else: + # Enable deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + if seed <= 0: + # Generate random seed + # Use default_rng() because it is independent of np.random.seed() + seed = np.random.default_rng().integers(1, 2**32 - 1) + + assert(0 < seed and seed < 2**32) + + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + os.environ["PYTHONHASHSEED"] = str(seed) + + return original_seed if original_seed != 0 else seed diff --git a/webui.py b/webui.py index 7fe992d8..e715762b 100644 --- a/webui.py +++ b/webui.py @@ -8,7 +8,7 @@ import logging import torch -from bark import SAMPLE_RATE, generate_audio +from bark import SAMPLE_RATE, generate_audio, set_seed from bark.clonevoice import clone_voice from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic from scipy.io.wavfile import write as write_wav @@ -157,6 +157,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu all_parts = [] for i, text in tqdm(enumerate(texts), total=len(texts)): + set_seed(432) if quick_generation == True: print(f"\nGenerating Text ({i+1}/{len(texts)}) -> `{text}`") audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp) @@ -180,7 +181,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu output_full=True, ) i+=1 - + set_seed(-1) if len(texts) > 1: save_wav(audio_array, create_filename(OUTPUTFOLDER, "audioclip",".wav")) From b244611a7d1e80b4d1f40f18cec9db4d2c0402b6 Mon Sep 17 00:00:00 2001 From: Marvin Weigand Date: Wed, 3 May 2023 18:41:00 +0200 Subject: [PATCH 2/3] proper seed setting --- webui.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webui.py b/webui.py index 7da24ff5..25ec3d84 100644 --- a/webui.py +++ b/webui.py @@ -104,6 +104,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu else: texts = split_and_recombine_text(text) for i, text in tqdm(enumerate(texts), total=len(texts)): + set_seed(423) print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker}:`{text}`") if quick_generation == True: audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp) @@ -141,7 +142,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu # loading voice from custom folder needs to have extension voice_name = voice_name + ".npz" all_parts += [audio_array, silence.copy()] - + set_seed(-1) # save & play audio result = create_filename(OUTPUTFOLDER, "final",".wav") save_wav(np.concatenate(all_parts), result) From 9a64d0c63edd3e2bd8e72e7f85e34903b0cea40c Mon Sep 17 00:00:00 2001 From: Marvin Weigand Date: Wed, 3 May 2023 20:35:28 +0200 Subject: [PATCH 3/3] Integrated option to set random seed within gradio UI --- webui.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/webui.py b/webui.py index b78908ac..7ba13a7e 100644 --- a/webui.py +++ b/webui.py @@ -54,7 +54,7 @@ def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, se return full_generation, codec_decode(x_fine_gen) return codec_decode(x_fine_gen) -def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, quick_generation, complete_settings, progress=gr.Progress(track_tqdm=True)): +def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, quick_generation, complete_settings, random_seed_number, progress=gr.Progress(track_tqdm=True)): if text == None or len(text) < 1: raise gr.Error('No text entered!') @@ -80,6 +80,8 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu use_coarse_history_prompt = "Use coarse history" in complete_settings use_fine_history_prompt = "Use fine history" in complete_settings use_last_generation_as_history = "Use last generation as history" in complete_settings + random_seed = int(random_seed_number) + progress(0, desc="Generating") silenceshort = np.zeros(int(0.25 * SAMPLE_RATE), dtype=np.float32) # quarter second of silence @@ -92,7 +94,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu prev_speaker = None for i, clip in tqdm(enumerate(list_speak), total=len(list_speak)): # set seed for consistent generation - set_seed(423) + set_seed(random_seed) selected_speaker = clip[0] # Add pause break between speakers if i > 0 and selected_speaker != prev_speaker: @@ -112,7 +114,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu texts = split_and_recombine_text(text) for i, text in tqdm(enumerate(texts), total=len(texts)): # set seed for consistent generation - set_seed(423) + set_seed(random_seed) print(f"\nGenerating Text ({i+1}/{len(texts)}) -> {selected_speaker}:`{text}`") if quick_generation == True: audio_array = generate_audio(text, selected_speaker, text_temp, waveform_temp) @@ -154,7 +156,7 @@ def generate_text_to_speech(text, selected_speaker, text_temp, waveform_temp, qu if text[-1] in "!?.\n" and i > 1: all_parts += [silenceshort.copy()] - all_parts += [audio_array, silence.copy()] + #all_parts += [audio_array, silencelong.copy()] # reset seed set_seed(-1) @@ -306,9 +308,12 @@ def convert_text_to_ssml(text, selected_speaker): with gr.Column(): quick_gen_checkbox = gr.Checkbox(label="Quick Generation", value=True) with gr.Column(): - settings_checkboxes = ["Use semantic history", "Use coarse history", "Use fine history", "Use last generation as history"] - complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False) - quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=complete_settings) + with gr.Row(): + settings_checkboxes = ["Use semantic history", "Use coarse history", "Use fine history", "Use last generation as history"] + complete_settings = gr.CheckboxGroup(choices=settings_checkboxes, value=settings_checkboxes, label="Detailed Generation Settings", type="value", interactive=True, visible=False) + random_seed_number = gr.inputs.Number(label="Random Seed", default=-1) + #random_seed_settings = gr.NumberGroup([random_seed_number], label="Random Seed Settings", interactive=True, visible=False) + quick_gen_checkbox.change(fn=on_quick_gen_changed, inputs=quick_gen_checkbox, outputs=[complete_settings]) with gr.Row(): with gr.Column(): @@ -329,7 +334,7 @@ def convert_text_to_ssml(text, selected_speaker): dummy = gr.Text(label="Progress") convert_to_ssml_button.click(convert_text_to_ssml, inputs=[input_text, speaker],outputs=input_text) - tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, quick_gen_checkbox, complete_settings],outputs=output_audio) + tts_create_button.click(generate_text_to_speech, inputs=[input_text, speaker, text_temp, waveform_temp, quick_gen_checkbox, complete_settings, random_seed_number],outputs=output_audio) # Javascript hack to display modal confirmation dialog js = "(x) => confirm('Are you sure? This will remove all files from output folder')" button_delete_files.click(None, None, hidden_checkbox, _js=js)