Skip to content

Commit

Permalink
LLM-FIX
Browse files Browse the repository at this point in the history
  • Loading branch information
Dartvauder committed Oct 4, 2024
1 parent ca9ea12 commit 2530c5b
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
13 changes: 7 additions & 6 deletions LaunchFile/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,16 +901,16 @@ def load_qwen2_audio_model():
return processor, model


def process_qwen2_audio(processor, model, audio_file, prompt):
def process_qwen2_audio(processor, model, input_audio_mm, prompt):
conversation = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{"role": "user", "content": [
{"type": "audio", "audio_url": audio_file},
{"type": "audio", "audio_url": input_audio_mm},
{"type": "text", "text": prompt},
]},
]
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audio, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)
audio, _ = librosa.load(input_audio_mm, sr=processor.feature_extractor.sampling_rate)
inputs = processor(text=text, audios=[audio], return_tensors="pt", padding=True)
inputs.input_ids = inputs.input_ids.to("cuda")
generate_ids = model.generate(**inputs, max_length=256)
Expand Down Expand Up @@ -1024,7 +1024,7 @@ def get_languages():
}


def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, enable_tts,
def generate_text_and_speech(input_text, system_prompt, input_audio, llm_model_type, llm_model_name, llm_lora_model_name, enable_web_search, enable_libretranslate, target_lang, enable_openparse, pdf_file, enable_multimodal, input_image, input_video, input_audio_mm, enable_tts,
llm_settings_html, max_new_tokens, max_length, min_length, n_ctx, n_batch, temperature, top_p, min_p, typical_p, top_k,
do_sample, early_stopping, stopping, repetition_penalty, frequency_penalty, presence_penalty, length_penalty, no_repeat_ngram_size, num_beams, num_return_sequences, chat_history_format, tts_settings_html, speaker_wav, language, tts_temperature, tts_top_p, tts_top_k, tts_speed, tts_repetition_penalty, tts_length_penalty, output_format):
global chat_history, chat_dir, tts_model, whisper_model
Expand Down Expand Up @@ -1217,7 +1217,7 @@ def image_to_base64_data_uri(image_path):
return None, None, "Qwen2-Audio is not supported with llama model type."
else:
try:
response = process_qwen2_audio(processor, model, input_audio, prompt)
response = process_qwen2_audio(processor, model, input_audio_mm, prompt)
if not chat_history or chat_history[-1][1] is not None:
chat_history.append([prompt, ""])
chat_history[-1][1] = response
Expand Down Expand Up @@ -8863,6 +8863,7 @@ def reload_interface():
gr.Checkbox(label=_("Enable Multimodal", lang), value=False),
gr.Image(label=_("Upload your image (for Multimodal)", lang), type="filepath"),
gr.Video(label=_("Upload your video (for Multimodal)", lang)),
gr.Audio(label=_("Upload your audio (for Multimodal)", lang), type="filepath"),
gr.Checkbox(label=_("Enable TTS", lang), value=False),
gr.HTML(_("<h3>LLM Settings</h3>", lang)),
gr.Slider(minimum=256, maximum=32768, value=512, step=1, label=_("Max tokens", lang)),
Expand Down Expand Up @@ -11490,7 +11491,7 @@ def reload_interface():
dropdowns_to_update = [
chat_interface.input_components[4],
chat_interface.input_components[5],
chat_interface.input_components[38],
chat_interface.input_components[40],
tts_stt_interface.input_components[3],
txt2img_interface.input_components[2],
txt2img_interface.input_components[4],
Expand Down
1 change: 1 addition & 0 deletions translations/ru.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"LLM and TTS Settings": "LLM и TTS настройки",
"TTS and STT Settings": "TTS и STT настройки",
"Upload your video (for Multimodal)": "Загрузите ваше видео (для мультимодального режима)",
"Upload your audio (for Multimodal)": "Загрузите свое аудио (для мультимодального режима)",
"Max tokens": "Максимум токенов",
"Min length": "Минимальная длина",
"Context size (N_CTX) for llama type models": "Размер контекста (N_CTX) для моделей типа llama",
Expand Down
1 change: 1 addition & 0 deletions translations/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"LLM and TTS Settings": "LLM和TTS设置",
"TTS and STT Settings": "TTS和STT设置",
"Upload your video (for Multimodal)": "上传您的视频(用于多模态)",
"Upload your audio (for Multimodal)": "上传您的音频(用于多模态)",
"Max tokens": "最大令牌数",
"Min length": "最小长度",
"Context size (N_CTX) for llama type models": "llama类型模型的上下文大小 (N_CTX)",
Expand Down

0 comments on commit 2530c5b

Please sign in to comment.