-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLema_voice_edge_tts.py
167 lines (132 loc) · 7.06 KB
/
Lema_voice_edge_tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import tkinter as tk
from tkinter import messagebox , filedialog
from llama_cpp import Llama
# agents imports
from agents.function import _FunctionAgent as FunctionAgent
from agents.routing import _RoutingAgent as RoutingAgent
from agents.chat import _ChatAgent as ChatAgent
from agents.speech_to_text import SpeechToText
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
class ModelPathSelector(tk.Tk):
def __init__(self):
super().__init__()
self.title("LLM Path")
# Get the screen width and height
screen_width = self.winfo_screenwidth()
screen_height = self.winfo_screenheight()
# Calculate the x and y coordinates to center the window
x = (screen_width/2) - (350/2)
y = (screen_height/2) - (100/2)
# Set the geometry of the window
self.geometry(f'350x100+{int(x)}+{int(y)}')
# Set the focus on the window
self.focus_force()
self.label = tk.Label(self, text="Select your GGUF file:")
self.label.pack()
self.entry = tk.Entry(self, width=55)
self.entry.pack()
button_frame = tk.Frame(self)
self.browse_button = tk.Button(button_frame, text="Browse", command=self.browse_model_path,
bg='orange', fg='black', font=('Helvetica', 10, 'bold'))
self.browse_button.pack(side=tk.LEFT)
self.confirm_button = tk.Button(button_frame, text="Confirm", command=self.confirm_model_path,
bg='lightgreen', fg='black', font=('Helvetica', 10, 'bold'))
self.confirm_button.pack(side=tk.LEFT)
button_frame.pack()
def browse_model_path(self):
model_path = filedialog.askopenfilename()
self.entry.delete(0, tk.END)
self.entry.insert(0, model_path)
def confirm_model_path(self):
model_path = self.entry.get()
if os.path.exists(model_path):
global llm
''' Args:
model_path: Path to the model.
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
vocab_only: Only load the vocabulary no weights.
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
kv_overrides: Key-value overrides for the model.
seed: RNG seed, -1 for random
n_ctx: Text context, 0 = from model
n_batch: Prompt processing maximum batch size
n_threads: Number of threads to use for generation
n_threads_batch: Number of threads to use for batch processing
rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
pooling_type: Pooling type, from `enum llama_pooling_type`.
rope_freq_base: RoPE base frequency, 0 = from model
rope_freq_scale: RoPE frequency scaling factor, 0 = from model
yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
yarn_attn_factor: YaRN magnitude scaling factor
yarn_beta_fast: YaRN low correction dim
yarn_beta_slow: YaRN high correction dim
yarn_orig_ctx: YaRN original context size
logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
flash_attn: Use flash attention.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
numa: numa policy
chat_format: String specifying the chat format to use when calling create_chat_completion.
chat_handler: Optional chat handler to use when calling create_chat_completion.
draft_model: Optional draft model to use for speculative decoding.
tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
verbose: Print verbose output to stderr.
type_k: KV cache data type for K (default: f16)
type_v: KV cache data type for V (default: f16) '''
llm = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=8192, n_batch=120,flash_attn=True,type_k= 1,type_v=1 ) # type_k,type_v =0 or 1 or 2
self.destroy()
else:
messagebox.showerror("Error", "Invalid model path")
class Application:
def __init__(self, llm):
self.llm = llm
self.listening = True
self.speech_to_text = SpeechToText() #loading to VRAM
self.chat_agent = ChatAgent(llm)
self.function_agent = FunctionAgent(llm)
self.current_message = ""
self.user_input_history = []
self.history_index = 0
def _process_message(self, user_input):
selected_agent = RoutingAgent(llm).select_agent(user_input)
if "function_call" in selected_agent:
command = FunctionAgent(llm).extract_command(user_input)
print(command)
_no_chat_agent = ["set_timer", "current_time", "current_date"]
if command in _no_chat_agent:
self.function_agent._execute_function(command, user_input)
else:
self.chat_agent.send_message(user_input)
self.function_agent._execute_function(command, user_input)
else:
self.chat_agent.send_message(user_input)
self._listen_for_input()
def _listen_for_input(self):
try:
user_input = self.speech_to_text.start_listening()
if user_input:
self._process_message(user_input)
self.user_input_history.append(user_input)
self.history_index = len(self.user_input_history) - 1
except Exception as e:
print('exeption:', e)
pass
def on_closing(self):
self.listening = False
if __name__ == "__main__":
selector = ModelPathSelector()
selector.mainloop()
app = Application(llm)
app.chat_agent.send_message("Hello.")
app._listen_for_input()
while True:
# Keep the application running until it's stopped manually
pass