@@ -25,14 +25,15 @@ def generate(self, state: AppState, data: Query) -> ServerSentEvent:
25
25
an endpoint for generating text directly from the LLM model
26
26
"""
27
27
chat = state .chat
28
+ tokeniser = chat .tokeniser
28
29
29
- prompt = chat . tokeniser .apply_chat_template (
30
+ prompt = tokeniser .apply_chat_template (
30
31
[{'role' : 'user' , 'content' : data .query }],
31
32
tokenize = False ,
32
33
add_generation_prompt = True ,
33
34
)
34
35
35
- return ServerSentEvent (chat .generate (chat . tokeniser (prompt ).tokens ()))
36
+ return ServerSentEvent (chat .generate (tokeniser (prompt ).tokens ()))
36
37
37
38
@post ('/benchmark' , sync_to_thread = True )
38
39
def benchmark (self , state : AppState , data : Query ) -> Benchmark :
@@ -42,15 +43,17 @@ def benchmark(self, state: AppState, data: Query) -> Benchmark:
42
43
an endpoint for benchmarking the LLM model
43
44
"""
44
45
chat = state .chat
46
+ tokeniser = chat .tokeniser
47
+
45
48
message : Message = {'role' : 'user' , 'content' : data .query }
46
- prompt = chat . tokeniser .apply_chat_template ([message ], add_generation_prompt = True , tokenize = False )
47
- tokenised_prompt = chat . tokeniser (prompt ).tokens ()
49
+ prompt = tokeniser .apply_chat_template ([message ], add_generation_prompt = True , tokenize = False )
50
+ tokenised_prompt = tokeniser (prompt ).tokens ()
48
51
49
52
start = perf_counter ()
50
53
response = '' .join (chat .generate (tokenised_prompt ))
51
54
total_time = perf_counter () - start
52
55
53
- output_tokens = chat . tokeniser (response ).tokens ()
56
+ output_tokens = tokeniser (response ).tokens ()
54
57
total_tokens = len (tokenised_prompt ) + len (chat ) + len (output_tokens )
55
58
56
59
return Benchmark (
0 commit comments