-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserve.py
70 lines (53 loc) · 2.14 KB
/
serve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import uvicorn
import os
import requests
import asyncio
import logging
import warnings
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from queue import Queue
from utils.payloads import InferencePayload
from utils.streamer import CustomStreamer
from utils.loader import ITREXLoader
from utils.generation import start_generation
app = FastAPI()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")
# Creating the queue, streamers, models, and tokenizers
# neural_chat_7B_v1_1
logger.info("loading Intel/neural-chat-7b-v1-1")
neural_chat_7B_v1_1_Model, neural_chat_7B_v1_1_Tokenizer = ITREXLoader('Intel/neural-chat-7b-v1-1')
# Generation initiator and response server
async def response_generator(query, model, tokenizer, streamer, streamer_queue):
start_generation(query, model, tokenizer, streamer)
while True:
value = streamer_queue.get()
if value == None:
break
yield value
streamer_queue.task_done()
await asyncio.sleep(0.1)
@app.get("/ping")
async def ping():
"""Ping server to determine status
Returns
-------
API response
response from server on health status
"""
return {"message":"Server is Running"}
@app.get('/query-stream/')
async def stream(payload:InferencePayload):
print(f'Query receieved: {payload.query}')
model = payload.selected_model
if model == 'Intel/neural-chat-7b-v1-1':
logger.info("Intel/neural-chat-7b-v1-1 selected for inference")
neural_chat_7B_v1_1_streamer_queue = Queue()
neural_chat_7B_v1_1_Streamer = CustomStreamer(neural_chat_7B_v1_1_streamer_queue, neural_chat_7B_v1_1_Tokenizer, True)
return StreamingResponse(response_generator(payload.query, neural_chat_7B_v1_1_Model,
neural_chat_7B_v1_1_Tokenizer, neural_chat_7B_v1_1_Streamer,
neural_chat_7B_v1_1_streamer_queue), media_type='text/event-stream')
if __name__ == "__main__":
uvicorn.run("serve:app", host="0.0.0.0", port=8080, log_level="info")