Skip to content

Commit

Permalink
Using llama2 chat model
Browse files Browse the repository at this point in the history
souradipp76 committed Apr 24, 2024
1 parent 91d806f commit 5887109
Showing 6 changed files with 87 additions and 17 deletions.
5 changes: 3 additions & 2 deletions doc_generator/index/createVectorStore.py
Original file line number Diff line number Diff line change
@@ -2,11 +2,12 @@
from pathlib import Path
from typing import List

from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.document_loaders import BaseLoader
from doc_generator.utils.HNSWLib import HNSWLib
from doc_generator.utils.LLMUtils import get_embeddings
from doc_generator.types import LLMModels

def processFile(filePath: str) -> Document:
def read_file(path):
@@ -57,5 +58,5 @@ def createVectorStore(root: str, output: str) -> None:
textSplitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=100)
docs = textSplitter.split_documents(rawDocs)
# Create the vectorstore
vectorStore = HNSWLib.from_documents(docs, OpenAIEmbeddings())
vectorStore = HNSWLib.from_documents(docs, get_embeddings(LLMModels.LLAMA2_7B_CHAT_GPTQ))
vectorStore.save(output)
4 changes: 2 additions & 2 deletions doc_generator/index/processRepository.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
import os
from pathlib import Path
from langchain_openai import ChatOpenAI

from langchain_experimental.chat_models import Llama2Chat
import tiktoken

from doc_generator.types import (
@@ -40,7 +40,7 @@ def write_file(path, content):
with open(path, "w", encoding="utf-8") as file:
file.write(content)

def callLLM(prompt: str, model: ChatOpenAI):
def callLLM(prompt: str, model: Llama2Chat):
return model.invoke(prompt)

def isModel(model):
4 changes: 2 additions & 2 deletions doc_generator/query/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
from prompt_toolkit import prompt
from prompt_toolkit.shortcuts import clear
from langchain_openai import OpenAIEmbeddings
from markdown2 import markdown

from doc_generator.types import AutodocRepoConfig, AutodocUserConfig
from doc_generator.utils.HNSWLib import HNSWLib
from doc_generator.utils.createChatChain import make_chain
from doc_generator.utils.LLMUtils import get_embeddings

chat_history = []

@@ -16,7 +16,7 @@ def display_welcome_message(project_name):

def query(repo_config: AutodocRepoConfig, user_confg: AutodocUserConfig):
data_path = os.path.join(repo_config.output, 'docs', 'data')
embeddings = OpenAIEmbeddings()
embeddings = get_embeddings(repo_config.llms[0])
vector_store = HNSWLib.load(data_path, embeddings)
chain = make_chain(repo_config.name,
repo_config.repository_url,
3 changes: 2 additions & 1 deletion doc_generator/types.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@ class LLMModels(str, Enum):
GPT3 = "gpt-3.5-turbo"
GPT4 = "gpt-4"
GPT432k = "gpt-4-32k"
LLAMA2_7B_CHAT_GPTQ = "TheBloke/Llama-2-7B-Chat-GPTQ"

class Priority(str, Enum):
COST = 'cost'
@@ -100,7 +101,7 @@ def __init__(self, input_path: str, project_name: str, process_file: Optional[Ca

class LLMModelDetails:
def __init__(self, name: LLMModels, input_cost_per_1k_tokens: float, output_cost_per_1k_tokens: float,
max_length: int, llm: ChatOpenAI, input_tokens: int, output_tokens: int, succeeded: int,
max_length: int, llm: LLama2Chat, input_tokens: int, output_tokens: int, succeeded: int,
failed: int, total: int):
self.name = name
self.input_cost_per_1k_tokens = input_cost_per_1k_tokens
67 changes: 65 additions & 2 deletions doc_generator/utils/LLMUtils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import os
import torch

from langchain_openai import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_experimental.chat_models import Llama2Chat
from langchain import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline, BitsAndBytesConfig
from doc_generator.types import LLMModelDetails, LLMModels

models = {
@@ -39,6 +44,36 @@
succeeded=0,
failed=0,
total=0
),
LLMModels.LLAMA2_7B_CHAT_GPTQ: LLMModelDetails(
name=LLMModels.LLAMA2_7B_CHAT_GPTQ,
input_cost_per_1k_tokens=0,
output_cost_per_1k_tokens=0,
max_length=4096,
llm=Llama2Chat(llm=HuggingFacePipeline(pipeline=pipeline(
"text-generation",
model=AutoModelForCausalLM.from_pretrained(
LLMModels.LLAMA2_7B_CHAT_GPTQ,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
),
torch_dtype=torch.float16,
trust_remote_code=True,
device_map="auto"
),
tokenizer=AutoTokenizer.from_pretrained(LLMModels.LLAMA2_7B_CHAT_GPTQ, use_fast=True),
generation_config=AutoConfig.from_pretrained(
LLMModels.LLAMA2_7B_CHAT_GPTQ,
),
), model_kwargs={"temperature": 0})),
input_tokens=0,
output_tokens=0,
succeeded=0,
failed=0,
total=0
)
}

@@ -69,10 +104,38 @@ def print_model_details(models):
for item in all_results:
print(item)

def total_index_cost_estimate(models):
def total_index_cost_estimate(model):
total_cost = sum(
(model.input_tokens / 1000) * model.input_cost_per_1k_tokens +
(model.output_tokens / 1000) * model.output_cost_per_1k_tokens
for model in models.values()
)
return total_cost

def get_embeddings(model):
if model == LLMModels.LLAMA2_7B_CHAT_GPTQ:
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={"device": "cuda"},
encode_kwargs={"normalize_embeddings": True},
)
else:
return OpenAIEmbeddings()

def get_chat_model(model_name, model_kwargs):
return Llama2Chat(llm=HuggingFacePipeline(pipeline=pipeline(
"text-generation",
model=AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
),
torch_dtype=torch.float16,
trust_remote_code=True,
device_map="auto"
),
tokenizer=AutoTokenizer.from_pretrained(model_name, use_fast=True),
generation_config=AutoConfig.from_pretrained(model_name),
), model_kwargs=model_kwargs))
21 changes: 13 additions & 8 deletions doc_generator/utils/createChatChain.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from doc_generator.utils.LLMUtils import get_chat_model

# Define the prompt template for condensing the follow-up question
condense_prompt = PromptTemplate.from_template(
@@ -38,19 +39,23 @@ def make_qa_prompt(project_name, repository_url, content_type, chat_prompt, targ
def make_chain(project_name, repository_url, content_type, chat_prompt, target_audience, vectorstore, llms, on_token_stream=None):
llm = llms[1] if len(llms) > 1 else llms[0]
question_generator = LLMChain(
llm=ChatOpenAI(temperature=0.1, model_name=llm),
# llm=ChatOpenAI(temperature=0.1, model_name=llm),
llm=get_chat_model(llm, {"temperature": 0.1}),
prompt=condense_prompt
)

qa_prompt = make_qa_prompt(project_name, repository_url, content_type, chat_prompt, target_audience)
doc_chain = load_qa_chain(
llm=ChatOpenAI(temperature=0.2,
model_name=llm,
streaming=bool(on_token_stream),
model_kwargs={
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
}),
# llm=ChatOpenAI(temperature=0.2,
# model_name=llm,
# streaming=bool(on_token_stream),
# model_kwargs={
# "frequency_penalty": 0.0,
# "presence_penalty": 0.0,
# }),
llm=get_chat_model(llm, {
"temperature": 0.2
}),
prompt=qa_prompt
)

0 comments on commit 5887109

Please sign in to comment.