Using llama2 chat model

souradipp76 · Apr 24, 2024 · 5887109 · 5887109
1 parent 91d806f
commit 5887109
Showing 6 changed files with 87 additions and 17 deletions.
diff --git a/doc_generator/index/createVectorStore.py b/doc_generator/index/createVectorStore.py
@@ -2,11 +2,12 @@
 from pathlib import Path
 from typing import List
 
-from langchain_openai import OpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from langchain_core.document_loaders import BaseLoader
 from doc_generator.utils.HNSWLib import HNSWLib
+from doc_generator.utils.LLMUtils import get_embeddings
+from doc_generator.types import LLMModels
 
 def processFile(filePath: str) -> Document:
     def read_file(path):
@@ -57,5 +58,5 @@ def createVectorStore(root: str, output: str) -> None:
     textSplitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=100)
     docs = textSplitter.split_documents(rawDocs)
     # Create the vectorstore
-    vectorStore = HNSWLib.from_documents(docs, OpenAIEmbeddings())
+    vectorStore = HNSWLib.from_documents(docs, get_embeddings(LLMModels.LLAMA2_7B_CHAT_GPTQ))
     vectorStore.save(output)
diff --git a/doc_generator/index/processRepository.py b/doc_generator/index/processRepository.py
@@ -3,7 +3,7 @@
 import os
 from pathlib import Path
 from langchain_openai import ChatOpenAI
-
+from langchain_experimental.chat_models import Llama2Chat
 import tiktoken
 
 from doc_generator.types import (
@@ -40,7 +40,7 @@ def write_file(path, content):
         with open(path, "w", encoding="utf-8") as file:
             file.write(content)
 
-    def callLLM(prompt: str, model: ChatOpenAI):
+    def callLLM(prompt: str, model: Llama2Chat):
         return model.invoke(prompt)
 
     def isModel(model):

diff --git a/doc_generator/query/__init__.py b/doc_generator/query/__init__.py
@@ -1,12 +1,12 @@
 import os
 from prompt_toolkit import prompt
 from prompt_toolkit.shortcuts import clear
-from langchain_openai import OpenAIEmbeddings
 from markdown2 import markdown
 
 from doc_generator.types import AutodocRepoConfig, AutodocUserConfig
 from doc_generator.utils.HNSWLib import HNSWLib
 from doc_generator.utils.createChatChain import make_chain
+from doc_generator.utils.LLMUtils import get_embeddings
 
 chat_history = []
 
@@ -16,7 +16,7 @@ def display_welcome_message(project_name):
 
 def query(repo_config: AutodocRepoConfig, user_confg: AutodocUserConfig):
     data_path = os.path.join(repo_config.output, 'docs', 'data')
-    embeddings = OpenAIEmbeddings()
+    embeddings = get_embeddings(repo_config.llms[0])
     vector_store = HNSWLib.load(data_path, embeddings)
     chain = make_chain(repo_config.name,
                        repo_config.repository_url,

diff --git a/doc_generator/types.py b/doc_generator/types.py
@@ -6,6 +6,7 @@ class LLMModels(str, Enum):
     GPT3 = "gpt-3.5-turbo"
     GPT4 = "gpt-4"
     GPT432k = "gpt-4-32k"
+    LLAMA2_7B_CHAT_GPTQ = "TheBloke/Llama-2-7B-Chat-GPTQ"
 
 class Priority(str, Enum):
     COST = 'cost'
@@ -100,7 +101,7 @@ def __init__(self, input_path: str, project_name: str, process_file: Optional[Ca
 
 class LLMModelDetails:
     def __init__(self, name: LLMModels, input_cost_per_1k_tokens: float, output_cost_per_1k_tokens: float,
-                 max_length: int, llm: ChatOpenAI, input_tokens: int, output_tokens: int, succeeded: int,
+                 max_length: int, llm: LLama2Chat, input_tokens: int, output_tokens: int, succeeded: int,
                  failed: int, total: int):
         self.name = name
         self.input_cost_per_1k_tokens = input_cost_per_1k_tokens

diff --git a/doc_generator/utils/LLMUtils.py b/doc_generator/utils/LLMUtils.py
@@ -1,6 +1,11 @@
 import os
+import torch
 
-from langchain_openai import ChatOpenAI
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_experimental.chat_models import Llama2Chat
+from langchain import HuggingFacePipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline, BitsAndBytesConfig
 from doc_generator.types import LLMModelDetails, LLMModels
 
 models = {
@@ -39,6 +44,36 @@
         succeeded=0,
         failed=0,
         total=0
+    ),
+    LLMModels.LLAMA2_7B_CHAT_GPTQ: LLMModelDetails(
+        name=LLMModels.LLAMA2_7B_CHAT_GPTQ,
+        input_cost_per_1k_tokens=0,
+        output_cost_per_1k_tokens=0,
+        max_length=4096,
+        llm=Llama2Chat(llm=HuggingFacePipeline(pipeline=pipeline(
+            "text-generation",
+            model=AutoModelForCausalLM.from_pretrained(
+                LLMModels.LLAMA2_7B_CHAT_GPTQ,
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type='nf4',
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16
+                ),
+                torch_dtype=torch.float16,
+                trust_remote_code=True,
+                device_map="auto"
+            ),
+            tokenizer=AutoTokenizer.from_pretrained(LLMModels.LLAMA2_7B_CHAT_GPTQ, use_fast=True),
+            generation_config=AutoConfig.from_pretrained(
+                LLMModels.LLAMA2_7B_CHAT_GPTQ,
+            ),
+        ), model_kwargs={"temperature": 0})),
+        input_tokens=0,
+        output_tokens=0,
+        succeeded=0,
+        failed=0,
+        total=0
     )
 }
 
@@ -69,10 +104,38 @@ def print_model_details(models):
     for item in all_results:
         print(item)
 
-def total_index_cost_estimate(models):
+def total_index_cost_estimate(model):
     total_cost = sum(
         (model.input_tokens / 1000) * model.input_cost_per_1k_tokens +
         (model.output_tokens / 1000) * model.output_cost_per_1k_tokens
         for model in models.values()
     )
     return total_cost
+
+def get_embeddings(model):
+    if model == LLMModels.LLAMA2_7B_CHAT_GPTQ:
+        return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
+                                     model_kwargs={"device": "cuda"},
+                                     encode_kwargs={"normalize_embeddings": True},
+                                     )
+    else:
+        return OpenAIEmbeddings()
+
+def get_chat_model(model_name, model_kwargs):
+    return Llama2Chat(llm=HuggingFacePipeline(pipeline=pipeline(
+            "text-generation",
+            model=AutoModelForCausalLM.from_pretrained(
+                model_name,
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type='nf4',
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16
+                ),
+                torch_dtype=torch.float16,
+                trust_remote_code=True,
+                device_map="auto"
+            ),
+            tokenizer=AutoTokenizer.from_pretrained(model_name, use_fast=True),
+            generation_config=AutoConfig.from_pretrained(model_name),
+        ), model_kwargs=model_kwargs))
diff --git a/doc_generator/utils/createChatChain.py b/doc_generator/utils/createChatChain.py
@@ -3,6 +3,7 @@
 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI
+from doc_generator.utils.LLMUtils import get_chat_model
 
 # Define the prompt template for condensing the follow-up question
 condense_prompt = PromptTemplate.from_template(
@@ -38,19 +39,23 @@ def make_qa_prompt(project_name, repository_url, content_type, chat_prompt, targ
 def make_chain(project_name, repository_url, content_type, chat_prompt, target_audience, vectorstore, llms, on_token_stream=None):
     llm = llms[1] if len(llms) > 1 else llms[0]
     question_generator = LLMChain(
-        llm=ChatOpenAI(temperature=0.1, model_name=llm),
+        # llm=ChatOpenAI(temperature=0.1, model_name=llm),
+        llm=get_chat_model(llm, {"temperature": 0.1}),
         prompt=condense_prompt
     )
 
     qa_prompt = make_qa_prompt(project_name, repository_url, content_type, chat_prompt, target_audience)
     doc_chain = load_qa_chain(
-        llm=ChatOpenAI(temperature=0.2,
-                       model_name=llm,
-                       streaming=bool(on_token_stream),
-                       model_kwargs={
-                        "frequency_penalty": 0.0,
-                        "presence_penalty": 0.0,
-                       }),
+        # llm=ChatOpenAI(temperature=0.2,
+        #                model_name=llm,
+        #                streaming=bool(on_token_stream),
+        #                model_kwargs={
+        #                 "frequency_penalty": 0.0,
+        #                 "presence_penalty": 0.0,
+        #                }),
+        llm=get_chat_model(llm, {
+            "temperature": 0.2
+        }),
         prompt=qa_prompt
     )