From 3bae482a185269ad20735f5a908d187d577e83cd Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 17:20:12 -0800 Subject: [PATCH] functionality to run script on all new entries --- document_analysis.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index 573c149..6317f34 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -1,10 +1,12 @@ from VectorDatabase import Lantern, Publication, Fragment from google_sheets import SheetsApiClient +from prompts import get_qbi_hackathon_prompt from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chat_models import ChatOpenAI from langchain.chains import RetrievalQA +from langchain.embeddings.openai import OpenAIEmbeddings from langchain import PromptTemplate from datetime import date @@ -15,9 +17,15 @@ class DocumentAnalyzer: """ def __init__(self): - # self.lantern = Lantern() + self.lantern = Lantern() self.sheets = SheetsApiClient() + self.llm = LlmHandler() + def analyze_all_unread(self): + """pulls all new files from Lantern database, evaluates them, and publishes results to google sheets + """ + publications = lantern.getUnreadPublications() + self.process_publications(publications) def process_publications(self, publications: [Publication]): """takes a list of publications, applies retrievalQA and processes responses @@ -67,19 +75,22 @@ def update_spreadsheet(rows: [], hits: int, notify=True): sheets.notify_arthur(message=msg) - def analyze_publication(self, publication: Publication): - """leaving this blank for now because i think the way these are stored is changing + def analyze_publication(self, text_embeddings: []): + """poses a question about the document, processes the result and returns it + NOTE: for now, only uses the hackathon question, might add more later Args: - publication (Publication): publication to be analyzed + text_embeddings ([]): list of (embedding, text) pairs from document to be analyzed Returns: bool: classification of response to query as positive (True) or negative (False) str: response from chatGPT """ - #faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) - #result = llm.evaluate_queries(faissIndex, query) - response = None + # NOTE: These very likely need to change + open_ai_emb = OpenAIEmbeddings() + query = get_qbi_hackathon_prompt() + faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) + response = self.llm.evaluate_queries(faissIndex, query)[0] return self.classify_response(response), response @staticmethod @@ -131,24 +142,24 @@ def evaluate_queries(self, embedding, queries): ) template = """ {query}? """ - response = [] + responses = [] for q in queries: prompt = PromptTemplate( input_variables=["query"], template=template, ) - response.append(chatbot.run( + responses.append(chatbot.run( prompt.format(query=q) )) - return response + return responses def main(): - x = DocumentAnalyzer() - l = LlmHandler() + document_analyzer = DocumentAnalyzer() + document_analyzer.analyze_all_unread() #analyzes all new files in lantern db if __name__ == '__main__': main() \ No newline at end of file