From fb26688d826e02e4abebc262106dab913c14ee50 Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 14:15:08 -0800 Subject: [PATCH 1/7] deleted extra copy --- context_retrieve.py | 125 -------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 context_retrieve.py diff --git a/context_retrieve.py b/context_retrieve.py deleted file mode 100644 index f4c4fc2..0000000 --- a/context_retrieve.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import openai -from langchain.document_loaders.csv_loader import CSVLoader -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.text_splitter import CharacterTextSplitter -from langchain.vectorstores import FAISS -from langchain.document_loaders import TextLoader - -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import FAISS -from langchain.chat_models import ChatOpenAI -from langchain.chains import RetrievalQA -from langchain import PromptTemplate - -import re -import requests -import xml.etree.ElementTree as ET - -from fragment import Fragment -from VectorDatabase import Latern - - - -# OpenAI Setup -OPEN_API_KEY = "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" -# openai.api_key = os.getenv(openai_api_key) -os.environ['OPENAI_API_KEY'] = OPEN_API_KEY - -def getPmcPaper(pmcid): - """ - """ - url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML' - req = requests.get(url) - res = req.text - return res - -def extractMethodsFromPmcPaper(paper): - """ - """ - tree = ET.fromstring(paper) - mtext = [] - for sec in tree.iter('sec'): - for title in sec.iter('title'): - if isinstance(title.text, str): - if re.search('methods', title.text, re.IGNORECASE): - mtext.extend(list(sec.itertext())) - return " ".join(mtext) - -def preprocess(input_text): - """ - """ - processed_data = input_text.replace("\n","") - return processed_data - -def get_embeddings(fname): - """ - """ - loader = TextLoader(fname) - documents = loader.load() - text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0) - - docs = text_splitter.split_documents(documents) - - emb = OpenAIEmbeddings() - input_texts = [d.page_content for d in docs] - - input_embeddings = emb.embed_documents(input_texts) - text_embeddings = list(zip(input_texts, input_embeddings)) - - return text_embeddings, emb - -def saveFassIndex(fname, sname, ): - """ - """ - txt_embs, emb = get_embeddings(docs) - faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) - faissIndex.save_local(sname) - # faissIndex = FAISS.from_documents(docs, OpenAIEmbeddings()) - # faissIndex.save_local("input_doc") - -def Query(input_query, faiss_obj): - chatbot = RetrievalQA.from_chain_type( - llm=ChatOpenAI( - openai_api_key=OPEN_API_KEY, - temperature=0, model_name="gpt-3.5-turbo", max_tokens=50 - ), - chain_type="stuff", - retriever=faiss_obj.as_retriever(search_type="similarity", search_kwargs={"k":1}) - ) - template = """ {query}? """ - prompt = PromptTemplate( - input_variables=["query"], - template=template, - ) - print(chatbot.run( - prompt.format(query=input_query) - )) - - -def main(): - text = getPmcPaper(pmcid) - - methods_text = preprocess(extractMethodsFromPmcPaper(text)) - fname = 'input_file.txt' - sname = 'input_doc' - with open(fname, 'w') as file: - file.write(methods_text) - # print(methods_text) - txt_embs, emb = get_embeddings(fname) - - fragments = [] - for txt, embs in txt_embs: - fragment = Fragment(pmcid, 'methods', txt, embs) - fragments.append(fragment) - - latern = Latern() - latern.insertEmbeddings(fragments) - - # retreieve. PMC - faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) - inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data" - Query(inp_query, faissIndex) - -if __name__ == '__main__': - main() \ No newline at end of file From f8b7b6ca9d147af5183b7453a1ae85b33361d5b4 Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 14:21:51 -0800 Subject: [PATCH 2/7] Moved database entities into same file --- VectorDatabase.py | 37 ++++++++++++++++++++++++++++++++++++- analysis.py | 3 +-- database_entities.py | 34 ---------------------------------- hackathon_runner.py | 4 +--- tests/test.py | 4 +--- 5 files changed, 39 insertions(+), 43 deletions(-) delete mode 100644 database_entities.py diff --git a/VectorDatabase.py b/VectorDatabase.py index 6d6a799..f75485b 100644 --- a/VectorDatabase.py +++ b/VectorDatabase.py @@ -302,4 +302,39 @@ def get_embeddings_for_pub(self, id): texts.append(fragment.content) embeddings.append(fragment.vector) text_embeddings = list(zip(texts, embeddings)) - return text_embeddings \ No newline at end of file + return text_embeddings + +# Class to represent a publication with attributes id, title, pmc, pubmed, and doi +class Publication: + + id = "" + title = "" + pmc = "" + pubmed = "" + doi = "" + + def __init__(self, id, title, pmc, pubmed, doi): + self.id = id # (DOI) Unique identifier for the publication + self.title = title # Title of the publication + self.pmc = pmc # PubMed Central (PMC) Link + self.pubmed = pubmed # PubMed Link + self.doi = doi # Digital Object Identifier (DOI) Link for the publication + +# Class to represent a fragment of a publication with attributes id, header, content, and vector +class Fragment: + + + # Class variables to store default values for attributes + id = "" + header = "" + content = "" + vector = "" + + def __init__(self, id, header, content, vector): + # Constructor to initialize the attributes of the Fragment object + + # Set the attributes of the object with the values provided during instantiation + self.id = id # (DOI) Unique identifier for the fragment + self.header = header # Header or title of the fragment + self.content = content # Content or text of the fragment + self.vector = vector # Vector representation of the fragment diff --git a/analysis.py b/analysis.py index aa60539..7a3be99 100644 --- a/analysis.py +++ b/analysis.py @@ -1,6 +1,5 @@ -from VectorDatabase import Lantern -from database_entities import Publication, Fragment +from VectorDatabase import Lantern, Publication, Fragment from google_sheets import SheetsApiClient from langchain.text_splitter import RecursiveCharacterTextSplitter diff --git a/database_entities.py b/database_entities.py deleted file mode 100644 index 9de5295..0000000 --- a/database_entities.py +++ /dev/null @@ -1,34 +0,0 @@ -# Class to represent a publication with attributes id, title, pmc, pubmed, and doi -class Publication: - - id = "" - title = "" - pmc = "" - pubmed = "" - doi = "" - - def __init__(self, id, title, pmc, pubmed, doi): - self.id = id # (DOI) Unique identifier for the publication - self.title = title # Title of the publication - self.pmc = pmc # PubMed Central (PMC) Link - self.pubmed = pubmed # PubMed Link - self.doi = doi # Digital Object Identifier (DOI) Link for the publication - -# Class to represent a fragment of a publication with attributes id, header, content, and vector -class Fragment: - - - # Class variables to store default values for attributes - id = "" - header = "" - content = "" - vector = "" - - def __init__(self, id, header, content, vector): - # Constructor to initialize the attributes of the Fragment object - - # Set the attributes of the object with the values provided during instantiation - self.id = id # (DOI) Unique identifier for the fragment - self.header = header # Header or title of the fragment - self.content = content # Content or text of the fragment - self.vector = vector # Vector representation of the fragment diff --git a/hackathon_runner.py b/hackathon_runner.py index 5acdc64..761c6e7 100644 --- a/hackathon_runner.py +++ b/hackathon_runner.py @@ -5,9 +5,7 @@ from paperscraper.pdf import save_pdf from paperscraper.get_dumps import biorxiv -from fragment import Fragment -from publication import Publication -from VectorDatabase import Lantern +from VectorDatabase import Lantern, Fragment, Publication import openai from langchain.document_loaders.csv_loader import CSVLoader from langchain.embeddings.openai import OpenAIEmbeddings diff --git a/tests/test.py b/tests/test.py index 8347787..b782d81 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,6 +1,4 @@ -from fragment import Fragment -from publication import Publication -from VectorDatabase import Latern +from VectorDatabase import Lantern, Fragment, Publication from tqdm.auto import tqdm from sentence_transformers import SentenceTransformer import torch From 873a7b6945ff7dc2c3ddbf1ae6dcc8db8371a831 Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 14:23:34 -0800 Subject: [PATCH 3/7] renamed file --- analysis.py => document_analysis.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename analysis.py => document_analysis.py (100%) diff --git a/analysis.py b/document_analysis.py similarity index 100% rename from analysis.py rename to document_analysis.py From 239308cdb76dc343e8e892aa24e0cdcce830498c Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 14:32:39 -0800 Subject: [PATCH 4/7] removed duplicated code --- document_analysis.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index 7a3be99..da87509 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -10,26 +10,7 @@ class DocumentAnalyzer: """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, and reports the results to the spreadsheet - """ - - keywords_groups = { - 'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'], - 'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'], - 'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"], - 'FRET': ['FRET', "forster resonance energy transfer", "fluorescence resonance energy transfer"], - 'AFM': ['AFM', "atomic force microscopy" ], - 'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"], - '3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"], - 'Y2H': ['Y2H', "yeast two-hybrid"], - 'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"], - 'XRAY_TOMOGRAPHY': ["soft x-ray tomography"], - 'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"], - 'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"], - 'EVOLUTION': ['coevolution', "evolutionary covariance"], - 'PREDICTED': ["predicted contacts"], - 'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"], - 'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension'] - } + """ def __init__(self): # self.lantern = Lantern() @@ -128,15 +109,6 @@ def paper_about_cryoem(text_embeddings: []): """ return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings) - @staticmethod - def methods_string(): - methods_string = '' - for i, (k, v) in enumerate(DocumentAnalyzer.keywords_groups.items()): - if i > 0: - methods_string += ' or ' - methods_string += f'{k} ({", ".join(v)})' - return methods_string - class LlmHandler: """pulled this straight from the hackathon code, should work though From ce7a5a83502d265e650d595051ef4404b824725e Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 15:03:24 -0800 Subject: [PATCH 5/7] added config file --- config.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 config.json diff --git a/config.json b/config.json new file mode 100644 index 0000000..0a315c3 --- /dev/null +++ b/config.json @@ -0,0 +1,4 @@ +{ + "Emails": [], + "DEBUG": false +} \ No newline at end of file From ae06ec1c29f7fa36dab773a5ebf589ba34bb4989 Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 15:03:33 -0800 Subject: [PATCH 6/7] fixed date hardcode --- document_analysis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index da87509..fe4a283 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -6,6 +6,7 @@ from langchain.chat_models import ChatOpenAI from langchain.chains import RetrievalQA from langchain import PromptTemplate +from datetime import date class DocumentAnalyzer: """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, @@ -37,7 +38,8 @@ def process_publications(self, publications: [Publication]): else: #print('paper not about cryo-em') pass - rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""]) + # add date if it's added + rows.append([pub.doi, pub.title, "", str(date.today()), "", int(classification), response, ""]) self.update_spreadsheet(rows, hits) @@ -115,7 +117,6 @@ class LlmHandler: """ def __init__(self): - self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100) self.llm=ChatOpenAI( temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3 ) From 2ff4a7a5e5509736b5290f07ef017e1cecb7dc43 Mon Sep 17 00:00:00 2001 From: Michael Antoun Date: Sat, 11 Nov 2023 15:24:24 -0800 Subject: [PATCH 7/7] noop --- document_analysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/document_analysis.py b/document_analysis.py index fe4a283..573c149 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -8,6 +8,7 @@ from langchain import PromptTemplate from datetime import date + class DocumentAnalyzer: """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, and reports the results to the spreadsheet