From fb26688d826e02e4abebc262106dab913c14ee50 Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 14:15:08 -0800
Subject: [PATCH 1/7] deleted extra copy

---
 context_retrieve.py | 125 --------------------------------------------
 1 file changed, 125 deletions(-)
 delete mode 100644 context_retrieve.py

diff --git a/context_retrieve.py b/context_retrieve.py
deleted file mode 100644
index f4c4fc2..0000000
--- a/context_retrieve.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import os
-import openai
-from langchain.document_loaders.csv_loader import CSVLoader
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import FAISS
-from langchain.document_loaders import TextLoader
-
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.chat_models import ChatOpenAI
-from langchain.chains import RetrievalQA
-from langchain import PromptTemplate
-
-import re
-import requests
-import xml.etree.ElementTree as ET
-
-from fragment import Fragment
-from VectorDatabase import Latern
-
-
-
-# OpenAI Setup
-OPEN_API_KEY = "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"
-# openai.api_key = os.getenv(openai_api_key)
-os.environ['OPENAI_API_KEY'] = OPEN_API_KEY
-
-def getPmcPaper(pmcid):
-    """
-    """
-    url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML'
-    req = requests.get(url)
-    res = req.text
-    return res
-
-def extractMethodsFromPmcPaper(paper):
-    """
-    """
-    tree = ET.fromstring(paper)
-    mtext = []
-    for sec in tree.iter('sec'):
-        for title in sec.iter('title'):
-            if isinstance(title.text, str):
-                if re.search('methods', title.text, re.IGNORECASE):
-                    mtext.extend(list(sec.itertext()))
-    return " ".join(mtext)
-
-def preprocess(input_text):
-    """
-    """
-    processed_data = input_text.replace("\n","")
-    return processed_data
-
-def get_embeddings(fname):
-    """
-    """
-    loader = TextLoader(fname)
-    documents = loader.load()
-    text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0)
-    
-    docs = text_splitter.split_documents(documents)
-    
-    emb = OpenAIEmbeddings()
-    input_texts = [d.page_content for d in docs]
-
-    input_embeddings = emb.embed_documents(input_texts)
-    text_embeddings = list(zip(input_texts, input_embeddings))
-
-    return text_embeddings, emb
-
-def saveFassIndex(fname, sname, ):
-    """
-    """
-    txt_embs, emb = get_embeddings(docs)
-    faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb)
-    faissIndex.save_local(sname)
-    # faissIndex = FAISS.from_documents(docs, OpenAIEmbeddings())
-    # faissIndex.save_local("input_doc")
-
-def Query(input_query, faiss_obj):
-    chatbot = RetrievalQA.from_chain_type(
-        llm=ChatOpenAI(
-            openai_api_key=OPEN_API_KEY,
-            temperature=0, model_name="gpt-3.5-turbo", max_tokens=50
-        ),
-        chain_type="stuff",
-        retriever=faiss_obj.as_retriever(search_type="similarity", search_kwargs={"k":1})
-    ) 
-    template = """ {query}? """
-    prompt = PromptTemplate(
-        input_variables=["query"],
-        template=template,
-    )
-    print(chatbot.run(
-        prompt.format(query=input_query)
-    ))
-
-
-def main():
-    text = getPmcPaper(pmcid)
-    
-    methods_text = preprocess(extractMethodsFromPmcPaper(text))
-    fname = 'input_file.txt'
-    sname = 'input_doc'
-    with open(fname, 'w') as file:
-        file.write(methods_text)
-    # print(methods_text)
-    txt_embs, emb = get_embeddings(fname) 
-    
-    fragments = []
-    for txt, embs in txt_embs:
-        fragment = Fragment(pmcid, 'methods', txt, embs)
-        fragments.append(fragment)
-        
-    latern = Latern()
-    latern.insertEmbeddings(fragments)
-    
-    # retreieve. PMC
-    faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb)
-    inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data"
-    Query(inp_query, faissIndex)
-
-if __name__ == '__main__':  
-    main()
\ No newline at end of file

From f8b7b6ca9d147af5183b7453a1ae85b33361d5b4 Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 14:21:51 -0800
Subject: [PATCH 2/7] Moved database entities into same file

---
 VectorDatabase.py    | 37 ++++++++++++++++++++++++++++++++++++-
 analysis.py          |  3 +--
 database_entities.py | 34 ----------------------------------
 hackathon_runner.py  |  4 +---
 tests/test.py        |  4 +---
 5 files changed, 39 insertions(+), 43 deletions(-)
 delete mode 100644 database_entities.py

diff --git a/VectorDatabase.py b/VectorDatabase.py
index 6d6a799..f75485b 100644
--- a/VectorDatabase.py
+++ b/VectorDatabase.py
@@ -302,4 +302,39 @@ def get_embeddings_for_pub(self, id):
             texts.append(fragment.content)
             embeddings.append(fragment.vector)
         text_embeddings = list(zip(texts, embeddings))
-        return text_embeddings
\ No newline at end of file
+        return text_embeddings
+
+# Class to represent a publication with attributes id, title, pmc, pubmed, and doi
+class Publication:
+
+    id = ""
+    title = ""
+    pmc = ""
+    pubmed = ""
+    doi = ""
+
+    def __init__(self, id, title, pmc, pubmed, doi):
+        self.id = id # (DOI) Unique identifier for the publication
+        self.title = title    # Title of the publication
+        self.pmc = pmc        # PubMed Central (PMC) Link
+        self.pubmed = pubmed  # PubMed Link
+        self.doi = doi # Digital Object Identifier (DOI) Link for the publication
+
+# Class to represent a fragment of a publication with attributes id, header, content, and vector
+class Fragment:
+
+
+    # Class variables to store default values for attributes
+    id = ""        
+    header = ""    
+    content = ""   
+    vector = ""    
+
+    def __init__(self, id, header, content, vector):
+        # Constructor to initialize the attributes of the Fragment object
+
+        # Set the attributes of the object with the values provided during instantiation
+        self.id = id          # (DOI) Unique identifier for the fragment
+        self.header = header  # Header or title of the fragment
+        self.content = content # Content or text of the fragment
+        self.vector = vector  # Vector representation of the fragment
diff --git a/analysis.py b/analysis.py
index aa60539..7a3be99 100644
--- a/analysis.py
+++ b/analysis.py
@@ -1,6 +1,5 @@
 
-from VectorDatabase import Lantern
-from database_entities import Publication, Fragment
+from VectorDatabase import Lantern, Publication, Fragment
 from google_sheets import SheetsApiClient
 
 from langchain.text_splitter import RecursiveCharacterTextSplitter
diff --git a/database_entities.py b/database_entities.py
deleted file mode 100644
index 9de5295..0000000
--- a/database_entities.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Class to represent a publication with attributes id, title, pmc, pubmed, and doi
-class Publication:
-
-    id = ""
-    title = ""
-    pmc = ""
-    pubmed = ""
-    doi = ""
-
-    def __init__(self, id, title, pmc, pubmed, doi):
-        self.id = id # (DOI) Unique identifier for the publication
-        self.title = title    # Title of the publication
-        self.pmc = pmc        # PubMed Central (PMC) Link
-        self.pubmed = pubmed  # PubMed Link
-        self.doi = doi # Digital Object Identifier (DOI) Link for the publication
-
-# Class to represent a fragment of a publication with attributes id, header, content, and vector
-class Fragment:
-
-
-    # Class variables to store default values for attributes
-    id = ""        
-    header = ""    
-    content = ""   
-    vector = ""    
-
-    def __init__(self, id, header, content, vector):
-        # Constructor to initialize the attributes of the Fragment object
-
-        # Set the attributes of the object with the values provided during instantiation
-        self.id = id          # (DOI) Unique identifier for the fragment
-        self.header = header  # Header or title of the fragment
-        self.content = content # Content or text of the fragment
-        self.vector = vector  # Vector representation of the fragment
diff --git a/hackathon_runner.py b/hackathon_runner.py
index 5acdc64..761c6e7 100644
--- a/hackathon_runner.py
+++ b/hackathon_runner.py
@@ -5,9 +5,7 @@
 from paperscraper.pdf import save_pdf
 from paperscraper.get_dumps import biorxiv
 
-from fragment import Fragment
-from publication import Publication
-from VectorDatabase import Lantern
+from VectorDatabase import Lantern, Fragment, Publication
 import openai
 from langchain.document_loaders.csv_loader import CSVLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
diff --git a/tests/test.py b/tests/test.py
index 8347787..b782d81 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,6 +1,4 @@
-from fragment import Fragment
-from publication import Publication
-from VectorDatabase import Latern
+from VectorDatabase import Lantern, Fragment, Publication
 from tqdm.auto import tqdm
 from sentence_transformers import SentenceTransformer
 import torch

From 873a7b6945ff7dc2c3ddbf1ae6dcc8db8371a831 Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 14:23:34 -0800
Subject: [PATCH 3/7] renamed file

---
 analysis.py => document_analysis.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename analysis.py => document_analysis.py (100%)

diff --git a/analysis.py b/document_analysis.py
similarity index 100%
rename from analysis.py
rename to document_analysis.py

From 239308cdb76dc343e8e892aa24e0cdcce830498c Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 14:32:39 -0800
Subject: [PATCH 4/7] removed duplicated code

---
 document_analysis.py | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/document_analysis.py b/document_analysis.py
index 7a3be99..da87509 100644
--- a/document_analysis.py
+++ b/document_analysis.py
@@ -10,26 +10,7 @@
 class DocumentAnalyzer:
     """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results,
     and reports the results to the spreadsheet
-    """
-    
-    keywords_groups = {
-        'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'],
-        'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'],
-        'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"],
-        'FRET': ['FRET',  "forster resonance energy transfer", "fluorescence resonance energy transfer"],
-        'AFM': ['AFM',  "atomic force microscopy" ],
-        'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"],
-        '3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"],
-        'Y2H': ['Y2H', "yeast two-hybrid"],
-        'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"],
-        'XRAY_TOMOGRAPHY': ["soft x-ray tomography"],
-        'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"],
-        'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"],
-        'EVOLUTION': ['coevolution', "evolutionary covariance"],
-        'PREDICTED': ["predicted contacts"],
-        'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"],
-        'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension']
-    }    
+    """  
     
     def __init__(self):
         # self.lantern = Lantern()
@@ -128,15 +109,6 @@ def paper_about_cryoem(text_embeddings: []):
         """
         return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings)
 
-    @staticmethod
-    def methods_string():
-        methods_string = ''
-        for i, (k, v) in enumerate(DocumentAnalyzer.keywords_groups.items()):
-            if i > 0:
-                methods_string += ' or '
-            methods_string += f'{k} ({", ".join(v)})'
-        return methods_string
-
 
 class LlmHandler:
     """pulled this straight from the hackathon code, should work though

From ce7a5a83502d265e650d595051ef4404b824725e Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 15:03:24 -0800
Subject: [PATCH 5/7] added config file

---
 config.json | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 config.json

diff --git a/config.json b/config.json
new file mode 100644
index 0000000..0a315c3
--- /dev/null
+++ b/config.json
@@ -0,0 +1,4 @@
+{
+    "Emails": [],
+    "DEBUG": false
+}
\ No newline at end of file

From ae06ec1c29f7fa36dab773a5ebf589ba34bb4989 Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 15:03:33 -0800
Subject: [PATCH 6/7] fixed date hardcode

---
 document_analysis.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/document_analysis.py b/document_analysis.py
index da87509..fe4a283 100644
--- a/document_analysis.py
+++ b/document_analysis.py
@@ -6,6 +6,7 @@
 from langchain.chat_models import ChatOpenAI
 from langchain.chains import RetrievalQA
 from langchain import PromptTemplate
+from datetime import date
 
 class DocumentAnalyzer:
     """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results,
@@ -37,7 +38,8 @@ def process_publications(self, publications: [Publication]):
             else:
                 #print('paper not about cryo-em')
                 pass
-            rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""])
+            # add date if it's added 
+            rows.append([pub.doi, pub.title, "", str(date.today()), "", int(classification), response, ""])
 
         self.update_spreadsheet(rows, hits)
         
@@ -115,7 +117,6 @@ class LlmHandler:
     """
 
     def __init__(self):
-        self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100)
         self.llm=ChatOpenAI(
                 temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3
             )

From 2ff4a7a5e5509736b5290f07ef017e1cecb7dc43 Mon Sep 17 00:00:00 2001
From: Michael Antoun <antounmichael@yahoo.com>
Date: Sat, 11 Nov 2023 15:24:24 -0800
Subject: [PATCH 7/7] noop

---
 document_analysis.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/document_analysis.py b/document_analysis.py
index fe4a283..573c149 100644
--- a/document_analysis.py
+++ b/document_analysis.py
@@ -8,6 +8,7 @@
 from langchain import PromptTemplate
 from datetime import date
 
+
 class DocumentAnalyzer:
     """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results,
     and reports the results to the spreadsheet