From 59802995b07dd6d713ee40309d54b0368b1d5f26 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Fri, 10 Nov 2023 20:55:13 -0800 Subject: [PATCH 01/31] changed authentication method --- .gitignore | 1 + google_sheets.py | 101 +++++++++++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index 662ecf4..10b4283 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ input_doc/index.faiss __pycache__ ./lantern data +google_sheets_credentials.json diff --git a/google_sheets.py b/google_sheets.py index 10533d0..522f8bd 100644 --- a/google_sheets.py +++ b/google_sheets.py @@ -1,61 +1,84 @@ +import os import gspread -class SpreadsheetUpdater(): - - credentials = { - "type": "service_account", - "project_id": "durable-sky-396700", - "private_key_id": "5735737884fa17f981beceb424001445e2476ae3", - "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDcN1SaQ3pgvu/Z\nYL4VHQSoztL5gUBtDFHvICFGLMBLwxj1SI9zWeI0uH4QvW32QkqTwnYO8cW2jCpZ\n7Lu2ZDdW5copVt3p7GCacYC++hYjH8Y13FSotE0yWpxh8qIQexzgTcHenrCr8nAd\nhkeHyNwwUpmjOASIqOtIHj7cGqp43jxMSuwh8fK94ef+Aemo5+7h+tXlHqwMIFap\nyjdE4TNdJ+mYp2nm17PUYiP0Y+WbYEOeeo29So9P/Ir1gMpH5Fyu3RcBI7jloFZd\nM8Hfrj17KKMnVOKItfnYeLlFaBDSfgYP17v9NUfzHVUSDEa2T67mSfUG63aIiYSm\nXrtAuignAgMBAAECggEACDHe6hnjIfQfazcLm8mHdNvnEFxCkExKRQ9f1AN/HGw9\nyR+47UTF0DE7yVYWed8gDon8Aef2JyoY7ioksILfzeuhld9vq3BqbK59aTeK2PL2\n80yOfsCtTSRmEWPWeBQjKcDhaAfLva2F7CaLeH59aY1WJLOSJ57xmOHXQP8uozsm\nm8dMs1PgEELl2B2zc+6JtHnWH2CAxZiA2b9yh+iZi3kiaJyLIW1bgx4U3suDnsFc\n+Igk+AYsIZ4UuPwFxlb+2mvYpiZd/Br0ASnBDQvgXDA4Xlu8wBeukfun8VZOviE4\nFjdxYkHMLeCsu15Xsc3E3UOt8wIXbr6b9Wi7mitY4QKBgQD7oMvTGl3SMMBynYHx\nYbbqW15UksGx6oPXBeUsqmCc64qBTiTZwLh2gY0TOELa0Evlf5kVtEVvMPviFynf\nvEvEc7ZV9rqkpgD1YRk2oi98wgAPG2/xU/asdSNblLVT/tK21/a7agDuyh6CfRO6\nfzQf4GYITKjtI34kjkEYa4Y6TwKBgQDgCtGNu2TpqKhle6v59EQzyB5aUEg4LdJn\n+YTGppohtbtbbW2N4nhoOi+ibLtN0dIDetfSdZtXe7CC13WSVS7T9QAOC/u6g0rj\nQstqktfUUyasIPYKdWe64rNtNJkIW+x+bgz2p8fOTGKwkTSFUFtPYwvajwIUv6Zc\n2/Vjtt82qQKBgBKLcTonsU5yZVyNGyyNBQwUm8kj376bCAhq2M8H54LpIRYSiki6\nGV4yghEujk7OFync041z8cIWHBo3ltB0cikSVhfTzUGhMmTjORZ7sYBCU/rJDOD+\nTSm8oFR5izubhjAPjpGVaGgw4TrAuRl/knne8eYesDx55ywOh+Gi2wulAoGBANdW\nrqnKtyi6mfjI0LhzpmYa78mgpnmQ2U5kjtEc6sKB2S38VLNuPIr5ejVkyvb2OCRu\nGyjHL2L7mOF51CCtTVAeiUn3DKHtdbpPxhKOR3Jl5aLGH5ZX2DbRlOHfD0PwjrPK\ndR1SkIJh+u1484E7hjgcnBUbJUXqGy3foNGRwKPZAoGAZ1Ig6vyIbZk9Lnh08COS\nOQ6JrTEdDCfr1i3CapHAW+rN6oHlM+S7PmzTFuxrWhGAHDWDOBczrPa+ohUAmLWa\niSJDC+bBJvj/L0jD4qIm39ifDCSyZfoAkshvpEPe010tw3IuO64pV9wowbwyu+wN\nieOoIE/RPaDtfFb2IZG7pGA=\n-----END PRIVATE KEY-----\n", - "client_email": "csv-updater@durable-sky-396700.iam.gserviceaccount.com", - "client_id": "116349894744257971396", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/csv-updater%40durable-sky-396700.iam.gserviceaccount.com", - "universe_domain": "googleapis.com" - } +class SpreadsheetUpdater: SPREADSHEET_NAME = "PDB-DEV_ChatGPT" - - def __init__(self): - self.gc = gspread.service_account_from_dict(type(self).credentials) - sh = self.gc.open(type(self).SPREADSHEET_NAME) - self.worksheet = sh.get_worksheet(0) + SCHEMA = [ + "DOI", + "Title", + "date of publishing", + "date of analysis", + "authors", + "classification", + "methods used", + "software", + ] + + def __init__(self): + self.connect() + self.spreadsheet = self.client.open(type(self).SPREADSHEET_NAME) + self.worksheet = self.spreadsheet.get_worksheet(0) + + def connect(self): + try: + secret_file = os.path.join(os.getcwd(), "google_sheets_credentials.json") + self.client = gspread.service_account(secret_file) + except OSError as e: + print(e) def append_row(self, row: [str]): - ''' - Adds a row to the spreadsheet, must follow schema: - ['DOI', 'Title', 'date of publishing', 'date of analysis', 'authors', 'classification', 'methods used', 'software'] - ''' + """ + Adds a row to the spreadsheet, must follow SCHEMA: + """ self._check_row(row) self.worksheet.append_row(row) - + def append_rows(self, rows: [[str]]): - ''' - Adds a list of rows to the spreadsheet, each row must follow schema: - ['DOI', 'Title', 'date of publishing', 'date of analysis', 'authors', 'classification', 'methods used', 'software'] - ''' + """ + Adds a list of rows to the spreadsheet, each row must follow SCHEMA: + """ for row in rows: self._check_row(row) self.worksheet.append_rows(rows) - - @staticmethod - def _check_row(row: [str]): - if len(row) != 8: - raise ValueError("Row must have 8 fields in the order specified") def notify_arthur(self, message: str): - self.sh.share('aozalevsky@gmail.com', perm_type='user', role='writer', notify=True, email_message=message) + """ + Args: + message (str): _description_ + """ + self.spreadsheet.share( + "aozalevsky@gmail.com", + perm_type="user", + role="writer", + notify=True, + email_message=message, + ) + @staticmethod + def _check_row(row: [str]): + if len(row) != len(SpreadsheetUpdater.SCHEMA): + raise ValueError( + f"Row must have {len(SpreadsheetUpdater.SCHEMA)} fields in the order specified\n{SpreadsheetUpdater.SCHEMA}" + ) def main(): spread = SpreadsheetUpdater() - dummy_row = ['DOI', 'Title', 'date of publishing', 'date of analysis', 'authors', 'classification', 'methods used', 'software'] + dummy_row = [ + "DOI", + "Title", + "date of publishing", + "date of analysis", + "authors", + "classification", + "methods used", + "software", + ] spread.append_row(dummy_row) spread.append_rows([dummy_row, dummy_row, dummy_row]) - spread.notify_arthur("testing out my dope code") + # spread.notify_arthur("testing out the code") + -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() From 6522b4bf3d110d435d6021dd993975314d457ffd Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Fri, 10 Nov 2023 22:09:04 -0800 Subject: [PATCH 02/31] added comments --- google_sheets.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/google_sheets.py b/google_sheets.py index 522f8bd..123e117 100644 --- a/google_sheets.py +++ b/google_sheets.py @@ -1,8 +1,12 @@ import os import gspread +import typing - -class SpreadsheetUpdater: +class SheetsApiClient: + """interface for all functionality with google sheets + enables connection, append, and notification + """ + SPREADSHEET_NAME = "PDB-DEV_ChatGPT" SCHEMA = [ "DOI", @@ -21,6 +25,8 @@ def __init__(self): self.worksheet = self.spreadsheet.get_worksheet(0) def connect(self): + """connects to Google Sheets API service using private key file + """ try: secret_file = os.path.join(os.getcwd(), "google_sheets_credentials.json") self.client = gspread.service_account(secret_file) @@ -43,9 +49,9 @@ def append_rows(self, rows: [[str]]): self.worksheet.append_rows(rows) def notify_arthur(self, message: str): - """ + """Shares the spreadsheet with arthur, along with the message in an email Args: - message (str): _description_ + message (str): """ self.spreadsheet.share( "aozalevsky@gmail.com", @@ -56,15 +62,24 @@ def notify_arthur(self, message: str): ) @staticmethod - def _check_row(row: [str]): - if len(row) != len(SpreadsheetUpdater.SCHEMA): + def _check_row(row: []): + """Checks row + + Args: + row ([]): row of values to be added to worksheet + + Raises: + ValueError: number of values in rows doesn't match schema + """ + if len(row) != len(SheetsApiClient.SCHEMA): raise ValueError( - f"Row must have {len(SpreadsheetUpdater.SCHEMA)} fields in the order specified\n{SpreadsheetUpdater.SCHEMA}" + f"Row must have {len(SheetsApiClient.SCHEMA)} fields in the order specified\n{SheetsApiClient.SCHEMA}" ) def main(): - spread = SpreadsheetUpdater() + # some test code which initializes the client, then appends rows to the worksheet, then pings arthur + spread = SheetsApiClient() dummy_row = [ "DOI", "Title", From d54a6be13ed34a870482c346207b877d6aa28539 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Fri, 10 Nov 2023 22:16:31 -0800 Subject: [PATCH 03/31] protected main function from import --- context_retrieve.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/context_retrieve.py b/context_retrieve.py index aa7c5d2..f4c4fc2 100644 --- a/context_retrieve.py +++ b/context_retrieve.py @@ -120,5 +120,6 @@ def main(): faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data" Query(inp_query, faissIndex) - -main() \ No newline at end of file + +if __name__ == '__main__': + main() \ No newline at end of file From fcd652719bc92d3fe876f257b4abee2bf5be4e17 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Fri, 10 Nov 2023 22:46:39 -0800 Subject: [PATCH 04/31] added flag option --- VectorDatabase.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/VectorDatabase.py b/VectorDatabase.py index 395bdc6..adf1921 100644 --- a/VectorDatabase.py +++ b/VectorDatabase.py @@ -231,6 +231,8 @@ def getAllFragmentsOfPublication(self, id): """ Retrieves unread publications from the 'publications' table. + Parameters: + - delete_unread_entries: bool, decides if entries are deleted from the "unread" table Returns: - List[Publication], a list of Publication objects representing the unread publications. Notes: @@ -238,7 +240,7 @@ def getAllFragmentsOfPublication(self, id): - Clears the 'unread' table after retrieving the unread publications. """ - def getUnreadPublications(self): + def getUnreadPublications(self, delete_unread_entries=True): conn = self.conn cursor = conn.cursor() @@ -247,7 +249,9 @@ def getUnreadPublications(self): publications = cursor.fetchall() - cursor.execute('DELETE FROM unread;') + if delete_unread_entries: + cursor.execute('DELETE FROM unread;') + conn.commit() cursor.close() From 99cb90e790e41340febb205e7211ed0fe69b360f Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Fri, 10 Nov 2023 23:37:30 -0800 Subject: [PATCH 05/31] combined classes into one file --- VectorDatabase.py | 4 +--- fragment.py => database_entities.py | 16 ++++++++++++++++ publication.py | 15 --------------- 3 files changed, 17 insertions(+), 18 deletions(-) rename fragment.py => database_entities.py (58%) delete mode 100644 publication.py diff --git a/VectorDatabase.py b/VectorDatabase.py index adf1921..9447abb 100644 --- a/VectorDatabase.py +++ b/VectorDatabase.py @@ -1,7 +1,5 @@ import psycopg2 -from fragment import Fragment -from publication import Publication - +from database_entities import Fragment, Publication # Lantern class that exposes functionality of database to application class Lantern: diff --git a/fragment.py b/database_entities.py similarity index 58% rename from fragment.py rename to database_entities.py index fad1193..9de5295 100644 --- a/fragment.py +++ b/database_entities.py @@ -1,3 +1,19 @@ +# Class to represent a publication with attributes id, title, pmc, pubmed, and doi +class Publication: + + id = "" + title = "" + pmc = "" + pubmed = "" + doi = "" + + def __init__(self, id, title, pmc, pubmed, doi): + self.id = id # (DOI) Unique identifier for the publication + self.title = title # Title of the publication + self.pmc = pmc # PubMed Central (PMC) Link + self.pubmed = pubmed # PubMed Link + self.doi = doi # Digital Object Identifier (DOI) Link for the publication + # Class to represent a fragment of a publication with attributes id, header, content, and vector class Fragment: diff --git a/publication.py b/publication.py deleted file mode 100644 index 5cb96a8..0000000 --- a/publication.py +++ /dev/null @@ -1,15 +0,0 @@ -# Class to represent a publication with attributes id, title, pmc, pubmed, and doi -class Publication: - - id = "" - title = "" - pmc = "" - pubmed = "" - doi = "" - - def __init__(self, id, title, pmc, pubmed, doi): - self.id = id # (DOI) Unique identifier for the publication - self.title = title # Title of the publication - self.pmc = pmc # PubMed Central (PMC) Link - self.pubmed = pubmed # PubMed Link - self.doi = doi # Digital Object Identifier (DOI) Link for the publication From 186f468d9ab1529c06a393e285cf9e726b65f266 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 00:03:35 -0800 Subject: [PATCH 06/31] added convenience function --- VectorDatabase.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/VectorDatabase.py b/VectorDatabase.py index 9447abb..6d6a799 100644 --- a/VectorDatabase.py +++ b/VectorDatabase.py @@ -283,3 +283,23 @@ def publicationExists(self, id): cursor.close() return count[0] == 1 + + """ + Fetches the content and embeddings of a publication by id + Parameters: + - id: Text, the unique identifier of the publication. + Returns: + - [(text, embedding)] content of a publication's embeddings + Notes: + """ + def get_embeddings_for_pub(self, id): + texts = [] + embeddings = [] + if not self.publicationExists(id): + return + fragments = self.getAllFragmentsOfPublication(id) + for fragment in fragments: + texts.append(fragment.content) + embeddings.append(fragment.vector) + text_embeddings = list(zip(texts, embeddings)) + return text_embeddings \ No newline at end of file From 1f7d85e78c02883f7bc561c0635e05ac2477c9ca Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 01:08:08 -0800 Subject: [PATCH 07/31] added code to handle analysis --- analysis.py | 180 +++++++++++++++++++++++++++++ hackathon_runner.py | 270 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 450 insertions(+) create mode 100644 analysis.py create mode 100644 hackathon_runner.py diff --git a/analysis.py b/analysis.py new file mode 100644 index 0000000..c71e800 --- /dev/null +++ b/analysis.py @@ -0,0 +1,180 @@ + +from VectorDatabase import Lantern +from database_entities import Publication, Fragment +from google_sheets import SheetsApiClient + +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.chat_models import ChatOpenAI +from langchain.chains import RetrievalQA +from langchain import PromptTemplate + +class DocumentAnalyzer: + """sfdaf + """ + + keywords_groups = { + 'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'], + 'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'], + 'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"], + 'FRET': ['FRET', "forster resonance energy transfer", "fluorescence resonance energy transfer"], + 'AFM': ['AFM', "atomic force microscopy" ], + 'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"], + '3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"], + 'Y2H': ['Y2H', "yeast two-hybrid"], + 'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"], + 'XRAY_TOMOGRAPHY': ["soft x-ray tomography"], + 'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"], + 'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"], + 'EVOLUTION': ['coevolution', "evolutionary covariance"], + 'PREDICTED': ["predicted contacts"], + 'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"], + 'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension'] + } + + def __init__(self): + # self.lantern = Lantern() + self.sheets = SheetsApiClient() + + + def process_publications(self, publications: [Publication]): + """takes a list of publications, applies retrievalQA and processes responses + NOTE: completely untested, just refactored code from hackathon + + Args: + publications ([]): list of publications + """ + query = [f"You are reading a materials and methods section of a scientific paper. Here is the list of structural biology methods {methods_string}.\n\n Did the authors use any methods from the list? \n\n Answer with Yes or No followed by the names of the methods."] + + rows = [] + hits = 0 + for pub in publications: + text_embeddings = self.lantern.get_embeddings_for_pub(pub.id) + classification, response = 0, '' + if self.paper_about_cryoem(text_embeddings): + classification, response = self.analyze_publication(text_embeddings) + hits += classification + else: + #print('paper not about cryo-em') + pass + rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""]) + + self.update_spreadsheet(rows, hits) + + def update_spreadsheet(rows: [], hits: int, notify=True): + """pushes a list of rows to the spreadsheet and notifies via email + + Args: + rows ([]): rows of data to be uploaded to sheet + hits (int): number of positive classifications in the rows + notify (bool): notify via email if True + """ + + if hits > len(rows): + raise ValueError(f"Number of hits ({hits}) is greater than the number of entries ({len(rows)}), sus") + + #print(rows) + self.sheets.append_rows(rows) + msg = f""" + This batch of paper analysis has concluded. + {len(rows)} papers were analyzed in total over the date range 11/2 - 11/3 + {hits} {"were" if ((hits>0) or (hits == 0)) else was} classified as having multi-method structural data""" + + if notify: + sheets.notify_arthur(message=msg) + + + def analyze_publication(self, publication: Publication): + """leaving this blank for now because i think the way these are stored is changing + + Args: + publication (Publication): publication to be analyzed + + Returns: + bool: classification of response to query as positive (True) or negative (False) + str: response from chatGPT + """ + #faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) + #result = llm.evaluate_queries(faissIndex, query) + response = None + return self.classify_response(response), response + + @staticmethod + def classify_response(response: str): + """converting text response from GPT into boolean + + Args: + response (str): response from ChatGPT to the query + + Returns: + bool: True if answer to question is "yes" + """ + if result == None: + return False + # this was used to filter out cases where ChatGPT said "Yes, Cryo-EM was used..." which is wrong because we asked it about + # inclusion of non-cryo-em stuff + #if "cryo" in response.lower(): + # return (False, None) + return response.lower().startswith('yes') + + @staticmethod + def paper_about_cryoem(text_embeddings: []): + """checks if the string "cryoem" or "cryo-em" is present in the text + + Args: + text_embeddings [(text, embedding)]: text and embeddings of a publication + + Returns: + bool: True if the text mentions cryo-em + """ + return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings) + + @staticmethod + def methods_string(): + methods_string = '' + for i, (k, v) in enumerate(DocumentAnalyzer.keywords_groups.items()): + if i > 0: + methods_string += ' or ' + methods_string += f'{k} ({", ".join(v)})' + return methods_string + + +class LlmHandler: + """pulled this straight from the hackathon code, should work though + """ + + def __init__(self): + self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100) + self.llm=ChatOpenAI( + temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3 + ) + + + def evaluate_queries(self, embedding, queries): + chatbot = RetrievalQA.from_chain_type( + llm=self.llm, + chain_type="stuff", + retriever=embedding.as_retriever(search_type="similarity", search_kwargs={"k":3}) + ) + + template = """ {query}? """ + response = [] + for q in queries: + prompt = PromptTemplate( + input_variables=["query"], + template=template, + ) + + response.append(chatbot.run( + prompt.format(query=q) + )) + return response + + + + +def main(): + x = DocumentAnalyzer() + l = LlmHandler() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/hackathon_runner.py b/hackathon_runner.py new file mode 100644 index 0000000..5acdc64 --- /dev/null +++ b/hackathon_runner.py @@ -0,0 +1,270 @@ + +import os +import pandas as pd +import PyPDF2 +from paperscraper.pdf import save_pdf +from paperscraper.get_dumps import biorxiv + +from fragment import Fragment +from publication import Publication +from VectorDatabase import Lantern +import openai +from langchain.document_loaders.csv_loader import CSVLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import FAISS +from langchain.document_loaders import TextLoader + +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import FAISS +from langchain.chat_models import ChatOpenAI +from langchain.chains import RetrievalQA +from langchain import PromptTemplate +import PyPDF2 + +keywords_groups = { + 'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'], + 'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'], + 'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"], + 'FRET': ['FRET', "forster resonance energy transfer", "fluorescence resonance energy transfer"], + 'AFM': ['AFM', "atomic force microscopy" ], + 'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"], + '3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"], + 'Y2H': ['Y2H', "yeast two-hybrid"], + 'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"], + 'XRAY_TOMOGRAPHY': ["soft x-ray tomography"], + 'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"], + 'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"], + 'EVOLUTION': ['coevolution', "evolutionary covariance"], + 'PREDICTED': ["predicted contacts"], + 'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"], + 'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension'] +} + +import re + +class LlmHandler: + + def __init__(self): + self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100) + self.llm=ChatOpenAI( + openai_api_key=openai_api_key, + temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3 + ) + + + def evaluate_queries(self, embedding, queries): + chatbot = RetrievalQA.from_chain_type( + llm=self.llm, + chain_type="stuff", + retriever=embedding.as_retriever(search_type="similarity", search_kwargs={"k":3}) + ) + + template = """ {query}? """ + response = [] + for q in queries: + prompt = PromptTemplate( + input_variables=["query"], + template=template, + ) + + response.append(chatbot.run( + prompt.format(query=q) + )) + return response + + +llm = LlmHandler() + +methods_string = '' +for i, (k, v) in enumerate(keywords_groups.items()): + if i > 0: + methods_string += ' or ' + methods_string += f'{k} ({", ".join(v)})' + + +def get_embeddings(fname): + """ + """ + loader = TextLoader(fname) + documents = loader.load() + text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","],chunk_size = 300, chunk_overlap=100) + docs = text_splitter.split_documents(documents) + + emb = OpenAIEmbeddings() + input_texts = [d.page_content for d in docs] + + input_embeddings = emb.embed_documents(input_texts) + text_embeddings = list(zip(input_texts, input_embeddings)) + return text_embeddings, emb + +def retreiveTextFromPdf(inp_file): + + + json = pd.read_json(path_or_buf=inp_file, lines=True) + lantern = Lantern() + + for n, doi in enumerate(json['doi']): + #print(n, doi) + + + ##NOTE: This is for example purpose only + if n > 0: + break + + if lantern.publicationExists(doi): + continue + + paper_data = {'doi': doi} + doi = doi.replace("/", "-") + pdf_dir = './papers/' + if not os.path.exists(pdf_dir): + os.mkdir(pdf_dir) + + pdfsavefile='./papers/' + doi +'.pdf' + save_pdf(paper_data, filepath=pdfsavefile) + + # creating a pdf reader object + reader = PyPDF2.PdfReader(pdfsavefile) + save_txt_path = 'scrapped_txts/' + if not os.path.exists(save_txt_path): + os.mkdir(save_txt_path) + extract_text = '' + for page in reader.pages: + extract_text+=page.extract_text() + + txt_file = str('{}.txt'.format(doi)) + with open(save_txt_path+txt_file, 'w') as file: + file.write(extract_text) + + + txt_embs, emb = get_embeddings(save_txt_path+txt_file) + + fragments = [] + for txt, embs in txt_embs: + fragment = Fragment(doi, 'methods', txt, embs) + fragments.append(fragment) + + title = "" + pmc = "" + pubmed = "" + + publication = Publication(doi, title, pmc, pubmed, doi) + + lantern.insertEmbeddings(fragments) + lantern.insertPublication(publication) + + os.remove(pdfsavefile) + + +def add_publication_by_doi(doi): + lantern = Lantern() + if lantern.publicationExists(doi): + return + + paper_data = {'doi': doi} + doi = doi.replace("/", "-") + pdf_dir = './papers/' + if not os.path.exists(pdf_dir): + os.mkdir(pdf_dir) + + pdfsavefile='./papers/' + doi +'.pdf' + save_pdf(paper_data, filepath=pdfsavefile) + + # creating a pdf reader object + reader = PyPDF2.PdfReader(pdfsavefile) + save_txt_path = 'scrapped_txts/' + if not os.path.exists(save_txt_path): + os.mkdir(save_txt_path) + extract_text = '' + for page in reader.pages: + extract_text+=page.extract_text() + + txt_file = str('{}.txt'.format(doi)) + with open(save_txt_path+txt_file, 'w') as file: + file.write(extract_text) + + + txt_embs, emb = get_embeddings(save_txt_path+txt_file) + + fragments = [] + for txt, embs in txt_embs: + fragment = Fragment(doi, 'methods', txt, embs) + fragments.append(fragment) + + title = "" + pmc = "" + pubmed = "" + + publication = Publication(doi, title, pmc, pubmed, doi) + + lantern.insertEmbeddings(fragments) + lantern.insertPublication(publication) + #print(fragments) + os.remove(pdfsavefile) + + +def process_result(result): + if result == None: + return (False, None) + for response in result: + if "cryo" in response.lower(): + return (False, None) + return (response.lower().startswith('yes'), response) + +lantern = Lantern() +def get_embeddings_for_pub(id): + input_texts = [] + input_embeddings = [] + if lantern.publicationExists(id): + fragments = lantern.getAllFragmentsOfPublication(id) + for fragment in fragments: + input_texts.append(fragment.content) + input_embeddings.append(fragment.vector) + text_embeddings = list(zip(input_texts, input_embeddings)) + return text_embeddings + +def main(): + open_ai_emb = OpenAIEmbeddings() + #add_publication_by_doi('10.1101/2023.10.31.564925') + #add_publication_by_doi('10.1101/2023.03.03.531047') + query = [f"You are reading a materials and methods section of a scientific paper. Here is the list of structural biology methods {methods_string}.\n\n Did the authors use any methods from the list? \n\n Answer with Yes or No followed by the names of the methods."] + lantern = Lantern() + publications = lantern.getUnreadPublication() + + all_results = [] + rows = [] + hits = 0 + for pub in publications[5:]: + text_embeddings = get_embeddings_for_pub(pub.id) + flag = False + for text, _ in text_embeddings: + if re.search("cryo-?em", text, re.IGNORECASE): + flag = True + break + if flag: + faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) + result = llm.evaluate_queries(faissIndex, query) + classification, response = process_result(result) + hits += classification + else: + classification, response = process_result(None) + #print('paper not about cryo-em') + rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""]) + + from google_sheets import SpreadsheetUpdater + gs = SpreadsheetUpdater() + print(rows) + gs.append_rows(rows) + msg = f""" + This batch of paper analysis has concluded. + {len(rows)} papers were analyzed in total over the date range 11/2 - 11/3 + {hits} {"were" if ((hits>0) or (hits == 0)) else was} classified as having multi-method structural data +""" + print(msg) + gs.notify_arthur(message=msg) + + +main() + + From 0debdf0c29096498d47275c684a7918a1b097eb4 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 01:11:19 -0800 Subject: [PATCH 08/31] logic fix --- analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis.py b/analysis.py index c71e800..49f90ad 100644 --- a/analysis.py +++ b/analysis.py @@ -77,7 +77,7 @@ def update_spreadsheet(rows: [], hits: int, notify=True): msg = f""" This batch of paper analysis has concluded. {len(rows)} papers were analyzed in total over the date range 11/2 - 11/3 - {hits} {"were" if ((hits>0) or (hits == 0)) else was} classified as having multi-method structural data""" + {hits} {"were" if (hits != 1) else "was"} classified as having multi-method structural data""" if notify: sheets.notify_arthur(message=msg) From aed0f64961c91c1e4ce7eb05f489074a0f00b61b Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 01:23:55 -0800 Subject: [PATCH 09/31] edited docstring --- analysis.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analysis.py b/analysis.py index 49f90ad..aa60539 100644 --- a/analysis.py +++ b/analysis.py @@ -9,7 +9,8 @@ from langchain import PromptTemplate class DocumentAnalyzer: - """sfdaf + """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, + and reports the results to the spreadsheet """ keywords_groups = { From 9aee017fe60e2071b8d0e151d1ee39954c10c0f4 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 11:38:45 -0800 Subject: [PATCH 10/31] Moved all tests to a dedicated folder --- latern_test.ipynb => tests/latern_test.ipynb | 0 test.py => tests/test.py | 0 testing.py => tests/testing.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename latern_test.ipynb => tests/latern_test.ipynb (100%) rename test.py => tests/test.py (100%) rename testing.py => tests/testing.py (100%) diff --git a/latern_test.ipynb b/tests/latern_test.ipynb similarity index 100% rename from latern_test.ipynb rename to tests/latern_test.ipynb diff --git a/test.py b/tests/test.py similarity index 100% rename from test.py rename to tests/test.py diff --git a/testing.py b/tests/testing.py similarity index 100% rename from testing.py rename to tests/testing.py From d6517e8af08822e863a1072a3dfaa69d328259b4 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 11:43:05 -0800 Subject: [PATCH 11/31] Create pylint.yml --- .github/workflows/pylint.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..383e65c --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,23 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') From bc2f629f4e8a503adf6ba2eedab1d28c699be5ae Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 11:51:17 -0800 Subject: [PATCH 12/31] Moved db dump/restore script to the README.md --- get_database.sh | 3 --- 1 file changed, 3 deletions(-) delete mode 100755 get_database.sh diff --git a/get_database.sh b/get_database.sh deleted file mode 100755 index 3790b3f..0000000 --- a/get_database.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -sudo -u postgres pg_dump structdb > structdb.sql \ No newline at end of file From 7a5ac99b9088144c2d3e1195214ea2db67985061 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 11:56:48 -0800 Subject: [PATCH 13/31] Moved ipynbs to test, because they were used for testing purposes and should be cleaned/deleted --- ScrapperPipeline.ipynb => tests/ScrapperPipeline.ipynb | 0 faiss.ipynb => tests/faiss.ipynb | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename ScrapperPipeline.ipynb => tests/ScrapperPipeline.ipynb (100%) rename faiss.ipynb => tests/faiss.ipynb (100%) diff --git a/ScrapperPipeline.ipynb b/tests/ScrapperPipeline.ipynb similarity index 100% rename from ScrapperPipeline.ipynb rename to tests/ScrapperPipeline.ipynb diff --git a/faiss.ipynb b/tests/faiss.ipynb similarity index 100% rename from faiss.ipynb rename to tests/faiss.ipynb From 812f69ecbfcdfe884fab49124761ca4468052860 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 12:55:51 -0800 Subject: [PATCH 14/31] Moved prompts and methods keywords to a dedicated file --- prompts.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 prompts.py diff --git a/prompts.py b/prompts.py new file mode 100644 index 0000000..d3196d1 --- /dev/null +++ b/prompts.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""A collection of various PDB-related prompts +""" + +# A list of abbreviated names and synonyms +# for various biophysical methonds +# that are typically used for integrative modeling + +METHODS_KEYWORDS = { + 'CX-MS': [ + 'cross-link', 'crosslink', + 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', + "chemical crosslinking mass spectrometry", + 'photo-crosslinking', 'crosslinking restraints', + 'crosslinking-derived restraints', 'chemical crosslinking', + 'in vivo crosslinking', 'crosslinking data', + ], + + 'HDX': [ + 'Hydrogen–deuterium exchange mass spectrometry', + 'Hydrogen/deuterium exchange mass spectrometry' + 'HDX', 'HDXMS', 'HDX-MS', + ], + + 'EPR': [ + 'electron paramagnetic resonance spectroscopy', + 'EPR', 'DEER', + "Double electron electron resonance spectroscopy", + ], + + 'FRET': [ + 'FRET', + "forster resonance energy transfer", + "fluorescence resonance energy transfer", + ], + + 'AFM': [ + 'AFM', "atomic force microscopy", + ], + + 'SAS': [ + 'SAS', 'SAXS', 'SANS', "Small angle solution scattering", + "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", + "Small angle X-ray scattering", "Small angle neutron scattering", + ], + + '3DGENOME': [ + 'HiC', 'Hi-C', "chromosome conformation capture", + ], + + 'Y2H': [ + 'Y2H', + "yeast two-hybrid", + ], + + 'DNA_FOOTPRINTING': [ + "DNA Footprinting", + "hydroxyl radical footprinting", + ], + + 'XRAY_TOMOGRAPHY': [ + "soft x-ray tomography", + ], + + 'FTIR': [ + "FTIR", "Infrared spectroscopy", + "Fourier-transform infrared spectroscopy", + ], + + 'FLUORESCENCE': [ + "Fluorescence imaging", + "fluorescence microscopy", "TIRF", + ], + + 'EVOLUTION': [ + 'coevolution', "evolutionary covariance", + ], + + 'PREDICTED': [ + "predicted contacts", + ], + + 'INTEGRATIVE': [ + "integrative structure", "hybrid structure", + "integrative modeling", "hybrid modeling", + ], + + 'SHAPE': [ + 'Hydroxyl Acylation analyzed by Primer Extension', + ] +} + + +def keywords_dict_to_string(keywords: dict) -> str: + """ + Convert dictionary with method keywords and synonyms + to a string + + Example: + + keywords = { + 'AFM': [ + 'AFM', "atomic force microscopy", + ], + + 'SAS': [ + 'SAS', "solution scattering", + ], + } + + Result: + + 'AFM (AFM, atomic force microscopy) or SAS (SAS, solution scattering)' + """ + + methods_string = '' + for i, (k, v) in enumerate(keywords.items()): + if i > 0: + methods_string += ' or ' + methods_string += f'{k} ({", ".join(v)})' + + return methods_string + +def get_qbi_hackathon_prompt(keywords: dict) -> str: + """ + Returns a prompt that was initially developed + during the QBI Hackathon. + """ + + if len(keywords) == 0: + raise(ValueError("Keywords dict can't be empty")) + + methods_string = keywords_dict_to_string(keywords) + + prompt = ( + "You are reading a materials and methods section " + "of a scientific paper. " + f"Here is the list of methods {methods_string}.\n\n" + "Did the authors use any of them? " + "Answer Yes or No, followed by the name(s) of methods. " + "Use only abbreviations." + ) + + return prompt + +if __name__ == '__main__': + # Just call an example function + print(get_qbi_hackathon_prompt(METHODS_KEYWORDS)) From ef9c690432524063e0088485dded43827b4c9f76 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 13:01:30 -0800 Subject: [PATCH 15/31] temporary moving this file into tests, but it should soon be deleted --- updated_prompt.py => tests/test_qbi_hackathon_promt.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename updated_prompt.py => tests/test_qbi_hackathon_promt.py (100%) diff --git a/updated_prompt.py b/tests/test_qbi_hackathon_promt.py similarity index 100% rename from updated_prompt.py rename to tests/test_qbi_hackathon_promt.py From 9b4db32c01731890c2b7a4e8f77e9fc77cc14d4b Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 13:53:17 -0800 Subject: [PATCH 16/31] Forgot to add dump/restore commands to the README.md --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e98a55a..b20c1b0 100644 --- a/README.md +++ b/README.md @@ -24,4 +24,14 @@ Latern creates the following two tables in the database: ## Usage -VectorDatabase file, which has class Latern, provides the main functionality for the vector database. For example, you can insert an embedding with the insertEmbedding(). \ No newline at end of file +VectorDatabase file, which has class Latern, provides the main functionality for the vector database. For example, you can insert an embedding with the insertEmbedding(). + +## Dumping/restoring the database + +To dump the database for the backup/transfer one can use built-in Postgres command [`pg_dump`](https://www.postgresql.org/docs/current/backup-dump.html): + +`sudo -u postgres pg_dump structdb > structdb.sql` + +to restore the database from dump: + +`sudo -u postgres psql structdb < structdb.sql` From 43dab99c2eee1ee874ee38fe87b80b4d7d36c543 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 14:09:14 -0800 Subject: [PATCH 17/31] This is also a deprecated dev/test snippet --- tests/context_retrieve.py | 125 ++++++++++++++++++++++++++++++++++++++ tests/input_file.txt | 1 + 2 files changed, 126 insertions(+) create mode 100644 tests/context_retrieve.py create mode 100644 tests/input_file.txt diff --git a/tests/context_retrieve.py b/tests/context_retrieve.py new file mode 100644 index 0000000..f4c4fc2 --- /dev/null +++ b/tests/context_retrieve.py @@ -0,0 +1,125 @@ +import os +import openai +from langchain.document_loaders.csv_loader import CSVLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS +from langchain.document_loaders import TextLoader + +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import FAISS +from langchain.chat_models import ChatOpenAI +from langchain.chains import RetrievalQA +from langchain import PromptTemplate + +import re +import requests +import xml.etree.ElementTree as ET + +from fragment import Fragment +from VectorDatabase import Latern + + + +# OpenAI Setup +OPEN_API_KEY = "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" +# openai.api_key = os.getenv(openai_api_key) +os.environ['OPENAI_API_KEY'] = OPEN_API_KEY + +def getPmcPaper(pmcid): + """ + """ + url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML' + req = requests.get(url) + res = req.text + return res + +def extractMethodsFromPmcPaper(paper): + """ + """ + tree = ET.fromstring(paper) + mtext = [] + for sec in tree.iter('sec'): + for title in sec.iter('title'): + if isinstance(title.text, str): + if re.search('methods', title.text, re.IGNORECASE): + mtext.extend(list(sec.itertext())) + return " ".join(mtext) + +def preprocess(input_text): + """ + """ + processed_data = input_text.replace("\n","") + return processed_data + +def get_embeddings(fname): + """ + """ + loader = TextLoader(fname) + documents = loader.load() + text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0) + + docs = text_splitter.split_documents(documents) + + emb = OpenAIEmbeddings() + input_texts = [d.page_content for d in docs] + + input_embeddings = emb.embed_documents(input_texts) + text_embeddings = list(zip(input_texts, input_embeddings)) + + return text_embeddings, emb + +def saveFassIndex(fname, sname, ): + """ + """ + txt_embs, emb = get_embeddings(docs) + faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) + faissIndex.save_local(sname) + # faissIndex = FAISS.from_documents(docs, OpenAIEmbeddings()) + # faissIndex.save_local("input_doc") + +def Query(input_query, faiss_obj): + chatbot = RetrievalQA.from_chain_type( + llm=ChatOpenAI( + openai_api_key=OPEN_API_KEY, + temperature=0, model_name="gpt-3.5-turbo", max_tokens=50 + ), + chain_type="stuff", + retriever=faiss_obj.as_retriever(search_type="similarity", search_kwargs={"k":1}) + ) + template = """ {query}? """ + prompt = PromptTemplate( + input_variables=["query"], + template=template, + ) + print(chatbot.run( + prompt.format(query=input_query) + )) + + +def main(): + text = getPmcPaper(pmcid) + + methods_text = preprocess(extractMethodsFromPmcPaper(text)) + fname = 'input_file.txt' + sname = 'input_doc' + with open(fname, 'w') as file: + file.write(methods_text) + # print(methods_text) + txt_embs, emb = get_embeddings(fname) + + fragments = [] + for txt, embs in txt_embs: + fragment = Fragment(pmcid, 'methods', txt, embs) + fragments.append(fragment) + + latern = Latern() + latern.insertEmbeddings(fragments) + + # retreieve. PMC + faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) + inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data" + Query(inp_query, faissIndex) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tests/input_file.txt b/tests/input_file.txt new file mode 100644 index 0000000..1eea3d8 --- /dev/null +++ b/tests/input_file.txt @@ -0,0 +1 @@ +Methods Mouse line generation and validation All animal work was performed in accordance with approved Yale IACUC protocols (#2019–11167 and #2020–07271). The HACNS1 and chimpanzee ortholog lines were generated at the Yale Genome Editing Center using standard gene targeting techniques in mouse ES cells 71 . C57BL/6J- A w−J /J mouse ES cells, generated by the Yale Genome Editing Center from C57BL/6J- A w−J /J mice obtained from The Jackson Laboratory (RRID:IMSR_JAX:000051), were edited by electroporation of a GFP cloning vector containing human (1241 bp) or chimpanzee (1240 bp) sequence flanked by C57BL/6 J mouse sequence homology arms, floxed pPGKneo vector, and diphtheria toxin sequence (Supplementary Fig. 1 A) 72 . The genomic coordinates of the human (hg19; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/ ), chimpanzee (panTro4; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001515.6/ ), and mouse (mm9; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.18/ ) sequences used in the editing constructs, including the mouse homology arm sequences, are listed in Supplementary Data 1 73 . Positive clones were karyotyped and only clones exhibiting a normal mouse karyotype were used for blastocyst injection. Resulting G0 chimeras were backcrossed to wild type C57BL/6 J (RRID: IMSR_JAX:000664) and crossed with an actin-Cre C57BL/6 J mouse line to remove the neo cassette. All mice used in our analysis were from F9 or later generations. Mice were maintained in a Yale Animal Resources Center (YARC) managed facility under a standard 12 h light/dark cycle and environmental monitoring according to YARC policies and procedures. Genotyping primers specific to HACNS1 , chimpanzee, and mouse orthologs are listed in Supplementary Data 10 . Cloning primers listed in Supplementary Data 10 were used to amplify edited loci for cloning and Sanger sequencing for comparison to the hg19 or panTro4 sequence. Sanger sequencing data is available at http://noonan.ycga.yale.edu/noonan_public/Dutrow_HACNS1/ . The sequence identity between the human (hg19, chr2:236773456-236774696) and chimpanzee alleles (panTro4, chr2B:241105291-241106530) is 98.2% (22 substitutions total, of which 15 are fixed in humans). Human-specific substitutions were defined as fixed if the derived allele frequency in dbSNP (v153) was >=0.9999 and if the ancestral sequence state was conserved between chimpanzee, rhesus macaque, orangutan, and marmoset. We provide a detailed analysis of sequence differences between the human, chimpanzee and mouse orthologs in the Supplemental Note (Supplementary Materials). HACNS1-GBX2 locus TAD coordinates (hg19 chr2:236655261-237135261) are from H1 human ES cell Hi-C data; HACNS1 and GBX2 are present in the same TAD and GBX2 is the only annotated protein-coding gene whose promoter is included in this TAD 32 . Copy number verification qPCR was performed using genomic DNA from three F9 mice from each line using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577) and the StepOnePlus Real-Time PCR System (Applied Biosystems) with primers listed in Supplementary Data 10 . All biological replicates of each genotype were run in triplicate and Ct values of each were normalized to a control region on a different chromosome (see Supplementary Data 10 ). Primary qPCR results are available as Source Data. Chromatin Immunoprecipitation, ChIP-qPCR and ChIP-seq Tissue for chromatin preparation was collected from E11.5 forelimb and hindlimb bud pairs or pharyngeal arch tissue from HACNS1 and chimpanzee ortholog line heterozygous crosses to obtain pooled, litter matched limb bud or pharyngeal arch samples for all three genotypes ( HACNS1 homozygous, chimpanzee ortholog line, and wild type). Two biological replicates were used per genotype per tissue, each with tissue pooled from three embryos. Pooled tissue was crosslinked and sonicated as previously described 74 . Chromatin for each genotype, tissue, and replicate was used for H3K27ac or H3K4me2 immunoprecipitation with 7.5 μg antibody and ~5 μg tissue per ChIP assay using Active Motif #39133 (RRID: AB_2561016) and Active Motif #39913 (RRID: AB_2614976) as previously described 74 , 75 . ChIP-qPCR was performed using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577) with primers listed in Supplementary Data 11 . Samples were sequenced (2 × 100 bp) using standard Illumina protocols on an Illumina HiSeq 4000 (RRID: SCR_016386). To control for batch effects, all samples of the same tissue type were multiplexed and sequenced on a single lane. Reference genomes edited to replace the mouse ortholog of HACNS1 with the human or chimpanzee sequence were built using Bowtie (v2.2.8; RRID: SCR_005476) 76 . ChIP-seq raw reads were aligned to the mm9, mm9 with chimpanzee ortholog, or humanized mm9 reference genome using Bowtie with -sensitive and -no-unal settings. GC composition was assessed using fastQC and showed that GC content and bias were consistent across all experiments 77 , 78 . Tag directories for each experiment were generated using makeTagDirectory in HOMER with default settings and standard normalization to 10 million tags, and were used to generate bigwig files for visualization with makeUCSCfile 23 . All peaks were called with HOMER (v4.9.1 RRID: SCR_010881) using default settings for -histone (IP vs input fold change = 4, p = 0.0001, peak size = 500, minDist = 1000) 23 . All differential peaks were called with DESeq2 implemented in HOMER using getDifferentialPeaksReplicates.pl with default settings (fold change cutoff = 2, FDR cutoff = 5%); briefly, reads from each comparison are pooled, with ChIP and inputs pooled separately, such that new peaks are called and used for quantitative comparison between genotypes 23 , 24 . The complete datasets of all peaks tested in differential analyses can be found at http://noonan.ycga.yale.edu/noonan_public/Dutrow_HACNS1/ . RNA extraction and RT-qPCR E11-E12 embryos were collected from six HACNS1 homozygous, chimpanzee ortholog line, or wild type litters generated by crossing homozygous animals for each line. All embryos within each genotype group were ordered based on stage (>70 total embryos) and were divided into six timepoint groups per genotype consisting of forelimb or hindlimb buds from 4-6 pooled embryos per time point per genotype per tissue. RNA was purified using the Qiagen miRNeasy Kit (#74106). Invitrogen Superscript III Reverse Transcription Kit (#18080-051) was used to prepare cDNA from each sample. qPCR with the resulting cDNA was performed using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577). All samples were analyzed in triplicate using primers listed in Supplementary Data 12 and Ct values of Gbx2 were normalized to Hprt1 . Primary RT-qPCR results are available as Source Data. Whole mount in situ hybridization E11-E12 mouse embryos were collected from HACNS1 homozygous ( n = 7 litters), chimpanzee ortholog line ( n = 8 litters), and wild type ( n = 12 litters) homozygous crosses. Embryos were fixed and hybridized with the same preparation of antisense Gbx2 mRNA probe under identical conditions as previously described 78 , 79 . The Gbx2 probe used for hybridization contains the full mouse consensus CDS sequence (CCDS15150.1); NCBI CCDS Release 23; https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=ALLFIELDS&DATA=CCDS15150.1&ORGANISM=10090&BUILDS=CURRENTBUILDS (NCBI CCDS Release 23 CCDS15150.1). The 178 embryos (55 from the HACNS1 knock-in line, 52 from the chimpanzee ortholog line, and 71 from wild type) were divided into temporally-ordered sextiles within the E11-E12 window (~40–48 somites, although we did not rely on somite counts for staging) based on measurement of crown-rump length for each individual embryo 35 . For the data shown in Fig. 3B , embryos were assessed for staining pattern by three individuals blinded to genotype under a stereo microscope (Leica S6D). For the data shown in Supplementary Fig. 3A, B embryos were annotated by a single scorer blinded to genotype. The scoring scheme was based on previous studies, notably to assess whole-mount gene expression patterns as described in the VISTA Enhancer Browser ( http://enhancer.lbl.gov/ ) 10 , 34 , 36 . Embryos were assigned to one of eleven categories of Gbx2 expression pattern based on the anterior-posterior and proximal-distal localization of staining as well the intensity (strong versus weak) of staining: 1: anterior and posterior (AP); 2: anterior distal and posterior distal (APD); 3: distal (D); 4: anterior distal (AD); 5: anterior (A); 6: weak anterior and posterior (APL); 7: weak anterior (AL); 8: weak distal (DL); 9: weak anterior and posterior distal (APDL); 10: weak anterior distal (ADL); 11: no staining (N). Categories were merged for clarity in Fig. 3B in the following manner: categories 1–3: anterior and posterior; categories 4–5: anterior only; categories 6–10: weak staining. See Fig. 3B for representative images of staining patterns illustrating the scoring scheme used for qualitative assessment of expression. Representative images were taken using a Zeiss Stemi 2000-C stereomicroscope fitted with an AxioCam MRc5 digital camera and Zeiss AxioVision software. Images and associated annotations are available as Source Data. Single-cell RNA-sequencing Sample preparation Tissue for scRNA-seq was collected at E11.5 from two human ortholog line homozygous litters, two chimpanzee ortholog line homozygous litters, and two wild type litters. Embryos were staged as previously described in order to obtain samples from stage-matched T3 embryos from each genotype. Left hindlimb buds from three embryos per genotype per replicate were pooled. Following dissection, the tissue was immediately placed in CMFSG saline–glucose solution (1x Calcium–magnesium-free phosphate buffered saline from Thermo Fisher Scientific #21-040-CV with 0.1% glucose from Corning 45% Glucose #45001-116) on ice. Gibco TrypLE Express digestion solution was used for cellular dissociation (Thermo Fisher Scientific # 2605010). The dissociation reaction was stopped using 1xDMEM (ATCC 30–2002) with 10% heat-inactivated Fetal Bovine Serum (Sigma-Aldrich #F4135). The dissociated cells were filtered through a 40 μM strainer and harvested by centrifugation at 4 °C. Cells were washed and resuspended in 1x Calcium–magnesium-free phosphate buffered saline (Thermo Fisher Scientific #21-040-CV) with 0.04% BSA (Sigma-Aldrich #SRE0036). Cell number and viability were estimated on a Countess II Automated Cell Counter prior to library preparation of 10,000 cells (estimated cell recovery from 16,000 input cells) per sample using Chromium Single Cell 3ʹ GEM, Library & Gel Bead Kit v3 (10X Genomics PN-1000075). Libraries were sequenced (2 × 75 bp) on an Illumina HiSeq 4000 (RRID: SCR_016386). To control for batch effects, all samples were multiplexed across all lanes. Count matrices were produced from raw sequencing data using the Cell Ranger v3.0.2 package from 10X Genomics (RRID: SCR_017344). Data filtering and preprocessing Matrices from the 10x Cell Ranger platform were filtered and preprocessed using Seurat v3.0.1 (RRID: SCR_016341) 38 . Prior to the generation of Seurat objects, Xist gene counts were eliminated in order to avoid clustering by sex within mixed sample populations. Genes expressed in fewer than 5 cells per sample were removed. Cells with greater than 7.5% or 2% counts from mitochondrial genes or hemoglobin genes, respectively, were removed. Cells with total gene count (nGene) z-scores less than -1 (corresponding to ~700 or fewer detected genes) or greater than 4 (corresponding to ~6000 or greater detected genes) were removed, as were cells with total UMI count (nUMI) z-scores greater than 7 (corresponding to ~50,000 or greater detected UMIs; see Supplementary Fig. 5 ). One chimpanzee ortholog line replicate was removed during pre-processing due to high overall mitochondrial gene expression, indicative of low viability. Prior to data integration, expression values from each sample were normalized based on library size for pre-processing purposes only using the Seurat tool NormalizeData 38 . Louvain clustering as implemented in Seurat was performed for pre-processing purposes only using FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, and FindClusters in order to remove endothelial cell clusters ( Cd34 -positive and Pf4 -positive), clusters characterized by aberrant mitochondrial gene expression (low mt-Co1 ), and transcriptionally distinct clusters containing fewer than 30 cells per sample 38 , 42 . The numbers of cells remaining after pre-processing for each sample are listed in Supplementary Data 13 . Data normalization and integration All subsequent normalization and integration steps after pre-processing were performed with raw counts for all cells retained after pre-processing (see Supplementary Data 13 ). Cell cycle scores were computed using CellCycleScoring in Seurat to regress out the difference between G2M and S phases, effectively preserving differences between cycling and non-cycling cells while reducing differences related to cell cycle amongst proliferating cells 38 . In addition to cell cycle scores, percent mitochondrial gene expression and nUMI values were regressed using SCTransform (SCT) in order to reduce the effects of sequencing depth and minor differences in mitochondrial DNA expression related to viability 38 , 80 . All SCT normalized datasets containing all genes from individual samples were integrated using SelectIntegrationFeatures, PrepSCTIntegration, FindIntegrationAnchors, and IntegrateData 38 , 80 . Following integration, the combined dataset was randomly down-sampled to contain a maximum of 10,000 cells per genotype prior to embedding and clustering using SubsetData in Seurat 38 . PCA, UMAP, and Louvain clustering were implemented in Seurat using RunPCA, RunUMAP, FindNeighbors, and FindClusters 38 , 41 . Percentages of cells belonging to each Louvain cluster are shown in Supplementary Data 13 . Normalized data from all samples combined were used for imputation using ALRA with default settings for the purposes of data visualization as shown in Fig. 4A–D , Supplementary Fig. 4A, B , and Fig. 5C, D 81 . Marker gene expression was compared between ALRA-imputed and unimputed data to establish that imputation did not substantially distort marker gene expression patterns in our dataset (Supplementary Fig. 4 , Supplementary Data 13 ). Data normalization and integration, UMAP embedding, and Louvain clustering were performed prior to imputation. The threshold for identifying Gbx2 -positive cells was set as an imputed Gbx2 expression value greater than 0.1. This threshold was also used for identifying percentages of marker gene-positive cells in unimputed and imputed data as shown in Supplementary Data 13 . All gene expression scaling and centering for visualization purposes was performed on normalized imputed or unimputed data using the Seurat ScaleData function with default parameters (scale.max = 10) 38 . MELD, MAGIC, kNN-DREMI analyses Cells belonging to mesenchymal cell clusters (clusters 1–4, see Fig. 4A, C ) from all genotypes were used for MELD, MAGIC, kNN-DREMI, and Gene Set Enrichment Analysis (GSEA). Scaled data matrices from the Seurat object integrated assay were loaded using scprep for MELD, MAGIC, and kNN-DREMI ( https://github.com/krishnaswamylab/scprep ). MELD and MAGIC both denoise scRNA-seq data using graphs to model cellular state space. The same graph signal was used for both MELD and MAGIC as calculated by graphtools (1.5.2) with n_pca = 20, decay = 40, and knn = 10. MELD was run on one-hot vectors for each genotype independently using default parameters 55 . MAGIC was performed using the same graph signal as MELD 54 . We used the kNN-DREMI implementation provided in scprep and kNN-DREMI was run on MAGIC-imputed data 53 . kNN-DREMI analysis was used in order to identify genes with expression levels associated with either Gbx2 expression in humanized hindlimb or cells with increased humanized RL as calculated using MELD. MAGIC was employed only for the purpose of generating denoised gene expression values for kNN-DREMI analysis of gene-gene relationships but was not used for data visualization, clustering, or sample-associated density estimation using MELD. Gene set enrichment analysis GSEA was performed using topGO v.2.34.0 (RRID: SCR_014798) on all expressed genes that were ranked by Gbx2- DREMI or humanized RL-DREMI score from the aforementioned humanized mesenchymal cell kNN-DREMI analysis 82 . Significant nodes were identified using a Kolmogorov–Smirnov test and the algorithm = “elim” argument. Ontologies listed in Supplementary Data 5 and 6 are the top 30 nodes with fewer than 100 annotated genes (to remove non-specific categories) and at least one gene in the top 20% of DREMI scores. Heatmap hierarchical clustering was performed using pheatmap v1.0.12 (RRID: SCR_016418) 83 . Skeletal staining E18.5 skeletons from two litters from each of HACNS1 homozygous, chimpanzee ortholog line, and wild type homozygous crosses ( n = 48 embryos) were stained with Alcian Blue and Alizarin Red as previously described 71 . Skeletons were imaged under a stereo microscope (Leica S6D) and measured by a single scorer blinded to genotype using ImageJ 2.0.0. Bone and cartilage lengths of the forelimb and hindlimb pelvic girdle, stylopod, zeugopod, and autopod were measured blinded to genotype using ImageJ 2.0.0. Forelimb measurements include metacarpals 1–5 (cartilage), proximal phalanges 1–5 (cartilage), intermediate phalanges 2–5 (cartilage), distal phalanges 1–5 (cartilage), scapula (bone and cartilage), humerus (bone and cartilage), radius (bone and cartilage), and ulna (bone and cartilage). Hindlimb measurements include metatarsals 1–5 (cartilage), proximal phalanges 1–5 (cartilage), intermediate phalanges 2–5 (cartilage), distal phalanges 1–5 (cartilage), tibia (bone and cartilage), femur (bone and cartilage), pelvis (cartilage), ilium (bone), ischium (bone), pubis (bone), fibula (bone), calcaneum (cartilage), and talus (cartilage). Digit length was calculated as the sum of all metacarpal/metatarsal and phalanx segments. Raw measurements and digit length were normalized to the length of ossified humerus or femur for forelimb or hindlimb digits, respectively. Phalange to metacarpal ratio was calculated as the ratio of the sum of the phalange lengths of each digit to the corresponding metacarpal/metatarsal segment. Interdigital ratios were calculated using raw digit lengths. Raw measurements and images are available as Source Data. ANOVA analysis for gene expression and morphometry ANOVA analysis was performed with the lme4 package in R (RRID: SCR_015654) using default parameters to dissect the effects of genotype on limb segment length (morphometric data) 84 . We calculated the effects of genotype, litter, sex, forelimb versus hindlimb, digit number, and right versus left (RL) on normalized digit length, phalange to metacarpal ratio and interdigital ratio (Length Ratio ~ Genotype * (1 | Genotype/Litter) * Sex * Limb * Digit * (1 | RL) * (1 | Litter/Embryo) * (1 | Sex/Embryo) * (1 | Genotype/Embryo)) . Correction for multiple comparisons was performed using the Benjamini & Hochberg method 85 . Statistics and reproducibility All ChIP-seq findings were validated using ChIP-qPCR of both the sequenced samples as well as additional biological replicates. Specificity of H3K27ac and H3K4me2 antibodies was validated by the authors using dot blot analysis. Additional validation measures including dot blot analysis and ChIP-qPCR were performed by Active Motif ( https://www.activemotif.com/documents/tds/39133.pdf and https://www.activemotif.com/documents/tds/39913.pdf ). RT-qPCR results shown in Supplementary Fig. 3C were validated with additional biological and technical replicates. All attempts at replication were successful. No statistical methods were used to predetermine sample size for ChIP-seq and RT-qPCR analyses and no data were excluded from these analyses. All samples prepared for ChIP-seq, RT-qPCR, ISH, scRNA-seq, or morphometric analysis as shown in the final figures were treated identically and processed in parallel. No statistical methods were used to predetermine optimal sample sizes for morphometric and ISH analyses. Instead, morphometric studies and ISH analyses were done using large sample sizes: limb samples from 48 embryos for morphometry and over 100 embryos obtained from multiple litters for each genotype for ISH analyses. For ISH and morphometric analyses, no data were excluded from the analyses; missing data values indicate samples that could not be evaluated/measured due to damage to the specimen. One scRNA-seq replicate from the chimpanzee ortholog line was excluded based on high overall mitochondrial gene expression, indicative of low sample quality based on preestablished filtering metrics. Qualitative analysis of ISH results were performed using a blinded approach by randomizing embryo identification numbers prior to annotation as described in in the Methods. Morphometric data was collected blinded to genotype by randomizing sample identification numbers. ChIP-seq, RT-qPCR, and scRNA-seq were performed without group allocation blinding as all biological and technical replicates were processed identically and in parallel and no qualitative analyses were required for these experiments. We did not consider the sex of embryonic samples as a variable in our studies. Reporting summary Further information on research design is available in the Nature Research Reporting Summary linked to this article. \ No newline at end of file From fb26688d826e02e4abebc262106dab913c14ee50 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 14:15:08 -0800 Subject: [PATCH 18/31] deleted extra copy --- context_retrieve.py | 125 -------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 context_retrieve.py diff --git a/context_retrieve.py b/context_retrieve.py deleted file mode 100644 index f4c4fc2..0000000 --- a/context_retrieve.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import openai -from langchain.document_loaders.csv_loader import CSVLoader -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.text_splitter import CharacterTextSplitter -from langchain.vectorstores import FAISS -from langchain.document_loaders import TextLoader - -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import FAISS -from langchain.chat_models import ChatOpenAI -from langchain.chains import RetrievalQA -from langchain import PromptTemplate - -import re -import requests -import xml.etree.ElementTree as ET - -from fragment import Fragment -from VectorDatabase import Latern - - - -# OpenAI Setup -OPEN_API_KEY = "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" -# openai.api_key = os.getenv(openai_api_key) -os.environ['OPENAI_API_KEY'] = OPEN_API_KEY - -def getPmcPaper(pmcid): - """ - """ - url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML' - req = requests.get(url) - res = req.text - return res - -def extractMethodsFromPmcPaper(paper): - """ - """ - tree = ET.fromstring(paper) - mtext = [] - for sec in tree.iter('sec'): - for title in sec.iter('title'): - if isinstance(title.text, str): - if re.search('methods', title.text, re.IGNORECASE): - mtext.extend(list(sec.itertext())) - return " ".join(mtext) - -def preprocess(input_text): - """ - """ - processed_data = input_text.replace("\n","") - return processed_data - -def get_embeddings(fname): - """ - """ - loader = TextLoader(fname) - documents = loader.load() - text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0) - - docs = text_splitter.split_documents(documents) - - emb = OpenAIEmbeddings() - input_texts = [d.page_content for d in docs] - - input_embeddings = emb.embed_documents(input_texts) - text_embeddings = list(zip(input_texts, input_embeddings)) - - return text_embeddings, emb - -def saveFassIndex(fname, sname, ): - """ - """ - txt_embs, emb = get_embeddings(docs) - faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) - faissIndex.save_local(sname) - # faissIndex = FAISS.from_documents(docs, OpenAIEmbeddings()) - # faissIndex.save_local("input_doc") - -def Query(input_query, faiss_obj): - chatbot = RetrievalQA.from_chain_type( - llm=ChatOpenAI( - openai_api_key=OPEN_API_KEY, - temperature=0, model_name="gpt-3.5-turbo", max_tokens=50 - ), - chain_type="stuff", - retriever=faiss_obj.as_retriever(search_type="similarity", search_kwargs={"k":1}) - ) - template = """ {query}? """ - prompt = PromptTemplate( - input_variables=["query"], - template=template, - ) - print(chatbot.run( - prompt.format(query=input_query) - )) - - -def main(): - text = getPmcPaper(pmcid) - - methods_text = preprocess(extractMethodsFromPmcPaper(text)) - fname = 'input_file.txt' - sname = 'input_doc' - with open(fname, 'w') as file: - file.write(methods_text) - # print(methods_text) - txt_embs, emb = get_embeddings(fname) - - fragments = [] - for txt, embs in txt_embs: - fragment = Fragment(pmcid, 'methods', txt, embs) - fragments.append(fragment) - - latern = Latern() - latern.insertEmbeddings(fragments) - - # retreieve. PMC - faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) - inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data" - Query(inp_query, faissIndex) - -if __name__ == '__main__': - main() \ No newline at end of file From f8b7b6ca9d147af5183b7453a1ae85b33361d5b4 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 14:21:51 -0800 Subject: [PATCH 19/31] Moved database entities into same file --- VectorDatabase.py | 37 ++++++++++++++++++++++++++++++++++++- analysis.py | 3 +-- database_entities.py | 34 ---------------------------------- hackathon_runner.py | 4 +--- tests/test.py | 4 +--- 5 files changed, 39 insertions(+), 43 deletions(-) delete mode 100644 database_entities.py diff --git a/VectorDatabase.py b/VectorDatabase.py index 6d6a799..f75485b 100644 --- a/VectorDatabase.py +++ b/VectorDatabase.py @@ -302,4 +302,39 @@ def get_embeddings_for_pub(self, id): texts.append(fragment.content) embeddings.append(fragment.vector) text_embeddings = list(zip(texts, embeddings)) - return text_embeddings \ No newline at end of file + return text_embeddings + +# Class to represent a publication with attributes id, title, pmc, pubmed, and doi +class Publication: + + id = "" + title = "" + pmc = "" + pubmed = "" + doi = "" + + def __init__(self, id, title, pmc, pubmed, doi): + self.id = id # (DOI) Unique identifier for the publication + self.title = title # Title of the publication + self.pmc = pmc # PubMed Central (PMC) Link + self.pubmed = pubmed # PubMed Link + self.doi = doi # Digital Object Identifier (DOI) Link for the publication + +# Class to represent a fragment of a publication with attributes id, header, content, and vector +class Fragment: + + + # Class variables to store default values for attributes + id = "" + header = "" + content = "" + vector = "" + + def __init__(self, id, header, content, vector): + # Constructor to initialize the attributes of the Fragment object + + # Set the attributes of the object with the values provided during instantiation + self.id = id # (DOI) Unique identifier for the fragment + self.header = header # Header or title of the fragment + self.content = content # Content or text of the fragment + self.vector = vector # Vector representation of the fragment diff --git a/analysis.py b/analysis.py index aa60539..7a3be99 100644 --- a/analysis.py +++ b/analysis.py @@ -1,6 +1,5 @@ -from VectorDatabase import Lantern -from database_entities import Publication, Fragment +from VectorDatabase import Lantern, Publication, Fragment from google_sheets import SheetsApiClient from langchain.text_splitter import RecursiveCharacterTextSplitter diff --git a/database_entities.py b/database_entities.py deleted file mode 100644 index 9de5295..0000000 --- a/database_entities.py +++ /dev/null @@ -1,34 +0,0 @@ -# Class to represent a publication with attributes id, title, pmc, pubmed, and doi -class Publication: - - id = "" - title = "" - pmc = "" - pubmed = "" - doi = "" - - def __init__(self, id, title, pmc, pubmed, doi): - self.id = id # (DOI) Unique identifier for the publication - self.title = title # Title of the publication - self.pmc = pmc # PubMed Central (PMC) Link - self.pubmed = pubmed # PubMed Link - self.doi = doi # Digital Object Identifier (DOI) Link for the publication - -# Class to represent a fragment of a publication with attributes id, header, content, and vector -class Fragment: - - - # Class variables to store default values for attributes - id = "" - header = "" - content = "" - vector = "" - - def __init__(self, id, header, content, vector): - # Constructor to initialize the attributes of the Fragment object - - # Set the attributes of the object with the values provided during instantiation - self.id = id # (DOI) Unique identifier for the fragment - self.header = header # Header or title of the fragment - self.content = content # Content or text of the fragment - self.vector = vector # Vector representation of the fragment diff --git a/hackathon_runner.py b/hackathon_runner.py index 5acdc64..761c6e7 100644 --- a/hackathon_runner.py +++ b/hackathon_runner.py @@ -5,9 +5,7 @@ from paperscraper.pdf import save_pdf from paperscraper.get_dumps import biorxiv -from fragment import Fragment -from publication import Publication -from VectorDatabase import Lantern +from VectorDatabase import Lantern, Fragment, Publication import openai from langchain.document_loaders.csv_loader import CSVLoader from langchain.embeddings.openai import OpenAIEmbeddings diff --git a/tests/test.py b/tests/test.py index 8347787..b782d81 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,6 +1,4 @@ -from fragment import Fragment -from publication import Publication -from VectorDatabase import Latern +from VectorDatabase import Lantern, Fragment, Publication from tqdm.auto import tqdm from sentence_transformers import SentenceTransformer import torch From 873a7b6945ff7dc2c3ddbf1ae6dcc8db8371a831 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 14:23:34 -0800 Subject: [PATCH 20/31] renamed file --- analysis.py => document_analysis.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename analysis.py => document_analysis.py (100%) diff --git a/analysis.py b/document_analysis.py similarity index 100% rename from analysis.py rename to document_analysis.py From 239308cdb76dc343e8e892aa24e0cdcce830498c Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 14:32:39 -0800 Subject: [PATCH 21/31] removed duplicated code --- document_analysis.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index 7a3be99..da87509 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -10,26 +10,7 @@ class DocumentAnalyzer: """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, and reports the results to the spreadsheet - """ - - keywords_groups = { - 'CX-MS': ['cross-link', 'crosslink', 'XL-MS', 'CX-MS', 'CL-MS', 'XLMS', 'CXMS', 'CLMS', "chemical crosslinking mass spectrometry", 'photo-crosslinking', 'crosslinking restraints', 'crosslinking-derived restraints', 'chemical crosslinking', 'in vivo crosslinking', 'crosslinking data'], - 'HDX': ['Hydrogen–deuterium exchange mass spectrometry', 'Hydrogen/deuterium exchange mass spectrometry' 'HDX', 'HDXMS', 'HDX-MS'], - 'EPR': ['electron paramagnetic resonance spectroscopy', 'EPR', 'DEER', "Double electron electron resonance spectroscopy"], - 'FRET': ['FRET', "forster resonance energy transfer", "fluorescence resonance energy transfer"], - 'AFM': ['AFM', "atomic force microscopy" ], - 'SAS': ['SAS', 'SAXS', 'SANS', "Small angle solution scattering", "solution scattering", "SEC-SAXS", "SEC-SAS", "SASBDB", "Small angle X-ray scattering", "Small angle neutron scattering"], - '3DGENOME': ['HiC', 'Hi-C', "chromosome conformation capture"], - 'Y2H': ['Y2H', "yeast two-hybrid"], - 'DNA_FOOTPRINTING': ["DNA Footprinting", "hydroxyl radical footprinting"], - 'XRAY_TOMOGRAPHY': ["soft x-ray tomography"], - 'FTIR': ["FTIR", "Infrared spectroscopy", "Fourier-transform infrared spectroscopy"], - 'FLUORESCENCE': ["Fluorescence imaging", "fluorescence microscopy", "TIRF"], - 'EVOLUTION': ['coevolution', "evolutionary covariance"], - 'PREDICTED': ["predicted contacts"], - 'INTEGRATIVE': ["integrative structure", "hybrid structure", "integrative modeling", "hybrid modeling"], - 'SHAPE': ['Hydroxyl Acylation analyzed by Primer Extension'] - } + """ def __init__(self): # self.lantern = Lantern() @@ -128,15 +109,6 @@ def paper_about_cryoem(text_embeddings: []): """ return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings) - @staticmethod - def methods_string(): - methods_string = '' - for i, (k, v) in enumerate(DocumentAnalyzer.keywords_groups.items()): - if i > 0: - methods_string += ' or ' - methods_string += f'{k} ({", ".join(v)})' - return methods_string - class LlmHandler: """pulled this straight from the hackathon code, should work though From ce7a5a83502d265e650d595051ef4404b824725e Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 15:03:24 -0800 Subject: [PATCH 22/31] added config file --- config.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 config.json diff --git a/config.json b/config.json new file mode 100644 index 0000000..0a315c3 --- /dev/null +++ b/config.json @@ -0,0 +1,4 @@ +{ + "Emails": [], + "DEBUG": false +} \ No newline at end of file From ae06ec1c29f7fa36dab773a5ebf589ba34bb4989 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 15:03:33 -0800 Subject: [PATCH 23/31] fixed date hardcode --- document_analysis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index da87509..fe4a283 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -6,6 +6,7 @@ from langchain.chat_models import ChatOpenAI from langchain.chains import RetrievalQA from langchain import PromptTemplate +from datetime import date class DocumentAnalyzer: """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, @@ -37,7 +38,8 @@ def process_publications(self, publications: [Publication]): else: #print('paper not about cryo-em') pass - rows.append([pub.doi, pub.title, "11-2-2023", "11-5-2023", "", int(classification), response, ""]) + # add date if it's added + rows.append([pub.doi, pub.title, "", str(date.today()), "", int(classification), response, ""]) self.update_spreadsheet(rows, hits) @@ -115,7 +117,6 @@ class LlmHandler: """ def __init__(self): - self.text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n", ".", ","], chunk_size=300, chunk_overlap=100) self.llm=ChatOpenAI( temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3 ) From 2ff4a7a5e5509736b5290f07ef017e1cecb7dc43 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 15:24:24 -0800 Subject: [PATCH 24/31] noop --- document_analysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/document_analysis.py b/document_analysis.py index fe4a283..573c149 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -8,6 +8,7 @@ from langchain import PromptTemplate from datetime import date + class DocumentAnalyzer: """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, and reports the results to the spreadsheet From c23dd0856d1073b66d46c4e1770bb98ff0dbfc73 Mon Sep 17 00:00:00 2001 From: Arthur Zalevsky <aozalevsky@gmail.com> Date: Sat, 11 Nov 2023 15:28:16 -0800 Subject: [PATCH 25/31] Already moved them to tests --- context_retrieve.py | 125 -------------------------------------------- input_file.txt | 1 - 2 files changed, 126 deletions(-) delete mode 100644 context_retrieve.py delete mode 100644 input_file.txt diff --git a/context_retrieve.py b/context_retrieve.py deleted file mode 100644 index f4c4fc2..0000000 --- a/context_retrieve.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import openai -from langchain.document_loaders.csv_loader import CSVLoader -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.text_splitter import CharacterTextSplitter -from langchain.vectorstores import FAISS -from langchain.document_loaders import TextLoader - -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import FAISS -from langchain.chat_models import ChatOpenAI -from langchain.chains import RetrievalQA -from langchain import PromptTemplate - -import re -import requests -import xml.etree.ElementTree as ET - -from fragment import Fragment -from VectorDatabase import Latern - - - -# OpenAI Setup -OPEN_API_KEY = "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" -# openai.api_key = os.getenv(openai_api_key) -os.environ['OPENAI_API_KEY'] = OPEN_API_KEY - -def getPmcPaper(pmcid): - """ - """ - url = f'https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML' - req = requests.get(url) - res = req.text - return res - -def extractMethodsFromPmcPaper(paper): - """ - """ - tree = ET.fromstring(paper) - mtext = [] - for sec in tree.iter('sec'): - for title in sec.iter('title'): - if isinstance(title.text, str): - if re.search('methods', title.text, re.IGNORECASE): - mtext.extend(list(sec.itertext())) - return " ".join(mtext) - -def preprocess(input_text): - """ - """ - processed_data = input_text.replace("\n","") - return processed_data - -def get_embeddings(fname): - """ - """ - loader = TextLoader(fname) - documents = loader.load() - text_splitter = CharacterTextSplitter(separator = ".",chunk_size = 1000, chunk_overlap=0) - - docs = text_splitter.split_documents(documents) - - emb = OpenAIEmbeddings() - input_texts = [d.page_content for d in docs] - - input_embeddings = emb.embed_documents(input_texts) - text_embeddings = list(zip(input_texts, input_embeddings)) - - return text_embeddings, emb - -def saveFassIndex(fname, sname, ): - """ - """ - txt_embs, emb = get_embeddings(docs) - faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) - faissIndex.save_local(sname) - # faissIndex = FAISS.from_documents(docs, OpenAIEmbeddings()) - # faissIndex.save_local("input_doc") - -def Query(input_query, faiss_obj): - chatbot = RetrievalQA.from_chain_type( - llm=ChatOpenAI( - openai_api_key=OPEN_API_KEY, - temperature=0, model_name="gpt-3.5-turbo", max_tokens=50 - ), - chain_type="stuff", - retriever=faiss_obj.as_retriever(search_type="similarity", search_kwargs={"k":1}) - ) - template = """ {query}? """ - prompt = PromptTemplate( - input_variables=["query"], - template=template, - ) - print(chatbot.run( - prompt.format(query=input_query) - )) - - -def main(): - text = getPmcPaper(pmcid) - - methods_text = preprocess(extractMethodsFromPmcPaper(text)) - fname = 'input_file.txt' - sname = 'input_doc' - with open(fname, 'w') as file: - file.write(methods_text) - # print(methods_text) - txt_embs, emb = get_embeddings(fname) - - fragments = [] - for txt, embs in txt_embs: - fragment = Fragment(pmcid, 'methods', txt, embs) - fragments.append(fragment) - - latern = Latern() - latern.insertEmbeddings(fragments) - - # retreieve. PMC - faissIndex = FAISS.from_embeddings(text_embeddings=txt_embs, embedding=emb) - inp_query = "Does the paper report a new structure of a biomolecule or biomolecular complex modeled using experimental data" - Query(inp_query, faissIndex) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/input_file.txt b/input_file.txt deleted file mode 100644 index 1eea3d8..0000000 --- a/input_file.txt +++ /dev/null @@ -1 +0,0 @@ -Methods Mouse line generation and validation All animal work was performed in accordance with approved Yale IACUC protocols (#2019–11167 and #2020–07271). The HACNS1 and chimpanzee ortholog lines were generated at the Yale Genome Editing Center using standard gene targeting techniques in mouse ES cells 71 . C57BL/6J- A w−J /J mouse ES cells, generated by the Yale Genome Editing Center from C57BL/6J- A w−J /J mice obtained from The Jackson Laboratory (RRID:IMSR_JAX:000051), were edited by electroporation of a GFP cloning vector containing human (1241 bp) or chimpanzee (1240 bp) sequence flanked by C57BL/6 J mouse sequence homology arms, floxed pPGKneo vector, and diphtheria toxin sequence (Supplementary Fig. 1 A) 72 . The genomic coordinates of the human (hg19; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.13/ ), chimpanzee (panTro4; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001515.6/ ), and mouse (mm9; https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.18/ ) sequences used in the editing constructs, including the mouse homology arm sequences, are listed in Supplementary Data 1 73 . Positive clones were karyotyped and only clones exhibiting a normal mouse karyotype were used for blastocyst injection. Resulting G0 chimeras were backcrossed to wild type C57BL/6 J (RRID: IMSR_JAX:000664) and crossed with an actin-Cre C57BL/6 J mouse line to remove the neo cassette. All mice used in our analysis were from F9 or later generations. Mice were maintained in a Yale Animal Resources Center (YARC) managed facility under a standard 12 h light/dark cycle and environmental monitoring according to YARC policies and procedures. Genotyping primers specific to HACNS1 , chimpanzee, and mouse orthologs are listed in Supplementary Data 10 . Cloning primers listed in Supplementary Data 10 were used to amplify edited loci for cloning and Sanger sequencing for comparison to the hg19 or panTro4 sequence. Sanger sequencing data is available at http://noonan.ycga.yale.edu/noonan_public/Dutrow_HACNS1/ . The sequence identity between the human (hg19, chr2:236773456-236774696) and chimpanzee alleles (panTro4, chr2B:241105291-241106530) is 98.2% (22 substitutions total, of which 15 are fixed in humans). Human-specific substitutions were defined as fixed if the derived allele frequency in dbSNP (v153) was >=0.9999 and if the ancestral sequence state was conserved between chimpanzee, rhesus macaque, orangutan, and marmoset. We provide a detailed analysis of sequence differences between the human, chimpanzee and mouse orthologs in the Supplemental Note (Supplementary Materials). HACNS1-GBX2 locus TAD coordinates (hg19 chr2:236655261-237135261) are from H1 human ES cell Hi-C data; HACNS1 and GBX2 are present in the same TAD and GBX2 is the only annotated protein-coding gene whose promoter is included in this TAD 32 . Copy number verification qPCR was performed using genomic DNA from three F9 mice from each line using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577) and the StepOnePlus Real-Time PCR System (Applied Biosystems) with primers listed in Supplementary Data 10 . All biological replicates of each genotype were run in triplicate and Ct values of each were normalized to a control region on a different chromosome (see Supplementary Data 10 ). Primary qPCR results are available as Source Data. Chromatin Immunoprecipitation, ChIP-qPCR and ChIP-seq Tissue for chromatin preparation was collected from E11.5 forelimb and hindlimb bud pairs or pharyngeal arch tissue from HACNS1 and chimpanzee ortholog line heterozygous crosses to obtain pooled, litter matched limb bud or pharyngeal arch samples for all three genotypes ( HACNS1 homozygous, chimpanzee ortholog line, and wild type). Two biological replicates were used per genotype per tissue, each with tissue pooled from three embryos. Pooled tissue was crosslinked and sonicated as previously described 74 . Chromatin for each genotype, tissue, and replicate was used for H3K27ac or H3K4me2 immunoprecipitation with 7.5 μg antibody and ~5 μg tissue per ChIP assay using Active Motif #39133 (RRID: AB_2561016) and Active Motif #39913 (RRID: AB_2614976) as previously described 74 , 75 . ChIP-qPCR was performed using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577) with primers listed in Supplementary Data 11 . Samples were sequenced (2 × 100 bp) using standard Illumina protocols on an Illumina HiSeq 4000 (RRID: SCR_016386). To control for batch effects, all samples of the same tissue type were multiplexed and sequenced on a single lane. Reference genomes edited to replace the mouse ortholog of HACNS1 with the human or chimpanzee sequence were built using Bowtie (v2.2.8; RRID: SCR_005476) 76 . ChIP-seq raw reads were aligned to the mm9, mm9 with chimpanzee ortholog, or humanized mm9 reference genome using Bowtie with -sensitive and -no-unal settings. GC composition was assessed using fastQC and showed that GC content and bias were consistent across all experiments 77 , 78 . Tag directories for each experiment were generated using makeTagDirectory in HOMER with default settings and standard normalization to 10 million tags, and were used to generate bigwig files for visualization with makeUCSCfile 23 . All peaks were called with HOMER (v4.9.1 RRID: SCR_010881) using default settings for -histone (IP vs input fold change = 4, p = 0.0001, peak size = 500, minDist = 1000) 23 . All differential peaks were called with DESeq2 implemented in HOMER using getDifferentialPeaksReplicates.pl with default settings (fold change cutoff = 2, FDR cutoff = 5%); briefly, reads from each comparison are pooled, with ChIP and inputs pooled separately, such that new peaks are called and used for quantitative comparison between genotypes 23 , 24 . The complete datasets of all peaks tested in differential analyses can be found at http://noonan.ycga.yale.edu/noonan_public/Dutrow_HACNS1/ . RNA extraction and RT-qPCR E11-E12 embryos were collected from six HACNS1 homozygous, chimpanzee ortholog line, or wild type litters generated by crossing homozygous animals for each line. All embryos within each genotype group were ordered based on stage (>70 total embryos) and were divided into six timepoint groups per genotype consisting of forelimb or hindlimb buds from 4-6 pooled embryos per time point per genotype per tissue. RNA was purified using the Qiagen miRNeasy Kit (#74106). Invitrogen Superscript III Reverse Transcription Kit (#18080-051) was used to prepare cDNA from each sample. qPCR with the resulting cDNA was performed using Power SYBR Green Mastermix (Thermo Fisher Scientific #4368577). All samples were analyzed in triplicate using primers listed in Supplementary Data 12 and Ct values of Gbx2 were normalized to Hprt1 . Primary RT-qPCR results are available as Source Data. Whole mount in situ hybridization E11-E12 mouse embryos were collected from HACNS1 homozygous ( n = 7 litters), chimpanzee ortholog line ( n = 8 litters), and wild type ( n = 12 litters) homozygous crosses. Embryos were fixed and hybridized with the same preparation of antisense Gbx2 mRNA probe under identical conditions as previously described 78 , 79 . The Gbx2 probe used for hybridization contains the full mouse consensus CDS sequence (CCDS15150.1); NCBI CCDS Release 23; https://www.ncbi.nlm.nih.gov/projects/CCDS/CcdsBrowse.cgi?REQUEST=ALLFIELDS&DATA=CCDS15150.1&ORGANISM=10090&BUILDS=CURRENTBUILDS (NCBI CCDS Release 23 CCDS15150.1). The 178 embryos (55 from the HACNS1 knock-in line, 52 from the chimpanzee ortholog line, and 71 from wild type) were divided into temporally-ordered sextiles within the E11-E12 window (~40–48 somites, although we did not rely on somite counts for staging) based on measurement of crown-rump length for each individual embryo 35 . For the data shown in Fig. 3B , embryos were assessed for staining pattern by three individuals blinded to genotype under a stereo microscope (Leica S6D). For the data shown in Supplementary Fig. 3A, B embryos were annotated by a single scorer blinded to genotype. The scoring scheme was based on previous studies, notably to assess whole-mount gene expression patterns as described in the VISTA Enhancer Browser ( http://enhancer.lbl.gov/ ) 10 , 34 , 36 . Embryos were assigned to one of eleven categories of Gbx2 expression pattern based on the anterior-posterior and proximal-distal localization of staining as well the intensity (strong versus weak) of staining: 1: anterior and posterior (AP); 2: anterior distal and posterior distal (APD); 3: distal (D); 4: anterior distal (AD); 5: anterior (A); 6: weak anterior and posterior (APL); 7: weak anterior (AL); 8: weak distal (DL); 9: weak anterior and posterior distal (APDL); 10: weak anterior distal (ADL); 11: no staining (N). Categories were merged for clarity in Fig. 3B in the following manner: categories 1–3: anterior and posterior; categories 4–5: anterior only; categories 6–10: weak staining. See Fig. 3B for representative images of staining patterns illustrating the scoring scheme used for qualitative assessment of expression. Representative images were taken using a Zeiss Stemi 2000-C stereomicroscope fitted with an AxioCam MRc5 digital camera and Zeiss AxioVision software. Images and associated annotations are available as Source Data. Single-cell RNA-sequencing Sample preparation Tissue for scRNA-seq was collected at E11.5 from two human ortholog line homozygous litters, two chimpanzee ortholog line homozygous litters, and two wild type litters. Embryos were staged as previously described in order to obtain samples from stage-matched T3 embryos from each genotype. Left hindlimb buds from three embryos per genotype per replicate were pooled. Following dissection, the tissue was immediately placed in CMFSG saline–glucose solution (1x Calcium–magnesium-free phosphate buffered saline from Thermo Fisher Scientific #21-040-CV with 0.1% glucose from Corning 45% Glucose #45001-116) on ice. Gibco TrypLE Express digestion solution was used for cellular dissociation (Thermo Fisher Scientific # 2605010). The dissociation reaction was stopped using 1xDMEM (ATCC 30–2002) with 10% heat-inactivated Fetal Bovine Serum (Sigma-Aldrich #F4135). The dissociated cells were filtered through a 40 μM strainer and harvested by centrifugation at 4 °C. Cells were washed and resuspended in 1x Calcium–magnesium-free phosphate buffered saline (Thermo Fisher Scientific #21-040-CV) with 0.04% BSA (Sigma-Aldrich #SRE0036). Cell number and viability were estimated on a Countess II Automated Cell Counter prior to library preparation of 10,000 cells (estimated cell recovery from 16,000 input cells) per sample using Chromium Single Cell 3ʹ GEM, Library & Gel Bead Kit v3 (10X Genomics PN-1000075). Libraries were sequenced (2 × 75 bp) on an Illumina HiSeq 4000 (RRID: SCR_016386). To control for batch effects, all samples were multiplexed across all lanes. Count matrices were produced from raw sequencing data using the Cell Ranger v3.0.2 package from 10X Genomics (RRID: SCR_017344). Data filtering and preprocessing Matrices from the 10x Cell Ranger platform were filtered and preprocessed using Seurat v3.0.1 (RRID: SCR_016341) 38 . Prior to the generation of Seurat objects, Xist gene counts were eliminated in order to avoid clustering by sex within mixed sample populations. Genes expressed in fewer than 5 cells per sample were removed. Cells with greater than 7.5% or 2% counts from mitochondrial genes or hemoglobin genes, respectively, were removed. Cells with total gene count (nGene) z-scores less than -1 (corresponding to ~700 or fewer detected genes) or greater than 4 (corresponding to ~6000 or greater detected genes) were removed, as were cells with total UMI count (nUMI) z-scores greater than 7 (corresponding to ~50,000 or greater detected UMIs; see Supplementary Fig. 5 ). One chimpanzee ortholog line replicate was removed during pre-processing due to high overall mitochondrial gene expression, indicative of low viability. Prior to data integration, expression values from each sample were normalized based on library size for pre-processing purposes only using the Seurat tool NormalizeData 38 . Louvain clustering as implemented in Seurat was performed for pre-processing purposes only using FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, and FindClusters in order to remove endothelial cell clusters ( Cd34 -positive and Pf4 -positive), clusters characterized by aberrant mitochondrial gene expression (low mt-Co1 ), and transcriptionally distinct clusters containing fewer than 30 cells per sample 38 , 42 . The numbers of cells remaining after pre-processing for each sample are listed in Supplementary Data 13 . Data normalization and integration All subsequent normalization and integration steps after pre-processing were performed with raw counts for all cells retained after pre-processing (see Supplementary Data 13 ). Cell cycle scores were computed using CellCycleScoring in Seurat to regress out the difference between G2M and S phases, effectively preserving differences between cycling and non-cycling cells while reducing differences related to cell cycle amongst proliferating cells 38 . In addition to cell cycle scores, percent mitochondrial gene expression and nUMI values were regressed using SCTransform (SCT) in order to reduce the effects of sequencing depth and minor differences in mitochondrial DNA expression related to viability 38 , 80 . All SCT normalized datasets containing all genes from individual samples were integrated using SelectIntegrationFeatures, PrepSCTIntegration, FindIntegrationAnchors, and IntegrateData 38 , 80 . Following integration, the combined dataset was randomly down-sampled to contain a maximum of 10,000 cells per genotype prior to embedding and clustering using SubsetData in Seurat 38 . PCA, UMAP, and Louvain clustering were implemented in Seurat using RunPCA, RunUMAP, FindNeighbors, and FindClusters 38 , 41 . Percentages of cells belonging to each Louvain cluster are shown in Supplementary Data 13 . Normalized data from all samples combined were used for imputation using ALRA with default settings for the purposes of data visualization as shown in Fig. 4A–D , Supplementary Fig. 4A, B , and Fig. 5C, D 81 . Marker gene expression was compared between ALRA-imputed and unimputed data to establish that imputation did not substantially distort marker gene expression patterns in our dataset (Supplementary Fig. 4 , Supplementary Data 13 ). Data normalization and integration, UMAP embedding, and Louvain clustering were performed prior to imputation. The threshold for identifying Gbx2 -positive cells was set as an imputed Gbx2 expression value greater than 0.1. This threshold was also used for identifying percentages of marker gene-positive cells in unimputed and imputed data as shown in Supplementary Data 13 . All gene expression scaling and centering for visualization purposes was performed on normalized imputed or unimputed data using the Seurat ScaleData function with default parameters (scale.max = 10) 38 . MELD, MAGIC, kNN-DREMI analyses Cells belonging to mesenchymal cell clusters (clusters 1–4, see Fig. 4A, C ) from all genotypes were used for MELD, MAGIC, kNN-DREMI, and Gene Set Enrichment Analysis (GSEA). Scaled data matrices from the Seurat object integrated assay were loaded using scprep for MELD, MAGIC, and kNN-DREMI ( https://github.com/krishnaswamylab/scprep ). MELD and MAGIC both denoise scRNA-seq data using graphs to model cellular state space. The same graph signal was used for both MELD and MAGIC as calculated by graphtools (1.5.2) with n_pca = 20, decay = 40, and knn = 10. MELD was run on one-hot vectors for each genotype independently using default parameters 55 . MAGIC was performed using the same graph signal as MELD 54 . We used the kNN-DREMI implementation provided in scprep and kNN-DREMI was run on MAGIC-imputed data 53 . kNN-DREMI analysis was used in order to identify genes with expression levels associated with either Gbx2 expression in humanized hindlimb or cells with increased humanized RL as calculated using MELD. MAGIC was employed only for the purpose of generating denoised gene expression values for kNN-DREMI analysis of gene-gene relationships but was not used for data visualization, clustering, or sample-associated density estimation using MELD. Gene set enrichment analysis GSEA was performed using topGO v.2.34.0 (RRID: SCR_014798) on all expressed genes that were ranked by Gbx2- DREMI or humanized RL-DREMI score from the aforementioned humanized mesenchymal cell kNN-DREMI analysis 82 . Significant nodes were identified using a Kolmogorov–Smirnov test and the algorithm = “elim” argument. Ontologies listed in Supplementary Data 5 and 6 are the top 30 nodes with fewer than 100 annotated genes (to remove non-specific categories) and at least one gene in the top 20% of DREMI scores. Heatmap hierarchical clustering was performed using pheatmap v1.0.12 (RRID: SCR_016418) 83 . Skeletal staining E18.5 skeletons from two litters from each of HACNS1 homozygous, chimpanzee ortholog line, and wild type homozygous crosses ( n = 48 embryos) were stained with Alcian Blue and Alizarin Red as previously described 71 . Skeletons were imaged under a stereo microscope (Leica S6D) and measured by a single scorer blinded to genotype using ImageJ 2.0.0. Bone and cartilage lengths of the forelimb and hindlimb pelvic girdle, stylopod, zeugopod, and autopod were measured blinded to genotype using ImageJ 2.0.0. Forelimb measurements include metacarpals 1–5 (cartilage), proximal phalanges 1–5 (cartilage), intermediate phalanges 2–5 (cartilage), distal phalanges 1–5 (cartilage), scapula (bone and cartilage), humerus (bone and cartilage), radius (bone and cartilage), and ulna (bone and cartilage). Hindlimb measurements include metatarsals 1–5 (cartilage), proximal phalanges 1–5 (cartilage), intermediate phalanges 2–5 (cartilage), distal phalanges 1–5 (cartilage), tibia (bone and cartilage), femur (bone and cartilage), pelvis (cartilage), ilium (bone), ischium (bone), pubis (bone), fibula (bone), calcaneum (cartilage), and talus (cartilage). Digit length was calculated as the sum of all metacarpal/metatarsal and phalanx segments. Raw measurements and digit length were normalized to the length of ossified humerus or femur for forelimb or hindlimb digits, respectively. Phalange to metacarpal ratio was calculated as the ratio of the sum of the phalange lengths of each digit to the corresponding metacarpal/metatarsal segment. Interdigital ratios were calculated using raw digit lengths. Raw measurements and images are available as Source Data. ANOVA analysis for gene expression and morphometry ANOVA analysis was performed with the lme4 package in R (RRID: SCR_015654) using default parameters to dissect the effects of genotype on limb segment length (morphometric data) 84 . We calculated the effects of genotype, litter, sex, forelimb versus hindlimb, digit number, and right versus left (RL) on normalized digit length, phalange to metacarpal ratio and interdigital ratio (Length Ratio ~ Genotype * (1 | Genotype/Litter) * Sex * Limb * Digit * (1 | RL) * (1 | Litter/Embryo) * (1 | Sex/Embryo) * (1 | Genotype/Embryo)) . Correction for multiple comparisons was performed using the Benjamini & Hochberg method 85 . Statistics and reproducibility All ChIP-seq findings were validated using ChIP-qPCR of both the sequenced samples as well as additional biological replicates. Specificity of H3K27ac and H3K4me2 antibodies was validated by the authors using dot blot analysis. Additional validation measures including dot blot analysis and ChIP-qPCR were performed by Active Motif ( https://www.activemotif.com/documents/tds/39133.pdf and https://www.activemotif.com/documents/tds/39913.pdf ). RT-qPCR results shown in Supplementary Fig. 3C were validated with additional biological and technical replicates. All attempts at replication were successful. No statistical methods were used to predetermine sample size for ChIP-seq and RT-qPCR analyses and no data were excluded from these analyses. All samples prepared for ChIP-seq, RT-qPCR, ISH, scRNA-seq, or morphometric analysis as shown in the final figures were treated identically and processed in parallel. No statistical methods were used to predetermine optimal sample sizes for morphometric and ISH analyses. Instead, morphometric studies and ISH analyses were done using large sample sizes: limb samples from 48 embryos for morphometry and over 100 embryos obtained from multiple litters for each genotype for ISH analyses. For ISH and morphometric analyses, no data were excluded from the analyses; missing data values indicate samples that could not be evaluated/measured due to damage to the specimen. One scRNA-seq replicate from the chimpanzee ortholog line was excluded based on high overall mitochondrial gene expression, indicative of low sample quality based on preestablished filtering metrics. Qualitative analysis of ISH results were performed using a blinded approach by randomizing embryo identification numbers prior to annotation as described in in the Methods. Morphometric data was collected blinded to genotype by randomizing sample identification numbers. ChIP-seq, RT-qPCR, and scRNA-seq were performed without group allocation blinding as all biological and technical replicates were processed identically and in parallel and no qualitative analyses were required for these experiments. We did not consider the sex of embryonic samples as a variable in our studies. Reporting summary Further information on research design is available in the Nature Research Reporting Summary linked to this article. \ No newline at end of file From 3bae482a185269ad20735f5a908d187d577e83cd Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 17:20:12 -0800 Subject: [PATCH 26/31] functionality to run script on all new entries --- document_analysis.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index 573c149..6317f34 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -1,10 +1,12 @@ from VectorDatabase import Lantern, Publication, Fragment from google_sheets import SheetsApiClient +from prompts import get_qbi_hackathon_prompt from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chat_models import ChatOpenAI from langchain.chains import RetrievalQA +from langchain.embeddings.openai import OpenAIEmbeddings from langchain import PromptTemplate from datetime import date @@ -15,9 +17,15 @@ class DocumentAnalyzer: """ def __init__(self): - # self.lantern = Lantern() + self.lantern = Lantern() self.sheets = SheetsApiClient() + self.llm = LlmHandler() + def analyze_all_unread(self): + """pulls all new files from Lantern database, evaluates them, and publishes results to google sheets + """ + publications = lantern.getUnreadPublications() + self.process_publications(publications) def process_publications(self, publications: [Publication]): """takes a list of publications, applies retrievalQA and processes responses @@ -67,19 +75,22 @@ def update_spreadsheet(rows: [], hits: int, notify=True): sheets.notify_arthur(message=msg) - def analyze_publication(self, publication: Publication): - """leaving this blank for now because i think the way these are stored is changing + def analyze_publication(self, text_embeddings: []): + """poses a question about the document, processes the result and returns it + NOTE: for now, only uses the hackathon question, might add more later Args: - publication (Publication): publication to be analyzed + text_embeddings ([]): list of (embedding, text) pairs from document to be analyzed Returns: bool: classification of response to query as positive (True) or negative (False) str: response from chatGPT """ - #faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) - #result = llm.evaluate_queries(faissIndex, query) - response = None + # NOTE: These very likely need to change + open_ai_emb = OpenAIEmbeddings() + query = get_qbi_hackathon_prompt() + faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) + response = self.llm.evaluate_queries(faissIndex, query)[0] return self.classify_response(response), response @staticmethod @@ -131,24 +142,24 @@ def evaluate_queries(self, embedding, queries): ) template = """ {query}? """ - response = [] + responses = [] for q in queries: prompt = PromptTemplate( input_variables=["query"], template=template, ) - response.append(chatbot.run( + responses.append(chatbot.run( prompt.format(query=q) )) - return response + return responses def main(): - x = DocumentAnalyzer() - l = LlmHandler() + document_analyzer = DocumentAnalyzer() + document_analyzer.analyze_all_unread() #analyzes all new files in lantern db if __name__ == '__main__': main() \ No newline at end of file From 832ff11243550c8ac77d10fa82214e33d2854051 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 18:33:37 -0800 Subject: [PATCH 27/31] allow any number of email addresses --- document_analysis.py | 94 +++++++++++++++++++++++--------------------- google_sheets.py | 29 ++++++++------ hackathon_runner.py | 2 +- 3 files changed, 66 insertions(+), 59 deletions(-) diff --git a/document_analysis.py b/document_analysis.py index 6317f34..be85901 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -1,7 +1,8 @@ +import re -from VectorDatabase import Lantern, Publication, Fragment +from VectorDatabase import Lantern, Publication from google_sheets import SheetsApiClient -from prompts import get_qbi_hackathon_prompt +from prompts import get_qbi_hackathon_prompt, METHODS_KEYWORDS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chat_models import ChatOpenAI @@ -9,22 +10,26 @@ from langchain.embeddings.openai import OpenAIEmbeddings from langchain import PromptTemplate from datetime import date +from langchain.vectorstores import FAISS class DocumentAnalyzer: - """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, aggregates the results, - and reports the results to the spreadsheet - """ - + """Takes in a list of publications to analyze, then prompts the chatbot, processes the response, + aggregates the results, and reports the results to the spreadsheet + """ + def __init__(self): self.lantern = Lantern() self.sheets = SheetsApiClient() self.llm = LlmHandler() - + + self.email_addresses = [] + self.notification_via_email = True + def analyze_all_unread(self): """pulls all new files from Lantern database, evaluates them, and publishes results to google sheets """ - publications = lantern.getUnreadPublications() + publications = self.lantern.getUnreadPublications() self.process_publications(publications) def process_publications(self, publications: [Publication]): @@ -34,46 +39,45 @@ def process_publications(self, publications: [Publication]): Args: publications ([]): list of publications """ - query = [f"You are reading a materials and methods section of a scientific paper. Here is the list of structural biology methods {methods_string}.\n\n Did the authors use any methods from the list? \n\n Answer with Yes or No followed by the names of the methods."] rows = [] hits = 0 for pub in publications: text_embeddings = self.lantern.get_embeddings_for_pub(pub.id) classification, response = 0, '' - if self.paper_about_cryoem(text_embeddings): + if self.paper_about_cryoem(text_embeddings): classification, response = self.analyze_publication(text_embeddings) hits += classification else: - #print('paper not about cryo-em') + # print('paper not about cryo-em') pass # add date if it's added rows.append([pub.doi, pub.title, "", str(date.today()), "", int(classification), response, ""]) self.update_spreadsheet(rows, hits) - - def update_spreadsheet(rows: [], hits: int, notify=True): + + def update_spreadsheet(self, rows: [], hits: int): """pushes a list of rows to the spreadsheet and notifies via email Args: rows ([]): rows of data to be uploaded to sheet hits (int): number of positive classifications in the rows - notify (bool): notify via email if True """ - if hits > len(rows): raise ValueError(f"Number of hits ({hits}) is greater than the number of entries ({len(rows)}), sus") - - #print(rows) + self.sheets.append_rows(rows) + + if self.notification_via_email: + self.email(hits, len(rows)) + + def email(self, hits: int, total: int): msg = f""" - This batch of paper analysis has concluded. - {len(rows)} papers were analyzed in total over the date range 11/2 - 11/3 - {hits} {"were" if (hits != 1) else "was"} classified as having multi-method structural data""" +This batch of paper analysis has concluded. +{total} papers were analyzed in total over the date range 11/2 - 11/3 +{hits} {"were" if (hits != 1) else "was"} classified as having multi-method structural data""" - if notify: - sheets.notify_arthur(message=msg) - + self.sheets.email(msg, self.email_addresses) def analyze_publication(self, text_embeddings: []): """poses a question about the document, processes the result and returns it @@ -88,9 +92,9 @@ def analyze_publication(self, text_embeddings: []): """ # NOTE: These very likely need to change open_ai_emb = OpenAIEmbeddings() - query = get_qbi_hackathon_prompt() - faissIndex = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) - response = self.llm.evaluate_queries(faissIndex, query)[0] + query = get_qbi_hackathon_prompt(METHODS_KEYWORDS) + faiss_index = FAISS.from_embeddings(text_embeddings=text_embeddings, embedding=open_ai_emb) + response = self.llm.evaluate_queries(faiss_index, query)[0] return self.classify_response(response), response @staticmethod @@ -103,14 +107,16 @@ def classify_response(response: str): Returns: bool: True if answer to question is "yes" """ - if result == None: + if response is None: return False - # this was used to filter out cases where ChatGPT said "Yes, Cryo-EM was used..." which is wrong because we asked it about + # this was used to filter out cases where ChatGPT said "Yes, Cryo-EM was used...", + # which is wrong because we asked it about # inclusion of non-cryo-em stuff - #if "cryo" in response.lower(): + # + # if "cryo" in response.lower(): # return (False, None) return response.lower().startswith('yes') - + @staticmethod def paper_about_cryoem(text_embeddings: []): """checks if the string "cryoem" or "cryo-em" is present in the text @@ -121,26 +127,25 @@ def paper_about_cryoem(text_embeddings: []): Returns: bool: True if the text mentions cryo-em """ - return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in embeddings) + return any(re.search("cryo-?em", text, re.IGNORECASE) for text, _ in text_embeddings) class LlmHandler: - """pulled this straight from the hackathon code, should work though + """Handles creation of langchain and evaluation of queries """ def __init__(self): - self.llm=ChatOpenAI( - temperature=0, model_name="gpt-4", max_tokens=300, request_timeout = 30, max_retries=3 - ) - - + self.llm = ChatOpenAI( + temperature=0, model_name="gpt-4", max_tokens=300, request_timeout=30, max_retries=3 + ) + def evaluate_queries(self, embedding, queries): chatbot = RetrievalQA.from_chain_type( - llm=self.llm, - chain_type="stuff", - retriever=embedding.as_retriever(search_type="similarity", search_kwargs={"k":3}) + llm=self.llm, + chain_type="stuff", + retriever=embedding.as_retriever(search_type="similarity", search_kwargs={"k": 3}) ) - + template = """ {query}? """ responses = [] for q in queries: @@ -155,11 +160,10 @@ def evaluate_queries(self, embedding, queries): return responses - - def main(): document_analyzer = DocumentAnalyzer() - document_analyzer.analyze_all_unread() #analyzes all new files in lantern db + document_analyzer.analyze_all_unread() # analyzes all new files in lantern db + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/google_sheets.py b/google_sheets.py index 123e117..9feccf8 100644 --- a/google_sheets.py +++ b/google_sheets.py @@ -1,6 +1,6 @@ import os import gspread -import typing + class SheetsApiClient: """interface for all functionality with google sheets @@ -20,16 +20,17 @@ class SheetsApiClient: ] def __init__(self): - self.connect() + self.client = self.connect() self.spreadsheet = self.client.open(type(self).SPREADSHEET_NAME) self.worksheet = self.spreadsheet.get_worksheet(0) - def connect(self): + @staticmethod + def connect(): """connects to Google Sheets API service using private key file """ try: secret_file = os.path.join(os.getcwd(), "google_sheets_credentials.json") - self.client = gspread.service_account(secret_file) + return gspread.service_account(secret_file) except OSError as e: print(e) @@ -48,18 +49,20 @@ def append_rows(self, rows: [[str]]): self._check_row(row) self.worksheet.append_rows(rows) - def notify_arthur(self, message: str): + def email(self, message: str, email_addresses: [str]): """Shares the spreadsheet with arthur, along with the message in an email Args: - message (str): + message (str): message to be sent + email_addresses ([str]): recipients of notification """ - self.spreadsheet.share( - "aozalevsky@gmail.com", - perm_type="user", - role="writer", - notify=True, - email_message=message, - ) + for email_address in email_addresses: + self.spreadsheet.share( + email_address, + perm_type="user", + role="writer", + notify=True, + email_message=message, + ) @staticmethod def _check_row(row: []): diff --git a/hackathon_runner.py b/hackathon_runner.py index 761c6e7..27b8319 100644 --- a/hackathon_runner.py +++ b/hackathon_runner.py @@ -260,7 +260,7 @@ def main(): {hits} {"were" if ((hits>0) or (hits == 0)) else was} classified as having multi-method structural data """ print(msg) - gs.notify_arthur(message=msg) + gs.email(message=msg) main() From 9502a5a5edae0b84de1ed7c3b9e6ef6fdd1b1ebe Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 18:37:55 -0800 Subject: [PATCH 28/31] added warning --- google_sheets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google_sheets.py b/google_sheets.py index 9feccf8..b93f978 100644 --- a/google_sheets.py +++ b/google_sheets.py @@ -44,6 +44,7 @@ def append_row(self, row: [str]): def append_rows(self, rows: [[str]]): """ Adds a list of rows to the spreadsheet, each row must follow SCHEMA: + WARNING: Assumes that the [rows] list will never exceed the maximum throughput of one api call """ for row in rows: self._check_row(row) From dbe4cf54a50f392580b16401f01cf4a3e85cb591 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 18:51:28 -0800 Subject: [PATCH 29/31] changed default spreadsheet permission to read only --- google_sheets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_sheets.py b/google_sheets.py index b93f978..0d0e38d 100644 --- a/google_sheets.py +++ b/google_sheets.py @@ -60,7 +60,7 @@ def email(self, message: str, email_addresses: [str]): self.spreadsheet.share( email_address, perm_type="user", - role="writer", + role="reader", notify=True, email_message=message, ) From b567551f41fd435ecd755b88c9867f0959789451 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 19:07:24 -0800 Subject: [PATCH 30/31] moved code around to fix error --- VectorDatabase.py | 76 +++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/VectorDatabase.py b/VectorDatabase.py index f75485b..f924c0d 100644 --- a/VectorDatabase.py +++ b/VectorDatabase.py @@ -1,5 +1,39 @@ import psycopg2 -from database_entities import Fragment, Publication + + +# Class to represent a publication with attributes id, title, pmc, pubmed, and doi +class Publication: + id = "" + title = "" + pmc = "" + pubmed = "" + doi = "" + + def __init__(self, id, title, pmc, pubmed, doi): + self.id = id # (DOI) Unique identifier for the publication + self.title = title # Title of the publication + self.pmc = pmc # PubMed Central (PMC) Link + self.pubmed = pubmed # PubMed Link + self.doi = doi # Digital Object Identifier (DOI) Link for the publication + + +# Class to represent a fragment of a publication with attributes id, header, content, and vector +class Fragment: + # Class variables to store default values for attributes + id = "" + header = "" + content = "" + vector = "" + + def __init__(self, id, header, content, vector): + # Constructor to initialize the attributes of the Fragment object + + # Set the attributes of the object with the values provided during instantiation + self.id = id # (DOI) Unique identifier for the fragment + self.header = header # Header or title of the fragment + self.content = content # Content or text of the fragment + self.vector = vector # Vector representation of the fragment + # Lantern class that exposes functionality of database to application class Lantern: @@ -249,7 +283,7 @@ def getUnreadPublications(self, delete_unread_entries=True): if delete_unread_entries: cursor.execute('DELETE FROM unread;') - + conn.commit() cursor.close() @@ -292,49 +326,15 @@ def publicationExists(self, id): - [(text, embedding)] content of a publication's embeddings Notes: """ + def get_embeddings_for_pub(self, id): texts = [] embeddings = [] if not self.publicationExists(id): - return + return fragments = self.getAllFragmentsOfPublication(id) for fragment in fragments: texts.append(fragment.content) embeddings.append(fragment.vector) text_embeddings = list(zip(texts, embeddings)) return text_embeddings - -# Class to represent a publication with attributes id, title, pmc, pubmed, and doi -class Publication: - - id = "" - title = "" - pmc = "" - pubmed = "" - doi = "" - - def __init__(self, id, title, pmc, pubmed, doi): - self.id = id # (DOI) Unique identifier for the publication - self.title = title # Title of the publication - self.pmc = pmc # PubMed Central (PMC) Link - self.pubmed = pubmed # PubMed Link - self.doi = doi # Digital Object Identifier (DOI) Link for the publication - -# Class to represent a fragment of a publication with attributes id, header, content, and vector -class Fragment: - - - # Class variables to store default values for attributes - id = "" - header = "" - content = "" - vector = "" - - def __init__(self, id, header, content, vector): - # Constructor to initialize the attributes of the Fragment object - - # Set the attributes of the object with the values provided during instantiation - self.id = id # (DOI) Unique identifier for the fragment - self.header = header # Header or title of the fragment - self.content = content # Content or text of the fragment - self.vector = vector # Vector representation of the fragment From 02fc7b47b6e1abfe5e6e5be4c6a53e958d97f968 Mon Sep 17 00:00:00 2001 From: Michael Antoun <antounmichael@yahoo.com> Date: Sat, 11 Nov 2023 19:12:22 -0800 Subject: [PATCH 31/31] added config parsing to decide email addresses and operation mode --- config.json | 2 +- document_analysis.py | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index 0a315c3..f40fb96 100644 --- a/config.json +++ b/config.json @@ -1,4 +1,4 @@ { - "Emails": [], + "emails": ["aozalevsky@gmail.com", "steveurkel99@gmail.com"], "DEBUG": false } \ No newline at end of file diff --git a/document_analysis.py b/document_analysis.py index be85901..0326e24 100644 --- a/document_analysis.py +++ b/document_analysis.py @@ -1,3 +1,4 @@ +import json import re from VectorDatabase import Lantern, Publication @@ -18,13 +19,33 @@ class DocumentAnalyzer: aggregates the results, and reports the results to the spreadsheet """ + CONFIG_PATH = "./config.json" + def __init__(self): self.lantern = Lantern() self.sheets = SheetsApiClient() self.llm = LlmHandler() - self.email_addresses = [] - self.notification_via_email = True + self.email_addresses, self.notification_via_email = self.parse_config() + + @staticmethod + def parse_config(): + try: + with open(DocumentAnalyzer.CONFIG_PATH, 'r') as config_file: + config_data = json.load(config_file) + + # Extracting fields from the config_data + my_list = config_data.get('emails', []) # Default to an empty list if 'my_list' is not present + my_bool = config_data.get('DEBUG', False) # Default to False if 'my_bool' is not present + + return my_list, my_bool + + except FileNotFoundError: + print(f"Config file '{DocumentAnalyzer.CONFIG_PATH}' not found. Using defaults (no email addresses)") + return [], False + except json.JSONDecodeError as e: + print(f"Error decoding JSON in '{DocumentAnalyzer.CONFIG_PATH}': {e}") + return None, None def analyze_all_unread(self): """pulls all new files from Lantern database, evaluates them, and publishes results to google sheets @@ -162,7 +183,7 @@ def evaluate_queries(self, embedding, queries): def main(): document_analyzer = DocumentAnalyzer() - document_analyzer.analyze_all_unread() # analyzes all new files in lantern db + #document_analyzer.analyze_all_unread() # analyzes all new files in lantern db if __name__ == '__main__':