Merge branch 'feature/flatten_and_llm' into develop

dermatologist · Jul 3, 2024 · 987f112 · 987f112
2 parents ec5b1a9 + bac2b15
commit 987f112
Show file tree

Hide file tree

Showing 20 changed files with 1,103 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,16 @@ Virtual flattened view of *FHIR Bundle / ndjson / FHIR server / BigQuery!*
 :fire: **FHIRy** is a [python](https://www.python.org/) package to facilitate health data analytics and machine learning by converting a folder of [FHIR bundles](https://www.hl7.org/fhir/bundle.html)/ndjson from [bulk data export](https://hl7.org/fhir/uv/bulkdata/export/index.html) into a [pandas](https://pandas.pydata.org/docs/user_guide/index.html) data frame for analysis. You can import the dataframe
 into ML packages such as Tensorflow and PyTorch. **FHIRy also supports FHIR server search and FHIR tables on BigQuery.**
 
+## UPDATE
+Recently added support for **LLM based natural language queries** of FHIR bundles/ndjson using [llama-index](examples/llm_example.py). Please install the llm extras from the develop branch as follows. Please be cognizant of the privacy issues with publically hosted LLMs. Any feedback will be highly appreciated. [See usage](examples/llm_example.py)!
+
+```
+git clone https://github.com/dermatologist/fhiry.git@develop
+cd fhiry
+pip install -e .[llm]
+```
+[See usage](examples/llm_example.py).
+
 Test this with the [synthea sample](https://synthea.mitre.org/downloads) or the downloaded ndjson from the [SMART Bulk data server](https://bulk-data.smarthealthit.org/). Use the 'Discussions' tab above for feature requests.
 
 :sparkles: Checkout [this template](https://github.com/dermatologist/kedro-multimodal) for Multimodal machine learning in healthcare!

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -12,7 +12,7 @@ babel==2.9.1
     # via sphinx
 backports-entry-points-selectable==1.1.0
     # via virtualenv
-certifi==2023.11.17
+certifi==2024.6.2
     # via
     #   -c requirements.txt
     #   requests
@@ -37,7 +37,7 @@ filelock==3.0.12
     # via
     #   tox
     #   virtualenv
-idna==3.6
+idna==3.7
     # via
     #   -c requirements.txt
     #   requests
@@ -63,7 +63,7 @@ mdurl==0.1.2
     # via markdown-it-py
 myst-parser[linkify]==0.18.1
     # via -r dev-requirements.in
-packaging==23.2
+packaging==24.1
     # via
     #   -c requirements.txt
     #   pytest
@@ -88,15 +88,15 @@ pytest==7.1.2
     #   pytest-cov
 pytest-cov==3.0.0
     # via -r dev-requirements.in
-pytz==2023.3.post1
+pytz==2024.1
     # via
     #   -c requirements.txt
     #   babel
 pyyaml==6.0.1
     # via myst-parser
 recommonmark==0.7.1
     # via -r dev-requirements.in
-requests==2.31.0
+requests==2.32.3
     # via
     #   -c requirements.txt
     #   responses
@@ -142,11 +142,11 @@ tox==3.25.0
     # via -r dev-requirements.in
 types-toml==0.10.8.1
     # via responses
-typing-extensions==4.8.0
+typing-extensions==4.12.2
     # via myst-parser
 uc-micro-py==1.0.2
     # via linkify-it-py
-urllib3==2.1.0
+urllib3==2.2.2
     # via
     #   -c requirements.txt
     #   requests

diff --git a/examples/llm_example.py b/examples/llm_example.py
@@ -0,0 +1,25 @@
+"""Example of using LLMs with FHIRy
+git clone https://github.com/dermatologist/fhiry.git@develop
+cd fhiry
+pip install -e .[llm]
+"""
+# Import any LLMs that llama_index supports and you have access to
+# Require OpenAI API key to use OpenAI LLMs
+from llama_index.llms import Vertex
+from fhiry.fhirsearch import Fhirsearch
+
+fs = Fhirsearch(fhir_base_url = "https://hapi.fhir.org/baseR4/")
+df = fs.search(resource_type = "Condition", search_parameters = {})
+# print(df.info())
+
+# Create a Vertex LLM
+llm = Vertex(
+    model="chat-bison"
+)
+query = "How many patients have a disease like rheumatoid arthritis?"
+_command = fs.llm_query(query, llm)
+
+print(_command)
+
+print(df["resource.code.text"].str.contains("rheumatoid arthritis").sum())
+
diff --git a/notes/work1.py b/notes/work1.py
@@ -2,7 +2,6 @@
 from types import CodeType
 import pandas as pd
 import numpy as np
-import fhir.resources
 with open('/gpfs/fs0/scratch/a/archer/beapen/home/scratch/fhiry/data/fhir/Aaafhir.json', 'r') as f:
     json_in = f.read()
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,91 +4,95 @@
 #
 #    pip-compile
 #
-cachetools==5.3.2
+cachetools==5.3.3
     # via google-auth
-certifi==2023.11.17
+certifi==2024.6.2
     # via requests
 charset-normalizer==3.3.2
     # via requests
-db-dtypes==1.1.1
+db-dtypes==1.2.0
     # via fhiry (setup.py)
-google-api-core[grpc]==2.14.0
+google-api-core[grpc]==2.19.1
     # via
     #   google-api-core
     #   google-cloud-bigquery
     #   google-cloud-core
-google-auth==2.23.4
+google-auth==2.31.0
     # via
     #   google-api-core
+    #   google-cloud-bigquery
     #   google-cloud-core
-google-cloud-bigquery==3.13.0
+google-cloud-bigquery==3.25.0
     # via fhiry (setup.py)
-google-cloud-core==2.3.3
+google-cloud-core==2.4.1
     # via google-cloud-bigquery
 google-crc32c==1.5.0
     # via google-resumable-media
-google-resumable-media==2.6.0
+google-resumable-media==2.7.1
     # via google-cloud-bigquery
-googleapis-common-protos==1.61.0
+googleapis-common-protos==1.63.2
     # via
     #   google-api-core
     #   grpcio-status
-grpcio==1.59.3
+grpcio==1.64.1
     # via
     #   google-api-core
-    #   google-cloud-bigquery
     #   grpcio-status
-grpcio-status==1.59.3
+grpcio-status==1.64.1
     # via google-api-core
-idna==3.6
+idna==3.7
     # via requests
-numpy==1.26.2
+numpy==1.26.4
     # via
     #   db-dtypes
+    #   fhiry (setup.py)
     #   pandas
     #   pyarrow
-packaging==23.2
+packaging==24.1
     # via
     #   db-dtypes
     #   google-cloud-bigquery
-pandas==2.1.3
+pandas==2.2.2
     # via
     #   db-dtypes
     #   fhiry (setup.py)
-proto-plus==1.22.3
-    # via google-cloud-bigquery
-protobuf==4.25.1
+prodict==0.8.18
+    # via fhiry (setup.py)
+proto-plus==1.24.0
+    # via google-api-core
+protobuf==5.27.2
     # via
     #   google-api-core
-    #   google-cloud-bigquery
     #   googleapis-common-protos
     #   grpcio-status
     #   proto-plus
-pyarrow==14.0.1
+pyarrow==16.1.0
     # via db-dtypes
-pyasn1==0.5.1
+pyasn1==0.6.0
     # via
     #   pyasn1-modules
     #   rsa
-pyasn1-modules==0.3.0
+pyasn1-modules==0.4.0
     # via google-auth
-python-dateutil==2.8.2
+python-dateutil==2.9.0.post0
     # via
     #   google-cloud-bigquery
     #   pandas
-pytz==2023.3.post1
+pytz==2024.1
     # via pandas
-requests==2.31.0
+requests==2.32.3
     # via
     #   google-api-core
     #   google-cloud-bigquery
 rsa==4.9
     # via google-auth
 six==1.16.0
     # via python-dateutil
-tqdm==4.66.1
+timeago==1.0.16
+    # via fhiry (setup.py)
+tqdm==4.66.4
     # via fhiry (setup.py)
-tzdata==2023.3
+tzdata==2024.1
     # via pandas
-urllib3==2.1.0
+urllib3==2.2.2
     # via requests
diff --git a/setup.cfg b/setup.cfg
@@ -57,6 +57,9 @@ install_requires =
     db-dtypes
     google-cloud-bigquery
     tqdm
+    timeago
+    prodict
+    numpy<2
 
 [options.packages.find]
 where = src
@@ -67,6 +70,10 @@ exclude =
 # Add here additional requirements for extra features, to install with:
 # `pip install fhiry[PDF]` like:
 # PDF = ReportLab; RXP
+llm =
+    llama-index
+    langchain==0.0.350
+    overrides
 
 # Add here test requirements (semicolon/line-separated)
 testing =

diff --git a/src/fhiry/__init__.py b/src/fhiry/__init__.py
@@ -4,6 +4,7 @@
 from .fhirndjson import Fhirndjson
 from .fhirsearch import Fhirsearch
 from .bqsearch import BQsearch
+from .flattenfhir import FlattenFhir
 
 if sys.version_info[:2] >= (3, 8):
     # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`

diff --git a/src/fhiry/base_fhiry.py b/src/fhiry/base_fhiry.py
@@ -5,9 +5,16 @@
  https://opensource.org/licenses/MIT
 """
 
+from typing import Any
 import pandas as pd
 import json
 
+
+def default_output_processor(
+    output: str, df: pd.DataFrame, **output_kwargs: Any
+) -> str:
+    return output
+
 class BaseFhiry(object):
     def __init__(self, config_json=None):
         self._df = None
@@ -135,3 +142,47 @@ def process_list(self, myList):
                 elif 'display' in entry:
                     myCodes.append(entry['display'])
         return myCodes
+
+    def llm_query(self, query, llm, embed_model=None, verbose=True):
+        """Execute a query using llama_index
+
+        Args:
+            query (str): The natural language query
+            llm (Any): The Language Model
+            embed_model (str, optional): The embedding model string from HuggingFace. Defaults to None.
+            verbose (bool, optional): Verbose or not. Defaults to True.
+
+        Raises:
+            Exception: Llama_index not installed
+            Exception: Dataframe is empty
+
+        Returns:
+            Any: Results of the query
+        """
+
+        LLAMA_INDEX_ENABLED = False
+        try:
+            LLAMA_INDEX_ENABLED = True
+            from llama_index.query_engine import PandasQueryEngine
+            from llama_index import ServiceContext
+            from langchain.embeddings import HuggingFaceEmbeddings
+        except:
+            pass
+        if not LLAMA_INDEX_ENABLED:
+            raise Exception("llama_index not installed")
+        if self._df is None:
+            raise Exception("Dataframe is empty")
+        if embed_model is None:
+            embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
+        else:
+            embed_model = HuggingFaceEmbeddings(model_name=embed_model)
+        service_context = ServiceContext.from_defaults(
+                llm=llm,
+                embed_model=embed_model,
+            )
+        query_engine = PandasQueryEngine(
+            df=self._df,
+            service_context=service_context,
+            output_processor=default_output_processor,
+            verbose=verbose)
+        return query_engine.query(query)
diff --git a/src/fhiry/fhirsearch.py b/src/fhiry/fhirsearch.py
@@ -47,7 +47,8 @@ def search(self, resource_type="Patient", search_parameters={}):
         else:
             df = pd.DataFrame(columns=[])
 
-        return df
+        self._df = df
+        return self._df