Skip to content

Commit

Permalink
Merge branch 'feature/flatten_and_llm' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
dermatologist committed Jul 3, 2024
2 parents ec5b1a9 + bac2b15 commit 987f112
Show file tree
Hide file tree
Showing 20 changed files with 1,103 additions and 42 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ Virtual flattened view of *FHIR Bundle / ndjson / FHIR server / BigQuery!*
:fire: **FHIRy** is a [python](https://www.python.org/) package to facilitate health data analytics and machine learning by converting a folder of [FHIR bundles](https://www.hl7.org/fhir/bundle.html)/ndjson from [bulk data export](https://hl7.org/fhir/uv/bulkdata/export/index.html) into a [pandas](https://pandas.pydata.org/docs/user_guide/index.html) data frame for analysis. You can import the dataframe
into ML packages such as Tensorflow and PyTorch. **FHIRy also supports FHIR server search and FHIR tables on BigQuery.**

## UPDATE
Recently added support for **LLM based natural language queries** of FHIR bundles/ndjson using [llama-index](examples/llm_example.py). Please install the llm extras from the develop branch as follows. Please be cognizant of the privacy issues with publically hosted LLMs. Any feedback will be highly appreciated. [See usage](examples/llm_example.py)!

```
git clone https://github.com/dermatologist/fhiry.git@develop
cd fhiry
pip install -e .[llm]
```
[See usage](examples/llm_example.py).

Test this with the [synthea sample](https://synthea.mitre.org/downloads) or the downloaded ndjson from the [SMART Bulk data server](https://bulk-data.smarthealthit.org/). Use the 'Discussions' tab above for feature requests.

:sparkles: Checkout [this template](https://github.com/dermatologist/kedro-multimodal) for Multimodal machine learning in healthcare!
Expand Down
14 changes: 7 additions & 7 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ babel==2.9.1
# via sphinx
backports-entry-points-selectable==1.1.0
# via virtualenv
certifi==2023.11.17
certifi==2024.6.2
# via
# -c requirements.txt
# requests
Expand All @@ -37,7 +37,7 @@ filelock==3.0.12
# via
# tox
# virtualenv
idna==3.6
idna==3.7
# via
# -c requirements.txt
# requests
Expand All @@ -63,7 +63,7 @@ mdurl==0.1.2
# via markdown-it-py
myst-parser[linkify]==0.18.1
# via -r dev-requirements.in
packaging==23.2
packaging==24.1
# via
# -c requirements.txt
# pytest
Expand All @@ -88,15 +88,15 @@ pytest==7.1.2
# pytest-cov
pytest-cov==3.0.0
# via -r dev-requirements.in
pytz==2023.3.post1
pytz==2024.1
# via
# -c requirements.txt
# babel
pyyaml==6.0.1
# via myst-parser
recommonmark==0.7.1
# via -r dev-requirements.in
requests==2.31.0
requests==2.32.3
# via
# -c requirements.txt
# responses
Expand Down Expand Up @@ -142,11 +142,11 @@ tox==3.25.0
# via -r dev-requirements.in
types-toml==0.10.8.1
# via responses
typing-extensions==4.8.0
typing-extensions==4.12.2
# via myst-parser
uc-micro-py==1.0.2
# via linkify-it-py
urllib3==2.1.0
urllib3==2.2.2
# via
# -c requirements.txt
# requests
Expand Down
25 changes: 25 additions & 0 deletions examples/llm_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Example of using LLMs with FHIRy
git clone https://github.com/dermatologist/fhiry.git@develop
cd fhiry
pip install -e .[llm]
"""
# Import any LLMs that llama_index supports and you have access to
# Require OpenAI API key to use OpenAI LLMs
from llama_index.llms import Vertex
from fhiry.fhirsearch import Fhirsearch

fs = Fhirsearch(fhir_base_url = "https://hapi.fhir.org/baseR4/")
df = fs.search(resource_type = "Condition", search_parameters = {})
# print(df.info())

# Create a Vertex LLM
llm = Vertex(
model="chat-bison"
)
query = "How many patients have a disease like rheumatoid arthritis?"
_command = fs.llm_query(query, llm)

print(_command)

print(df["resource.code.text"].str.contains("rheumatoid arthritis").sum())

1 change: 0 additions & 1 deletion notes/work1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from types import CodeType
import pandas as pd
import numpy as np
import fhir.resources
with open('/gpfs/fs0/scratch/a/archer/beapen/home/scratch/fhiry/data/fhir/Aaafhir.json', 'r') as f:
json_in = f.read()

Expand Down
62 changes: 33 additions & 29 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,91 +4,95 @@
#
# pip-compile
#
cachetools==5.3.2
cachetools==5.3.3
# via google-auth
certifi==2023.11.17
certifi==2024.6.2
# via requests
charset-normalizer==3.3.2
# via requests
db-dtypes==1.1.1
db-dtypes==1.2.0
# via fhiry (setup.py)
google-api-core[grpc]==2.14.0
google-api-core[grpc]==2.19.1
# via
# google-api-core
# google-cloud-bigquery
# google-cloud-core
google-auth==2.23.4
google-auth==2.31.0
# via
# google-api-core
# google-cloud-bigquery
# google-cloud-core
google-cloud-bigquery==3.13.0
google-cloud-bigquery==3.25.0
# via fhiry (setup.py)
google-cloud-core==2.3.3
google-cloud-core==2.4.1
# via google-cloud-bigquery
google-crc32c==1.5.0
# via google-resumable-media
google-resumable-media==2.6.0
google-resumable-media==2.7.1
# via google-cloud-bigquery
googleapis-common-protos==1.61.0
googleapis-common-protos==1.63.2
# via
# google-api-core
# grpcio-status
grpcio==1.59.3
grpcio==1.64.1
# via
# google-api-core
# google-cloud-bigquery
# grpcio-status
grpcio-status==1.59.3
grpcio-status==1.64.1
# via google-api-core
idna==3.6
idna==3.7
# via requests
numpy==1.26.2
numpy==1.26.4
# via
# db-dtypes
# fhiry (setup.py)
# pandas
# pyarrow
packaging==23.2
packaging==24.1
# via
# db-dtypes
# google-cloud-bigquery
pandas==2.1.3
pandas==2.2.2
# via
# db-dtypes
# fhiry (setup.py)
proto-plus==1.22.3
# via google-cloud-bigquery
protobuf==4.25.1
prodict==0.8.18
# via fhiry (setup.py)
proto-plus==1.24.0
# via google-api-core
protobuf==5.27.2
# via
# google-api-core
# google-cloud-bigquery
# googleapis-common-protos
# grpcio-status
# proto-plus
pyarrow==14.0.1
pyarrow==16.1.0
# via db-dtypes
pyasn1==0.5.1
pyasn1==0.6.0
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.3.0
pyasn1-modules==0.4.0
# via google-auth
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
# via
# google-cloud-bigquery
# pandas
pytz==2023.3.post1
pytz==2024.1
# via pandas
requests==2.31.0
requests==2.32.3
# via
# google-api-core
# google-cloud-bigquery
rsa==4.9
# via google-auth
six==1.16.0
# via python-dateutil
tqdm==4.66.1
timeago==1.0.16
# via fhiry (setup.py)
tqdm==4.66.4
# via fhiry (setup.py)
tzdata==2023.3
tzdata==2024.1
# via pandas
urllib3==2.1.0
urllib3==2.2.2
# via requests
7 changes: 7 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ install_requires =
db-dtypes
google-cloud-bigquery
tqdm
timeago
prodict
numpy<2

[options.packages.find]
where = src
Expand All @@ -67,6 +70,10 @@ exclude =
# Add here additional requirements for extra features, to install with:
# `pip install fhiry[PDF]` like:
# PDF = ReportLab; RXP
llm =
llama-index
langchain==0.0.350
overrides

# Add here test requirements (semicolon/line-separated)
testing =
Expand Down
1 change: 1 addition & 0 deletions src/fhiry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .fhirndjson import Fhirndjson
from .fhirsearch import Fhirsearch
from .bqsearch import BQsearch
from .flattenfhir import FlattenFhir

if sys.version_info[:2] >= (3, 8):
# TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
Expand Down
51 changes: 51 additions & 0 deletions src/fhiry/base_fhiry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,16 @@
https://opensource.org/licenses/MIT
"""

from typing import Any
import pandas as pd
import json


def default_output_processor(
output: str, df: pd.DataFrame, **output_kwargs: Any
) -> str:
return output

class BaseFhiry(object):
def __init__(self, config_json=None):
self._df = None
Expand Down Expand Up @@ -135,3 +142,47 @@ def process_list(self, myList):
elif 'display' in entry:
myCodes.append(entry['display'])
return myCodes

def llm_query(self, query, llm, embed_model=None, verbose=True):
"""Execute a query using llama_index
Args:
query (str): The natural language query
llm (Any): The Language Model
embed_model (str, optional): The embedding model string from HuggingFace. Defaults to None.
verbose (bool, optional): Verbose or not. Defaults to True.
Raises:
Exception: Llama_index not installed
Exception: Dataframe is empty
Returns:
Any: Results of the query
"""

LLAMA_INDEX_ENABLED = False
try:
LLAMA_INDEX_ENABLED = True
from llama_index.query_engine import PandasQueryEngine
from llama_index import ServiceContext
from langchain.embeddings import HuggingFaceEmbeddings
except:
pass
if not LLAMA_INDEX_ENABLED:
raise Exception("llama_index not installed")
if self._df is None:
raise Exception("Dataframe is empty")
if embed_model is None:
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
else:
embed_model = HuggingFaceEmbeddings(model_name=embed_model)
service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
)
query_engine = PandasQueryEngine(
df=self._df,
service_context=service_context,
output_processor=default_output_processor,
verbose=verbose)
return query_engine.query(query)
3 changes: 2 additions & 1 deletion src/fhiry/fhirsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def search(self, resource_type="Patient", search_parameters={}):
else:
df = pd.DataFrame(columns=[])

return df
self._df = df
return self._df



Expand Down
Loading

0 comments on commit 987f112

Please sign in to comment.