Merge pull request #34 from cvs-health/release-branch/v0.1.2

Release PR: v0.1.2
cvs-health · Nov 11, 2024 · 6563ba2 · 6563ba2
2 parents 4c3dae8 + b2d1469
commit 6563ba2
Show file tree

Hide file tree

Showing 21 changed files with 903 additions and 1,018 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -28,7 +28,7 @@ jobs:
           python-version: ${{matrix.python-version}}
 
       - name: Install dependencies
-        run: python -m pip install pytest pytest-asyncio .
+        run: python -m pip install pytest pytest-asyncio langchain-openai .
 
       - name: Run tests
         run: pytest -v

diff --git a/README.md b/README.md
diff --git a/assets/images/use_case_framework.PNG b/assets/images/use_case_framework.PNG
diff --git a/examples/evaluations/classification/classification_metrics_demo.ipynb b/examples/evaluations/classification/classification_metrics_demo.ipynb
diff --git a/examples/evaluations/recommendation/recommendation_metrics_demo.ipynb b/examples/evaluations/recommendation/recommendation_metrics_demo.ipynb
diff --git a/examples/evaluations/text_generation/auto_eval_demo.ipynb b/examples/evaluations/text_generation/auto_eval_demo.ipynb
@@ -30,11 +30,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/brand-new3/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "# Run if python-dotenv not installed\n",
     "# import sys\n",
@@ -45,7 +54,6 @@
     "\n",
     "import pandas as pd\n",
     "from dotenv import find_dotenv, load_dotenv\n",
-    "from langchain_openai import AzureChatOpenAI\n",
     "\n",
     "from langfair.auto import AutoEval\n",
     "\n",
@@ -54,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {
     "tags": []
    },
@@ -81,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {
     "tags": []
    },
@@ -103,7 +111,7 @@
        " \"#Person1#: Watsup, ladies! Y'll looking'fine tonight. May I have this dance?\\\\n#Person2#: He's cute! He looks like Tiger Woods! But, I can't dance. . .\\\\n#Person1#: It's all good. I'll show you all the right moves. My name's Malik.\\\\n#Person2#: Nice to meet you. I'm Wen, and this is Nikki.\\\\n#Person1#: How you feeling', vista? Mind if I take your friend'round the dance floor?\\\\n#Person2#: She doesn't mind if you don't mind getting your feet stepped on.\\\\n#Person1#: Right. Cool! Let's go!\\n\"]"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -122,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {
     "tags": []
    },
@@ -143,8 +151,9 @@
     "A list of input prompts for the model.\n",
     "- `responses` - (**list of strings, default=None**)\n",
     "A list of generated output from an LLM. If not available, responses are computed using the model.\n",
-    "- `langchain_llm` (**langchain llm (Runnable), default=None**) A langchain llm object to get passed to LLMChain `llm` argument.\n",
+    "- `langchain_llm` (**langchain llm (Runnable), default=None**) A langchain llm object to get passed to LLMChain `llm` argument. \n",
     "- `max_calls_per_min` (**int, default=None**) Specifies how many api calls to make per minute to avoid a rate limit error. By default, no limit is specified.\n",
+    "- `suppressed_exceptions` (**tuple, default=None**) Specifies which exceptions to handle as 'Unable to get response' rather than raising the exception\n",
     "- `metrics` - (**dict or list of str, default is all metrics**)\n",
     "Specifies which metrics to evaluate.\n",
     "- `toxicity_device` - (**str or torch.device input or torch.device object, default=\"cpu\"**)\n",
@@ -172,7 +181,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Instantiate an `AutoEval` class object and provide prompts, model name or LLM object, and responses (optional) as inputs. "
+    "Below we use LangFair's `AutoEval` class to conduct a comprehensive bias and fairness assessment for our text generation/summarization use case. To instantiate the `AutoEval` class, provide prompts and LangChain LLM object. We provide two examples of LangChain LLMs below, but these can be replaced with a LangChain LLM of your choice.\n",
+    "\n",
+    "**Important:** When installing community packages for LangChain, please ensure that the package version is compatible with `langchain<0.2.0`. Incompatibility may lead to unexpected errors or issues."
    ]
   },
   {
@@ -183,6 +194,30 @@
    },
    "outputs": [],
    "source": [
+    "# # Run if langchain-google-vertexai not installed (must be compatible with langchain<0.2.0). Note: kernel restart may be required.\n",
+    "# import sys\n",
+    "# !{sys.executable} -m pip install langchain-google-vertexai==0.1.3\n",
+    "\n",
+    "# # Example with Gemini-Pro on VertexAI\n",
+    "# from langchain_google_vertexai import VertexAI\n",
+    "# llm = VertexAI(model_name='gemini-pro', temperature=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Run if langchain-openai not installed (must be compatible with langchain<0.2.0)\n",
+    "# import sys\n",
+    "# !{sys.executable} -m pip install langchain-openai==0.1.6\n",
+    "\n",
+    "# Example with AzureChatOpenAI\n",
+    "import openai\n",
+    "from langchain_openai import AzureChatOpenAI\n",
     "llm = AzureChatOpenAI(\n",
     "    deployment_name=DEPLOYMENT_NAME,\n",
     "    openai_api_key=API_KEY,\n",
@@ -195,7 +230,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {
     "tags": []
    },
@@ -206,6 +241,7 @@
     "auto_object = AutoEval(\n",
     "    prompts=prompts, # small sample used as an example; in practice, a bigger sample should be used\n",
     "    langchain_llm=llm,\n",
+    "    suppressed_exceptions=(openai.BadRequestError, ValueError), # this suppresses content filtering errors    \n",
     "    neutralize_tokens=True\n",
     "    # toxicity_device=device # use if GPU is available\n",
     ")"
@@ -215,12 +251,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Call `evaluate` method to compute scores corresponding to supported metrics."
+    "Call `evaluate` method to compute scores corresponding to supported metrics.\n",
+    "\n",
+    "Note that this  may take some time due to evaluation being computationally intensive. Consider using GPU acceleration for  faster processing."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {
     "tags": []
    },
@@ -277,7 +315,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "metadata": {
     "tags": []
    },
@@ -287,21 +325,21 @@
      "output_type": "stream",
      "text": [
       "\u001b[1m1. Toxicity Assessment\u001b[0m \n",
-      "- Toxic Fraction                           0.0028 \n",
-      "- Expected Maximum Toxicity                0.0141 \n",
+      "- Toxic Fraction                           0.0024 \n",
+      "- Expected Maximum Toxicity                0.0131 \n",
       "- Toxicity Probability                     0.0100 \n",
       "\u001b[1m2. Stereotype Assessment\u001b[0m \n",
-      "- Stereotype Association                   0.3577 \n",
-      "- Cooccurrence Bias                        0.4055 \n",
-      "- Stereotype Fraction - gender             0.1928 \n",
-      "- Expected Maximum Stereotype - gender     0.3587 \n",
-      "- Stereotype Probability - gender          0.5400 \n",
+      "- Stereotype Association                   0.3534 \n",
+      "- Cooccurrence Bias                        0.5349 \n",
+      "- Stereotype Fraction - gender             0.1892 \n",
+      "- Expected Maximum Stereotype - gender     0.3604 \n",
+      "- Stereotype Probability - gender          0.5800 \n",
       "\u001b[1m3. Counterfactual Assessment\u001b[0m \n",
       "                         male-female     \n",
-      "- Cosine Similarity        0.8706          \n",
-      "- RougeL Similarity        0.5249          \n",
-      "- Bleu Similarity          0.2907          \n",
-      "- Sentiment Bias           0.0034          \n",
+      "- Cosine Similarity        0.8403          \n",
+      "- RougeL Similarity        0.5065          \n",
+      "- Bleu Similarity          0.2784          \n",
+      "- Sentiment Bias           0.0040          \n",
       "\n"
      ]
     }
@@ -624,15 +662,15 @@
  ],
  "metadata": {
   "environment": {
-   "kernel": "langfair",
+   "kernel": "langfair0.1.2-beta",
    "name": "workbench-notebooks.m125",
    "type": "gcloud",
    "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m125"
   },
   "kernelspec": {
-   "display_name": "langfair",
+   "display_name": "langfair0.1.2-beta",
    "language": "python",
-   "name": "langfair"
+   "name": "langfair0.1.2-beta"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/examples/evaluations/text_generation/counterfactual_metrics_demo.ipynb b/examples/evaluations/text_generation/counterfactual_metrics_demo.ipynb
diff --git a/examples/evaluations/text_generation/stereotype_metrics_demo.ipynb b/examples/evaluations/text_generation/stereotype_metrics_demo.ipynb
diff --git a/examples/evaluations/text_generation/toxicity_metrics_demo.ipynb b/examples/evaluations/text_generation/toxicity_metrics_demo.ipynb