diff --git a/aif360/datasets/law_school_gpa_dataset.py b/aif360/datasets/law_school_gpa_dataset.py index eac3656c..0d5c992a 100644 --- a/aif360/datasets/law_school_gpa_dataset.py +++ b/aif360/datasets/law_school_gpa_dataset.py @@ -1,36 +1,24 @@ -import os import pandas as pd +from sklearn.model_selection import train_test_split from aif360.datasets import RegressionDataset -try: - import tempeh.configurations as tc -except ImportError as error: - from logging import warning - warning("{}: LawSchoolGPADataset will be unavailable. To install, run:\n" - "pip install 'aif360[LawSchoolGPA]'".format(error)) +from aif360.sklearn.datasets.lawschool_dataset import LSAC_URL class LawSchoolGPADataset(RegressionDataset): - """Law School GPA dataset. - - See https://github.com/microsoft/tempeh for details. - """ + """Law School GPA dataset.""" def __init__(self, dep_var_name='zfygpa', - protected_attribute_names=['race'], - privileged_classes=[['white']], + protected_attribute_names=['race', 'gender'], + privileged_classes=[['white'], ['male']], instance_weights_name=None, categorical_features=[], na_values=[], custom_preprocessing=None, metadata=None): """See :obj:`RegressionDataset` for a description of the arguments.""" - dataset = tc.datasets["lawschool_gpa"]() - X_train,X_test = dataset.get_X(format=pd.DataFrame) - y_train, y_test = dataset.get_y(format=pd.Series) - A_train, A_test = dataset.get_sensitive_features(name='race', - format=pd.Series) - all_train = pd.concat([X_train, y_train, A_train], axis=1) - all_test = pd.concat([X_test, y_test, A_test], axis=1) - - df = pd.concat([all_train, all_test], axis=0) + df = pd.read_sas(LSAC_URL, encoding="utf-8") + df.race = df.race1.where(df.race1.isin(['black', 'white'])) + df.gender = df.gender.fillna("female") + df = df[["race", "gender", "lsat", "ugpa", "zfygpa"]].dropna() + df = pd.concat(train_test_split(df, test_size=0.33, random_state=123)) super(LawSchoolGPADataset, self).__init__(df=df, dep_var_name=dep_var_name, diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py index 525b1431..8c0f1049 100644 --- a/aif360/sklearn/datasets/__init__.py +++ b/aif360/sklearn/datasets/__init__.py @@ -12,4 +12,4 @@ from aif360.sklearn.datasets.openml_datasets import fetch_adult, fetch_german, fetch_bank from aif360.sklearn.datasets.compas_dataset import fetch_compas from aif360.sklearn.datasets.meps_datasets import fetch_meps -from aif360.sklearn.datasets.tempeh_datasets import fetch_lawschool_gpa +from aif360.sklearn.datasets.lawschool_dataset import fetch_lawschool_gpa diff --git a/aif360/sklearn/datasets/lawschool_dataset.py b/aif360/sklearn/datasets/lawschool_dataset.py new file mode 100644 index 00000000..66a11bfa --- /dev/null +++ b/aif360/sklearn/datasets/lawschool_dataset.py @@ -0,0 +1,89 @@ +from io import BytesIO +import os +import urllib + +import pandas as pd +from sklearn.model_selection import train_test_split + +from aif360.sklearn.datasets.utils import standardize_dataset, Dataset + + +# cache location +DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)), + '..', 'data', 'raw') +LSAC_URL = "https://github.com/jkomiyama/fairregresion/raw/master/dataset/law/lsac.sas7bdat" + +def fetch_lawschool_gpa(subset="all", *, data_home=None, cache=True, + binary_race=True, fillna_gender="female", + usecols=["race", "gender", "lsat", "ugpa"], + dropcols=None, numeric_only=False, dropna=True): + """Load the Law School GPA dataset. + + Optionally binarizes 'race' to 'white' (privileged) or 'black' (unprivileged). + The other protected attribute is gender ('male' is privileged and 'female' + is unprivileged). The outcome variable is standardized first year GPA + ('zfygpa'). Note: this is a continuous variable, i.e., a regression task. + + Args: + subset ({'train', 'test', or 'all'}, optional): Select the dataset to + load: 'train' for the training set, 'test' for the test set, 'all' + for both. + data_home (string, optional): Specify another download and cache folder + for the datasets. By default all AIF360 datasets are stored in + 'aif360/sklearn/data/raw' subfolders. + cache (bool): Whether to cache downloaded datasets. + binary_race (bool, optional): Filter only white and black students. + fillna_gender (str or None, optional): Fill NA values for gender with + this value. If `None`, leave as NA. Note: this is used for backward- + compatibility with tempeh and may be dropped in later versions. + usecols (single label or list-like, optional): Feature column(s) to + keep. All others are dropped. + dropcols (single label or list-like, optional): Feature column(s) to + drop. + numeric_only (bool): Drop all non-numeric feature columns. + dropna (bool): Drop rows with NAs. + + Returns: + namedtuple: Tuple containing X, y, and sample_weights for the Law School + GPA dataset accessible by index or name. + """ + if subset not in {'train', 'test', 'all'}: + raise ValueError("subset must be either 'train', 'test', or 'all'; " + "cannot be {}".format(subset)) + + cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, + os.path.basename(LSAC_URL)) + if cache and os.path.isfile(cache_path): + df = pd.read_sas(cache_path, encoding="utf-8") + else: + data = urllib.request.urlopen(LSAC_URL).read() + if cache: + os.makedirs(os.path.dirname (cache_path), exist_ok=True) + with open(cache_path, 'wb') as f: + f.write(data) + df = pd.read_sas(BytesIO(data), format="sas7bdat", encoding="utf-8") + + df.race = df.race1.astype('category') + if binary_race: + df.race = df.race.cat.set_categories(['black', 'white'], ordered=True) + + # for backwards-compatibility with tempeh + if fillna_gender is not None: + df.gender = df.gender.fillna(fillna_gender) + df.gender = df.gender.astype('category').cat.set_categories( + ['female', 'male'], ordered=True) + + ds = standardize_dataset(df, prot_attr=['race', 'gender'], target='zfygpa', + usecols=usecols, dropcols=dropcols, + numeric_only=numeric_only, dropna=dropna) + + # for backwards-compatibility with tempeh + train_X, test_X, train_y, test_y = train_test_split(*ds, test_size=0.33, random_state=123) + if subset == "train": + return Dataset(train_X, train_y) + elif subset == "test": + return Dataset(test_X, test_y) + else: + X = pd.concat([train_X, test_X], axis=0) + y = pd.concat([train_y, test_y], axis=0) + return Dataset(X, y) diff --git a/aif360/sklearn/datasets/tempeh_datasets.py b/aif360/sklearn/datasets/tempeh_datasets.py deleted file mode 100644 index 7adae048..00000000 --- a/aif360/sklearn/datasets/tempeh_datasets.py +++ /dev/null @@ -1,59 +0,0 @@ -import pandas as pd -try: - import tempeh.configurations as tc -except ImportError as error: - from logging import warning - warning("{}: fetch_lawschool_gpa will be unavailable. To install, run:\n" - "pip install 'aif360[LawSchoolGPA]'".format(error)) - -from aif360.sklearn.datasets.utils import standardize_dataset - - -def fetch_lawschool_gpa(subset="all", *, usecols=None, dropcols=None, - numeric_only=False, dropna=True): - """Load the Law School GPA dataset - - Note: - By default, the data is downloaded from tempeh. See - https://github.com/microsoft/tempeh for details. - - Args: - subset ({'train', 'test', or 'all'}, optional): Select the dataset to - load: 'train' for the training set, 'test' for the test set, 'all' - for both. - usecols (single label or list-like, optional): Feature column(s) to - keep. All others are dropped. - dropcols (single label or list-like, optional): Feature column(s) to - drop. - numeric_only (bool): Drop all non-numeric feature columns. - dropna (bool): Drop rows with NAs. FIXME: NAs already dropped by tempeh - - Returns: - namedtuple: Tuple containing X, y, and sample_weights for the Law School - GPA dataset accessible by index or name. - """ - if subset not in {'train', 'test', 'all'}: - raise ValueError("subset must be either 'train', 'test', or 'all'; " - "cannot be {}".format(subset)) - - dataset = tc.datasets["lawschool_gpa"]() - X_train, X_test = dataset.get_X(format=pd.DataFrame) - y_train, y_test = dataset.get_y(format=pd.Series) - A_train, A_test = dataset.get_sensitive_features(name='race', - format=pd.Series) - all_train = pd.concat([X_train, y_train, A_train], axis=1) - all_test = pd.concat([X_test, y_test, A_test], axis=1) - - if subset == "train": - df = all_train - elif subset == "test": - df = all_test - else: - df = pd.concat([all_train, all_test], axis=0) - - df.race = df.race.astype('category').cat.set_categories( - ['black', 'white'], ordered=True) - - return standardize_dataset(df, prot_attr='race', target='zfygpa', - usecols=usecols, dropcols=dropcols, - numeric_only=numeric_only, dropna=dropna) diff --git a/examples/sklearn/demo_grid_search_reduction_regression_sklearn.ipynb b/examples/sklearn/demo_grid_search_reduction_regression_sklearn.ipynb index d9fec458..ea47f6d5 100644 --- a/examples/sklearn/demo_grid_search_reduction_regression_sklearn.ipynb +++ b/examples/sklearn/demo_grid_search_reduction_regression_sklearn.ipynb @@ -2,12 +2,12 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Trusted-AI/AIF360/blob/master/examples/sklearn/demo_grid_search_reduction_regression_sklearn.ipynb)" - ], "metadata": { "id": "fgfyb_2c9WL4" - } + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Trusted-AI/AIF360/blob/master/examples/sklearn/demo_grid_search_reduction_regression_sklearn.ipynb)" + ] }, { "cell_type": "markdown", @@ -23,73 +23,24 @@ }, { "cell_type": "code", - "source": [ - "#Install aif360\n", - "#Install Reductions from Fairlearn\n", - "#Install Low School GPA dataset\n", - "!pip install aif360[Reductions,LawSchoolGPA]" - ], + "execution_count": null, "metadata": { - "id": "4Ad7nC129i8f", - "outputId": "0ce37c29-d5a5-4682-968e-bb33cea9de19", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "id": "4Ad7nC129i8f", + "outputId": "0ce37c29-d5a5-4682-968e-bb33cea9de19" }, - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", - "Requirement already satisfied: aif360[LawSchoolGPA,Reductions] in /usr/local/lib/python3.7/dist-packages (0.5.0)\n", - "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (1.7.3)\n", - "Requirement already satisfied: scikit-learn>=1.0 in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (1.0.2)\n", - "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (1.21.6)\n", - "Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (1.3.5)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (3.2.2)\n", - "Requirement already satisfied: fairlearn~=0.7 in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (0.7.0)\n", - "Requirement already satisfied: tempeh in /usr/local/lib/python3.7/dist-packages (from aif360[LawSchoolGPA,Reductions]) (0.1.12)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->aif360[LawSchoolGPA,Reductions]) (2022.2.1)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.24.0->aif360[LawSchoolGPA,Reductions]) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.24.0->aif360[LawSchoolGPA,Reductions]) (1.15.0)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=1.0->aif360[LawSchoolGPA,Reductions]) (1.1.0)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=1.0->aif360[LawSchoolGPA,Reductions]) (3.1.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->aif360[LawSchoolGPA,Reductions]) (1.4.4)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->aif360[LawSchoolGPA,Reductions]) (3.0.9)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->aif360[LawSchoolGPA,Reductions]) (0.11.0)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->aif360[LawSchoolGPA,Reductions]) (4.1.1)\n", - "Requirement already satisfied: pytest in /usr/local/lib/python3.7/dist-packages (from tempeh->aif360[LawSchoolGPA,Reductions]) (3.6.4)\n", - "Requirement already satisfied: memory-profiler in /usr/local/lib/python3.7/dist-packages (from tempeh->aif360[LawSchoolGPA,Reductions]) (0.60.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from tempeh->aif360[LawSchoolGPA,Reductions]) (2.23.0)\n", - "Requirement already satisfied: shap in /usr/local/lib/python3.7/dist-packages (from tempeh->aif360[LawSchoolGPA,Reductions]) (0.41.0)\n", - "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from memory-profiler->tempeh->aif360[LawSchoolGPA,Reductions]) (5.4.8)\n", - "Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA,Reductions]) (1.11.0)\n", - "Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA,Reductions]) (8.14.0)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA,Reductions]) (57.4.0)\n", - "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA,Reductions]) (22.1.0)\n", - "Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.7/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA,Reductions]) (0.7.1)\n", - "Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.7/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA,Reductions]) (1.4.1)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->tempeh->aif360[LawSchoolGPA,Reductions]) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->tempeh->aif360[LawSchoolGPA,Reductions]) (3.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->tempeh->aif360[LawSchoolGPA,Reductions]) (2022.6.15)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->tempeh->aif360[LawSchoolGPA,Reductions]) (2.10)\n", - "Requirement already satisfied: numba in /usr/local/lib/python3.7/dist-packages (from shap->tempeh->aif360[LawSchoolGPA,Reductions]) (0.56.2)\n", - "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.7/dist-packages (from shap->tempeh->aif360[LawSchoolGPA,Reductions]) (1.5.0)\n", - "Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.7/dist-packages (from shap->tempeh->aif360[LawSchoolGPA,Reductions]) (0.0.7)\n", - "Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.7/dist-packages (from shap->tempeh->aif360[LawSchoolGPA,Reductions]) (4.64.1)\n", - "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.7/dist-packages (from shap->tempeh->aif360[LawSchoolGPA,Reductions]) (21.3)\n", - "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.7/dist-packages (from numba->shap->tempeh->aif360[LawSchoolGPA,Reductions]) (0.39.1)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from numba->shap->tempeh->aif360[LawSchoolGPA,Reductions]) (4.12.0)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->numba->shap->tempeh->aif360[LawSchoolGPA,Reductions]) (3.8.1)\n" - ] - } + "outputs": [], + "source": [ + "#Install aif360\n", + "#Install Reductions from Fairlearn\n", + "!pip install aif360[Reductions]" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "sivw3vma71DE" }, @@ -130,33 +81,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { - "id": "7Xqg6Yn771DG", - "outputId": "00fda049-0c6a-4c86-9ee8-f49b9e2d5161", "colab": { "base_uri": "https://localhost:8080/", "height": 238 - } + }, + "id": "7Xqg6Yn771DG", + "outputId": "00fda049-0c6a-4c86-9ee8-f49b9e2d5161" }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " lsat ugpa race\n", - "race \n", - "0 38.0 3.3 0\n", - "1 34.0 4.0 1\n", - "1 34.0 3.9 1\n", - "1 45.0 3.3 1\n", - "1 39.0 2.5 1" - ], "text/html": [ - "\n", - "