change source for lawschool gpa dataset

* remove tempeh requirement Signed-off-by: Samuel Hoffman <hoffman.sc@gmail.com>
Trusted-AI · Feb 20, 2024 · 6acbe7d · 6acbe7d
1 parent ab7e52a
commit 6acbe7d
Show file tree

Hide file tree

Showing 8 changed files with 215 additions and 398 deletions.
diff --git a/aif360/datasets/law_school_gpa_dataset.py b/aif360/datasets/law_school_gpa_dataset.py
@@ -1,36 +1,24 @@
-import os
 import pandas as pd
+from sklearn.model_selection import train_test_split
 from aif360.datasets import RegressionDataset
-try:
-    import tempeh.configurations as tc
-except ImportError as error:
-    from logging import warning
-    warning("{}: LawSchoolGPADataset will be unavailable. To install, run:\n"
-            "pip install 'aif360[LawSchoolGPA]'".format(error))
+from aif360.sklearn.datasets.lawschool_dataset import LSAC_URL
 
 class LawSchoolGPADataset(RegressionDataset):
-    """Law School GPA dataset.
-
-    See https://github.com/microsoft/tempeh for details.
-    """
+    """Law School GPA dataset."""
 
     def __init__(self, dep_var_name='zfygpa',
-                 protected_attribute_names=['race'],
-                 privileged_classes=[['white']],
+                 protected_attribute_names=['race', 'gender'],
+                 privileged_classes=[['white'], ['male']],
                  instance_weights_name=None,
                  categorical_features=[],
                  na_values=[], custom_preprocessing=None,
                  metadata=None):
         """See :obj:`RegressionDataset` for a description of the arguments."""
-        dataset = tc.datasets["lawschool_gpa"]()
-        X_train,X_test = dataset.get_X(format=pd.DataFrame)
-        y_train, y_test = dataset.get_y(format=pd.Series)
-        A_train, A_test = dataset.get_sensitive_features(name='race',
-                                                         format=pd.Series)
-        all_train = pd.concat([X_train, y_train, A_train], axis=1)
-        all_test = pd.concat([X_test, y_test, A_test], axis=1)
-
-        df = pd.concat([all_train, all_test], axis=0)
+        df = pd.read_sas(LSAC_URL, encoding="utf-8")
+        df.race = df.race1.where(df.race1.isin(['black', 'white']))
+        df.gender = df.gender.fillna("female")
+        df = df[["race", "gender", "lsat", "ugpa", "zfygpa"]].dropna()
+        df = pd.concat(train_test_split(df, test_size=0.33, random_state=123))
 
         super(LawSchoolGPADataset, self).__init__(df=df,
             dep_var_name=dep_var_name,

diff --git a/aif360/sklearn/datasets/__init__.py b/aif360/sklearn/datasets/__init__.py
@@ -12,4 +12,4 @@
 from aif360.sklearn.datasets.openml_datasets import fetch_adult, fetch_german, fetch_bank
 from aif360.sklearn.datasets.compas_dataset import fetch_compas
 from aif360.sklearn.datasets.meps_datasets import fetch_meps
-from aif360.sklearn.datasets.tempeh_datasets import fetch_lawschool_gpa
+from aif360.sklearn.datasets.lawschool_dataset import fetch_lawschool_gpa
diff --git a/aif360/sklearn/datasets/lawschool_dataset.py b/aif360/sklearn/datasets/lawschool_dataset.py
@@ -0,0 +1,89 @@
+from io import BytesIO
+import os
+import urllib
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+from aif360.sklearn.datasets.utils import standardize_dataset, Dataset
+
+
+# cache location
+DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                 '..', 'data', 'raw')
+LSAC_URL = "https://github.com/jkomiyama/fairregresion/raw/master/dataset/law/lsac.sas7bdat"
+
+def fetch_lawschool_gpa(subset="all", *, data_home=None, cache=True,
+                        binary_race=True, fillna_gender="female",
+                        usecols=["race", "gender", "lsat", "ugpa"],
+                        dropcols=None, numeric_only=False, dropna=True):
+    """Load the Law School GPA dataset.
+
+    Optionally binarizes 'race' to 'white' (privileged) or 'black' (unprivileged).
+    The other protected attribute is gender ('male' is privileged and 'female'
+    is unprivileged). The outcome variable is standardized first year GPA
+    ('zfygpa'). Note: this is a continuous variable, i.e., a regression task.
+
+    Args:
+        subset ({'train', 'test', or 'all'}, optional): Select the dataset to
+            load: 'train' for the training set, 'test' for the test set, 'all'
+            for both.
+        data_home (string, optional): Specify another download and cache folder
+            for the datasets. By default all AIF360 datasets are stored in
+            'aif360/sklearn/data/raw' subfolders.
+        cache (bool): Whether to cache downloaded datasets.
+        binary_race (bool, optional): Filter only white and black students.
+        fillna_gender (str or None, optional): Fill NA values for gender with
+            this value. If `None`, leave as NA. Note: this is used for backward-
+            compatibility with tempeh and may be dropped in later versions.
+        usecols (single label or list-like, optional): Feature column(s) to
+            keep. All others are dropped.
+        dropcols (single label or list-like, optional): Feature column(s) to
+            drop.
+        numeric_only (bool): Drop all non-numeric feature columns.
+        dropna (bool): Drop rows with NAs.
+
+    Returns:
+        namedtuple: Tuple containing X, y, and sample_weights for the Law School
+        GPA dataset accessible by index or name.
+    """
+    if subset not in {'train', 'test', 'all'}:
+        raise ValueError("subset must be either 'train', 'test', or 'all'; "
+                         "cannot be {}".format(subset))
+
+    cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
+                              os.path.basename(LSAC_URL))
+    if cache and os.path.isfile(cache_path):
+        df = pd.read_sas(cache_path, encoding="utf-8")
+    else:
+        data = urllib.request.urlopen(LSAC_URL).read()
+        if cache:
+            os.makedirs(os.path.dirname (cache_path), exist_ok=True)
+            with open(cache_path, 'wb') as f:
+                f.write(data)
+        df = pd.read_sas(BytesIO(data), format="sas7bdat", encoding="utf-8")
+
+    df.race = df.race1.astype('category')
+    if binary_race:
+        df.race = df.race.cat.set_categories(['black', 'white'], ordered=True)
+
+    # for backwards-compatibility with tempeh
+    if fillna_gender is not None:
+        df.gender = df.gender.fillna(fillna_gender)
+    df.gender = df.gender.astype('category').cat.set_categories(
+        ['female', 'male'], ordered=True)
+
+    ds = standardize_dataset(df, prot_attr=['race', 'gender'], target='zfygpa',
+                               usecols=usecols, dropcols=dropcols,
+                               numeric_only=numeric_only, dropna=dropna)
+
+    # for backwards-compatibility with tempeh
+    train_X, test_X, train_y, test_y = train_test_split(*ds, test_size=0.33, random_state=123)
+    if subset == "train":
+        return Dataset(train_X, train_y)
+    elif subset == "test":
+        return Dataset(test_X, test_y)
+    else:
+        X = pd.concat([train_X, test_X], axis=0)
+        y = pd.concat([train_y, test_y], axis=0)
+        return Dataset(X, y)
diff --git a/aif360/sklearn/datasets/tempeh_datasets.py b/aif360/sklearn/datasets/tempeh_datasets.py