Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change source for Law School GPA dataset #510

Merged
merged 1 commit into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 10 additions & 22 deletions aif360/datasets/law_school_gpa_dataset.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,24 @@
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from aif360.datasets import RegressionDataset
try:
import tempeh.configurations as tc
except ImportError as error:
from logging import warning
warning("{}: LawSchoolGPADataset will be unavailable. To install, run:\n"
"pip install 'aif360[LawSchoolGPA]'".format(error))
from aif360.sklearn.datasets.lawschool_dataset import LSAC_URL

class LawSchoolGPADataset(RegressionDataset):
"""Law School GPA dataset.

See https://github.com/microsoft/tempeh for details.
"""
"""Law School GPA dataset."""

def __init__(self, dep_var_name='zfygpa',
protected_attribute_names=['race'],
privileged_classes=[['white']],
protected_attribute_names=['race', 'gender'],
privileged_classes=[['white'], ['male']],
instance_weights_name=None,
categorical_features=[],
na_values=[], custom_preprocessing=None,
metadata=None):
"""See :obj:`RegressionDataset` for a description of the arguments."""
dataset = tc.datasets["lawschool_gpa"]()
X_train,X_test = dataset.get_X(format=pd.DataFrame)
y_train, y_test = dataset.get_y(format=pd.Series)
A_train, A_test = dataset.get_sensitive_features(name='race',
format=pd.Series)
all_train = pd.concat([X_train, y_train, A_train], axis=1)
all_test = pd.concat([X_test, y_test, A_test], axis=1)

df = pd.concat([all_train, all_test], axis=0)
df = pd.read_sas(LSAC_URL, encoding="utf-8")
df.race = df.race1.where(df.race1.isin(['black', 'white']))
df.gender = df.gender.fillna("female")
df = df[["race", "gender", "lsat", "ugpa", "zfygpa"]].dropna()
df = pd.concat(train_test_split(df, test_size=0.33, random_state=123))

super(LawSchoolGPADataset, self).__init__(df=df,
dep_var_name=dep_var_name,
Expand Down
2 changes: 1 addition & 1 deletion aif360/sklearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
from aif360.sklearn.datasets.openml_datasets import fetch_adult, fetch_german, fetch_bank
from aif360.sklearn.datasets.compas_dataset import fetch_compas
from aif360.sklearn.datasets.meps_datasets import fetch_meps
from aif360.sklearn.datasets.tempeh_datasets import fetch_lawschool_gpa
from aif360.sklearn.datasets.lawschool_dataset import fetch_lawschool_gpa
89 changes: 89 additions & 0 deletions aif360/sklearn/datasets/lawschool_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from io import BytesIO
import os
import urllib

import pandas as pd
from sklearn.model_selection import train_test_split

from aif360.sklearn.datasets.utils import standardize_dataset, Dataset


# cache location
DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'data', 'raw')
LSAC_URL = "https://github.com/jkomiyama/fairregresion/raw/master/dataset/law/lsac.sas7bdat"

def fetch_lawschool_gpa(subset="all", *, data_home=None, cache=True,
binary_race=True, fillna_gender="female",
usecols=["race", "gender", "lsat", "ugpa"],
dropcols=None, numeric_only=False, dropna=True):
"""Load the Law School GPA dataset.

Optionally binarizes 'race' to 'white' (privileged) or 'black' (unprivileged).
The other protected attribute is gender ('male' is privileged and 'female'
is unprivileged). The outcome variable is standardized first year GPA
('zfygpa'). Note: this is a continuous variable, i.e., a regression task.

Args:
subset ({'train', 'test', or 'all'}, optional): Select the dataset to
load: 'train' for the training set, 'test' for the test set, 'all'
for both.
data_home (string, optional): Specify another download and cache folder
for the datasets. By default all AIF360 datasets are stored in
'aif360/sklearn/data/raw' subfolders.
cache (bool): Whether to cache downloaded datasets.
binary_race (bool, optional): Filter only white and black students.
fillna_gender (str or None, optional): Fill NA values for gender with
this value. If `None`, leave as NA. Note: this is used for backward-
compatibility with tempeh and may be dropped in later versions.
usecols (single label or list-like, optional): Feature column(s) to
keep. All others are dropped.
dropcols (single label or list-like, optional): Feature column(s) to
drop.
numeric_only (bool): Drop all non-numeric feature columns.
dropna (bool): Drop rows with NAs.

Returns:
namedtuple: Tuple containing X, y, and sample_weights for the Law School
GPA dataset accessible by index or name.
"""
if subset not in {'train', 'test', 'all'}:
raise ValueError("subset must be either 'train', 'test', or 'all'; "
"cannot be {}".format(subset))

cache_path = os.path.join(data_home or DATA_HOME_DEFAULT,
os.path.basename(LSAC_URL))
if cache and os.path.isfile(cache_path):
df = pd.read_sas(cache_path, encoding="utf-8")
else:
data = urllib.request.urlopen(LSAC_URL).read()
if cache:
os.makedirs(os.path.dirname (cache_path), exist_ok=True)
with open(cache_path, 'wb') as f:
f.write(data)
df = pd.read_sas(BytesIO(data), format="sas7bdat", encoding="utf-8")

df.race = df.race1.astype('category')
if binary_race:
df.race = df.race.cat.set_categories(['black', 'white'], ordered=True)

# for backwards-compatibility with tempeh
if fillna_gender is not None:
df.gender = df.gender.fillna(fillna_gender)
df.gender = df.gender.astype('category').cat.set_categories(
['female', 'male'], ordered=True)

ds = standardize_dataset(df, prot_attr=['race', 'gender'], target='zfygpa',
usecols=usecols, dropcols=dropcols,
numeric_only=numeric_only, dropna=dropna)

# for backwards-compatibility with tempeh
train_X, test_X, train_y, test_y = train_test_split(*ds, test_size=0.33, random_state=123)
if subset == "train":
return Dataset(train_X, train_y)
elif subset == "test":
return Dataset(test_X, test_y)
else:
X = pd.concat([train_X, test_X], axis=0)
y = pd.concat([train_y, test_y], axis=0)
return Dataset(X, y)
59 changes: 0 additions & 59 deletions aif360/sklearn/datasets/tempeh_datasets.py

This file was deleted.

Loading
Loading