diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index b13344f..0000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-version: '2.1'
-orbs:
- python: circleci/python@2.0.3
-
-workflows:
- main:
- jobs:
- - python/test:
- pkg-manager: pip
- test-tool: pytest
- test-tool-args: tests
- version: '3.7'
- setup:
- - python/install-packages:
- pip-dependency-file: requirements-dev.txt
- pkg-manager: pip
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..62c0d7e
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,37 @@
+name: Tests
+on:
+ push:
+ branches:
+ - "master"
+ - "develop"
+ pull_request:
+ branches:
+ - "master"
+ - "develop"
+jobs:
+ pytest:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.7"
+ cache: "pip"
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install flake8 pytest
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+ if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
+ - name: Lint with flake8
+ run: |
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 wefe --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 wefe --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ run: |
+ pytest tests
diff --git a/.gitignore b/.gitignore
index 38703dd..40a15ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,11 +7,12 @@ __pycache__/
*.so
# scikit-learn specific
-doc/_build/
-doc/auto_examples/
-doc/modules/generated/
-doc/datasets/generated/
-doc/api/generated/
+docs/_build/
+docs/_build/*
+docs/auto_examples/
+docs/modules/generated/
+docs/datasets/generated/
+docs/api/generated/
# Distribution / packaging
@@ -62,8 +63,9 @@ coverage.xml
*.log
# Sphinx documentation
-doc/_build/
-doc/generated/
+docs/_build/
+docs/generated/
+docs/results/
# PyBuilder
target/
@@ -74,19 +76,21 @@ target/
# jupyter
.ipynb_checkpoints/
-.results/*
-.results
+# notebook execution results
+results/*
+results
+docs/user_guide/gender_debiased_glove.kv
+# mypy cache
.mypy_cache
-./doc/results/
-
-develop.ipynb
-
+# conda deploy
conda-deploy/
conda_deploy/
*.csv
*.xls
-doc/user_guide/gender_debiased_glove.kv
\ No newline at end of file
+# coverage files
+cov.xml
+test-results/junit.xml
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..ade5e71
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,16 @@
+version: 2
+
+formats:
+ - epub
+ - pdf
+
+sphinx:
+ configuration: docs/conf.py
+
+python:
+ version: "3.7"
+ install:
+ - requirements: requirements.txt
+ - requirements: requirements-dev.txt
+ - method: pip
+ path: .
diff --git a/.readthedocs.yml b/.readthedocs.yml
deleted file mode 100644
index 8fbae9f..0000000
--- a/.readthedocs.yml
+++ /dev/null
@@ -1,9 +0,0 @@
-formats:
- - epub
- - pdf
-requirements_file: requirements.txt
-python:
- version: 3.7
- install:
- - requirements: requirements.txt
- - requirements: requirements-dev.txt
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 6f22c3d..f4b0791 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,27 +1,21 @@
-Copyright (c) 2016, Vighnesh Birodkar and scikit-learn-contrib contributors
-All rights reserved.
+MIT License
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
+Copyright (c) 2022 WEFE Team
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
-* Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
-* Neither the name of project-template nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.rst b/README.rst
index 2fc3dfb..600296b 100644
--- a/README.rst
+++ b/README.rst
@@ -1,28 +1,24 @@
.. -*- mode: rst -*-
-|ReadTheDocs|_ |CircleCI|_ |Conda|_ |CondaLatestRelease|_ |CondaVersion|_
+|License|_ |GithubActions|_ |ReadTheDocs|_ |Downloads|_ |Pypy|_ |CondaVersion|_
+.. |License| image:: https://img.shields.io/github/license/dccuchile/wefe
+.. _License: https://github.com/dccuchile/wefe/blob/master/LICENSE
.. |ReadTheDocs| image:: https://readthedocs.org/projects/wefe/badge/?version=latest
.. _ReadTheDocs: https://wefe.readthedocs.io/en/latest/?badge=latest
+.. |GithubActions| image:: https://github.com/dccuchile/wefe/actions/workflows/ci.yaml/badge.svg?branch=master
+.. _GithubActions: https://github.com/dccuchile/wefe/actions
-.. |CircleCI| image:: https://circleci.com/gh/dccuchile/wefe.svg?style=shield
-.. _CircleCI: https://circleci.com/gh/dccuchile/wefe.svg?style=shield
-
-
-.. |Conda| image:: https://anaconda.org/pbadilla/wefe/badges/installer/conda.svg
-.. _Conda: https://anaconda.org/pbadilla/wefe/badges/installer/conda.svg
-
-
-.. |CondaLatestRelease| image:: https://anaconda.org/pbadilla/wefe/badges/latest_release_date.svg
-.. _CondaLatestRelease: https://anaconda.org/pbadilla/wefe/badges/latest_release_date.svg
+.. |Downloads| image:: https://pepy.tech/badge/wefe
+.. _Downloads: https://pepy.tech/project/wefe
+.. |Pypy| image:: https://badge.fury.io/py/wefe.svg
+.. _Pypy: https://pypi.org/project/wefe/
.. |CondaVersion| image:: https://anaconda.org/pbadilla/wefe/badges/version.svg
-.. _CondaVersion: https://anaconda.org/pbadilla/wefe/badges/version.svg
-
-
+.. _CondaVersion: https://anaconda.org/pbadilla/wefe
WEFE: The Word Embedding Fairness Evaluation Framework
@@ -133,21 +129,34 @@ To compile the documentation, run:
Changelog
=========
-NEW DEVELOP VERSION
+Version 0.4.0
-------------------
+- 3 new bias mitigation methods (debias) implemented: Double Hard Debias, Half
+ Sibling Regression and Repulsion Attraction Neutralization.
+- The library documentation of the library has been restructured.
+ Now, the documentation is divided into user guide and theoretical framework
+ The user guide does not contain theoretical information.
+ Instead, theoretical documentation can be found in the conceptual guides.
+- Improved API documentation and examples. Added multilingual examples contributed
+ by the community.
+- The user guides are fully executable because they are now on notebooks.
+- There was also an important improvement in the API documentation and in metrics and
+ debias examples.
+- Improved library testing mechanisms for metrics and debias methods.
- Fixed wrong repr of query. Now the sets are in the correct order.
-- Greatly improved library testing mechanisms.
-- Improved project documentation. Now, the documentation is divided into user guide and
- theoretical framework. In addition, the user guides are fully executable because they
- are now on notebooks.
+- Implemented repr for WordEmbeddingModel.
+- Testing CI moved from CircleCI to GithubActions.
+- License changed to MIT.
Version 0.3.2
-------------
-- Fixed RNSB bug where the classification labels were interchanged and could produce erroneous results when the attributes are of different sizes.
+- Fixed RNSB bug where the classification labels were interchanged and could produce
+ erroneous results when the attributes are of different sizes.
- Fixed RNSB replication notebook
- Update of WEFE case study scores.
- Improved documentation examples for WEAT, RNSB, RIPA.
-- Holdout parameter added to RNSB, which allows to indicate whether or not a holdout is performed when training the classifier.
+- Holdout parameter added to RNSB, which allows to indicate whether or not a holdout
+ is performed when training the classifier.
- Improved printing of the RNSB evaluation.
Version 0.3.1
@@ -155,16 +164,22 @@ Version 0.3.1
- Update WEFE original case study
- Hotfix: Several bug fixes for execute WEFE original Case Study.
- fetch_eds top_n_race_occupations argument set to 10.
-- Preprocessing: get_embeddings_from_set now returns a list with the lost preprocessed words instead of the original ones.
+- Preprocessing: get_embeddings_from_set now returns a list with the lost
+ preprocessed words instead of the original ones.
Version 0.3.0
-------------
- Implemented Bolukbasi et al. 2016 Hard Debias.
- Implemented Thomas Manzini et al. 2019 Multiclass Hard Debias.
- Implemented a fetch function to retrieve gn-glove female-male word sets.
-- Moved the transformation logic of words, sets and queries to embeddings to its own module: preprocessing
-- Enhanced the preprocessor_args and secondary_preprocessor_args metric preprocessing parameters to an list of preprocessors `preprocessors` together with the parameter `strategy` indicating whether to consider all the transformed words (`'all'`) or only the first one encountered (`'first'`).
-- Renamed WordEmbeddingModel attributes ```model``` and ```model_name``` to ```wv``` and ```name``` respectively.
+- Moved the transformation logic of words, sets and queries to embeddings to its own
+ module: preprocessing
+- Enhanced the preprocessor_args and secondary_preprocessor_args metric
+ preprocessing parameters to an list of preprocessors `preprocessors` together with
+ the parameter `strategy` indicating whether to consider all the transformed words
+ (`'all'`) or only the first one encountered (`'first'`).
+- Renamed WordEmbeddingModel attributes ```model``` and ```model_name``` to
+ ```wv``` and ```name``` respectively.
- Renamed every run_query ```word_embedding``` argument to ```model``` in every metric.
@@ -179,21 +194,30 @@ Version 0.2.1
- Compatibility fixes.
-
Version 0.2.0
--------------
-- Renamed optional ```run_query``` parameter ```warn_filtered_words``` to `warn_not_found_words`.
-- Added ```word_preprocessor_args``` parameter to ```run_query``` that allow specifying transformations prior to searching for words in word embeddings.
-- Added ```secondary_preprocessor_args``` parameter to ```run_query``` which allows specifying a second pre-processor transformation to words before searching them in word embeddings. It is not necessary to specify the first preprocessor to use this one.
-- Implemented ```__getitem__``` function in ```WordEmbeddingModel```. This method allows obtaining an embedding from a word from the model stored in the instance using indexers.
+- Renamed optional ```run_query``` parameter ```warn_filtered_words``` to
+ `warn_not_found_words`.
+- Added ```word_preprocessor_args``` parameter to ```run_query``` that allow specifying
+ transformations prior to searching for words in word embeddings.
+- Added ```secondary_preprocessor_args``` parameter to ```run_query``` which allows
+ specifying a second pre-processor transformation to words before searching them in
+ word embeddings. It is not necessary to specify the first preprocessor to use this
+ one.
+- Implemented ```__getitem__``` function in ```WordEmbeddingModel```. This method
+ allows obtaining an embedding from a word from the model stored in the instance
+ using indexers.
- Removed underscore from class and instance variable names.
-- Improved type and verification exception messages when creating objects and executing methods.
-- Fix an error that appeared when calculating rankings with two columns of aggregations with the same name.
+- Improved type and verification exception messages when creating objects and executing
+ methods.
+- Fix an error that appeared when calculating rankings with two columns of aggregations
+ with the same name.
- Ranking correlations are now calculated using pandas ```corr``` method.
- Changed metric template, name and short_names to class variables.
- Implemented ```random_state``` in RNSB to allow replication of the experiments.
-- run_query now returns as a result the default metric requested in the parameters and all calculated values that may be useful in the other variables of the dictionary.
+- run_query now returns as a result the default metric requested in the parameters
+ and all calculated values that may be useful in the other variables of the dictionary.
- Fixed problem with api documentation: now it shows methods of the classes.
- Implemented p-value for WEAT
diff --git a/cov.xml b/cov.xml
deleted file mode 100644
index b4141df..0000000
--- a/cov.xml
+++ /dev/null
@@ -1,1601 +0,0 @@
-
-
-
-
-
- /home/pablo/Proyectos/WEFE/wefe/wefe
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/doc/_static/css/project-template.css b/doc/_static/css/project-template.css
deleted file mode 100644
index 29392fc..0000000
--- a/doc/_static/css/project-template.css
+++ /dev/null
@@ -1,16 +0,0 @@
-@import url("theme.css");
-
-.highlight a {
- text-decoration: underline;
-}
-
-.deprecated p {
- padding: 10px 7px 10px 10px;
- color: #b94a48;
- background-color: #F3E5E5;
- border: 1px solid #eed3d7;
-}
-
-.deprecated p span.versionmodified {
- font-weight: bold;
-}
\ No newline at end of file
diff --git a/doc/user_guide/measurement.ipynb b/doc/user_guide/measurement.ipynb
deleted file mode 100644
index 1fc012c..0000000
--- a/doc/user_guide/measurement.ipynb
+++ /dev/null
@@ -1,1243 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "(bias measurement)=\n",
- "\n",
- "# Bias Measurement\n",
- "\n",
- "The following guide is designed to present the more general details on\n",
- "using the package to measure bias. The following sections show:\n",
- "\n",
- "* how to run a simple query using ``Glove`` embedding model.\n",
- "* how to run multiple queries on multiple embeddings.\n",
- "* how to compare the results obtained from running multiple\n",
- " sets of queries on multiple embeddings using different metrics\n",
- " through ranking calculation.\n",
- "* how to calculate the correlations between the\n",
- " rankings obtained.\n",
- "\n",
- "\n",
- ":::{warning}\n",
- "\n",
- " To accurately study and reduce biases contained in word embeddings, queries may\n",
- " contain words that could be offensive to certain groups or individuals.\n",
- " The relationships studied between these words DO NOT represent the\n",
- " ideas, thoughts or beliefs of the authors of this library. \n",
- " This warning applies to all documentation.\n",
- "\n",
- ":::\n",
- "\n",
- "\n",
- ":::{note}\n",
- "\n",
- "If you are not familiar with the concepts of query, target and attribute\n",
- "set, please visit the {ref}`measurement framework`\n",
- "on the library’s conceptual guides. These concepts are widely used in the\n",
- "following sections.\n",
- "\n",
- ":::\n",
- "\n",
- ":::{note}\n",
- "\n",
- "For a list of metrics implemented in WEFE, refer to the\n",
- "[metrics section](metrics-API) of the API reference. \n",
- "\n",
- ":::\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "## Run a Query\n",
- "\n",
- "The following subsections explains how to run a simple query that\n",
- "measures gender bias on\n",
- "[Glove](https://nlp.stanford.edu/projects/glove/). The example uses\n",
- "the Word Embedding Association Test ({class}`~wefe.metrics.WEAT.WEAT`) metric\n",
- "quantifying the bias in the embeddings model. Below we show the three usual steps for\n",
- "performing a query in WEFE:\n",
- "\n",
- ":::{note}\n",
- "\n",
- "{class}`~wefe.metrics.WEAT.WEAT` is a fairness metric that quantifies the relationship\n",
- "between two sets of target words (sets of words intended to denote a social\n",
- "groups as men and women) and two sets of attribute words (sets of words\n",
- "representing some attitude, characteristic, trait, occupational field,\n",
- "etc. that can be associated with individuals from any social group). \n",
- "\n",
- "The closer its value is to 0, the less biased the model is. \n",
- "\n",
- "Visit the metrics documentation ({class}`~wefe.metrics.WEAT.WEAT`) for more information.\n",
- "\n",
- "\n",
- ":::"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Load a word embeddings model as a ``WordEmbeddingModel`` object.\n",
- "\n",
- "Load the word embedding model and then wrap it using a\n",
- "{class}`~wefe.word_embedding_model.WordEmbeddingModel` (class that allows WEFE to handle the models).\n",
- "\n",
- "WEFE bases all its operations on word embeddings using Gensim’s\n",
- "``KeyedVectors`` interface. Any model that can be loaded using\n",
- "``KeyedVectors`` will be compatible with WEFE. The following example uses a 25-dim pre-trained ``Glove`` model using a\n",
- "twitter dataset loaded using [gensim-data](https://github.com/RaRe-Technologies/gensim-data/)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import gensim.downloader as api\n",
- "\n",
- "from wefe.datasets import load_weat\n",
- "from wefe.metrics import WEAT\n",
- "from wefe.query import Query\n",
- "from wefe.word_embedding_model import WordEmbeddingModel\n",
- "\n",
- "twitter_25 = api.load(\"glove-twitter-25\")\n",
- "# WordEmbeddingModel receives as first argument a KeyedVectors model\n",
- "# and the second argument the model name.\n",
- "model = WordEmbeddingModel(twitter_25, \"glove twitter dim=25\")\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create the query using a ``Query`` object\n",
- "\n",
- "Define the target and attribute word sets and create a {class}`~wefe.query.Query` object\n",
- "that contains them.\n",
- "\n",
- "For this initial example, a query is used to study the association\n",
- "between gender with respect to family and career. The words used are\n",
- "taken from the set of words used in the *Semantics derived automatically\n",
- "from language corpora contain human-like biases* paper, which are\n",
- "included in the ``datasets`` module."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "gender_query = Query(\n",
- " target_sets=[\n",
- " [\"female\", \"woman\", \"girl\", \"sister\", \"she\", \"her\", \"hers\", \"daughter\"],\n",
- " [\"male\", \"man\", \"boy\", \"brother\", \"he\", \"him\", \"his\", \"son\"],\n",
- " ],\n",
- " attribute_sets=[\n",
- " [\n",
- " \"home\",\n",
- " \"parents\",\n",
- " \"children\",\n",
- " \"family\",\n",
- " \"cousins\",\n",
- " \"marriage\",\n",
- " \"wedding\",\n",
- " \"relatives\",\n",
- " ],\n",
- " [\n",
- " \"executive\",\n",
- " \"management\",\n",
- " \"professional\",\n",
- " \"corporation\",\n",
- " \"salary\",\n",
- " \"office\",\n",
- " \"business\",\n",
- " \"career\",\n",
- " ],\n",
- " ],\n",
- " target_sets_names=[\"Female terms\", \"Male Terms\"],\n",
- " attribute_sets_names=[\"Family\", \"Careers\"],\n",
- ")\n",
- "\n",
- "gender_query\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Run the Query\n",
- "\n",
- "Instantiate the metric that you will use and then execute ``run_query``\n",
- "with the parameters created in the previous steps.\n",
- "\n",
- "Any bias measurement process at WEFE consists of the following steps:\n",
- "\n",
- "1. Metric arguments checking.\n",
- "2. Transform the word sets into word embeddings.\n",
- "3. Calculate the metric.\n",
- "\n",
- "In this case we use the {class}`~wefe.metrics.WEAT.WEAT` metric (proposed in the\n",
- "same paper of the set of words used in the query)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "metric = WEAT()\n",
- "result = metric.run_query(gender_query, model)\n",
- "result\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "By default, the results are a ``dict`` containing the query name (in the\n",
- "key ``query_name``) and the calculated value of the metric in the\n",
- "``result`` key. It also contains a key with the name and the value of\n",
- "the calculated metric (which is duplicated in the “results” key).\n",
- "\n",
- "Depending on the metric class used, the result ``dict`` can also return\n",
- "more metrics, detailed word-by-word values or other statistics like\n",
- "p-values. Also some metrics allow you to change the default value in\n",
- "results.\n",
- "\n",
- "Details of all the metrics implemented, their parameters and\n",
- "examples of execution can be found at [metrics section](metrics-API)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Run Query Arguments\n",
- "\n",
- "Each metric allows varying the behavior of ``run_query`` according to\n",
- "different parameters. There are parameters to customize the\n",
- "transformation of the sets of words to sets of embeddings, others to\n",
- "warn errors or modify which calculation method the metric use.\n",
- "\n",
- ":::{note}\n",
- "\n",
- "Each metric implements the `run_query` method with different arguments. \n",
- "Visit their API documentation for more information.\n",
- ":::\n",
- "\n",
- "For example, ``run_query`` can be instructed to ``return effect_size``\n",
- "in the ``result`` key by setting ``return_effect_size`` as ``True``.\n",
- "Note that this parameter is only of the class ``WEAT``.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "weat = WEAT()\n",
- "result = weat.run_query(gender_query, model, return_effect_size=True)\n",
- "result\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can also request ``run_query`` to run the statistical significance\n",
- "calculation by setting ``calculate_p_value`` as ``True``. This checks\n",
- "how many queries generated from permutations (controlled by the\n",
- "parameter ``p_value_iterations``) of the target sets obtain values\n",
- "greater than those obtained by the original query.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "weat = WEAT()\n",
- "result = weat.run_query(\n",
- " gender_query, model, calculate_p_value=True, p_value_iterations=5000\n",
- ")\n",
- "result"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Out of Vocabulary Words\n",
- "\n",
- "It is common in the literature to find bias tests whose tagret sets are\n",
- "common names of social groups. These names are commonly cased and may\n",
- "contain special characters. There are several embedding models whose\n",
- "words are not cased or do not have accents or other special characters,\n",
- "as for example, in ``Glove``. This implies that a query with target sets\n",
- "composed by names executed in ``Glove`` (without any preprocessing of\n",
- "the words) could produce erroneous results because WEFE will not be able\n",
- "to find the names in the model vocabulary.\n",
- "\n",
- "\n",
- ":::{note}\n",
- "\n",
- "\n",
- "Some well-known word sets are already provided by the package and can be\n",
- "easily loaded by the user through the [datasets](datasets-API) module. From here on,\n",
- "the tutorial use the words defined in the study *Semantics derived\n",
- "automatically from language corpora contain human-like biases*, the same\n",
- "that proposed the {class}`~wefe.metrics.WEAT.WEAT` metric.\n",
- "\n",
- ":::"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# load the weat word sets.\n",
- "word_sets = load_weat()\n",
- "\n",
- "# print a set of european american common names.\n",
- "print(word_sets[\"european_american_names_5\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following query compares European-American and African-American\n",
- "names with respect to pleasant and unpleasant attributes.\n",
- "\n",
- "\n",
- ":::{note}\n",
- "\n",
- "\n",
- "It can be indicated to ``run_query`` to log the words that were lost in\n",
- "the transformation to vectors by using the parameter\n",
- "``warn_not_found_words`` as ``True``.\n",
- "\n",
- ":::"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ethnicity_query = Query(\n",
- " [word_sets[\"european_american_names_5\"], word_sets[\"african_american_names_5\"]],\n",
- " [word_sets[\"pleasant_5\"], word_sets[\"unpleasant_5\"]],\n",
- " [\"European american names\", \"African american names\"],\n",
- " [\"Pleasant\", \"Unpleasant\"],\n",
- ")\n",
- "result = weat.run_query(ethnicity_query, model, warn_not_found_words=True,)\n",
- "result\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ":::{warning}\n",
- "\n",
- "If more than 20% of the words from any of the word sets of the query are\n",
- "lost during the transformation to embeddings, the result of the metric\n",
- "will be ``np.nan``. This behavior can be changed using a float number\n",
- "parameter called ``lost_vocabulary_threshold``.\n",
- "\n",
- ":::"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Word Preprocessors\n",
- "\n",
- "``run_queries`` allows preprocessing each word before they are searched in the model's \n",
- "vocabulary.through the parameter ``preprocessors``. (list of one or more preprocessor).\n",
- "This parameter accepts a list of individual preprocessors, which are defined below:\n",
- "\n",
- "A ``preprocessor`` is a dictionary that specifies what processing(s) are \n",
- "performed on each word before its looked up in the model vocabulary.\n",
- "For example, the ``preprocessor``\n",
- "``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase\n",
- "and remove the accent from each word before searching for them in the\n",
- "model vocabulary. Note that an empty dictionary ``{}`` indicates that no\n",
- "preprocessing is done.\n",
- "\n",
- "The possible options for a preprocessor are:\n",
- "\n",
- "* ``lowercase``: ``bool``. Indicates that the words are transformed to lowercase.\n",
- "* ``uppercase``: ``bool``. Indicates that the words are transformed to uppercase.\n",
- "* ``titlecase``: ``bool``. Indicates that the words are transformed to titlecase.\n",
- "* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that the accents of the words are eliminated. The stripping type can be specified. True uses ‘unicode’ by default.\n",
- "* ``preprocessor``: ``Callable``. It receives a function that operates on each word. In the case of specifying a function, it overrides the default preprocessor (i.e., the previous options stop working).\n",
- "\n",
- "\n",
- "A list of preprocessor options allows searching for several\n",
- "variants of the words into the model. For example, the preprocessors\n",
- "``[{}, {\"lowercase\": True, \"strip_accents\": True}]``\n",
- "``{}`` allows first to search for the original words in the vocabulary of the model. \n",
- "In case some of them are not found, ``{\"lowercase\": True, \"strip_accents\": True}`` \n",
- "is executed on these words and then they are searched in the model vocabulary.\n",
- "\n",
- "By default (in case there is more than one preprocessor in the list) the first \n",
- "preprocessed word found in the embeddings model is used. \n",
- "This behavior can be controlled by the ``strategy`` parameter of ``run_query``.\n",
- "\n",
- "In the following example, we provide a list with only one\n",
- "preprocessor that instructs ``run_query`` to lowercase and remove all\n",
- "accents from every word before they are searched in the embeddings\n",
- "model.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "weat = WEAT()\n",
- "result = weat.run_query(\n",
- " ethnicity_query,\n",
- " model,\n",
- " preprocessors=[{\"lowercase\": True, \"strip_accents\": True}],\n",
- " warn_not_found_words=True,\n",
- ")\n",
- "result"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It may happen that it is more important to find the original word and in\n",
- "the case of not finding it, then preprocess it and look it up in the\n",
- "vocabulary. This behavior can be specified in ``preprocessors`` list by\n",
- "first specifying an empty preprocessor ``{}`` and then the preprocessor\n",
- "that converts to lowercase and removes accents.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "weat = WEAT()\n",
- "result = weat.run_query(\n",
- " ethnicity_query,\n",
- " model,\n",
- " preprocessors=[\n",
- " {}, # empty preprocessor, search for the original words.\n",
- " {\n",
- " \"lowercase\": True,\n",
- " \"strip_accents\": True,\n",
- " }, # search for lowercase and no accent words.\n",
- " ],\n",
- " warn_not_found_words=True,\n",
- ")\n",
- "\n",
- "result"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The number of preprocessing steps can be increased as needed. For\n",
- "example, we can complex the above preprocessor to first search for the\n",
- "original words, then for the lowercase words, and finally for the\n",
- "lowercase words without accents.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "weat = WEAT()\n",
- "result = weat.run_query(\n",
- " ethnicity_query,\n",
- " model,\n",
- " preprocessors=[\n",
- " {}, # first step: empty preprocessor, search for the original words.\n",
- " {\"lowercase\": True,}, # second step: search for lowercase.\n",
- " {\n",
- " \"lowercase\": True,\n",
- " \"strip_accents\": True,\n",
- " }, # third step: search for lowercase and no accent words.\n",
- " ],\n",
- " warn_not_found_words=True,\n",
- ")\n",
- "\n",
- "result"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It is also possible to change the behavior of the search by including\n",
- "not only the first word, but all the words generated by the\n",
- "preprocessors. This can be controlled by specifying the parameter\n",
- "``strategy=all``."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "weat = WEAT()\n",
- "result = weat.run_query(\n",
- " ethnicity_query,\n",
- " model,\n",
- " preprocessors=[\n",
- " {}, # first step: empty preprocessor, search for the original words.\n",
- " {\"lowercase\": True,}, # second step: search for lowercase .\n",
- " {\"uppercase\": True,}, # third step: search for uppercase.\n",
- " ],\n",
- " strategy=\"all\",\n",
- " warn_not_found_words=True,\n",
- ")\n",
- "\n",
- "result\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Running Multiple Queries\n",
- "\n",
- "\n",
- "It is usual to want to test many queries of some bias criterion (gender,\n",
- "ethnicity, religion, politics, socioeconomic, among others) on several\n",
- "models at the same time. Trying to use ``run_query`` on each pair\n",
- "embedding-query can be a bit complex and could require extra work to\n",
- "implement.\n",
- "\n",
- "This is why the library also implements a function to test multiple\n",
- "queries on various word embedding models in a single call: the\n",
- "``run_queries`` util.\n",
- "\n",
- "The following code shows how to run various gender queries on ``Glove``\n",
- "embedding models with different dimensions trained from the Twitter\n",
- "dataset. The queries are executed using {class}`~wefe.metrics.WEAT.WEAT` metric.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import gensim.downloader as api\n",
- "\n",
- "from wefe.datasets import load_weat\n",
- "from wefe.metrics import RNSB, WEAT\n",
- "from wefe.query import Query\n",
- "from wefe.utils import run_queries\n",
- "from wefe.word_embedding_model import WordEmbeddingModel"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Load the models\n",
- "\n",
- "Load three different Glove Twitter embedding models. These models were\n",
- "trained using the same dataset varying the number of embedding\n",
- "dimensions."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model_1 = WordEmbeddingModel(api.load(\"glove-twitter-25\"), \"glove twitter dim=25\")\n",
- "model_2 = WordEmbeddingModel(api.load(\"glove-twitter-50\"), \"glove twitter dim=50\")\n",
- "model_3 = WordEmbeddingModel(api.load(\"glove-twitter-100\"), \"glove twitter dim=100\")\n",
- "\n",
- "models = [model_1, model_2, model_3]\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Load the word sets and create the queries\n",
- "\n",
- "Now, we load the ``WEAT`` word set and create three queries. The\n",
- "three queries are intended to measure gender bias.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load the WEAT word sets\n",
- "word_sets = load_weat()\n",
- "\n",
- "# Create gender queries\n",
- "gender_query_1 = Query(\n",
- " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
- " [word_sets[\"career\"], word_sets[\"family\"]],\n",
- " [\"Male terms\", \"Female terms\"],\n",
- " [\"Career\", \"Family\"],\n",
- ")\n",
- "\n",
- "gender_query_2 = Query(\n",
- " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
- " [word_sets[\"science\"], word_sets[\"arts\"]],\n",
- " [\"Male terms\", \"Female terms\"],\n",
- " [\"Science\", \"Arts\"],\n",
- ")\n",
- "\n",
- "gender_query_3 = Query(\n",
- " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
- " [word_sets[\"math\"], word_sets[\"arts_2\"]],\n",
- " [\"Male terms\", \"Female terms\"],\n",
- " [\"Math\", \"Arts\"],\n",
- ")\n",
- "\n",
- "gender_queries = [gender_query_1, gender_query_2, gender_query_3]\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Run the queries on all Word Embeddings using WEAT.\n",
- "\n",
- "To run the list of queries and models, we call ``run_queries`` using the\n",
- "parameters defined in the previous step. The mandatory parameters of the\n",
- "function are 3:\n",
- "\n",
- "- a metric,\n",
- "- a list of queries, and,\n",
- "- a list of embedding models.\n",
- "\n",
- "It is also possible to provide a name for the criterion studied in this\n",
- "set of queries through the parameter ``queries_set_name``.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "WEAT_gender_results = run_queries(\n",
- " WEAT, gender_queries, models, queries_set_name=\"Gender Queries\"\n",
- ")\n",
- "WEAT_gender_results\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Setting metric params\n",
- "\n",
- "There is a whole column that has no results. As the warnings point out,\n",
- "when transforming the words of the sets into embeddings, there is a loss\n",
- "of words that is greater than the allowed by the parameter\n",
- "``lost_vocabulary_threshold``. In this case, it would be very useful to\n",
- "use the word preprocessors seen above.\n",
- "\n",
- "``run_queries``, accept specific parameters for each metric. These extra\n",
- "parameters for the metric can be passed through ``metric_params``\n",
- "parameter. In this case, a ``preprocessor`` is provided to lowercase the\n",
- "words before searching for them in the models’ vocabularies.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\n",
- "WEAT_gender_results = run_queries(\n",
- " WEAT,\n",
- " gender_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " queries_set_name=\"Gender Queries\",\n",
- ")\n",
- "\n",
- "WEAT_gender_results"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "No query was null in these results.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "### Plot the results in a barplot\n",
- "\n",
- "\n",
- "The library also provides an easy way to plot the results obtained from\n",
- "a ``run_queries`` execution into a [plotly](https://plotly.com/python/) braplot."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from wefe.utils import plot_queries_results, run_queries\n",
- "\n",
- "# Plot the results\n",
- "plot_queries_results(WEAT_gender_results).show()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Aggregating Results\n",
- "\n",
- "The execution of ``run_queries`` provided many results evaluating the\n",
- "gender bias in the tested embeddings. However, these results alone do\n",
- "not comprehensively report the biases observed in all of these queries.\n",
- "One way to obtain an overall view of bias is by aggregating results by\n",
- "model.\n",
- "\n",
- "For WEAT, a simple way to aggregate the results is to average their\n",
- "absolute values. When running ``run_queries``, it is possible to specify\n",
- "that the results be aggregated by model by setting ``aggregate_results``\n",
- "as ``True``\n",
- "\n",
- "The aggregation function can be specified through the\n",
- "``aggregation_function`` parameter. This parameter accepts a list of\n",
- "predefined aggregations as well as a custom function that operates on\n",
- "the results dataframe. The aggregation functions available are:\n",
- "\n",
- "- Average ``avg``.\n",
- "- Average of the absolute values ``abs_avg``.\n",
- "- Sum ``sum``.\n",
- "- Sum of the absolute values, ``abs_sum``.\n",
- "\n",
- ":::{note}\n",
- "\n",
- "Notice that some functions are more appropriate for certain metrics. For\n",
- "metrics returning only positive numbers, all the previous aggregation\n",
- "functions would be OK. In contrast, metrics that return real values\n",
- "(e.g., {class}`~wefe.metrics.WEAT.WEAT` , {class}`~wefe.metrics.RND.RND` , etc…), \n",
- "aggregation functions such as sum would make positive and negative outputs to cancel \n",
- "each other.\n",
- "\n",
- ":::"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "WEAT_gender_results_agg = run_queries(\n",
- " WEAT,\n",
- " gender_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " aggregate_results=True,\n",
- " aggregation_function=\"abs_avg\",\n",
- " queries_set_name=\"Gender Queries\",\n",
- ")\n",
- "WEAT_gender_results_agg\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plot_queries_results(WEAT_gender_results_agg).show()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It is also possible to ask the function to return only the aggregated\n",
- "results using the parameter ``return_only_aggregation``\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "WEAT_gender_results_only_agg = run_queries(\n",
- " WEAT,\n",
- " gender_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " aggregate_results=True,\n",
- " aggregation_function=\"abs_avg\",\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Gender Queries\",\n",
- ")\n",
- "WEAT_gender_results_only_agg\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "fig = plot_queries_results(WEAT_gender_results_only_agg)\n",
- "fig.show()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Model Ranking\n",
- "\n",
- "It may be desirable to obtain an overall view of the bias by model using\n",
- "different metrics or bias criteria. While the aggregate values can be\n",
- "compared directly, two problems are likely to be encountered:\n",
- "\n",
- "1. One type of bias criterion can dominate the other because of\n",
- " significant differences in magnitude.\n",
- "\n",
- "2. Different metrics can operate on different scales, which makes them\n",
- " difficult to compare.\n",
- "\n",
- "To show these problems, suppose we have:\n",
- "\n",
- "- Two sets of queries: one that explores gender biases and\n",
- " another that explores ethnicity biases.\n",
- "- Three ``Glove`` models of 25, 50 and 100 dimensions trained on the same\n",
- " twitter dataset.\n",
- "\n",
- "Then we run ``run_queries`` on this set of model-queries using \n",
- "{class}`~wefe.metrics.WEAT.WEAT`, and to corroborate the results obtained, we also use \n",
- "Relative Negative Sentiment Bias ({class}`~wefe.metrics.RNSB.RNSB`).\n",
- "\n",
- "1. The first problem occurs when the bias scores obtained from one set\n",
- " of queries are much higher than those from the other set, even when\n",
- " the same metric is used.\n",
- "\n",
- "When executing ``run_queries`` with the gender and ethnicity queries on\n",
- "the models described above, the results obtained are as follows:\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "| model_name | WEAT: Gender Queries average of abs values score | WEAT: Ethnicity Queries average of abs values score |\n",
- "|----------------------------------------|-------------------------------------------------------------------------|------------------------------------------------------------------------|\n",
- "| glove twitter dim=25 | 0.210556 | 2.64632 |\n",
- "| glove twitter dim=50 | 0.292373 | 1.87431 |\n",
- "| glove twitter dim=100 | 0.225116 | 1.78469 |"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "As can be seen, the results of ethnicity bias are much greater than\n",
- "those of gender.\n",
- "\n",
- "2. The second problem is when different metrics return results on\n",
- " different scales of magnitude.\n",
- "\n",
- "When executing ``run_queries`` with the gender queries and models\n",
- "described above using both WEAT and RNSB, the results obtained are as\n",
- "follows:"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "| model_name | WEAT: Gender Queries average of abs values score | RNSB: Gender Queries average of abs values score |\n",
- "|----------------------------------------|-------------------------------------------------------------------------|-------------------------------------------------------------------------|\n",
- "| glove twitter dim=25 | 0.210556 | 0.032673 |\n",
- "| glove twitter dim=50 | 0.292373 | 0.049429 |\n",
- "| glove twitter dim=100 | 0.225116 | 0.0312772 |"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "We can see differences between the results of both metrics of an order\n",
- "of magnitude.\n",
- "\n",
- "One solution to this problem is to create **rankings**. Rankings focus on the relative\n",
- "differences reported by the metrics (for different models) instead of focusing on the\n",
- "absolute values.\n",
- "\n",
- "The following guide show how to create rankings that evaluate\n",
- "gender bias and ethnicity.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Gender Bias Model Ranking"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# define the queries\n",
- "gender_query_1 = Query(\n",
- " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
- " [word_sets[\"career\"], word_sets[\"family\"]],\n",
- " [\"Male terms\", \"Female terms\"],\n",
- " [\"Career\", \"Family\"],\n",
- ")\n",
- "gender_query_2 = Query(\n",
- " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
- " [word_sets[\"science\"], word_sets[\"arts\"]],\n",
- " [\"Male terms\", \"Female terms\"],\n",
- " [\"Science\", \"Arts\"],\n",
- ")\n",
- "gender_query_3 = Query(\n",
- " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
- " [word_sets[\"math\"], word_sets[\"arts_2\"]],\n",
- " [\"Male terms\", \"Female terms\"],\n",
- " [\"Math\", \"Arts\"],\n",
- ")\n",
- "\n",
- "gender_queries = [gender_query_1, gender_query_2, gender_query_3]\n",
- "\n",
- "# run the queries using WEAT\n",
- "WEAT_gender_results = run_queries(\n",
- " WEAT,\n",
- " gender_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " aggregate_results=True,\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Gender Queries\",\n",
- ")\n",
- "\n",
- "# run the queries using WEAT effect size\n",
- "WEAT_EZ_gender_results = run_queries(\n",
- " WEAT,\n",
- " gender_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}], \"return_effect_size\": True,},\n",
- " aggregate_results=True,\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Gender Queries\",\n",
- ")\n",
- "\n",
- "# run the queries using RNSB\n",
- "RNSB_gender_results = run_queries(\n",
- " RNSB,\n",
- " gender_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " aggregate_results=True,\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Gender Queries\",\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The rankings can be calculated by means of the ``create_ranking``\n",
- "function. This function receives as input results from running\n",
- "``run_queries`` and assumes that the last column contains the aggregated\n",
- "values."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from wefe.utils import create_ranking\n",
- "\n",
- "# create the ranking\n",
- "gender_ranking = create_ranking(\n",
- " [WEAT_gender_results, WEAT_EZ_gender_results, RNSB_gender_results]\n",
- ")\n",
- "\n",
- "gender_ranking\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Ethnicity Bias Model Ranking\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\n",
- "# define the queries\n",
- "ethnicity_query_1 = Query(\n",
- " [word_sets[\"european_american_names_5\"], word_sets[\"african_american_names_5\"]],\n",
- " [word_sets[\"pleasant_5\"], word_sets[\"unpleasant_5\"]],\n",
- " [\"European Names\", \"African Names\"],\n",
- " [\"Pleasant\", \"Unpleasant\"],\n",
- ")\n",
- "\n",
- "ethnicity_query_2 = Query(\n",
- " [word_sets[\"european_american_names_7\"], word_sets[\"african_american_names_7\"]],\n",
- " [word_sets[\"pleasant_9\"], word_sets[\"unpleasant_9\"]],\n",
- " [\"European Names\", \"African Names\"],\n",
- " [\"Pleasant 2\", \"Unpleasant 2\"],\n",
- ")\n",
- "\n",
- "ethnicity_queries = [ethnicity_query_1, ethnicity_query_2]\n",
- "\n",
- "# run the queries using WEAT\n",
- "WEAT_ethnicity_results = run_queries(\n",
- " WEAT,\n",
- " ethnicity_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " aggregate_results=True,\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Ethnicity Queries\",\n",
- ")\n",
- "\n",
- "# run the queries using WEAT effect size\n",
- "WEAT_EZ_ethnicity_results = run_queries(\n",
- " WEAT,\n",
- " ethnicity_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}], \"return_effect_size\": True,},\n",
- " aggregate_results=True,\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Ethnicity Queries\",\n",
- ")\n",
- "\n",
- "# run the queries using RNSB\n",
- "RNSB_ethnicity_results = run_queries(\n",
- " RNSB,\n",
- " ethnicity_queries,\n",
- " models,\n",
- " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
- " aggregate_results=True,\n",
- " return_only_aggregation=True,\n",
- " queries_set_name=\"Ethnicity Queries\",\n",
- ")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# create the ranking\n",
- "ethnicity_ranking = create_ranking(\n",
- " [WEAT_ethnicity_results, WEAT_EZ_gender_results, RNSB_ethnicity_results]\n",
- ")\n",
- "\n",
- "ethnicity_ranking\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Plotting the rankings\n",
- "\n",
- "It is possible to graph the rankings in barplots using the\n",
- "``plot_ranking`` function. The generated figure shows the accumulated\n",
- "rankings for each embedding model. Each bar represents the sum of the\n",
- "rankings obtained by each embedding. Each color within a bar represents\n",
- "a different criterion-metric ranking."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from wefe.utils import plot_ranking\n",
- "\n",
- "fig = plot_ranking(gender_ranking)\n",
- "fig.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "fig = plot_ranking(ethnicity_ranking)\n",
- "fig.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Correlating Rankings\n",
- "\n",
- "Having obtained rankings by metric for each embeddings, it would be\n",
- "ideal to see and analyze the degree of agreement between them.\n",
- "\n",
- "A high concordance between the rankings allows us to state with some certainty that \n",
- "all metrics evaluated the embedding models in a similar way and therefore, \n",
- "that the ordering of embeddings by bias calculated makes sense.\n",
- "On the other hand, a low degree of agreement shows the opposite: the rankings do not \n",
- "allow to clearly establish which embedding is less biased than another.\n",
- "\n",
- "The level of concordance of the rankings can be evaluated by calculating\n",
- "correlations.WEFE provides ``calculate_ranking_correlations`` to\n",
- "calculate the correlations between rankings.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from wefe.utils import calculate_ranking_correlations, plot_ranking_correlations\n",
- "\n",
- "correlations = calculate_ranking_correlations(gender_ranking)\n",
- "correlations\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- ":::{note}\n",
- "\n",
- "``calculate_ranking_correlations`` uses the ``corr()`` ``pandas``\n",
- "dataframe method. The type of correlation that is calculated can be changed \n",
- "through the method parameter. The available options are:\n",
- "``'pearson'``, ``'spearman'``, ``'kendall'``. By default, the spearman\n",
- "correlation is calculated.\n",
- "\n",
- ":::\n",
- "\n",
- "In this example, Kendall’s correlation is used.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "calculate_ranking_correlations(gender_ranking, method=\"kendall\")\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "WEFE also provides a function for graphing the correlations:\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "correlation_fig = plot_ranking_correlations(correlations)\n",
- "correlation_fig.show()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this case, only two of the three rankings show similar results.\n",
- "\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3.8.13 ('wefe')",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.13"
- },
- "orig_nbformat": 4,
- "vscode": {
- "interpreter": {
- "hash": "37d01894bb315c73bf6fde5551d8a97078996f38b23395695bd1998fb0ae5507"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/doc/Makefile b/docs/Makefile
similarity index 97%
rename from doc/Makefile
rename to docs/Makefile
index ce9aedf..ef2103c 100644
--- a/doc/Makefile
+++ b/docs/Makefile
@@ -181,4 +181,4 @@ xml:
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
- @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
\ No newline at end of file
diff --git a/docs/_static/css/theme_overrides.css b/docs/_static/css/theme_overrides.css
new file mode 100644
index 0000000..a67f5cf
--- /dev/null
+++ b/docs/_static/css/theme_overrides.css
@@ -0,0 +1,48 @@
+/* override table width restrictions */
+.wy-table-responsive table td,
+.wy-table-responsive table th {
+ white-space: normal;
+}
+
+.wy-table-responsive {
+ margin-bottom: 24px;
+ max-width: 100%;
+ overflow: visible;
+}
+
+.dataframe {
+ margin-left: auto;
+ margin-right: auto;
+ border: none;
+ border-collapse: collapse;
+ border-spacing: 0;
+ color: rgba(0, 0, 0, 0.87);
+ font-size: 12px;
+ table-layout: fixed;
+ margin-bottom: 24px;
+}
+
+.dataframe thead {
+ border-bottom: 1px solid rgba(0, 0, 0, 0.87);
+ vertical-align: bottom;
+}
+
+.dataframe tr,
+.dataframe th,
+.dataframe td {
+ text-align: right;
+ vertical-align: middle;
+ padding: 0.5em 0.5em;
+ line-height: normal;
+ white-space: normal;
+ max-width: none;
+ border: none;
+}
+
+.dataframe th {
+ font-weight: bold;
+}
+
+.dataframe tbody tr:nth-child(odd) {
+ background: #f5f5f5;
+}
\ No newline at end of file
diff --git a/doc/_static/js/copybutton.js b/docs/_static/js/copybutton.js
similarity index 100%
rename from doc/_static/js/copybutton.js
rename to docs/_static/js/copybutton.js
diff --git a/doc/_templates/class.rst b/docs/_templates/class.rst
similarity index 79%
rename from doc/_templates/class.rst
rename to docs/_templates/class.rst
index b90be1c..62f57cc 100644
--- a/doc/_templates/class.rst
+++ b/docs/_templates/class.rst
@@ -11,6 +11,8 @@
.. automethod:: __init__
{% endblock %}
+.. include:: {{module}}.{{objname}}.examples
+
.. raw:: html
diff --git a/doc/_templates/function.rst b/docs/_templates/function.rst
similarity index 100%
rename from doc/_templates/function.rst
rename to docs/_templates/function.rst
diff --git a/doc/_templates/numpydoc_docstring.py b/docs/_templates/numpydoc_docstring.py
similarity index 100%
rename from doc/_templates/numpydoc_docstring.py
rename to docs/_templates/numpydoc_docstring.py
diff --git a/doc/api/api.rst b/docs/api/api.rst
similarity index 87%
rename from doc/api/api.rst
rename to docs/api/api.rst
index 7ebc99e..da2fcd7 100644
--- a/doc/api/api.rst
+++ b/docs/api/api.rst
@@ -56,11 +56,11 @@ This list contains the debiasing methods implemented so far in WEFE.
:toctree: generated/
:template: class.rst
- wefe.debias.HardDebias
- wefe.debias.MulticlassHardDebias
- wefe.debias.RepulsionAttractionNeutralization
- wefe.debias.DoubleHardDebias
- wefe.debias.HalfSiblingRegression
+ wefe.debias.hard_debias.HardDebias
+ wefe.debias.multiclass_hard_debias.MulticlassHardDebias
+ wefe.debias.repulsion_attraction_neutralization.RepulsionAttractionNeutralization
+ wefe.debias.double_hard_debias.DoubleHardDebias
+ wefe.debias.half_sibling_regression.HalfSiblingRegression
.. _datasets-API:
diff --git a/doc/conceptual_guides/measurement_framework.rst b/docs/conceptual_guides/measurement_framework.rst
similarity index 99%
rename from doc/conceptual_guides/measurement_framework.rst
rename to docs/conceptual_guides/measurement_framework.rst
index f970280..1734179 100644
--- a/doc/conceptual_guides/measurement_framework.rst
+++ b/docs/conceptual_guides/measurement_framework.rst
@@ -6,6 +6,7 @@ Measurement Framework
Below we present the main aspects of the measurement framework developed at WEFE.
.. note::
+
If you want to see tutorials on how to apply queries, visit :ref:`bias measurement`
in the User Guide.
@@ -124,7 +125,7 @@ query, word2vec embeddings and the WEAT metric.
:alt: Bias measurement diagram.
To see the implementation of this query using WEFE, refer to
-the `Quick start `_ section.
+the :ref:`Quick start ` section.
Metrics Implemented So Far
--------------------------
diff --git a/doc/conceptual_guides/mitigation_framework.rst b/docs/conceptual_guides/mitigation_framework.rst
similarity index 87%
rename from doc/conceptual_guides/mitigation_framework.rst
rename to docs/conceptual_guides/mitigation_framework.rst
index df36d63..988bb77 100644
--- a/doc/conceptual_guides/mitigation_framework.rst
+++ b/docs/conceptual_guides/mitigation_framework.rst
@@ -1,9 +1,11 @@
+.. _mitigation framework:
+
Mitigation Framework
====================
.. note::
If you want to see tutorials on how to mitigate (debias) bias in word embedding
- models, visit :ref:`bias mitigation` in the User Guide.
+ models, visit :ref:`Bias Mitigation ` in the User Guide.
WEFE standardizes all mitigation methods through an interface inherited from
@@ -12,8 +14,9 @@ WEFE standardizes all mitigation methods through an interface inherited from
Fit method
----------
-The first step ``fit``, consists in learning the corresponding mitigation transformation
-which usually corresponds to a matrix projection of the embedding space.
+The first step ``fit``, consists in learning the corresponding mitigation
+transformation, which usually corresponds to a matrix projection of the
+embedding space.
This method is quite flexible: it can accept multiple sets of words and other parameters.
Transform method
@@ -44,4 +47,4 @@ WEFE implements the following bias mitigation (debias) metrics:
- :class:`~wefe.debias.half_sibling_regression.HalfSiblingRegression`,
Except for ``MulticlassHardDebias``, all methods are limited to binary criteria,
-such gender.
\ No newline at end of file
+such as gender.
\ No newline at end of file
diff --git a/doc/conceptual_guides/references.rst b/docs/conceptual_guides/references.rst
similarity index 100%
rename from doc/conceptual_guides/references.rst
rename to docs/conceptual_guides/references.rst
diff --git a/doc/conf.py b/docs/conf.py
similarity index 93%
rename from doc/conf.py
rename to docs/conf.py
index 3676d38..f8cbe9e 100644
--- a/doc/conf.py
+++ b/docs/conf.py
@@ -33,20 +33,18 @@
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
- "numpydoc",
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
- # "sphinx_gallery.gen_gallery",
+ "numpydoc",
+ "sphinx_gallery.gen_gallery",
"sphinx.ext.todo",
"sphinx.ext.mathjax",
"sphinx.ext.ifconfig",
- "myst_nb",
+ "sphinx.ext.napoleon",
]
-mathjax_path = ""
-
# this is needed for some reason...
# see https://github.com/numpy/numpydoc/issues/69
numpydoc_show_class_members = False
@@ -60,13 +58,7 @@
mathjax_path = ""
else:
extensions.append("sphinx.ext.mathjax")
- mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/" "tex-chtml.js"
-
-# from distutils.version import LooseVersion
-# if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
-# extensions.append('sphinx.ext.pngmath')
-# else:
-# extensions.append('sphinx.ext.imgmath')
+ mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"
autodoc_default_flags = ["members", "inherited-members"]
@@ -80,20 +72,8 @@
# source_suffix = ".rst"
source_suffix = {
".rst": "restructuredtext",
- ".ipynb": "myst-nb",
- ".myst": "myst-nb",
}
-myst_enable_extensions = [
- "amsmath",
- "colon_fence",
- "deflist",
- "dollarmath",
- "html_image",
-]
-myst_url_schemes = ("http", "https", "mailto")
-nb_execution_mode = "cache"
-nb_execution_timeout = 120
# The encoding of source files.
# source_encoding = 'utf-8-sig'
@@ -138,7 +118,7 @@
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-# add_function_parentheses = True
+add_function_parentheses = False
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
@@ -154,7 +134,6 @@
# Custom style
# html_style = "css/project-template.css"
-
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
@@ -196,7 +175,9 @@
html_static_path = ["_static"]
# html_context = {
-# "css_files": ["_static/theme_overrides.css",], # override wide tables in RTD theme
+# "css_files": [
+# "_static/css/theme_overrides.css", # overrides for wide tables in RTD theme
+# ],
# }
# Add any extra paths that contain custom files (such as robots.txt or
@@ -256,7 +237,12 @@
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
- #'preamble': '',
+ "preamble": r"""
+ \usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm}
+ \usepackage{morefloats}\usepackage{enumitem} \setlistdepth{10}
+ \let\oldhref\href
+ \renewcommand{\href}[2]{\oldhref{#1}{\hbox{#2}}}
+ """
}
# Grouping the document tree into LaTeX files. List of tuples
@@ -350,3 +336,4 @@
def setup(app):
# a copy button to copy snippet of code from the documentation
app.add_js_file("js/copybutton.js")
+ app.add_css_file("css/theme_overrides.css")
diff --git a/doc/examples/multilingual.rst b/docs/examples/multilingual.rst
similarity index 100%
rename from doc/examples/multilingual.rst
rename to docs/examples/multilingual.rst
diff --git a/doc/examples/replications.rst b/docs/examples/replications.rst
similarity index 97%
rename from doc/examples/replications.rst
rename to docs/examples/replications.rst
index 9c292d4..503acc7 100644
--- a/doc/examples/replications.rst
+++ b/docs/examples/replications.rst
@@ -24,6 +24,8 @@ reproduces the experiments performed in the following paper:
the original paper. However, our results are still very similar to those
in the original paper.
+
+
A transparent framework for evaluating unintended demographic bias in word embeddings (RNSB)
============================================================================================
diff --git a/doc/examples/wefe_case_study.rst b/docs/examples/wefe_case_study.rst
similarity index 100%
rename from doc/examples/wefe_case_study.rst
rename to docs/examples/wefe_case_study.rst
diff --git a/doc/getting_started/about.rst b/docs/getting_started/about.rst
similarity index 78%
rename from doc/getting_started/about.rst
rename to docs/getting_started/about.rst
index 009985e..07081e9 100644
--- a/doc/getting_started/about.rst
+++ b/docs/getting_started/about.rst
@@ -2,29 +2,46 @@
About
=====
-*Word Embedding Fairness Evaluation* (WEFE) is an open source library for
-measuring an mitigating bias in word embedding models.
-It generalizes many existing fairness metrics into a unified framework and
-provides a standard interface for:
+*Word Embedding Fairness Evaluation* (WEFE) is an open source library that implements
+many fairness metrics and mitigation methods (debias) in a unified framework.
+It also provides a standard interface for designing new ones.
-- Encapsulating existing fairness metrics from previous work and designing
- new ones.
+The main goal of the library is to provide a ready-to-use tool that allows the
+user to run bias measures and mitigation methods in a straightforward manner
+through well-designed and documented interfaces.
+
+In bias measurement, WEFE provides a standard interface for:
+
+- Encapsulating existing fairness metrics.
- Encapsulating the test words used by fairness metrics into standard
objects called queries.
- Computing a fairness metric on a given pre-trained word embedding model
using user-given queries.
-WEFE also standardizes the process of mitigating bias through an interface similar
-to the ``scikit-learn`` ``fit-transform``.
-This standardization separates the mitigation process into two stages:
-- The logic of calculating the transformation to be performed on the model (``fit``).
-- The execution of the mitigation transformation on the model (``transform``).
+On the other hand, WEFE standardizes all mitigation methods through an interface
+inherited from `scikit-learn `_ basic data transformations:
+the ``fit-transform`` interface. This standardization separates the mitigation
+process into two stages:
+
+- The first step, ``fit``, learn the corresponding mitigation transformation.
+- The ``transform`` method applies the transformation learned in the previous step
+ to words residing in the original embedding space.
+
+.. note::
+
+ To learn more about the measurement or mitigation framework, visit
+ :ref:`measurement framework` or
+ :ref:`mitigation framework` respectively, in the Conceptual Guides Section.
+
+ For practical tutorials on how to measure or mitigate bias, visit
+ :ref:`bias measurement` or :ref:`bias mitigation` respectively
+ in the WEFE User Guide.
Motivation and objectives
=========================
-Word Embeddings models are a core component in almost all NLP downstream systems.
+Word Embedding models are a core component in almost all NLP downstream systems.
Several studies have shown that they are prone to inherit stereotypical social
biases from the corpus they were built on.
The common method for quantifying bias is to use a metric that calculates the
@@ -104,10 +121,11 @@ Roadmap
We expect in the future to:
+- Implement measurement framework for contextualized embedding models.
- Implement new queries on different criteria.
- Create a single script that evaluates different embedding models under different bias criteria.
- From the previous script, rank as many embeddings available on the web as possible.
-- Implement a visualization module.
+- Implement a simple visualization module.
- Implement p-values mixin that applies for all metrics that accept two targets.
License
@@ -148,6 +166,7 @@ You are also welcome to do a pull request or publish an issue in the
Acknowledgments
===============
+
This work was funded by the
`Millennium Institute for Foundational Research on Data (IMFD) `_.
It is also sponsored by `National Center of Artificial Intelligence of Chile (CENIA) `_.
\ No newline at end of file
diff --git a/doc/getting_started/quick_start.rst b/docs/getting_started/quick_start.rst
similarity index 93%
rename from doc/getting_started/quick_start.rst
rename to docs/getting_started/quick_start.rst
index f897453..3ea3da9 100644
--- a/doc/getting_started/quick_start.rst
+++ b/docs/getting_started/quick_start.rst
@@ -1,3 +1,5 @@
+.. _quick_start:
+
===========
Quick Start
===========
@@ -98,5 +100,4 @@ with the parameters created in the past steps. In this case we use the
A score greater than 0 indicates that there is indeed a biased relationship between
women and the arts with respect to men and science.
-For more advanced usage, visit user the `User Guide `_
-section.
+For more advanced usage, visit user the :ref:`bias measurement` in the User Guide.
diff --git a/doc/images/WEAT_replication.png b/docs/images/WEAT_replication.png
similarity index 100%
rename from doc/images/WEAT_replication.png
rename to docs/images/WEAT_replication.png
diff --git a/doc/images/conceptnet_rnsb.png b/docs/images/conceptnet_rnsb.png
similarity index 100%
rename from doc/images/conceptnet_rnsb.png
rename to docs/images/conceptnet_rnsb.png
diff --git a/doc/images/diagram_1.png b/docs/images/diagram_1.png
similarity index 100%
rename from doc/images/diagram_1.png
rename to docs/images/diagram_1.png
diff --git a/doc/images/diagram_2.png b/docs/images/diagram_2.png
similarity index 100%
rename from doc/images/diagram_2.png
rename to docs/images/diagram_2.png
diff --git a/doc/images/fair_rnsb.png b/docs/images/fair_rnsb.png
similarity index 100%
rename from doc/images/fair_rnsb.png
rename to docs/images/fair_rnsb.png
diff --git a/doc/images/glove_rnsb.png b/docs/images/glove_rnsb.png
similarity index 100%
rename from doc/images/glove_rnsb.png
rename to docs/images/glove_rnsb.png
diff --git a/docs/images/measurement_user_guide/output_40_0.png b/docs/images/measurement_user_guide/output_40_0.png
new file mode 100644
index 0000000..af82d58
Binary files /dev/null and b/docs/images/measurement_user_guide/output_40_0.png differ
diff --git a/docs/images/measurement_user_guide/output_43_0.png b/docs/images/measurement_user_guide/output_43_0.png
new file mode 100644
index 0000000..82a9e8d
Binary files /dev/null and b/docs/images/measurement_user_guide/output_43_0.png differ
diff --git a/docs/images/measurement_user_guide/output_46_0.png b/docs/images/measurement_user_guide/output_46_0.png
new file mode 100644
index 0000000..70e033b
Binary files /dev/null and b/docs/images/measurement_user_guide/output_46_0.png differ
diff --git a/docs/images/measurement_user_guide/output_60_0.png b/docs/images/measurement_user_guide/output_60_0.png
new file mode 100644
index 0000000..7cf8131
Binary files /dev/null and b/docs/images/measurement_user_guide/output_60_0.png differ
diff --git a/docs/images/measurement_user_guide/output_61_0.png b/docs/images/measurement_user_guide/output_61_0.png
new file mode 100644
index 0000000..d32910a
Binary files /dev/null and b/docs/images/measurement_user_guide/output_61_0.png differ
diff --git a/docs/images/measurement_user_guide/output_67_0.png b/docs/images/measurement_user_guide/output_67_0.png
new file mode 100644
index 0000000..d05ed99
Binary files /dev/null and b/docs/images/measurement_user_guide/output_67_0.png differ
diff --git a/doc/index.rst b/docs/index.rst
similarity index 90%
rename from doc/index.rst
rename to docs/index.rst
index 63791b4..35c2cec 100644
--- a/doc/index.rst
+++ b/docs/index.rst
@@ -21,8 +21,8 @@ the detailed API documentation and extensive examples.
:maxdepth: 2
:caption: User Guide
- user_guide/measurement.ipynb
- user_guide/mitigation.ipynb
+ user_guide/measurement_user_guide.rst
+ user_guide/mitigation_user_guide.rst
user_guide/loading_embeddings.rst
.. toctree::
diff --git a/doc/make.bat b/docs/make.bat
similarity index 100%
rename from doc/make.bat
rename to docs/make.bat
diff --git a/doc/user_guide/contribute.rst b/docs/user_guide/contribute.rst
similarity index 96%
rename from doc/user_guide/contribute.rst
rename to docs/user_guide/contribute.rst
index bb85420..f5956cd 100644
--- a/doc/user_guide/contribute.rst
+++ b/docs/user_guide/contribute.rst
@@ -314,7 +314,7 @@ Using the steps previously seen, a sample metric is implemented:
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -322,14 +322,14 @@ Using the steps previously seen, a sample metric is implemented:
A list of preprocessor options allows searching for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -527,7 +527,7 @@ the above.
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -535,13 +535,13 @@ the above.
A list of preprocessor options allows searching for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of the model.
+ ``{}`` allows searching first for the original words in the vocabulary of the model.
In case some of them are not found, ``{"lowercase": True, "strip_accents": True}``
is executed on these words and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
diff --git a/doc/user_guide/loading_embeddings.rst b/docs/user_guide/loading_embeddings.rst
similarity index 89%
rename from doc/user_guide/loading_embeddings.rst
rename to docs/user_guide/loading_embeddings.rst
index d1ad52c..716fd5b 100644
--- a/doc/user_guide/loading_embeddings.rst
+++ b/docs/user_guide/loading_embeddings.rst
@@ -113,7 +113,7 @@ it can also be used via :code:`FastTextKeyedVectors`.
Flair
=====
-WEFE does not yet support flair interfaces.
+WEFE does not support flair interfaces.
However, you can use static embeddings of flair
(
`Classic Word Embeddings `_
@@ -121,13 +121,14 @@ However, you can use static embeddings of flair
The following code is an example of this:
>>> from flair.embeddings import WordEmbeddings
+>>> from wefe.utils import flair_to_gensim
>>>
->>> glove_embedding = WordEmbeddings('glove') # 100 dim glove
+>>> # could be any of the Classic Word Embeddings model list.
+>>> flair_model_name = "glove"
>>>
->>> # extract KeyedVectors object
->>> glove_keyed_vectors = glove_embedding.precomputed_word_embeddings
->>> glove_100 = WordEmbeddingModel(glove_keyed_vectors, 'glove-100')
+>>> flair_model = flair_to_gensim(WordEmbeddings(flair_model_name))
+>>> wefe_model = WordEmbeddingModel(flair_model, flair_model_name)
>>>
->>> result = weat.run_query(query, glove_100)
+>>> result = weat.run_query(query, wefe_model)
>>> print(result)
{'query_name': 'Male terms and Female terms wrt Career and Family', 'result': 1.0486683}
\ No newline at end of file
diff --git a/docs/user_guide/measurement_user_guide.rst b/docs/user_guide/measurement_user_guide.rst
new file mode 100644
index 0000000..c307069
--- /dev/null
+++ b/docs/user_guide/measurement_user_guide.rst
@@ -0,0 +1,1576 @@
+.. _bias measurement:
+
+Bias Measurement
+================
+
+The following guide is designed to present the more general details on
+using the package to measure bias. The following sections show:
+
+* how to run a simple query using ``Glove`` embedding model.
+* how to run multiple queries on multiple embeddings.
+* how to compare the results obtained from running multiple
+ sets of queries on multiple embeddings using different metrics
+ through ranking calculation.
+* how to calculate the correlations between the
+ rankings obtained.
+
+.. warning::
+
+ To accurately study and reduce biases contained in word embeddings, queries may
+ contain words that could be offensive to certain groups or individuals.
+ The relationships studied between these words DO NOT represent the
+ ideas, thoughts or beliefs of the authors of this library.
+ This warning applies to all documentation.
+
+.. note::
+
+ If you are not familiar with the concepts of query, target and attribute
+ set, please visit the :ref:`measurement framework`
+ on the library’s conceptual guides. These concepts are widely used in the
+ following sections.
+
+.. note::
+
+ For a list of metrics implemented in WEFE, refer to the
+ :ref:`metrics section` of the API reference.
+
+
+Run a Query
+-----------
+
+The following subsections explains how to run a simple query that
+measures gender bias on
+`Glove `_.
+The example uses the Word Embedding Association Test (:class:`~wefe.metrics.WEAT.WEAT`)
+metric quantifying the bias in the embeddings model. Below we show the three usual
+steps for performing a query in WEFE:
+
+.. note::
+
+ :class:`~wefe.metrics.WEAT.WEAT` is a fairness metric that quantifies the relationship
+ between two sets of target words (sets of words intended to denote a social
+ groups as men and women) and two sets of attribute words (sets of words
+ representing some attitude, characteristic, trait, occupational field,
+ etc. that can be associated with individuals from any social group).
+
+ The closer its value is to 0, the less biased the model is.
+
+ Visit the metrics documentation (:class:`~wefe.metrics.WEAT.WEAT`) for more information.
+
+
+Load a word embeddings model as a ``WordEmbeddingModel`` object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Load the word embedding model and then wrap it using a
+:class:`~wefe.word_embedding_model.WordEmbeddingModel` (class that allows WEFE to handle the models).
+
+WEFE bases all its operations on word embeddings using Gensim’s
+``KeyedVectors`` interface. Any model that can be loaded using
+``KeyedVectors`` will be compatible with WEFE. The following example uses a 25-dim pre-trained ``Glove`` model using a
+twitter dataset loaded using `gensim-data `_.
+
+
+.. note::
+ Visit `gensim-data repository`_.
+ to find the complete list of published pre-trained models ready to use.
+
+.. code:: ipython3
+
+ import gensim.downloader as api
+
+ from wefe.datasets import load_weat
+ from wefe.metrics import WEAT
+ from wefe.query import Query
+ from wefe.word_embedding_model import WordEmbeddingModel
+
+ twitter_25 = api.load("glove-twitter-25")
+ # WordEmbeddingModel receives as first argument a KeyedVectors model
+ # and the second argument the model name.
+ model = WordEmbeddingModel(twitter_25, "glove twitter dim=25")
+
+Create the query using a ``Query`` object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Define the target and attribute word sets and create a :class:`~wefe.query.Query` object
+that contains them.
+
+For this initial example, a query is used to study the association
+between gender with respect to family and career. The words used are
+taken from the set of words used in the *Semantics derived automatically
+from language corpora contain human-like biases* paper, which are
+included in the ``datasets`` module.
+
+.. code:: ipython3
+
+ gender_query = Query(
+ target_sets=[
+ ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"],
+ ["male", "man", "boy", "brother", "he", "him", "his", "son"],
+ ],
+ attribute_sets=[
+ [
+ "home",
+ "parents",
+ "children",
+ "family",
+ "cousins",
+ "marriage",
+ "wedding",
+ "relatives",
+ ],
+ [
+ "executive",
+ "management",
+ "professional",
+ "corporation",
+ "salary",
+ "office",
+ "business",
+ "career",
+ ],
+ ],
+ target_sets_names=["Female terms", "Male Terms"],
+ attribute_sets_names=["Family", "Careers"],
+ )
+
+ gender_query
+
+
+
+
+
+.. parsed-literal::
+
+
+
+
+
+Run the Query
+~~~~~~~~~~~~~
+
+Instantiate the metric that you will use and then execute ``run_query``
+with the parameters created in the previous steps.
+
+Any bias measurement process at WEFE consists of the following steps:
+
+1. Metric arguments checking.
+2. Transform the word sets into word embeddings.
+3. Calculate the metric.
+
+In this case we use the :class:`~wefe.metrics.WEAT.WEAT` metric (proposed in the
+same paper of the set of words used in the query).
+
+.. code:: ipython3
+
+ metric = WEAT()
+ result = metric.run_query(gender_query, model)
+ result
+
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'Female terms and Male Terms wrt Family and Careers',
+ 'result': 0.31658412935212255,
+ 'weat': 0.31658412935212255,
+ 'effect_size': 0.6779439085309583,
+ 'p_value': nan}
+
+
+
+By default, the results are a ``dict`` containing the query name (in the
+key ``query_name``) and the calculated value of the metric in the
+``result`` key. It also contains a key with the name and the value of
+the calculated metric (which is duplicated in the “results” key).
+
+Depending on the metric class used, the result ``dict`` can also return
+more metrics, detailed word-by-word values or other statistics like
+p-values. Also some metrics allow you to change the default value in
+results.
+
+Details of all the metrics implemented, their parameters and
+examples of execution can be found at :ref:`metrics section `.
+
+Run Query Arguments
+-------------------
+
+Each metric allows varying the behavior of ``run_query`` according to
+different parameters. There are parameters to customize the
+transformation of the sets of words to sets of embeddings, others to
+warn errors or modify which calculation method the metric use.
+
+.. note::
+
+ Each metric implements the ``run_query`` method with different arguments.
+ Visit their API documentation for more information.
+
+
+For example, ``run_query`` can be instructed to ``return effect_size``
+in the ``result`` key by setting ``return_effect_size`` as ``True``.
+Note that this parameter is only of the class :class:`~wefe.metrics.WEAT.WEAT``.
+
+
+.. code:: ipython3
+
+ weat = WEAT()
+ result = weat.run_query(gender_query, model, return_effect_size=True)
+ result
+
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'Female terms and Male Terms wrt Family and Careers',
+ 'result': 0.6779439085309583,
+ 'weat': 0.31658412935212255,
+ 'effect_size': 0.6779439085309583,
+ 'p_value': nan}
+
+
+
+You can also request ``run_query`` to run the statistical significance
+calculation by setting ``calculate_p_value`` as ``True``. This checks
+how many queries generated from permutations (controlled by the
+parameter ``p_value_iterations``) of the target sets obtain values
+greater than those obtained by the original query.
+
+.. code:: ipython3
+
+ weat = WEAT()
+ result = weat.run_query(
+ gender_query, model, calculate_p_value=True, p_value_iterations=5000
+ )
+ result
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'Female terms and Male Terms wrt Family and Careers',
+ 'result': 0.31658412935212255,
+ 'weat': 0.31658412935212255,
+ 'effect_size': 0.6779439085309583,
+ 'p_value': 0.08418316336732654}
+
+
+
+Out of Vocabulary Words and Word Preprocessors
+----------------------------------------------
+
+It is common in the literature to find bias tests whose tagret sets are
+common names of social groups. These names are commonly cased and may
+contain special characters. There are several embedding models whose
+words are not cased or do not have accents or other special characters,
+as for example, in ``Glove``. This implies that a query with target sets
+composed by names executed in ``Glove`` (without any preprocessing of
+the words) could produce erroneous results because WEFE will not be able
+to find the names in the model vocabulary.
+
+.. note::
+
+ Some well-known word sets are already provided by the package and can be
+ easily loaded by the user through the :ref:`datasets ` module. From here on,
+ the tutorial use the words defined in the study *Semantics derived
+ automatically from language corpora contain human-like biases*, the same
+ that proposed the :class:`~wefe.metrics.WEAT.WEAT` metric.
+
+
+.. code:: ipython3
+
+ # load the weat word sets.
+ word_sets = load_weat()
+
+ # print a set of european american common names.
+ print(word_sets["european_american_names_5"])
+
+
+.. parsed-literal::
+
+ ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']
+
+
+The following query compares European-American and African-American
+names with respect to pleasant and unpleasant attributes.
+
+.. note::
+
+ It can be indicated to ``run_query`` to log the words that were lost in
+ the transformation to vectors by using the parameter
+ ``warn_not_found_words`` as ``True``.
+
+.. code:: ipython3
+
+ ethnicity_query = Query(
+ [word_sets["european_american_names_5"], word_sets["african_american_names_5"]],
+ [word_sets["pleasant_5"], word_sets["unpleasant_5"]],
+ ["European american names", "African american names"],
+ ["Pleasant", "Unpleasant"],
+ )
+ result = weat.run_query(ethnicity_query, model, warn_not_found_words=True,)
+ result
+
+
+
+.. parsed-literal::
+
+ WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']
+ WARNING:root:The transformation of 'European american names' into glove twitter dim=25 embeddings lost proportionally more words than specified in 'lost_words_threshold': 1.0 lost with respect to 0.2 maximum loss allowed.
+ WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'Jamel', 'Theo', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Lavon', 'Marcellus', 'Wardell', 'Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya', 'Yolanda', 'Malika', 'Yvette']
+ WARNING:root:The transformation of 'African american names' into glove twitter dim=25 embeddings lost proportionally more words than specified in 'lost_words_threshold': 1.0 lost with respect to 0.2 maximum loss allowed.
+ ERROR:root:At least one set of 'European american names and African american names wrt Pleasant and Unpleasant' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',
+ 'result': nan,
+ 'weat': nan,
+ 'effect_size': nan}
+
+
+
+.. warning::
+
+ If more than 20% of the words from any of the word sets of the query are
+ lost during the transformation to embeddings, the result of the metric
+ will be ``np.nan``. This behavior can be changed using a float number
+ parameter called ``lost_vocabulary_threshold``.
+
+Word Preprocessors
+~~~~~~~~~~~~~~~~~~
+
+Any ``run_query`` method allows preprocessing each word before they are searched in the model's
+vocabulary through the parameter ``preprocessors`` (list of one or more preprocessor).
+This parameter accepts a list of individual preprocessors, which are defined below:
+
+A ``preprocessor`` is a dictionary that specifies what processing(s) are
+performed on each word before its looked up in the model vocabulary.
+For example, the ``preprocessor``
+``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase
+and remove the accent from each word before searching for them in the
+model vocabulary. Note that an empty dictionary ``{}`` indicates that no
+preprocessing is done.
+
+The possible options for a preprocessor are:
+
+- ``lowercase``: ``bool``. Indicates that the words are transformed to lowercase.
+- ``uppercase``: ``bool``. Indicates that the words are transformed to uppercase.
+- ``titlecase``: ``bool``. Indicates that the words are transformed to titlecase.
+- ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that the accents of the words
+ are eliminated. The stripping type can be specified. True uses 'unicode' by default.
+- ``preprocessor``: ``Callable``. It receives a function that operates on each word.
+ In the case of specifying a function, it overrides the default preprocessor
+ (i.e., the previous options stop working).
+
+
+A list of preprocessor options allows searching for several
+variants of the words into the model. For example, the preprocessors
+``[{}, {"lowercase": True, "strip_accents": True}]``
+``{}`` allows searching first for the original words in the vocabulary of the model.
+In case some of them are not found, ``{"lowercase": True, "strip_accents": True}``
+is executed on these words and then they are searched in the model vocabulary.
+
+By default (in case there is more than one preprocessor in the list) the first
+preprocessed word found in the embeddings model is used.
+This behavior can be controlled by the ``strategy`` parameter of ``run_query``.
+
+In the following example, we provide a list with only one
+preprocessor that instructs ``run_query`` to lowercase and remove all
+accents from every word before they are searched in the embeddings
+model.
+
+
+.. code:: ipython3
+
+ weat = WEAT()
+ result = weat.run_query(
+ ethnicity_query,
+ model,
+ preprocessors=[{"lowercase": True, "strip_accents": True}],
+ warn_not_found_words=True,
+ )
+ result
+
+
+.. parsed-literal::
+
+ WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['wardell']
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',
+ 'result': 3.7529150679125456,
+ 'weat': 3.7529150679125456,
+ 'effect_size': 1.2746819330405683,
+ 'p_value': nan}
+
+
+
+It may happen that it is more important to find the original word and in
+the case of not finding it, then preprocess it and look it up in the
+vocabulary. This behavior can be specified in ``preprocessors`` list by
+first specifying an empty preprocessor ``{}`` and then the preprocessor
+that converts to lowercase and removes accents.
+
+
+.. code:: ipython3
+
+ weat = WEAT()
+ result = weat.run_query(
+ ethnicity_query,
+ model,
+ preprocessors=[
+ {}, # empty preprocessor, search for the original words.
+ {
+ "lowercase": True,
+ "strip_accents": True,
+ }, # search for lowercase and no accent words.
+ ],
+ warn_not_found_words=True,
+ )
+
+ result
+
+
+.. parsed-literal::
+
+ WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']
+ WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'Jamel', 'Theo', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Lavon', 'Marcellus', 'Wardell', 'wardell', 'Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya', 'Yolanda', 'Malika', 'Yvette']
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',
+ 'result': 3.7529150679125456,
+ 'weat': 3.7529150679125456,
+ 'effect_size': 1.2746819330405683,
+ 'p_value': nan}
+
+
+
+The number of preprocessing steps can be increased as needed. For
+example, we can complex the above preprocessor to first search for the
+original words, then for the lowercase words, and finally for the
+lowercase words without accents.
+
+
+.. code:: ipython3
+
+ weat = WEAT()
+ result = weat.run_query(
+ ethnicity_query,
+ model,
+ preprocessors=[
+ {}, # first step: empty preprocessor, search for the original words.
+ {"lowercase": True,}, # second step: search for lowercase.
+ {
+ "lowercase": True,
+ "strip_accents": True,
+ }, # third step: search for lowercase and no accent words.
+ ],
+ warn_not_found_words=True,
+ )
+
+ result
+
+
+.. parsed-literal::
+
+ WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']
+ WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'Jamel', 'Theo', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Lavon', 'Marcellus', 'Wardell', 'wardell', 'wardell', 'Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya', 'Yolanda', 'Malika', 'Yvette']
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',
+ 'result': 3.7529150679125456,
+ 'weat': 3.7529150679125456,
+ 'effect_size': 1.2746819330405683,
+ 'p_value': nan}
+
+
+
+It is also possible to change the behavior of the search by including
+not only the first word, but all the words generated by the
+preprocessors. This can be controlled by specifying the parameter
+``strategy=all``.
+
+.. code:: ipython3
+
+ weat = WEAT()
+ result = weat.run_query(
+ ethnicity_query,
+ model,
+ preprocessors=[
+ {}, # first step: empty preprocessor, search for the original words.
+ {"lowercase": True,}, # second step: search for lowercase .
+ {"uppercase": True,}, # third step: search for uppercase.
+ ],
+ strategy="all",
+ warn_not_found_words=True,
+ )
+
+ result
+
+
+
+.. parsed-literal::
+
+ WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'ADAM', 'Harry', 'HARRY', 'Josh', 'JOSH', 'Roger', 'ROGER', 'Alan', 'ALAN', 'Frank', 'FRANK', 'Justin', 'JUSTIN', 'Ryan', 'RYAN', 'Andrew', 'ANDREW', 'Jack', 'JACK', 'Matthew', 'MATTHEW', 'Stephen', 'STEPHEN', 'Brad', 'BRAD', 'Greg', 'GREG', 'Paul', 'PAUL', 'Jonathan', 'JONATHAN', 'Peter', 'PETER', 'Amanda', 'AMANDA', 'Courtney', 'COURTNEY', 'Heather', 'HEATHER', 'Melanie', 'MELANIE', 'Sara', 'SARA', 'Amber', 'AMBER', 'Katie', 'KATIE', 'Betsy', 'BETSY', 'Kristin', 'KRISTIN', 'Nancy', 'NANCY', 'Stephanie', 'STEPHANIE', 'Ellen', 'ELLEN', 'Lauren', 'LAUREN', 'Colleen', 'COLLEEN', 'Emily', 'EMILY', 'Megan', 'MEGAN', 'Rachel', 'RACHEL']
+ WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'ALONZO', 'Jamel', 'JAMEL', 'Theo', 'THEO', 'Alphonse', 'ALPHONSE', 'Jerome', 'JEROME', 'Leroy', 'LEROY', 'Torrance', 'TORRANCE', 'Darnell', 'DARNELL', 'Lamar', 'LAMAR', 'Lionel', 'LIONEL', 'Tyree', 'TYREE', 'Deion', 'DEION', 'Lamont', 'LAMONT', 'Malik', 'MALIK', 'Terrence', 'TERRENCE', 'Tyrone', 'TYRONE', 'Lavon', 'LAVON', 'Marcellus', 'MARCELLUS', 'Wardell', 'wardell', 'WARDELL', 'Nichelle', 'NICHELLE', 'Shereen', 'SHEREEN', 'Ebony', 'EBONY', 'Latisha', 'LATISHA', 'Shaniqua', 'SHANIQUA', 'Jasmine', 'JASMINE', 'Tanisha', 'TANISHA', 'Tia', 'TIA', 'Lakisha', 'LAKISHA', 'Latoya', 'LATOYA', 'Yolanda', 'YOLANDA', 'Malika', 'MALIKA', 'Yvette', 'YVETTE']
+ WARNING:root:The following words from set 'Pleasant' do not exist within the vocabulary of glove twitter dim=25: ['CARESS', 'FREEDOM', 'HEALTH', 'LOVE', 'PEACE', 'CHEER', 'FRIEND', 'HEAVEN', 'LOYAL', 'PLEASURE', 'DIAMOND', 'GENTLE', 'HONEST', 'LUCKY', 'RAINBOW', 'DIPLOMA', 'GIFT', 'HONOR', 'MIRACLE', 'SUNRISE', 'FAMILY', 'HAPPY', 'LAUGHTER', 'PARADISE', 'VACATION']
+ WARNING:root:The following words from set 'Unpleasant' do not exist within the vocabulary of glove twitter dim=25: ['ABUSE', 'CRASH', 'FILTH', 'MURDER', 'SICKNESS', 'ACCIDENT', 'DEATH', 'GRIEF', 'POISON', 'STINK', 'ASSAULT', 'DISASTER', 'HATRED', 'POLLUTE', 'TRAGEDY', 'DIVORCE', 'JAIL', 'POVERTY', 'UGLY', 'CANCER', 'KILL', 'ROTTEN', 'VOMIT', 'AGONY', 'PRISON']
+
+
+
+
+.. parsed-literal::
+
+ {'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',
+ 'result': 3.7529150679125456,
+ 'weat': 3.7529150679125456,
+ 'effect_size': 1.2746819330405683,
+ 'p_value': nan}
+
+
+
+Running Multiple Queries
+------------------------
+
+It is usual to want to test many queries of some bias criterion (gender,
+ethnicity, religion, politics, socioeconomic, among others) on several
+models at the same time. Trying to use ``run_query`` on each pair
+embedding-query can be a bit complex and could require extra work to
+implement.
+
+This is why WEFE also implements a function to test multiple
+queries on various word embedding models in a single call: the
+:func:`~wefe.utils.run_queries` util.
+
+The following code shows how to run various gender queries on ``Glove``
+embedding models with different dimensions trained from the Twitter
+dataset. The queries are executed using :class:`~wefe.metrics.WEAT.WEAT` metric.
+
+.. code:: ipython3
+
+ import gensim.downloader as api
+
+ from wefe.datasets import load_weat
+ from wefe.metrics import RNSB, WEAT
+ from wefe.query import Query
+ from wefe.utils import run_queries
+ from wefe.word_embedding_model import WordEmbeddingModel
+
+Load the models
+~~~~~~~~~~~~~~~
+
+Load three different Glove Twitter embedding models. These models were
+trained using the same dataset varying the number of embedding
+dimensions.
+
+.. code:: ipython3
+
+ model_1 = WordEmbeddingModel(api.load("glove-twitter-25"), "glove twitter dim=25")
+ model_2 = WordEmbeddingModel(api.load("glove-twitter-50"), "glove twitter dim=50")
+ model_3 = WordEmbeddingModel(api.load("glove-twitter-100"), "glove twitter dim=100")
+
+ models = [model_1, model_2, model_3]
+
+
+
+Load the word sets and create the queries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now, we load the :class:`~wefe.metrics.WEAT.WEAT` word set and create three queries. The
+three queries are intended to measure gender bias.
+
+
+.. code:: ipython3
+
+ # Load the WEAT word sets
+ word_sets = load_weat()
+
+ # Create gender queries
+ gender_query_1 = Query(
+ [word_sets["male_terms"], word_sets["female_terms"]],
+ [word_sets["career"], word_sets["family"]],
+ ["Male terms", "Female terms"],
+ ["Career", "Family"],
+ )
+
+ gender_query_2 = Query(
+ [word_sets["male_terms"], word_sets["female_terms"]],
+ [word_sets["science"], word_sets["arts"]],
+ ["Male terms", "Female terms"],
+ ["Science", "Arts"],
+ )
+
+ gender_query_3 = Query(
+ [word_sets["male_terms"], word_sets["female_terms"]],
+ [word_sets["math"], word_sets["arts_2"]],
+ ["Male terms", "Female terms"],
+ ["Math", "Arts"],
+ )
+
+ gender_queries = [gender_query_1, gender_query_2, gender_query_3]
+
+
+Run the queries on all Word Embeddings using WEAT
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To run the list of queries and models, we call :func:`~wefe.utils.run_queries` using the
+parameters defined in the previous step. The mandatory parameters of the
+function are 3:
+
+- a metric,
+- a list of queries, and,
+- a list of embedding models.
+
+It is also possible to provide a name for the criterion studied in this
+set of queries through the parameter ``queries_set_name``.
+
+
+.. code:: ipython3
+
+ WEAT_gender_results = run_queries(
+ WEAT, gender_queries, models, queries_set_name="Gender Queries"
+ )
+ WEAT_gender_results
+
+
+
+.. parsed-literal::
+
+ WARNING:root:The transformation of 'Science' into glove twitter dim=25 embeddings lost proportionally more words than specified in 'lost_words_threshold': 0.25 lost with respect to 0.2 maximum loss allowed.
+ ERROR:root:At least one set of 'Male terms and Female terms wrt Science and Arts' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.
+ WARNING:root:The transformation of 'Science' into glove twitter dim=50 embeddings lost proportionally more words than specified in 'lost_words_threshold': 0.25 lost with respect to 0.2 maximum loss allowed.
+ ERROR:root:At least one set of 'Male terms and Female terms wrt Science and Arts' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.
+ WARNING:root:The transformation of 'Science' into glove twitter dim=100 embeddings lost proportionally more words than specified in 'lost_words_threshold': 0.25 lost with respect to 0.2 maximum loss allowed.
+ ERROR:root:At least one set of 'Male terms and Female terms wrt Science and Arts' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
query_name
+
Male terms and Female terms wrt Career and Family
+
Male terms and Female terms wrt Science and Arts
+
Male terms and Female terms wrt Math and Arts
+
+
+
model_name
+
+
+
+
+
+
+
+
glove twitter dim=25
+
0.316584
+
NaN
+
-0.022133
+
+
+
glove twitter dim=50
+
0.363743
+
NaN
+
-0.272334
+
+
+
glove twitter dim=100
+
0.385352
+
NaN
+
-0.082544
+
+
+
+
+
+
+
+Setting metric params
+~~~~~~~~~~~~~~~~~~~~~
+
+There is a whole column that has no results. As the warnings point out,
+when transforming the words of the sets into embeddings, there is a loss
+of words that is greater than the allowed by the parameter
+``lost_vocabulary_threshold``. In this case, it would be very useful to
+use the word preprocessors seen above.
+
+:func:`~wefe.utils.run_queries`, accept specific parameters for each metric. These extra
+parameters for the metric can be passed through ``metric_params``
+parameter. In this case, a ``preprocessor`` is provided to lowercase the
+words before searching for them in the models’ vocabularies.
+
+
+.. code:: ipython3
+
+ WEAT_gender_results = run_queries(
+ WEAT,
+ gender_queries,
+ models,
+ metric_params={"preprocessors": [{"lowercase": True}]},
+ queries_set_name="Gender Queries",
+ )
+
+ WEAT_gender_results
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
query_name
+
Male terms and Female terms wrt Career and Family
+
Male terms and Female terms wrt Science and Arts
+
Male terms and Female terms wrt Math and Arts
+
+
+
model_name
+
+
+
+
+
+
+
+
glove twitter dim=25
+
0.316584
+
0.167431
+
-0.033912
+
+
+
glove twitter dim=50
+
0.363743
+
-0.084690
+
-0.307589
+
+
+
glove twitter dim=100
+
0.385352
+
0.099632
+
-0.155790
+
+
+
+
+
+
+
+No query was null in these results.
+
+
+Plot the results in a barplot
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The library also provides an easy way to plot the results obtained from
+a ``run_queries`` execution into a `plotly `_ barplot.
+
+.. code:: ipython3
+
+ from wefe.utils import plot_queries_results, run_queries
+
+ # Plot the results
+ plot_queries_results(WEAT_gender_results).show()
+
+
+
+
+.. image:: ../images/measurement_user_guide/output_40_0.png
+
+
+Aggregating Results
+-------------------
+
+The execution of :func:`~wefe.utils.run_queries` provided many results evaluating the
+gender bias in the tested embeddings. However, these results alone do
+not comprehensively report the biases observed in all of these queries.
+One way to obtain an overall view of bias is by aggregating results by
+model.
+
+For WEAT, a simple way to aggregate the results is to average their
+absolute values. When running :func:`~wefe.utils.run_queries`, it is possible to specify
+that the results be aggregated by model by setting ``aggregate_results``
+as ``True``
+
+The aggregation function can be specified through the
+``aggregation_function`` parameter. This parameter accepts a list of
+predefined aggregations as well as a custom function that operates on
+the results dataframe. The aggregation functions available are:
+
+- Average ``avg``.
+- Average of the absolute values ``abs_avg``.
+- Sum ``sum``.
+- Sum of the absolute values, ``abs_sum``.
+
+.. note::
+
+ Notice that some functions are more appropriate for certain metrics. For
+ metrics returning only positive numbers, all the previous aggregation
+ functions would be OK. In contrast, metrics that return real values
+ (e.g., :class:`~wefe.metrics.WEAT.WEAT` , :class:`~wefe.metrics.RND.RND` , etc…),
+ aggregation functions such as sum would make positive and negative outputs to cancel
+ each other.
+
+.. code:: ipython3
+
+ WEAT_gender_results_agg = run_queries(
+ WEAT,
+ gender_queries,
+ models,
+ metric_params={"preprocessors": [{"lowercase": True}]},
+ aggregate_results=True,
+ aggregation_function="abs_avg",
+ queries_set_name="Gender Queries",
+ )
+ WEAT_gender_results_agg
+
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
+
Male terms and Female terms wrt Career and Family
+
Male terms and Female terms wrt Science and Arts
+
Male terms and Female terms wrt Math and Arts
+
WEAT: Gender Queries average of abs values score
+
+
+
model_name
+
+
+
+
+
+
+
+
+
glove twitter dim=25
+
0.316584
+
0.167431
+
-0.033912
+
0.172642
+
+
+
glove twitter dim=50
+
0.363743
+
-0.084690
+
-0.307589
+
0.252007
+
+
+
glove twitter dim=100
+
0.385352
+
0.099632
+
-0.155790
+
0.213591
+
+
+
+
+
+
+
+.. code:: ipython3
+
+ plot_queries_results(WEAT_gender_results_agg).show()
+
+
+
+
+.. image:: ../images/measurement_user_guide/output_43_0.png
+
+
+It is also possible to ask the function to return only the aggregated
+results using the parameter ``return_only_aggregation``
+
+
+.. code:: ipython3
+
+ WEAT_gender_results_only_agg = run_queries(
+ WEAT,
+ gender_queries,
+ models,
+ metric_params={"preprocessors": [{"lowercase": True}]},
+ aggregate_results=True,
+ aggregation_function="abs_avg",
+ return_only_aggregation=True,
+ queries_set_name="Gender Queries",
+ )
+ WEAT_gender_results_only_agg
+
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
+
WEAT: Gender Queries average of abs values score
+
+
+
model_name
+
+
+
+
+
+
glove twitter dim=25
+
0.172642
+
+
+
glove twitter dim=50
+
0.252007
+
+
+
glove twitter dim=100
+
0.213591
+
+
+
+
+
+
+
+.. code:: ipython3
+
+ fig = plot_queries_results(WEAT_gender_results_only_agg)
+ fig.show()
+
+
+
+
+.. image:: ../images/measurement_user_guide/output_46_0.png
+
+
+Model Ranking
+-------------
+
+It may be desirable to obtain an overall view of the bias by model using
+different metrics or bias criteria. While the aggregate values can be
+compared directly, two problems are likely to be encountered:
+
+1. One type of bias criterion can dominate the other because of
+ significant differences in magnitude.
+
+2. Different metrics can operate on different scales, which makes them
+ difficult to compare.
+
+To show these problems, suppose we have:
+
+- Two sets of queries: one that explores gender biases and
+ another that explores ethnicity biases.
+- Three ``Glove`` models of 25, 50 and 100 dimensions trained on the same
+ twitter dataset.
+
+Then we run :func:`~wefe.utils.run_queries` on this set of model-queries using
+:class:`~wefe.metrics.WEAT.WEAT`, and to corroborate the results obtained, we also use
+Relative Negative Sentiment Bias (:class:`~wefe.metrics.RNSB.RNSB`).
+
+1. The first problem occurs when the bias scores obtained from one set
+ of queries are much higher than those from the other set, even when
+ the same metric is used.
+
+When executing :func:`~wefe.utils.run_queries` with the gender and ethnicity queries on
+the models described above, the results obtained are as follows:
+
+
++--------------+---------------------------+---------------------------+
+| model_name | WEAT: Gender Queries | WEAT: Ethnicity Queries |
+| | average of abs values | average of abs values |
+| | score | score |
++==============+===========================+===========================+
+| glove | 0.210556 | 2.64632 |
+| twitter | | |
+| dim=25 | | |
++--------------+---------------------------+---------------------------+
+| glove | 0.292373 | 1.87431 |
+| twitter | | |
+| dim=50 | | |
++--------------+---------------------------+---------------------------+
+| glove | 0.225116 | 1.78469 |
+| twitter | | |
+| dim=100 | | |
++--------------+---------------------------+---------------------------+
+
+As can be seen, the results of ethnicity bias are much greater than
+those of gender.
+
+2. The second problem is when different metrics return results on
+ different scales of magnitude.
+
+When executing :func:`~wefe.utils.run_queries` with the gender queries and models
+described above using both WEAT and RNSB, the results obtained are as
+follows:
+
++--------------+---------------------------+---------------------------+
+| model_name | WEAT: Gender Queries | RNSB: Gender Queries |
+| | average of abs values | average of abs values |
+| | score | score |
++==============+===========================+===========================+
+| glove | 0.210556 | 0.032673 |
+| twitter | | |
+| dim=25 | | |
++--------------+---------------------------+---------------------------+
+| glove | 0.292373 | 0.049429 |
+| twitter | | |
+| dim=50 | | |
++--------------+---------------------------+---------------------------+
+| glove | 0.225116 | 0.0312772 |
+| twitter | | |
+| dim=100 | | |
++--------------+---------------------------+---------------------------+
+
+
+We can see differences between the results of both metrics of an order
+of magnitude.
+
+One solution to this problem is to create **rankings**. Rankings focus on the relative
+differences reported by the metrics (for different models) instead of focusing on the
+absolute values.
+
+The following guide show how to create rankings that evaluate
+gender bias and ethnicity.
+
+
+Gender Bias Model Ranking
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: ipython3
+
+ # define the queries
+ gender_query_1 = Query(
+ [word_sets["male_terms"], word_sets["female_terms"]],
+ [word_sets["career"], word_sets["family"]],
+ ["Male terms", "Female terms"],
+ ["Career", "Family"],
+ )
+ gender_query_2 = Query(
+ [word_sets["male_terms"], word_sets["female_terms"]],
+ [word_sets["science"], word_sets["arts"]],
+ ["Male terms", "Female terms"],
+ ["Science", "Arts"],
+ )
+ gender_query_3 = Query(
+ [word_sets["male_terms"], word_sets["female_terms"]],
+ [word_sets["math"], word_sets["arts_2"]],
+ ["Male terms", "Female terms"],
+ ["Math", "Arts"],
+ )
+
+ gender_queries = [gender_query_1, gender_query_2, gender_query_3]
+
+ # run the queries using WEAT
+ WEAT_gender_results = run_queries(
+ WEAT,
+ gender_queries,
+ models,
+ metric_params={"preprocessors": [{"lowercase": True}]},
+ aggregate_results=True,
+ return_only_aggregation=True,
+ queries_set_name="Gender Queries",
+ )
+
+ # run the queries using WEAT effect size
+ WEAT_EZ_gender_results = run_queries(
+ WEAT,
+ gender_queries,
+ models,
+ metric_params={"preprocessors": [{"lowercase": True}], "return_effect_size": True,},
+ aggregate_results=True,
+ return_only_aggregation=True,
+ queries_set_name="Gender Queries",
+ )
+
+ # run the queries using RNSB
+ RNSB_gender_results = run_queries(
+ RNSB,
+ gender_queries,
+ models,
+ metric_params={"preprocessors": [{"lowercase": True}]},
+ aggregate_results=True,
+ return_only_aggregation=True,
+ queries_set_name="Gender Queries",
+ )
+
+The rankings can be calculated by means of the :func:`~wefe.utils.create_ranking`
+function. This function receives as input results from running
+:func:`~wefe.utils.run_queries` and assumes that the last column contains the aggregated
+values.
+
+.. code:: ipython3
+
+ from wefe.utils import create_ranking
+
+ # create the ranking
+ gender_ranking = create_ranking(
+ [WEAT_gender_results, WEAT_EZ_gender_results, RNSB_gender_results]
+ )
+
+ gender_ranking
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
+
WEAT: Gender Queries average of abs values score (1)
+
WEAT: Gender Queries average of abs values score (2)
WEAT: Ethnicity Queries average of abs values score
+
WEAT: Gender Queries average of abs values score
+
RNSB: Ethnicity Queries average of abs values score
+
+
+
model_name
+
+
+
+
+
+
+
+
glove twitter dim=25
+
3.0
+
1.0
+
3.0
+
+
+
glove twitter dim=50
+
2.0
+
2.0
+
2.0
+
+
+
glove twitter dim=100
+
1.0
+
3.0
+
1.0
+
+
+
+
+
+
+
+Plotting the rankings
+~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to graph the rankings in barplots using the
+:func:`~wefe.utils.plot_ranking` function. The generated figure shows the accumulated
+rankings for each embedding model. Each bar represents the sum of the
+rankings obtained by each embedding. Each color within a bar represents
+a different criterion-metric ranking.
+
+.. code:: ipython3
+
+ from wefe.utils import plot_ranking
+
+ fig = plot_ranking(gender_ranking)
+ fig.show()
+
+
+
+.. image:: ../images/measurement_user_guide/output_60_0.png
+
+
+.. code:: ipython3
+
+ fig = plot_ranking(ethnicity_ranking)
+ fig.show()
+
+
+
+.. image:: ../images/measurement_user_guide/output_61_0.png
+
+
+Correlating Rankings
+~~~~~~~~~~~~~~~~~~~~
+
+Having obtained rankings by metric for each embeddings, it would be
+ideal to see and analyze the degree of agreement between them.
+
+A high concordance between the rankings allows us to state with some certainty that
+all metrics evaluated the embedding models in a similar way and therefore,
+that the ordering of embeddings by bias calculated makes sense.
+On the other hand, a low degree of agreement shows the opposite: the rankings do not
+allow to clearly establish which embedding is less biased than another.
+
+The level of concordance of the rankings can be evaluated by calculating
+correlations.WEFE provides :func:`~wefe.utils.calculate_ranking_correlations` to
+calculate the correlations between rankings.
+
+.. code:: ipython3
+
+ from wefe.utils import calculate_ranking_correlations, plot_ranking_correlations
+
+ correlations = calculate_ranking_correlations(gender_ranking)
+ correlations
+
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
+
WEAT: Gender Queries average of abs values score (1)
+
WEAT: Gender Queries average of abs values score (2)
+
RNSB: Gender Queries average of abs values score
+
+
+
+
+
WEAT: Gender Queries average of abs values score (1)
+
1.0
+
0.5
+
-1.0
+
+
+
WEAT: Gender Queries average of abs values score (2)
+
0.5
+
1.0
+
-0.5
+
+
+
RNSB: Gender Queries average of abs values score
+
-1.0
+
-0.5
+
1.0
+
+
+
+
+
+
+
+.. note::
+
+ ``calculate_ranking_correlations`` uses the ``corr()`` ``pandas``
+ dataframe method. The type of correlation that is calculated can be changed
+ through the method parameter. The available options are:
+ ``'pearson'``, ``'spearman'``, ``'kendall'``. By default, the spearman
+ correlation is calculated.
+
+In this example, Kendall’s correlation is used.
+
+.. code:: ipython3
+
+ calculate_ranking_correlations(gender_ranking, method="kendall")
+
+
+
+
+
+.. raw:: html
+
+
+
+
+
+
+
+
WEAT: Gender Queries average of abs values score (1)
+
WEAT: Gender Queries average of abs values score (2)
+
RNSB: Gender Queries average of abs values score
+
+
+
+
+
WEAT: Gender Queries average of abs values score (1)
+
1.000000
+
0.333333
+
-1.000000
+
+
+
WEAT: Gender Queries average of abs values score (2)
+
0.333333
+
1.000000
+
-0.333333
+
+
+
RNSB: Gender Queries average of abs values score
+
-1.000000
+
-0.333333
+
1.000000
+
+
+
+
+
+
+
+WEFE also provides a function for graphing the correlations:
+
+
+.. code:: ipython3
+
+ correlation_fig = plot_ranking_correlations(correlations)
+ correlation_fig.show()
+
+
+
+
+.. image:: ../images/measurement_user_guide/output_67_0.png
+
+
+In this case, only two of the three rankings show similar results.
+
diff --git a/docs/user_guide/mitigation_user_guide.rst b/docs/user_guide/mitigation_user_guide.rst
new file mode 100644
index 0000000..7d3e752
--- /dev/null
+++ b/docs/user_guide/mitigation_user_guide.rst
@@ -0,0 +1,602 @@
+.. _bias mitigation:
+
+Bias Mitigation (Debias)
+========================
+
+The following guide is designed to present the more general details on
+using the package to mitigate (debias) bias in word embedding models.
+The following sections show:
+
+- run :class:`~wefe.debias.hard_debias.HardDebias` mitigation method on an
+ embedding model to mitigate gender bias (using the ``fit-transform`` interface).
+- apply the ``target`` parameter when executing the transformation.
+- apply the ``ignore`` parameter when executing the transformation.
+- apply the ``copy`` parameter when executing the transformation.
+- run :class:`~wefe.debias.multiclass_hard_debias.MulticlassHardDebias` mitigation
+ method on an word embedding model to mitigate ethnic bias.
+
+.. note::
+
+ For a list of metrics implemented in WEFE, refer to the
+ :ref:`debias-API` of the API reference.
+
+.. note::
+
+ If you want to know more about WEFE's standardization of debias methods,
+ visit :ref:`mitigation framework` in the conceptual guides.
+
+
+Hard Debias
+-----------
+
+
+Hard debias is a method that allows mitigating biases through geometric operations
+on embeddings.
+This method is binary because it only allows 2 classes of the same bias criterion,
+such as male or female.
+
+.. note::
+
+ For a multiclass debias (such as for Latinos, Asians and Whites), it is
+ recommended to visit
+ :class:`~wefe.debias.multiclass_hard_debias.MulticlassHardDebias` class.
+
+
+The main idea of this method is:
+
+1. Identify a bias subspace through the defining sets. In the case of gender,
+these could be e.g. ``[['woman', 'man'], ['she', 'he'], ...]``
+
+2. Neutralize the bias subspace of embeddings that should not be biased.
+
+First, we define a set of words that are correct to be related to the bias
+criterion: the *criterion specific gender words*.
+For example, in the case of gender, *gender specific* words are:
+``['he', 'his', 'He', 'her', 'she', 'him', 'him', 'She', 'man', 'women', 'men'...]``.
+
+We then define that all words outside this set should have no relation to the
+bias criterion and thus have the possibility of being biased. (e.g. for the case of
+genthe bias direction, such that neither is closer to the bias direction
+than the other: ``['doctor', 'nurse', ...]``). Therefore, this set of words is
+neutralized with respect to the bias subspace found in the previous step.
+
+The neutralization is carried out under the following operation:
+
+- :math:`u` : embedding
+- :math:`v`: bias direction
+
+First calculate the projection of the embedding on the bias subspace.
+
+.. math::
+
+ \text{bias subspace} = \frac{v \cdot (v \cdot u)}{(v \cdot v)}
+
+Then subtract the projection from the embedding.
+
+.. math::
+
+ u' = u - \text{bias subspace}
+
+3. Equalizate the embeddings with respect to the bias direction.
+
+Given an equalization set (set of word pairs such as ``['she', 'he'],
+['men', 'women'], ...``, but not limited to the definitional set) this step
+executes, for each pair, an equalization with respect to the bias direction.
+That is, it takes both embeddings of the pair and distributes them at the same
+distance from the bias direction, so that neither is closer to the bias direction
+than the other.
+
+
+The fit parameters define how the neutralization will be calculated. In
+Hard Debias, you have to provide the the ``definitional_pairs``, the
+``equalize_pairs`` (which could be the same of definitional pairs) and
+optionally, a debias ``criterion_name`` (to name the debiased model).
+
+
+The code shown below shows how to run Hard Debias from gender to the test model
+provided by wefe (reduced word2vec).
+
+.. code:: ipython3
+
+ from wefe.utils import load_test_model
+
+ model = load_test_model() # load a reduced version of word2vec
+ model
+
+
+
+
+.. parsed-literal::
+
+
+
+
+
+Load the required word sets.
+
+.. code:: ipython3
+
+ from wefe.datasets import fetch_debiaswe
+ from wefe.debias.hard_debias import HardDebias
+
+ debiaswe_wordsets = fetch_debiaswe()
+
+ definitional_pairs = debiaswe_wordsets["definitional_pairs"]
+ equalize_pairs = debiaswe_wordsets["equalize_pairs"]
+ gender_specific = debiaswe_wordsets["gender_specific"]
+
+
+ print(f"definitional_pairs: \n{definitional_pairs}")
+ print(f"equalize_pairs: \n{equalize_pairs}")
+ print(f"gender_specific: \n{gender_specific}")
+ print("-" * 70, "\n")
+
+
+.. parsed-literal::
+
+ definitional_pairs:
+ [['woman', 'man'], ['girl', 'boy'], ['she', 'he'], ['mother', 'father'], ['daughter', 'son'], ['gal', 'guy'], ['female', 'male'], ['her', 'his'], ['herself', 'himself'], ['Mary', 'John']]
+ equalize_pairs:
+ [['monastery', 'convent'], ['spokesman', 'spokeswoman'], ['Catholic_priest', 'nun'], ['Dad', 'Mom'], ['Men', 'Women'], ['councilman', 'councilwoman'], ['grandpa', 'grandma'], ['grandsons', 'granddaughters'], ['prostate_cancer', 'ovarian_cancer'], ['testosterone', 'estrogen'], ['uncle', 'aunt'], ['wives', 'husbands'], ['Father', 'Mother'], ['Grandpa', 'Grandma'], ['He', 'She'], ['boy', 'girl'], ['boys', 'girls'], ['brother', 'sister'], ['brothers', 'sisters'], ['businessman', 'businesswoman'], ['chairman', 'chairwoman'], ['colt', 'filly'], ['congressman', 'congresswoman'], ['dad', 'mom'], ['dads', 'moms'], ['dudes', 'gals'], ['ex_girlfriend', 'ex_boyfriend'], ['father', 'mother'], ['fatherhood', 'motherhood'], ['fathers', 'mothers'], ['fella', 'granny'], ['fraternity', 'sorority'], ['gelding', 'mare'], ['gentleman', 'lady'], ['gentlemen', 'ladies'], ['grandfather', 'grandmother'], ['grandson', 'granddaughter'], ['he', 'she'], ['himself', 'herself'], ['his', 'her'], ['king', 'queen'], ['kings', 'queens'], ['male', 'female'], ['males', 'females'], ['man', 'woman'], ['men', 'women'], ['nephew', 'niece'], ['prince', 'princess'], ['schoolboy', 'schoolgirl'], ['son', 'daughter'], ['sons', 'daughters'], ['twin_brother', 'twin_sister']]
+ gender_specific:
+ ['he', 'his', 'He', 'her', 'she', 'him', 'She', 'man', 'women', 'men', 'His', 'woman', 'spokesman', 'wife', 'himself', 'son', 'mother', 'father', 'chairman', 'daughter', 'husband', 'guy', 'girls', 'girl', 'Her', 'boy', 'King', 'boys', 'brother', 'Chairman', 'spokeswoman', 'female', 'sister', 'Women', 'Man', 'male', 'herself', 'Lions', 'Lady', 'brothers', 'dad', 'actress', 'mom', 'sons', 'girlfriend', 'Kings', 'Men', 'daughters', 'Prince', 'Queen', 'teenager', 'lady', 'Bulls', 'boyfriend', 'sisters', 'Colts', 'mothers', 'Sir', 'king', 'businessman', 'Boys', 'grandmother', 'grandfather', 'deer', 'cousin', 'Woman', 'ladies', 'Girls', 'Father', 'uncle', 'PA', 'Boy', 'Councilman', 'mum', 'Brothers', 'MA', 'males', 'Girl', 'Mom', 'Guy', 'Queens', 'congressman', 'Dad', 'Mother', 'grandson', 'twins', 'bull', 'queen', 'businessmen', 'wives', 'widow', 'nephew', 'bride', 'females', 'aunt', 'Congressman', 'prostate_cancer', 'lesbian', 'chairwoman', 'fathers', 'Son', 'moms', 'Ladies', 'maiden', 'granddaughter', 'younger_brother', 'Princess', 'Guys', 'lads', 'Ma', 'Sons', 'lion', 'Bachelor', 'gentleman', 'fraternity', 'bachelor', 'niece', 'Lion', 'Sister', 'bulls', 'husbands', 'prince', 'colt', 'salesman', 'Bull', 'Sisters', 'hers', 'dude', 'Spokesman', 'beard', 'filly', 'Actress', 'Him', 'princess', 'Brother', 'lesbians', 'councilman', 'actresses', 'Viagra', 'gentlemen', 'stepfather', 'Deer', 'monks', 'Beard', 'Uncle', 'ex_girlfriend', 'lad', 'sperm', 'Daddy', 'testosterone', 'MAN', 'Female', 'nephews', 'maid', 'daddy', 'mare', 'fiance', 'Wife', 'fiancee', 'kings', 'dads', 'waitress', 'Male', 'maternal', 'heroine', 'feminist', 'Mama', 'nieces', 'girlfriends', 'Councilwoman', 'sir', 'stud', 'Mothers', 'mistress', 'lions', 'estranged_wife', 'womb', 'Brotherhood', 'Statesman', 'grandma', 'maternity', 'estrogen', 'ex_boyfriend', 'widows', 'gelding', 'diva', 'teenage_girls', 'nuns', 'Daughter', 'czar', 'ovarian_cancer', 'HE', 'Monk', 'countrymen', 'Grandma', 'teenage_girl', 'penis', 'bloke', 'nun', 'Husband', 'brides', 'housewife', 'spokesmen', 'suitors', 'menopause', 'monastery', 'patriarch', 'Beau', 'motherhood', 'brethren', 'stepmother', 'Dude', 'prostate', 'Moms', 'hostess', 'twin_brother', 'Colt', 'schoolboy', 'eldest', 'brotherhood', 'Godfather', 'fillies', 'stepson', 'congresswoman', 'Chairwoman', 'Daughters', 'uncles', 'witch', 'Mommy', 'monk', 'viagra', 'paternity', 'suitor', 'chick', 'Pa', 'fiancé', 'sorority', 'macho', 'Spokeswoman', 'businesswoman', 'eldest_son', 'gal', 'statesman', 'schoolgirl', 'fathered', 'goddess', 'hubby', 'mares', 'stepdaughter', 'blokes', 'dudes', 'socialite', 'strongman', 'Witch', 'fiancée', 'uterus', 'grandsons', 'Bride', 'studs', 'mama', 'Aunt', 'godfather', 'hens', 'hen', 'mommy', 'Babe', 'estranged_husband', 'Fathers', 'elder_brother', 'boyhood', 'baritone', 'Diva', 'Lesbian', 'grandmothers', 'grandpa', 'boyfriends', 'feminism', 'countryman', 'stallion', 'heiress', 'queens', 'Grandpa', 'witches', 'aunts', 'semen', 'fella', 'granddaughters', 'chap', 'knight', 'widower', 'Maiden', 'salesmen', 'convent', 'KING', 'vagina', 'beau', 'babe', 'HIS', 'beards', 'handyman', 'twin_sister', 'maids', 'gals', 'housewives', 'Gentlemen', 'horsemen', 'Businessman', 'obstetrics', 'fatherhood', 'beauty_queen', 'councilwoman', 'princes', 'matriarch', 'colts', 'manly', 'ma', 'fraternities', 'Spokesmen', 'pa', 'fellas', 'Gentleman', 'councilmen', 'dowry', 'barbershop', 'Monks', 'WOMAN', 'fraternal', 'ballerina', 'manhood', 'Dads', 'heroines', 'granny', 'gynecologist', 'princesses', 'Goddess', 'yo', 'Granny', 'knights', 'eldest_daughter', 'HER', 'underage_girls', 'masculinity', 'Girlfriend', 'bro', 'Grandmother', 'grandfathers', 'crown_prince', 'Restless', 'paternal', 'Queen_Mother', 'Boyfriend', 'womens', 'Males', 'SHE', 'Countess', 'stepchildren', 'Belles', 'bachelors', 'matron', 'momma', 'Legs', 'maidens', 'goddesses', 'landlady', 'sisterhood', 'Grandfather', 'Fraternity', 'Majesty', 'Babes', 'lass', 'maternal_grandmother', 'blondes', "ma'am", 'Womens', 'divorcee', 'Momma', 'fathering', 'Effie', 'Lad', 'womanhood', 'missus', 'Sisterhood', 'granddad', 'Mens', 'papa', 'gf', 'sis', 'Husbands', 'Hen', 'womanizer', 'gynecological', 'stepsister', 'Handsome', 'Prince_Charming', 'BOY', 'stepdad', 'teen_ager', 'GIRL', 'dame', 'Sorority', 'beauty_pageants', 'raspy', 'harem', 'maternal_grandfather', 'Hes', 'deliveryman', 'septuagenarian', 'damsel', 'paternal_grandmother', 'paramour', 'paternal_grandparents', 'Nun', 'DAD', 'mothering', 'shes', "HE_'S", 'Nuns', 'teenage_daughters', 'auntie', 'widowed_mother', 'Girlfriends', 'FATHER', 'virile', 'COUPLE', 'grandmas', 'Hubby', 'nan', 'vixen', 'Joan_Crawford', 'stepdaughters', 'endometrial_cancer', 'stepsons', 'loins', 'Grandson', 'Mitchells', 'erections', 'Matron', 'Fella', 'daddies', 'ter', 'Sweetie', 'Dudes', 'Princesses', 'Lads', 'lioness', 'Mamma', 'virility', 'bros', 'womenfolk', 'Heir', 'BROTHERS', 'manliness', 'patriarchs', 'earl', 'sisterly', 'Whore', 'Gynaecology', 'countess', 'convents', 'Oratory', 'witch_doctor', 'mamas', 'yah', 'aunty', 'aunties', 'Heiress', 'lasses', 'Breasts', 'fairer_sex', 'sorority_sisters', 'WIFE', 'Laurels', 'penile', 'nuh', 'mah', 'toms', 'mam', 'Granddad', 'premenopausal_women', 'Granddaddy', 'nana', 'coeds', 'dames', 'herdsman', 'Mammy', 'Fellas', 'Niece', 'menfolk', 'Grandad', 'bloods', 'Gramps', 'damsels', 'Granddaughter', 'mamma', 'concubine', 'Oros', 'Blarney', 'filial', 'broads', 'Ethel_Kennedy', 'ACTRESS', 'Tit', 'fianc', 'Hunk', 'Night_Shift', 'wifey', 'Lothario', 'Holy_Roman_Emperor', 'horse_breeder', 'grandnephew', 'Lewises', 'Muscular', 'feminist_movement', 'Sanan', 'womenâ_€_™', 'Fiancee', 'dowries', 'Carmelite', 'rah', 'n_roller', 'bay_filly', 'belles', 'Uncles', 'PRINCESS', 'womans', 'Homeboy', 'Blokes', 'Charmer', 'codger', 'Delta_Zeta', 'courtesans', 'grandaughter', 'SISTER', 'Highness', 'grandbabies', 'crone', 'Skip_Away', 'noblewoman', 'bf', 'jane', 'philandering_husband', 'Sisqo', 'mammy', 'daugher', 'director_Skip_Bertman', 'DAUGHTER', 'Royal_Highness', 'mannish', 'spinsters', 'Missus', 'madame', 'Godfathers', 'saleswomen', 'beaus', 'Risha', 'luh', 'sah', 'negligee', 'Womenâ_€_™', 'Hos', 'salesgirl', 'grandmom', 'Grandmas', 'Lawsons', 'countrywomen', 'Booby', 'darlin', 'Sheiks', 'boyz', 'wifes', 'Bayi', 'Il_Duce', 'â_€_œMy', 'fem', 'daugther', 'Potti', 'hussy', 'tch', 'Gelding', 'stemmed_roses', 'Damson', 'puh', 'Tylers', 'neice', 'Mutha', 'GRANDMOTHER', 'youse', 'spurned_lover', 'mae', 'Britt_Ekland', 'clotheshorse', 'Carlita_Kilpatrick', 'Cambest', 'Pretty_Polly', 'banshees', 'male_chauvinist', 'Arliss', 'mommas', 'maidservant', 'Gale_Harold', 'Little_Bo_Peep', 'Cleavers', 'hags', 'blowsy', 'Queen_Elizabeth_I.', 'lassies', 'papas', 'BABE', 'ugly_ducklings', 'Jims', 'hellion', 'Beautician', 'coalminer', 'relaxin', 'El_Mahroug', 'Victoria_Secret_Angel', 'shepherdess', 'Mosco', 'Slacks', 'nanna', 'wifely', 'tomboys', 'LAH', 'hast', 'apo', 'Kaplans', 'milkmaid', 'Robin_Munis', 'John_Barleycorn', 'royal_highness', 'Meanie', 'NAH', 'trollop', 'roh', 'Jewess', 'Sheik_Hamad', 'mumsy', 'Big_Pussy', 'chil_dren', 'Aunt_Bea', 'basso', 'sista', 'girlies', 'nun_Sister', 'chica', 'Bubbas', 'massa', 'Southern_belles', 'Nephews', 'castrations', 'Mister_Ed', 'Grandsons', 'Calaf', 'Malachy_McCourt', 'Shamash', 'hey_hey', 'Harmen', 'sonofabitch', 'Donovans', 'Grannie', 'Kalinka', 'hisself', 'Devean', 'goatherd', 'hinds', 'El_Corredor', 'Kens', 'notorious_womanizer', 'goh', 'Mommas', 'washerwoman', 'Samaira', 'Coo_Coo', 'Governess', 'grandsire', 'PRINCE_WILLIAM', 'gramma', 'him.He', 'Coptic_priest', 'Corbie', 'Kennys', 'thathe', 'Pa_Pa', 'Bristols', 'Hotep', 'snowy_haired', 'El_Prado_Ire', 'Girl_hitmaker', 'Hurleys', 'St._Meinrad', 'sexually_perverted', 'authoress', 'Prudie', 'raven_haired_beauty', 'Bonos', 'domestic_shorthair', 'brothas', 'nymphet', 'Neelma', 'Seita', 'stud_muffin', 'St._Judes', 'yenta', 'bare_shouldered', 'Pinkney_Sr.', 'PRINCE_CHARLES', 'Bisutti', 'sistas', 'Blanche_Devereaux', 'Momoa', 'Quiff', 'Scotswoman', 'balaclava_clad_men', 'Louis_Leakey', 'dearie', 'vacuum_cleaner_salesman', 'grandads', 'postulant', 'SARAH_JESSICA_PARKER', 'AUNT', 'Prince_Dauntless', 'Dalys', 'Darkie', 'Czar_Nicholas', 'Lion_Hearted', 'Boy_recliner', 'baby_mamas', 'giantess', 'Lawd', 'GRANNY', 'fianc_e', 'Bilqis', 'WCTU', 'famly', 'Ellas', 'feminazis', 'Pentheus', 'MAMAS', 'Town_Criers', 'Saggy', 'youngman', 'grandam', 'divorcé', 'bosomed', 'roon', 'Simmentals', 'eponymous_heroine', 'LEYLAND', "REE'", "cain't", 'Evelynn', "WAH'", 'sistah', 'Horners', 'Elsie_Poncher', 'Coochie', 'rat_terriers', 'Limousins', 'Buchinski', 'Schicchi', 'Carpitcher', 'Khwezi', "HAH'", 'Shazza', 'Mackeson', "ROH'", 'kuya', 'novice_nun', 'Shei', 'Elmasri', 'ladykiller', '6yo', 'Yenta', 'SHEL', 'pater', 'Souse', 'Tahirah', 'comedian_Rodney_Dangerfield', 'Shottle', 'carryin', 'Sath', "fa'afafine", 'royal_consort', 'hus_band', 'maternal_uncles', 'dressing_provocatively', 'dreamgirl', 'millionaire_industrialist', 'Georgie_Girl', 'Must_Be_Obeyed', 'joh', 'Arabian_stallion', 'ahr', 'mso_para_margin_0in', "SOO'", 'Biddles', 'Chincoteague_Volunteer_Fire', 'Lisa_Miceli', 'gorgeous_brunette', 'fiancŽ', 'Moved_fluently', 'Afternoon_Deelites', 'biker_dude', 'Vito_Spatafore', 'MICK_JAGGER', 'Adesida', 'Reineman', 'witz', 'Djamila', 'Glenroe', 'daddys', 'Romanzi', 'gentlewomen', 'Dandie_Dinmont_terrier', 'Excess_Ire', 'By_SYVJ_Staff', 'zan', 'CONFESSIONS', 'Magees', 'wimmin', 'tash', 'Theatrical_Ire', 'Prince_Charmings', 'chocolate_eclair', 'bron', 'daughers', 'Felly', 'fiftyish', 'Spritely', 'GRANDPA', 'distaffer', 'Norbertines', "DAH'", 'leader_Muammar_Gadaffi', 'swains', 'Prince_Tomohito', 'Honneur', 'Soeur', 'jouster', 'Pharaoh_Amenhotep_III', 'QUEEN_ELIZABETH_II', "Ne'er", 'Galileo_Ire', 'Fools_Crow', 'Lannisters', 'Devines', 'gonzales', 'columnist_Ann_Landers', 'Moseleys', 'hiz', 'busch', 'roastee', 'toyboys', 'Sheffields', 'grandaunt', 'Galvins', 'Giongo', 'geh', 'flame_haired_actress', 'Grammarian', 'Greg_Evigan', 'frontierswoman', 'Debele', 'rabs', 'nymphets', 'aai', 'BREE', 'Shaqs', 'ZAY', 'pappa', 'Housa', 'refrigerator_repairman', 'artificial_inseminations', 'chickie', 'Rippa', 'teenager_Tracy_Turnblad', 'homebred_colt', 'Abigaille', 'hen_pecked_husband', 'businesman', 'her.She', 'Kaikeyi', 'Stittsworth', 'self_proclaimed_redneck', 'Khella', 'NeW', 'Evers_Swindell', 'Asmerom_Gebreselassie', 'Boy_recliners', 'Cliff_Claven', 'Legge_Bourke', 'Costos', "d'_honneur", 'sistahs', 'Cabble', 'sahn', 'CROW_AGENCY_Mont', 'jezebel', 'Harrolds', 'ROSARIO_DAWSON', 'INXS_frontman_Michael_Hutchence', 'Gursikh', 'Dadas', 'VIAGA', 'keen_horsewoman', 'Theodoric', 'Eldery', 'lihn', 'Alice_Kramden', 'Santarina', 'radical_cleric_al_Sadr', 'Curleys', "SY'", 'Fidaa', 'Saptapadi', 'Actor_Sean_Astin', 'Kellita_Smith', 'Doly', 'Libertina', 'Money_McBags', 'Chief_Bearhart', 'choirgirl', 'chestnut_stallion', 'VIGRA', 'BY_JIM_McCONNELL', 'Sal_Vitale', 'Trivia_buffs', 'kumaris', 'fraternal_lodge', 'galpals', 'Borino_Quinn', 'lina', 'LATEST_Rapper', 'Bezar', 'Manro', 'bakla', 'Grisetti', 'blond_bimbo', 'spinster_aunt', 'gurls', 'hiswife', 'paleface', 'Charlye', 'hippie_chicks', 'Khalifas', 'Picture_JUSTIN_SANSON', 'Hepburns', 'yez', 'ALDER', 'Sanussi', 'Lil_Sis', 'McLoughlins', 'Barbra_Jean', 'Lulua', 'thatshe', 'actress_Shohreh_Aghdashloo', 'SIR_ANTHONY_HOPKINS', 'Gloddy', "ZAH'", "ORANGE_'S", 'Danielle_Bimber', 'grandmum', 'Kulkis', 'Brazington', 'Marisa_Lenhard_CFA', 'SIR_JOHN', 'Clareman', 'Aqila', 'Heavily_tattooed', 'Libbys', 'thim', 'elocutionist', 'submissives', 'Inja', 'rahm', 'Agnes_Gooch', 'fake_tits', 'nancy_boys', 'Swaidan', "SHAH'", "ain'ta_bed", 'Shumail_Raj', 'Duchesse', 'diethylstilbestrol_DES', 'colt_foal', 'unfaithful_lover', 'Maseri', 'nevah', 'SAHN', 'Barths', 'Toughkenamon', 'GUEST_STARS', 'him.But', 'Donna_Claspell', 'gingham_dresses', 'Massage_Parlour', 'wae', 'Wasacz', 'Magistra', 'vihl', 'Smriti_Iraani', 'boyish_haircut', 'workingwoman', 'borthers', 'Capuchin_friars', 'Nejma', 'yes_sirs', 'bivocational_pastor', 'Grafters', 'HOPWOOD', 'Nicknamed_Godzilla', 'yos', 'Berkenfield', 'Missis', 'sitcom_Designing_Women', 'Kafoa', 'trainer_Emma_Lavelle', 'sadomasochistic_dungeon', 'iht', 'desperates', 'predessor', 'wolf_cub', 'indigenous_Peruvians', 'Livia_Soprano', 'troh', 'colt_sired', 'BOND_HILL', 'ihl', 'Drydens', 'rahs', 'Piserchia', 'Sonny_Corinthos', 'bankrobber', 'Fwank', 'feisty_redhead', 'booze_guzzling', 'COOPERS', "actress_Q'orianka_Kilcher", 'Cortezar', 'twe', 'Jacoub', 'Cindy_Iannarelli', 'Hell_Raiser', 'Fondly_referred', 'Bridal_Shoppe', 'Noleta', 'Christinas', 'IAGRA', 'LaTanya_Richardson', 'Sang_Bender', 'Assasins', 'sorrel_gelding', 'septugenarian', 'Hissy', 'Muqtada_al_Sadr_mook', 'Pfeni', 'MADRID_AFX_Banco_Santander', 'tuchis', 'LeVaughn', 'Gadzicki', 'transvestite_hooker', 'Fame_jockey_Laffit', 'nun_Sister_Mary', 'SAMSONOV', 'Mayflower_Madam', 'Shaque', 'well.He', 'Trainer_Julio_Canani', 'sorrel_mare', 'minivehicle_joint_venture', 'wife_Dwina', "Aasiya_AH'_see", 'Baratheon', "Rick_O'Shay", 'Mammies', 'goatie', 'Nell_Gwynne', 'charmingly_awkward', 'Slamma', 'DEHL', 'Lorenzo_Borghese', 'ALMA_Wis.', 'Anne_Scurria', 'father_Peruvians_alternately', 'JULIE_ANDREWS', 'Slim_Pickins', 'Victoria_Secret_stunner', "BY'", 'Sanam_Devdas', 'pronounced_luh', 'Pasha_Selim', '中华', 'rson', 'maternal_grandmothers', 'IOWA_CITY_Ia', 'Madame_de_Tourvel', "JAY'", 'Sheika_Mozah_bint_Nasser', 'Hotsy_Totsy', "D'_Ginto", 'singer_Johnny_Paycheck', 'uterine_prolapse_surgery', 'SCOTTDALE_Pa.', 'AdelaideNow_reports', 'Marcus_Schenkenberg', 'Clyse', 'Obiter_Dicta', 'comic_Sam_Kinison', 'bitties', 'ROCKVILLE_Ind.', 'swimsuit_calendars', 'Decicio_Smith', 'Ma_ma', 'Rie_Miyazawa', 'celibate_chastity', 'gwah', "ZAY'", 'HER_Majesty', 'Defrere', 'Las_Madrinas', '簿_聂_翻', 'Bea_Hamill', 'ARCADIA_Calif._Trainer', 'Bold_Badgett', 'stakes_victress', 'Hoppin_Frog', 'Narumiya', 'Flayfil', 'hardman_Vinnie_Jones', 'Marilyn_Monroe_lookalike', 'Kivanc_Tatlitug', 'Persis_Khambatta', 'SINKING_SPRING_Pa.', 'len_3rd', 'DEAR_TRYING', 'Farndon_Cheshire', 'Krishna_Madiga', 'daughter_Princess_Chulabhorn', 'Marshall_Rooster_Cogburn', 'Kitty_Kiernan', 'Yokich', 'Jarou', 'Serdaris', 'ee_ay', 'Montifiore', 'Chuderewicz', 'Samuel_Le_Bihan', 'filly_Proud_Spell', 'Umm_Hiba', 'pronounced_koo', 'Sandy_Fonzo', "KOR'", 'Fielder_Civil_kisses', 'Federalsburg_Maryland', 'Nikah_ceremony', 'Brinke_Stevens', 'Yakama_Tribal_Council', 'Capuchin_Father', 'wife_Callista_Bisek', 'Beau_Dare', 'Bedoni', 'Arjun_Punj', 'JOHNNY_KNOXVILLE', 'cap_tain', 'Alderwood_Boys', 'Chi_Eta_Phi', 'ringleader_Charles_Graner', 'Savoies', 'Lalla_Salma', 'Mrs._Potiphar', 'fahn', 'name_Taylor_Sumers', 'Vernita_Green', 'Bollywood_baddie', 'BENBROOK_Texas', 'Assemblyman_Lou_Papan', 'virgin_brides', 'Cho_Eun', 'CATHY_Freeman', 'Uncle_Saul', 'Lao_Brewery', 'Ibo_tribe', 'ruf', 'rival_Edurne_Pasaban', 'Hei_Shangri_La', 'Mommy_dearest', 'interest_Angola_Sonogal', 'Ger_Monsun', 'PUSSYCAT_DOLL', 'Crown_Jewels_Condoms', 'Lord_Marke', 'Patootie', 'Nora_Bey', 'huntin_shootin', 'Minister_Raymond_Tshibanda', 'La_Nina_la_NEEN', 'signature_Whoppers', 'estranged_hubby_Kevin_Federline', "UR'", 'pill_poppin', "GEHR'", 'purebred_Arabians', 'husbandly_duties', 'VIAGRA_TIMING', 'Hereford_heifer', 'hushed_monotone_voice', 'Pola_Uddin', 'Wee_Jimmy_Krankie', 'Kwakwanso', 'Our_Galvinator', 'shoh', 'Codependency_Anonymous_Group', "LA'", "Taufa'ahau", 'Invincible_Spirit_colt', "SAH'_dur", 'MOUNT_CARMEL_Pa.', 'watches_attentively', 'SNL_spinoffs', 'Seth_Nitschke', 'Duns_Berwickshire', 'defendant_Colleen_LaRose', "Silky_O'Sullivan", 'Highcliff_Farm', "REN'", 'Comestar', 'Satisfied_Frog', 'Jai_Maharashtra', 'ATTICA_Ind.', 'lover_Larry_Birkhead', 'Tami_Megal', 'chauvinist_pigs', 'Phi_sorority', 'Micronesian_immigrant', 'Lia_Boldt', 'Sugar_Tits', 'actress_Kathy_Najimy', 'zhoo', 'Colombo_underboss', 'Katsav_accusers', 'Bess_Houdini', 'rap_mogul_Diddy', 'companions_Khin_Khin', 'Van_Het', 'Mastoi_tribe', 'VITALY', 'ROLLING_STONES_rocker', 'womanizing_cad', 'LILY_COLE', 'paternal_grandfathers', 'Lt._Col._Kurt_Kosmatka', 'Kasseem_Jr.', 'Ji_Ji', 'Wilburforce', 'VIAGRA_DOSE', 'English_Sheepdogs', 'pronounced_Kah', 'Htet_Htet_Oo', 'Brisk_Breeze', 'Eau_du', 'BY_MELANIE_EVANS', 'Neovasc_Medical', 'British_funnyman_RICKY', '4YO_mare', 'Hemaida', 'MONKTON', 'Mrs_Mujuru', 'BaGhana_BaGhana', 'Shaaban_Abdel_Rahim', 'Edward_Jazlowiecki_lawyer', 'Ajman_Stud', 'manly_pharaoh_even', 'Serra_Madeira_Islands', "FRAY'", 'panto_dames', 'Khin_Myo', 'dancer_Karima_El_Mahroug', 'CROWN_Princess', 'Baseball_HOFer', 'Hasta_la_Pasta', 'GIRLS_NEXT_DOOR', 'Benedict_Groeschel', 'Bousamra', 'Ruby_Rubacuori_Ruby', 'Monde_Bleu', 'Un_homme_qui', 'Taylor_Sumers', 'Rapper_EMINEM', 'Joe_Menchetti', "VAY'", 'supermodel_NAOMI_CAMPBELL', 'Supermodel_GISELE_BUNDCHEN', 'Au_Lait', 'Radar_Installed', 'THOMAS_TOWNSHIP_Mich.', 'Rafinesque', 'Herman_Weinrich', 'Abraxas_Antelope', 'raspy_voiced_rocker', 'Manurewa_Cosmopolitan_Club', 'Paraone', 'THE_LEOPARD', 'Boy_Incorporated_LZB', 'Dansili_filly', 'Lumpy_Rutherford', 'unwedded_bliss', 'Bhavna_Sharma', 'Scarvagh', 'en_flagrante', 'Mottu_Maid', 'Dowager_Queen', 'NEEN', 'model_Monika_Zsibrita', 'ROSIE_PEREZ', 'Mattock_Ranger', 'Valorous', 'Surpreme', 'Marwari_businessmen', 'Grandparents_aunts', 'Kimberley_Vlaeminck', 'Lyn_Treece_Boys', 'PDX_Update', 'Virsa_Punjab', 'eyelash_fluttering', 'Pi_fraternity', 'HUNTLEIGH_Mo.', 'novelist_Jilly_Cooper', 'Naha_Shuri_temple', 'Yasmine_Al_Massri', 'Mu_Gamma_Xi', 'Mica_Ertegun', 'Ocleppo', 'VIAGRA_CONTRAINDICATIONS', 'daughter_PEACHES', 'trainer_Geoff_Wragg', 'OVERNIGHT_DELIVERY', 'Fitts_retiree', 'de_Tourvel', 'Lil_Lad', 'north_easterner', 'Aol_Weird_News', 'Somewhat_improbably', 'Sikh_panth', 'Worcester_2m_7f', 'Zainab_Jah', 'OLYMPIC_medalist', 'Enoch_Petrucelly', 'collie_Lassie', "LOW'", 'clumsiness_Holloway', 'ayr', "OHR'", 'ROLLING_STONES_guitarist', "LAH'_nee", 'Ian_Beefy_Botham', 'Awapuni_trainer', 'Glamorous_Granny', 'Chiang_Ching', 'MidAtlantic_Cardiovascular_Associates', 'Yeke', 'Seaforth_Huron_Expositor', 'Westley_Cary_Elwes', 'Cate_Blanchett_Veronica_Guerin', 'Bellas_Gate', 'witch_Glinda', 'wives_mistresses', 'Woodsville_Walmart', '2YO_colt', 'Manav_Sushant_Singh', 'Pupi_Avati_Il', 'Sigma_Beta_Rho', 'Bishop_Christopher_Senyonjo', 'Vodou_priest', 'Rubel_Chowdhury', 'Claddagh_Ring', "TAH'_duh_al", "al_Sadr_mook_TAH'", 'ROBIN_GIBB', "GAHN'", 'BY_THOMAS_RANSON', 'sister_Carine_Jena', 'Lyphard_mare', 'summa_cum', 'Semenya_grandmother_Maputhi', 'Clare_Nuns', 'Talac', 'sex_hormones_androgens', 'majeste', 'Saint_Ballado_mare', 'Carrie_Huchel', 'Mae_Dok', 'wife_Dieula', 'Earnest_Sirls', 'spoof_bar_mitzvah', 'von_Boetticher', 'Audwin_Mosby', 'Case_presentationWe', 'Vincent_Papandrea', "KRAY'", 'Sergi_Benavent', 'Le_Poisson', 'Von_Cramm', 'Patti_Mell', 'Raymi_Coya', 'Benjamin_BeBe_Winans', 'Nana_Akosua', 'Auld_Acquaintance', 'Desire_Burunga', 'Company_Wrangler_Nestea', 'ask_Krisy_Plourde', 'JUANITA_BYNUM', 'livia', 'GAMB', 'Gail_Rosario_Dawson', 'Ramgarhia_Sikh', 'Catholic_nun_Sister', 'FOUR_WEDDINGS_AND', 'Robyn_Scherer', 'brother_King_Athelstan', 'Santo_Loquasto_Fences', 'Wee_Frees', 'MARISOL', 'Soliloquy_Stakes', 'Whatever_Spoetzl', "Marc'Aurelio", 'mon_petit', 'Sabbar_al_Mashhadani', "KAY'_lee", "m_zah_MAH'", 'BY_TAMI_ALTHOFF', 'hobbit_Samwise_Gamgee', 'Bahiya_Hariri_sister', 'daddy_Larry_Birkhead', 'Sow_Tracey_Ullman', 'coach_Viljo_Nousiainen', 'Carmen_Lebbos', 'conjoined_twins_Zainab', 'Rob_Komosa', 'ample_bosomed', 'Ageing_rocker', 'psychic_Oda']
+ ----------------------------------------------------------------------
+
+
+
+Instantiate and fit the parameters of the debias transformation.
+In the fit stage, parameters such as bias direction are calculated and embeddings are
+prepared for the equalization stage.
+
+.. code:: ipython3
+
+ hd = HardDebias(verbose=False, criterion_name="gender")
+
+ hd.fit(
+ model, definitional_pairs=definitional_pairs, equalize_pairs=equalize_pairs,
+ )
+
+
+
+
+.. parsed-literal::
+
+
+
+
+
+Mitigation Parameters
+~~~~~~~~~~~~~~~~~~~~~
+
+The parameters of the transform method are relatively standard for all
+methods. The most important ones are ``target``, ``ignore`` and
+``copy``.
+
+In the following example we use ``ignore`` and ``copy``, which are
+described below:
+
+- ``ignore`` (by default, ``None``):
+
+ A list of strings that indicates that the debias method will perform
+ the debias in all words except those specified in this list. In case
+ it is not specified, debias will be executed on all words. In case
+ ignore is not specified or its value is None, the transformation will
+ be performed on all embeddings. This may cause words that are
+ specific to social groups to lose that component (for example,
+ leaving ``'she'`` and ``'he'`` without a gender component).
+
+- ``copy`` (by default ``True``):
+
+ if the value of copy is ``True``, method attempts to create a copy of
+ the model and run debias on the copy. If ``False``, the method is
+ applied on the original model, causing the vectors to mutate.
+
+
+.. warning::
+
+ WARNING:** Setting copy with ``True`` requires at least 2x RAM of
+ the size of the model. Otherwise the execution of the debias may raise
+ ``MemoryError``.
+
+The following transformation is executed using a copy of the model,
+ignoring the words contained in ``gender_specific``.
+
+
+.. code:: ipython3
+
+ gender_debiased_model = hd.transform(model, ignore=gender_specific, copy=True)
+
+
+.. parsed-literal::
+
+ Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Model copy created successfully.
+
+
+.. parsed-literal::
+
+ 100%|██████████| 13013/13013 [00:00<00:00, 118668.18it/s]
+
+
+Measuring the Decrease of Bias
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using the metrics and queries shown in the :ref:`bias measurement` user guide, we
+can measure whether there was a change in the measured gender bias
+between the original model and the debiased model.
+
+.. code:: ipython3
+
+ from wefe.datasets import load_weat
+ from wefe.query import Query
+ from wefe.metrics import WEAT
+
+ weat_wordset = load_weat()
+ weat = WEAT()
+
+
+Next, we measure the gender bias exposed by query 1 (Male terms and Female terms wrt Career and Family) with respect to the debiased model and the original.
+
+.. code:: ipython3
+
+ gender_query_1 = Query(
+ [weat_wordset["male_terms"], weat_wordset["female_terms"]],
+ [weat_wordset["career"], weat_wordset["family"]],
+ ["Male terms", "Female terms"],
+ ["Career", "Family"],
+ )
+ print(gender_query_1, "\n", "-" * 70, "\n")
+
+ biased_results_1 = weat.run_query(gender_query_1, model, normalize=True)
+ debiased_results_1 = weat.run_query(
+ gender_query_1, gender_debiased_model, normalize=True
+ )
+
+ print("Debiased vs Biased (absolute values)")
+ print(
+ round(abs(debiased_results_1["weat"]), 3),
+ "<",
+ round(abs(biased_results_1["weat"]), 3),
+ )
+
+
+
+
+.. parsed-literal::
+
+
+ ----------------------------------------------------------------------
+
+ Debiased vs Biased (absolute values)
+ 0.047 < 0.463
+
+
+The above results show that there was a decrease in the measured gender bias.
+
+Next, we measure the gender bias exposed by query 2 (Male Names and Female Names wrt Pleasant and Unpleasant terms) with respect to the debiased model and the original.
+
+.. code:: ipython3
+
+ gender_query_2 = Query(
+ [weat_wordset["male_names"], weat_wordset["female_names"]],
+ [weat_wordset["pleasant_5"], weat_wordset["unpleasant_5"]],
+ ["Male Names", "Female Names"],
+ ["Pleasant", "Unpleasant"],
+ )
+
+ print(gender_query_2, "\n", "-" * 70, "\n")
+
+ biased_results_2 = weat.run_query(
+ gender_query_2, model, normalize=True, preprocessors=[{}, {"lowercase": True}]
+ )
+ debiased_results_2 = weat.run_query(
+ gender_query_2,
+ gender_debiased_model,
+ normalize=True,
+ preprocessors=[{}, {"lowercase": True}],
+ )
+
+ print("Debiased vs Biased (absolute values)")
+ print(
+ round(abs(debiased_results_2["weat"]), 3),
+ "<",
+ round(abs(biased_results_2["weat"]), 3),
+ )
+
+
+
+
+.. parsed-literal::
+
+
+ ----------------------------------------------------------------------
+
+ Debiased vs Biased (absolute values)
+ 0.055 < 0.074
+
+
+Again, the above results show that there was a decrease in the measured gender bias.
+
+Target Parameter
+~~~~~~~~~~~~~~~~
+
+If a set of words is specified in ``target`` parameter, the debias method is performed
+only on the embeddings associated with this set.
+In the case of providing ``None``, the transformation is performed on all vocabulary
+words except those specified in ignore. By default ``None``.
+
+In the following example, the target parameter is used to execute the transformation
+only on the career and family word set:
+
+.. code:: ipython3
+
+ targets = [
+ "executive",
+ "management",
+ "professional",
+ "corporation",
+ "salary",
+ "office",
+ "business",
+ "career",
+ "home",
+ "parents",
+ "children",
+ "family",
+ "cousins",
+ "marriage",
+ "wedding",
+ "relatives",
+ ]
+
+ hd = HardDebias(verbose=False, criterion_name="gender").fit(
+ model, definitional_pairs=definitional_pairs, equalize_pairs=equalize_pairs,
+ )
+
+ gender_debiased_model = hd.transform(model, target=targets, copy=True)
+
+
+
+.. parsed-literal::
+
+ Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Model copy created successfully.
+
+
+.. parsed-literal::
+
+ 100%|██████████| 16/16 [00:00<00:00, 9428.05it/s]
+
+
+Next, a bias test is run on the mitigated embeddings associated with the
+target words.
+
+In this case, the value of the metric is lower on the
+query executed on the mitigated model than on the original one.
+These results indicate that there was a mitigation of bias on embeddings of these words.
+
+
+.. code:: ipython3
+
+ gender_query_1 = Query(
+ [weat_wordset["male_terms"], weat_wordset["female_terms"]],
+ [weat_wordset["career"], weat_wordset["family"]],
+ ["Male terms", "Female terms"],
+ ["Career", "Family"],
+ )
+ print(gender_query_1, "\n", "-" * 70, "\n")
+
+ biased_results_1 = weat.run_query(gender_query_1, model, normalize=True)
+ debiased_results_1 = weat.run_query(
+ gender_query_1, gender_debiased_model, normalize=True
+ )
+
+ print("Debiased vs Biased (absolute values)")
+ print(
+ round(abs(debiased_results_1["weat"]), 3),
+ "<",
+ round(abs(biased_results_1["weat"]), 3),
+ )
+
+
+
+
+.. parsed-literal::
+
+
+ ----------------------------------------------------------------------
+
+ Debiased vs Biased (absolute values)
+ 0.047 < 0.463
+
+
+However, if a bias test is run with words that were outside the ``target``
+word set, the results are almost the same. The slight difference in the
+metric scores lies in the fact that the equalize sets were still
+equalized.
+
+.. warning::
+
+ The equalization process can modify embeddings that have not been marked in the target.
+ In Hard Debias, equalization can be deactivated by delivering an empty
+ equalize set (``[]``).
+
+
+
+.. code:: ipython3
+
+ gender_query_2 = Query(
+ [weat_wordset["male_names"], weat_wordset["female_names"]],
+ [weat_wordset["pleasant_5"], weat_wordset["unpleasant_5"]],
+ ["Male Names", "Female Names"],
+ ["Pleasant", "Unpleasant"],
+ )
+
+ print(gender_query_2, "\n", "-" * 70, "\n")
+
+ biased_results_2 = weat.run_query(
+ gender_query_2, model, normalize=True, preprocessors=[{}, {"lowercase": True}]
+ )
+ debiased_results_2 = weat.run_query(
+ gender_query_2,
+ gender_debiased_model,
+ normalize=True,
+ preprocessors=[{}, {"lowercase": True}],
+ )
+
+ print("Debiased vs Biased (absolute values)")
+ print(
+ round(abs(debiased_results_2["weat"]), 3),
+ ">",
+ round(abs(biased_results_2["weat"]), 3),
+ )
+
+
+
+.. parsed-literal::
+
+
+ ----------------------------------------------------------------------
+
+ Debiased vs Biased (absolute values)
+ 0.08 > 0.074
+
+
+Note that the equalization caused the bias of the debiased model to be slightly larger than the original.
+
+
+Saving the Debiased Model
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To save the mitigated model one must access the ``KeyedVectors`` (the
+gensim object that contains the embeddings) through ``wv`` and then use
+the ``save`` method to store the method in a file.
+
+
+
+.. code:: ipython3
+
+ gender_debiased_model.wv.save("gender_debiased_glove.kv")
+
+
+
+Multiclass Hard Debias
+----------------------
+
+Multiclass Hard Debias is a generalized version of Hard Debias that
+enables multiclass debiasing. Generalized refers to the fact that this
+method extends Hard Debias in order to support more than two types of
+social target sets within the definitional set.
+
+For example, for the case of religion bias, it supports a debias using
+words associated with Christianity, Islam and Judaism.
+
+The usage is very similar to Hard Debias with the difference that the
+``definitional_sets`` can be larger than pairs.
+
+
+.. code:: ipython3
+
+ from wefe.datasets import fetch_debias_multiclass
+ from wefe.debias.multiclass_hard_debias import MulticlassHardDebias
+
+ multiclass_debias_wordsets = fetch_debias_multiclass()
+ weat_wordsets = load_weat()
+ weat = WEAT()
+
+ ethnicity_definitional_sets = multiclass_debias_wordsets["ethnicity_definitional_sets"]
+ ethnicity_equalize_sets = list(
+ multiclass_debias_wordsets["ethnicity_analogy_templates"].values()
+ )
+
+ print(f"ethnicity_definitional_sets: \n{ethnicity_definitional_sets}")
+ print(f"ethnicity_equalize_sets: \n{ethnicity_equalize_sets}")
+ print("-" * 70, "\n")
+
+ mhd = MulticlassHardDebias(verbose=False, criterion_name="ethnicity")
+ mhd.fit(
+ model=model,
+ definitional_sets=ethnicity_definitional_sets,
+ equalize_sets=ethnicity_equalize_sets,
+ )
+
+ ethnicity_debiased_model = mhd.transform(model, copy=True)
+
+
+
+.. parsed-literal::
+
+ ethnicity_definitional_sets:
+ [['black', 'caucasian', 'asian'], ['african', 'caucasian', 'asian'], ['black', 'white', 'asian'], ['africa', 'america', 'asia'], ['africa', 'america', 'china'], ['africa', 'europe', 'asia']]
+ ethnicity_equalize_sets:
+ [['manager', 'executive', 'redneck', 'hillbilly', 'leader', 'farmer'], ['doctor', 'engineer', 'laborer', 'teacher'], ['slave', 'musician', 'runner', 'criminal', 'homeless']]
+ ----------------------------------------------------------------------
+
+ copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Model copy created successfully.
+
+
+.. parsed-literal::
+
+ 100%|██████████| 13003/13003 [00:00<00:00, 18357.20it/s]
+
+
+Measuring the Decrease of Bias
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following code compares the execution of a query measuring ethnic bias in the original model vs. in the debiased model.
+
+.. code:: ipython3
+
+ ethnicity_query = Query(
+ [
+ multiclass_debias_wordsets["white_terms"],
+ multiclass_debias_wordsets["black_terms"],
+ ],
+ [
+ multiclass_debias_wordsets["white_biased_words"],
+ multiclass_debias_wordsets["black_biased_words"],
+ ],
+ ["european_american_names", "african_american_names"],
+ ["white_biased_words", "black_biased_words"],
+ )
+
+ print(ethnicity_query, "\n", "-" * 70, "\n")
+
+ biased_results = weat.run_query(
+ ethnicity_query, model, normalize=True, preprocessors=[{}, {"lowercase": True}],
+ )
+ debiased_results = weat.run_query(
+ ethnicity_query,
+ ethnicity_debiased_model,
+ normalize=True,
+ preprocessors=[{}, {"lowercase": True}],
+ )
+
+ print("Debiased vs Biased (absolute values)")
+ print(
+ round(abs(debiased_results_2["weat"]), 3),
+ "<",
+ round(abs(biased_results_2["weat"]), 3),
+ )
+
+
+
+.. parsed-literal::
+
+
+ ----------------------------------------------------------------------
+
+ Debiased vs Biased (absolute values)
+ 0.08 < 0.074
+
diff --git a/enviornment.yml b/enviornment.yml
deleted file mode 100644
index 5d16887..0000000
--- a/enviornment.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: wefe
-dependencies:
- - numpy
- - scipy
- - pandas
- - scikit-learn
- - gensim
- - plotly
- - six
- - semantic_version
diff --git a/examples/Contributing.ipynb b/examples/Contributing.ipynb
index ee61578..1ef6f6f 100644
--- a/examples/Contributing.ipynb
+++ b/examples/Contributing.ipynb
@@ -1,795 +1,795 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "source": [
- "# Contributing"
- ],
- "metadata": {}
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Metric Implementation Guide\n",
- "\n",
- "The following guide will show you how to implement a metric using WEFE.\n",
- "\n",
- "## Create the class \n",
- "\n",
- "The first step is to create the class that will contain the metric. \n",
- "This class must extend the `BaseMetric` class. \n",
- "\n",
- "In the new class you must specify the template (explained below), the name and an abbreviated name or acronym \n",
- "for the metric as class variables.\n",
- "\n",
- "A **template** is a tuple that defines the cardinality of the tagret and attribute sets of a query that can be accepted by the metric. \n",
- "It can take integer values, which require that the target or attribute sets have that cardinality or 'n' in case the metric can operate with 1 or more word sets.\n",
- "Note that this will indicate that all queries that do not comply with the template will be rejected when executed using this metric.\n",
- "\n",
- "Below are some examples of templates:"
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "source": [
- "# two target sets and one attribute set required to execute this metric.\n",
- "template_1 = (2, 1)\n",
- "\n",
- "# two target sets and two attribute set required to execute this metric.\n",
- "template_2 = (2, 2)\n",
- "\n",
- "# one or more (unlimited) target sets and one attribute set required to execute this metric.\n",
- "template_3 = ('n', 1)"
- ],
- "outputs": [],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-29T18:24:29.760528Z",
- "start_time": "2020-12-29T18:24:29.757008Z"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Once the template is defined, you can create the metric according to the following code scheme:"
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "source": [
- "from wefe.metrics.base_metric import BaseMetric\n",
- " \n",
- "class ExampleMetric(BaseMetric):\n",
- " metric_template = (2, 1)\n",
- " metric_name = 'Example Metric'\n",
- " metric_short_name = 'EM'"
- ],
- "outputs": [],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-29T18:24:32.036229Z",
- "start_time": "2020-12-29T18:24:29.767042Z"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "## Implement `run_query` method\n",
- "\n",
- "The second step is to implement `run_query` method. \n",
- "This method is in charge of storing all the operations to calculate the scores from a `query` and the `word_embedding` model.\n",
- "It must perform 2 basic operations before executing the mathematical calculations: \n",
- "\n",
- "### Validate the parameters:\n",
- "\n",
- "This call checks the main parameters provided to the ``run_query`` and will raise an exception if it finds a problem with them.\n",
- "\n",
- "\n",
- "```python \n",
- "# check the types of the provided arguments.\n",
- "self._check_input(query, model)\n",
- "\n",
- "``` \n",
- " \n",
- " \n",
- "### Transform the Query to Embeddings.\n",
- "\n",
- "This call transforms all the word sets of a query into embeddings.\n",
- "\n",
- "\n",
- "```python\n",
- "# transform query word sets into embeddings\n",
- "embeddings = get_embeddings_from_query(\n",
- " model=model,\n",
- " query=query,\n",
- " lost_vocabulary_threshold=lost_vocabulary_threshold,\n",
- " preprocessors=preprocessors,\n",
- " strategy=strategy,\n",
- " normalize=normalize,\n",
- " warn_not_found_words=warn_not_found_words,\n",
- ")\n",
- "\n",
- "```\n",
- "\n",
- "This step could return either:\n",
- "\n",
- "- ``None`` if any of the sets lost percentage more words than the number of words \n",
- " allowed by ``lost_vocabulary_threshold`` parameter (specified as percentage\n",
- " float). In this case the metric would be expected to return nan in its results.\n",
- "\n",
- " .. code:: python\n",
- "\n",
- "```python\n",
- "# if there is any/some set has less words than the allowed limit,\n",
- "# return the default value (nan)\n",
- "if embeddings is None:\n",
- " return {\n",
- " \"query_name\": query.query_name,\n",
- " \"result\": np.nan,\n",
- " \"metrica_default_value\": np.nan,\n",
- " }\n",
- "\n",
- "```\n",
- "\n",
- "- A tuple otherwise. This tuple contains two values:\n",
- "\n",
- " - A dictionary that maps each target set name to a dictionary\n",
- " containing its words and embeddings.\n",
- " - A dictionary that maps each attribute set name to a dictionary\n",
- " containing its words and embeddings.\n",
- "\n",
- "We can illustrate what the outputs of the previous transformation look\n",
- "like using the following query:\n"
- ],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-17T14:39:02.150641Z",
- "start_time": "2020-12-17T14:39:02.142644Z"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "source": [
- "from wefe.word_embedding_model import WordEmbeddingModel\n",
- "from wefe.query import Query\n",
- "from wefe.utils import load_test_model # a few embeddings of WEAT experiments\n",
- "from wefe.datasets.datasets import load_weat # the word sets of WEAT experiments\n",
- "from wefe.preprocessing import get_embeddings_from_query\n",
- "\n",
- " \n",
- "weat = load_weat()\n",
- "model = load_test_model()\n",
- "\n",
- "flowers = weat['flowers']\n",
- "weapons = weat['weapons']\n",
- "pleasant = weat['pleasant_5']\n",
- "query = Query([flowers, weapons], [pleasant],\n",
- " ['Flowers', 'Weapons'], ['Pleasant'])\n",
- "\n",
- "embeddings = get_embeddings_from_query(\n",
- " model=model,\n",
- " query=query,\n",
- " # other params...\n",
- ")\n",
- "target_sets, attribute_sets = embeddings"
- ],
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "INFO:gensim.models.keyedvectors:loading projection weights from C:\\Users\\pablo\\Desktop\\DesarrolloWEFE\\wefe\\wefe\\datasets\\data\\weat_w2v.txt\n",
- "DEBUG:smart_open.smart_open_lib:{'uri': 'C:\\\\Users\\\\pablo\\\\Desktop\\\\DesarrolloWEFE\\\\wefe\\\\wefe\\\\datasets\\\\data\\\\weat_w2v.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}\n",
- "INFO:gensim.models.keyedvectors:loaded (347, 300) matrix from C:\\Users\\pablo\\Desktop\\DesarrolloWEFE\\wefe\\wefe\\datasets\\data\\weat_w2v.txt\n"
- ]
- }
- ],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-29T18:24:32.312423Z",
- "start_time": "2020-12-29T18:24:32.039213Z"
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Contributing"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Metric Implementation Guide\n",
+ "\n",
+ "The following guide will show you how to implement a metric using WEFE.\n",
+ "\n",
+ "## Create the class \n",
+ "\n",
+ "The first step is to create the class that will contain the metric. \n",
+ "This class must extend the `BaseMetric` class. \n",
+ "\n",
+ "In the new class you must specify the template (explained below), the name and an abbreviated name or acronym \n",
+ "for the metric as class variables.\n",
+ "\n",
+ "A **template** is a tuple that defines the cardinality of the tagret and attribute sets of a query that can be accepted by the metric. \n",
+ "It can take integer values, which require that the target or attribute sets have that cardinality or 'n' in case the metric can operate with 1 or more word sets.\n",
+ "Note that this will indicate that all queries that do not comply with the template will be rejected when executed using this metric.\n",
+ "\n",
+ "Below are some examples of templates:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "source": [
+ "# two target sets and one attribute set required to execute this metric.\n",
+ "template_1 = (2, 1)\n",
+ "\n",
+ "# two target sets and two attribute set required to execute this metric.\n",
+ "template_2 = (2, 2)\n",
+ "\n",
+ "# one or more (unlimited) target sets and one attribute set required to execute this metric.\n",
+ "template_3 = ('n', 1)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-29T18:24:29.760528Z",
+ "start_time": "2020-12-29T18:24:29.757008Z"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Once the template is defined, you can create the metric according to the following code scheme:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "source": [
+ "from wefe.metrics.base_metric import BaseMetric\n",
+ " \n",
+ "class ExampleMetric(BaseMetric):\n",
+ " metric_template = (2, 1)\n",
+ " metric_name = 'Example Metric'\n",
+ " metric_short_name = 'EM'"
+ ],
+ "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-29T18:24:32.036229Z",
+ "start_time": "2020-12-29T18:24:29.767042Z"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Implement `run_query` method\n",
+ "\n",
+ "The second step is to implement `run_query` method. \n",
+ "This method is in charge of storing all the operations to calculate the scores from a `query` and the `word_embedding` model.\n",
+ "It must perform 2 basic operations before executing the mathematical calculations: \n",
+ "\n",
+ "### Validate the parameters:\n",
+ "\n",
+ "This call checks the main parameters provided to the ``run_query`` and will raise an exception if it finds a problem with them.\n",
+ "\n",
+ "\n",
+ "```python \n",
+ "# check the types of the provided arguments.\n",
+ "self._check_input(query, model)\n",
+ "\n",
+ "``` \n",
+ " \n",
+ " \n",
+ "### Transform the Query to Embeddings.\n",
+ "\n",
+ "This call transforms all the word sets of a query into embeddings.\n",
+ "\n",
+ "\n",
+ "```python\n",
+ "# transform query word sets into embeddings\n",
+ "embeddings = get_embeddings_from_query(\n",
+ " model=model,\n",
+ " query=query,\n",
+ " lost_vocabulary_threshold=lost_vocabulary_threshold,\n",
+ " preprocessors=preprocessors,\n",
+ " strategy=strategy,\n",
+ " normalize=normalize,\n",
+ " warn_not_found_words=warn_not_found_words,\n",
+ ")\n",
+ "\n",
+ "```\n",
+ "\n",
+ "This step could return either:\n",
+ "\n",
+ "- ``None`` if any of the sets lost percentage more words than the number of words \n",
+ " allowed by ``lost_vocabulary_threshold`` parameter (specified as percentage\n",
+ " float). In this case the metric would be expected to return nan in its results.\n",
+ "\n",
+ " .. code:: python\n",
+ "\n",
+ "```python\n",
+ "# if there is any/some set has less words than the allowed limit,\n",
+ "# return the default value (nan)\n",
+ "if embeddings is None:\n",
+ " return {\n",
+ " \"query_name\": query.query_name,\n",
+ " \"result\": np.nan,\n",
+ " \"metrica_default_value\": np.nan,\n",
+ " }\n",
+ "\n",
+ "```\n",
+ "\n",
+ "- A tuple otherwise. This tuple contains two values:\n",
+ "\n",
+ " - A dictionary that maps each target set name to a dictionary\n",
+ " containing its words and embeddings.\n",
+ " - A dictionary that maps each attribute set name to a dictionary\n",
+ " containing its words and embeddings.\n",
+ "\n",
+ "We can illustrate what the outputs of the previous transformation look\n",
+ "like using the following query:\n"
+ ],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-17T14:39:02.150641Z",
+ "start_time": "2020-12-17T14:39:02.142644Z"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "source": [
+ "from wefe.word_embedding_model import WordEmbeddingModel\n",
+ "from wefe.query import Query\n",
+ "from wefe.utils import load_test_model # a few embeddings of WEAT experiments\n",
+ "from wefe.datasets.datasets import load_weat # the word sets of WEAT experiments\n",
+ "from wefe.preprocessing import get_embeddings_from_query\n",
+ "\n",
+ " \n",
+ "weat = load_weat()\n",
+ "model = load_test_model()\n",
+ "\n",
+ "flowers = weat['flowers']\n",
+ "weapons = weat['weapons']\n",
+ "pleasant = weat['pleasant_5']\n",
+ "query = Query([flowers, weapons], [pleasant],\n",
+ " ['Flowers', 'Weapons'], ['Pleasant'])\n",
+ "\n",
+ "embeddings = get_embeddings_from_query(\n",
+ " model=model,\n",
+ " query=query,\n",
+ " # other params...\n",
+ ")\n",
+ "target_sets, attribute_sets = embeddings"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "INFO:gensim.models.keyedvectors:loading projection weights from C:\\Users\\pablo\\Desktop\\DesarrolloWEFE\\wefe\\wefe\\datasets\\data\\weat_w2v.txt\n",
+ "DEBUG:smart_open.smart_open_lib:{'uri': 'C:\\\\Users\\\\pablo\\\\Desktop\\\\DesarrolloWEFE\\\\wefe\\\\wefe\\\\datasets\\\\data\\\\weat_w2v.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}\n",
+ "INFO:gensim.models.keyedvectors:loaded (347, 300) matrix from C:\\Users\\pablo\\Desktop\\DesarrolloWEFE\\wefe\\wefe\\datasets\\data\\weat_w2v.txt\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-29T18:24:32.312423Z",
+ "start_time": "2020-12-29T18:24:32.039213Z"
+ },
+ "scrolled": true
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "If you inspect `target_sets`, it would look like the following dictionary:\n",
+ "\n",
+ "```python \n",
+ "{\n",
+ " 'Flowers': {\n",
+ " 'aster': array([-0.22167969, 0.52734375, 0.01745605, ...], dtype=float32),\n",
+ " 'clover': array([-0.03442383, 0.19042969, -0.17089844, ...], dtype=float32),\n",
+ " 'hyacinth': array([-0.01391602, 0.3828125, -0.21679688, ...], dtype=float32),\n",
+ " ...\n",
+ " },\n",
+ " 'Weapons': {\n",
+ " 'arrow': array([0.18164062, 0.125, -0.12792969. ...], dtype=float32),\n",
+ " 'club': array([-0.04907227, -0.07421875, -0.0390625, ...], dtype=float32),\n",
+ " 'gun': array([0.05566406, 0.15039062, 0.33398438, ...], dtype=float32),\n",
+ " 'missile': array([4.7874451e-04, 5.1953125e-01, -1.3809204e-03, ...], dtype=float32),\n",
+ " ...\n",
+ " }\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "And `attribute_sets` would look like:\n",
+ "```python\n",
+ "{\n",
+ " 'Pleasant': {\n",
+ " 'caress': array([0.2578125, -0.22167969, 0.11669922], dtype=float32),\n",
+ " 'freedom': array([0.26757812, -0.078125, 0.09326172], dtype=float32),\n",
+ " 'health': array([-0.07421875, 0.11279297, 0.09472656], dtype=float32),\n",
+ " ...\n",
+ " }\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "The idea of keeping a mapping between set names, words and their embeddings is that\n",
+ "there are some metrics that can calculate sub-metrics at different levels and that can\n",
+ "be useful for further use.\n",
+ "\n"
+ ],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-22T16:20:49.335341Z",
+ "start_time": "2020-12-22T16:20:49.325329Z"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Example Metric\n",
+ "\n",
+ "Using the steps previously seen, a sample metric is implemented:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "source": [
+ "from typing import Any, Dict, Union, List, Callable\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "from wefe.metrics.base_metric import BaseMetric\n",
+ "from wefe.query import Query\n",
+ "from wefe.word_embedding_model import WordEmbeddingModel\n",
+ "\n",
+ "\n",
+ "class ExampleMetric(BaseMetric):\n",
+ "\n",
+ " # replace with the parameters of your metric\n",
+ " metric_template = (2, 1) # cardinalities of the targets and attributes sets that your metric will accept.\n",
+ " metric_name = 'Example Metric' \n",
+ " metric_short_name = 'EM'\n",
+ "\n",
+ " def run_query(\n",
+ " self,\n",
+ " query: Query,\n",
+ " model: WordEmbeddingModel,\n",
+ " lost_vocabulary_threshold: float = 0.2,\n",
+ " preprocessors: List[Dict[str, Union[str, bool, Callable]]] = [{}],\n",
+ " strategy: str = \"first\",\n",
+ " normalize: bool = False,\n",
+ " warn_not_found_words: bool = False,\n",
+ " *args: Any,\n",
+ " **kwargs: Any,\n",
+ " ) -> Dict[str, Any]:\n",
+ " \"\"\"Calculate the Example Metric metric over the provided parameters.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " query : Query\n",
+ " A Query object that contains the target and attribute word sets to \n",
+ " be tested.\n",
+ "\n",
+ " word_embedding : WordEmbeddingModel\n",
+ " A WordEmbeddingModel object that contains certain word embedding \n",
+ " pretrained model.\n",
+ " \n",
+ " lost_vocabulary_threshold : float, optional\n",
+ " Specifies the proportional limit of words that any set of the query is\n",
+ " allowed to lose when transforming its words into embeddings.\n",
+ " In the case that any set of the query loses proportionally more words\n",
+ " than this limit, the result values will be np.nan, by default 0.2\n",
+ "\n",
+ " preprocessors : List[Dict[str, Union[str, bool, Callable]]]\n",
+ " A list with preprocessor options.\n",
+ "\n",
+ " A ``preprocessor`` is a dictionary that specifies what processing(s) are\n",
+ " performed on each word before its looked up in the model vocabulary.\n",
+ " For example, the ``preprocessor``\n",
+ " ``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase\n",
+ " and remove the accent from each word before searching for them in the\n",
+ " model vocabulary. Note that an empty dictionary ``{}`` indicates that no\n",
+ " preprocessing is done.\n",
+ "\n",
+ " The possible options for a preprocessor are:\n",
+ "\n",
+ " * ``lowercase``: ``bool``. Indicates that the words are transformed to\n",
+ " lowercase.\n",
+ " * ``uppercase``: ``bool``. Indicates that the words are transformed to\n",
+ " uppercase.\n",
+ " * ``titlecase``: ``bool``. Indicates that the words are transformed to\n",
+ " titlecase.\n",
+ " * ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that\n",
+ " the accents of the words are eliminated. The stripping type can be\n",
+ " specified. True uses 'unicode' by default.\n",
+ " * ``preprocessor``: ``Callable``. It receives a function that operates\n",
+ " on each word. In the case of specifying a function, it overrides the\n",
+ " default preprocessor (i.e., the previous options stop working).\n",
+ "\n",
+ " A list of preprocessor options allows searching for several\n",
+ " variants of the words into the model. For example, the preprocessors\n",
+ " ``[{}, {\"lowercase\": True, \"strip_accents\": True}]``\n",
+ " ``{}`` allows searching first for the original words in the vocabulary of the model. \n",
+ " In case some of them are not found, ``{\"lowercase\": True, \"strip_accents\": True}`` \n",
+ " is executed on these words and then they are searched in the model vocabulary.\n",
+ "\n",
+ " strategy : str, optional\n",
+ " The strategy indicates how it will use the preprocessed words: 'first' will\n",
+ " include only the first transformed word found. 'all' will include all\n",
+ " transformed words found, by default \"first\".\n",
+ "\n",
+ " normalize : bool, optional\n",
+ " True indicates that embeddings will be normalized, by default False\n",
+ "\n",
+ " warn_not_found_words : bool, optional\n",
+ " Specifies if the function will warn (in the logger)\n",
+ " the words that were not found in the model's vocabulary\n",
+ " , by default False.\n",
+ "\n",
+ " Returns\n",
+ " -------\n",
+ " Dict[str, Any]\n",
+ " A dictionary with the query name, the resulting score of the metric, \n",
+ " and other scores.\n",
+ " \"\"\"\n",
+ " # check the types of the provided arguments (only the defaults).\n",
+ " self._check_input(query, model)\n",
+ "\n",
+ " # transform query word sets into embeddings\n",
+ " embeddings = get_embeddings_from_query(\n",
+ " model=model,\n",
+ " query=query,\n",
+ " lost_vocabulary_threshold=lost_vocabulary_threshold,\n",
+ " preprocessors=preprocessors,\n",
+ " strategy=strategy,\n",
+ " normalize=normalize,\n",
+ " warn_not_found_words=warn_not_found_words,\n",
+ " )\n",
+ "\n",
+ " # if there is any/some set has less words than the allowed limit,\n",
+ " # return the default value (nan)\n",
+ " if embeddings is None:\n",
+ " return {\n",
+ " 'query_name': query.query_name, # the name of the evaluated query\n",
+ " 'result': np.nan, # the result of the metric\n",
+ " 'em': np.nan, # result of the calculated metric (recommended)\n",
+ " 'other_metric' : np.nan, # another metric calculated (optional)\n",
+ " 'results_by_word' : np.nan, # if available, values by word (optional)\n",
+ " # ...\n",
+ " }\n",
+ "\n",
+ " # get the targets and attribute sets transformed into embeddings.\n",
+ " target_sets, attribute_sets = embeddings\n",
+ "\n",
+ " # commonly, you only will need the embeddings of the sets.\n",
+ " # this can be obtained by using:\n",
+ " target_embeddings = list(target_sets.values())\n",
+ " attribute_embeddings = list(attribute_sets.values())\n",
+ "\n",
+ " \n",
+ " \"\"\"\n",
+ " # From here, the code can vary quite a bit depending on what you need.\n",
+ " # It is recommended to calculate the metric operations in another method(s).\n",
+ " results = calc_metric() \n",
+ " \n",
+ " # The final step is to return query and result. \n",
+ " # You can return other scores, metrics by word or metrics by set, etc.\n",
+ " return {\n",
+ " 'query_name': query.query_name, # the name of the evaluated query\n",
+ " 'result': results.metric, # the result of the metric\n",
+ " 'em': results.metric # result of the calculated metric (recommended)\n",
+ " 'other_metric' : results.other_metric # Another metric calculated (optional)\n",
+ " 'another_results' : results.details_by_set # if available, values by word (optional),\n",
+ " ...\n",
+ " }\n",
+ " \"\"\"\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-29T18:24:32.324424Z",
+ "start_time": "2020-12-29T18:24:32.315426Z"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "This is what the transformed :code:`target_embeddings_dict` would look like:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Implement the logic of the metric\n",
+ "\n",
+ "\n",
+ "Suppose we want to implement an extremely simple three-step metric, where:\n",
+ "\n",
+ "1. We calculate the average of all the sets,\n",
+ "2. Then, calculate the cosine distance between the target set averages and the \n",
+ " attribute average.\n",
+ "3. Subtract these distances.\n",
+ "\n",
+ "To do this, we create a new method :code:`_calc_metric` in which, using\n",
+ "the array of embedding dict objects as input, we will implement the above."
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "source": [
+ "from typing import Any, Dict, Union, List, Callable\n",
+ "\n",
+ "from scipy.spatial import distance\n",
+ "import numpy as np\n",
+ "\n",
+ "from wefe.metrics import BaseMetric\n",
+ "from wefe.query import Query\n",
+ "from wefe.word_embedding_model import WordEmbeddingModel\n",
+ "from wefe.preprocessing import get_embeddings_from_query\n",
+ "\n",
+ "class ExampleMetric(BaseMetric):\n",
+ "\n",
+ " # replace with the parameters of your metric\n",
+ " metric_template = (\n",
+ " 2, 1\n",
+ " ) # cardinalities of the targets and attributes sets that your metric will accept.\n",
+ " metric_name = 'Example Metric'\n",
+ " metric_short_name = 'EM'\n",
+ "\n",
+ " def _calc_metric(self, target_embeddings, attribute_embeddings):\n",
+ " \"\"\"Calculates the metric.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " target_embeddings : np.array\n",
+ " An array with dicts. Each dict represents an target set. \n",
+ " A dict is composed with a word and its embedding as key, value respectively.\n",
+ " attribute_embeddings : np.array\n",
+ " An array with dicts. Each dict represents an attribute set. \n",
+ " A dict is composed with a word and its embedding as key, value respectively.\n",
+ "\n",
+ " Returns\n",
+ " -------\n",
+ " np.float\n",
+ " The value of the calculated metric.\n",
+ " \"\"\"\n",
+ "\n",
+ " # get the embeddings from the dicts\n",
+ " target_embeddings_0 = np.array(list(target_embeddings[0].values()))\n",
+ " target_embeddings_1 = np.array(list(target_embeddings[1].values()))\n",
+ "\n",
+ " attribute_embeddings_0 = np.array(\n",
+ " list(attribute_embeddings[0].values()))\n",
+ "\n",
+ " # calculate the average embedding by target and attribute set.\n",
+ " target_embeddings_0_avg = np.mean(target_embeddings_0, axis=0)\n",
+ " target_embeddings_1_avg = np.mean(target_embeddings_1, axis=0)\n",
+ " attribute_embeddings_0_avg = np.mean(attribute_embeddings_0, axis=0)\n",
+ "\n",
+ " # calculate the distances between the target sets and the attribute set\n",
+ " dist_target_0_attr = distance.cosine(target_embeddings_0_avg,\n",
+ " attribute_embeddings_0_avg)\n",
+ " dist_target_1_attr = distance.cosine(target_embeddings_1_avg,\n",
+ " attribute_embeddings_0_avg)\n",
+ "\n",
+ " # subtract the distances\n",
+ " metric_result = dist_target_0_attr - dist_target_1_attr\n",
+ " return metric_result\n",
+ "\n",
+ " def run_query(\n",
+ " self,\n",
+ " query: Query,\n",
+ " model: WordEmbeddingModel,\n",
+ " lost_vocabulary_threshold: float = 0.2,\n",
+ " preprocessors: List[Dict[str, Union[str, bool, Callable]]] = [{}],\n",
+ " strategy: str = \"first\",\n",
+ " normalize: bool = False,\n",
+ " warn_not_found_words: bool = False,\n",
+ " *args: Any,\n",
+ " **kwargs: Any,\n",
+ " ) -> Dict[str, Any]:\n",
+ " \"\"\"Calculate the Example Metric metric over the provided parameters.\n",
+ "\n",
+ " Parameters\n",
+ " ----------\n",
+ " query : Query\n",
+ " A Query object that contains the target and attribute word sets to \n",
+ " be tested.\n",
+ "\n",
+ " word_embedding : WordEmbeddingModel\n",
+ " A WordEmbeddingModel object that contains certain word embedding \n",
+ " pretrained model.\n",
+ " \n",
+ " lost_vocabulary_threshold : float, optional\n",
+ " Specifies the proportional limit of words that any set of the query is\n",
+ " allowed to lose when transforming its words into embeddings.\n",
+ " In the case that any set of the query loses proportionally more words\n",
+ " than this limit, the result values will be np.nan, by default 0.2\n",
+ "\n",
+ " preprocessors : List[Dict[str, Union[str, bool, Callable]]]\n",
+ " A list with preprocessor options.\n",
+ "\n",
+ " A ``preprocessor`` is a dictionary that specifies what processing(s) are\n",
+ " performed on each word before its looked up in the model vocabulary.\n",
+ " For example, the ``preprocessor``\n",
+ " ``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase\n",
+ " and remove the accent from each word before searching for them in the\n",
+ " model vocabulary. Note that an empty dictionary ``{}`` indicates that no\n",
+ " preprocessing is done.\n",
+ "\n",
+ " The possible options for a preprocessor are:\n",
+ "\n",
+ " * ``lowercase``: ``bool``. Indicates that the words are transformed to\n",
+ " lowercase.\n",
+ " * ``uppercase``: ``bool``. Indicates that the words are transformed to\n",
+ " uppercase.\n",
+ " * ``titlecase``: ``bool``. Indicates that the words are transformed to\n",
+ " titlecase.\n",
+ " * ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that\n",
+ " the accents of the words are eliminated. The stripping type can be\n",
+ " specified. True uses 'unicode' by default.\n",
+ " * ``preprocessor``: ``Callable``. It receives a function that operates\n",
+ " on each word. In the case of specifying a function, it overrides the\n",
+ " default preprocessor (i.e., the previous options stop working).\n",
+ "\n",
+ " A list of preprocessor options allows searching for several\n",
+ " variants of the words into the model. For example, the preprocessors\n",
+ " ``[{}, {\"lowercase\": True, \"strip_accents\": True}]``\n",
+ " ``{}`` allows first searching for the original words in the vocabulary of the model. \n",
+ " In case some of them are not found, ``{\"lowercase\": True, \"strip_accents\": True}`` \n",
+ " is executed on these words and then they are searched in the model vocabulary.\n",
+ "\n",
+ " strategy : str, optional\n",
+ " The strategy indicates how it will use the preprocessed words: 'first' will\n",
+ " include only the first transformed word found. 'all' will include all\n",
+ " transformed words found, by default \"first\".\n",
+ "\n",
+ " normalize : bool, optional\n",
+ " True indicates that embeddings will be normalized, by default False\n",
+ "\n",
+ " warn_not_found_words : bool, optional\n",
+ " Specifies if the function will warn (in the logger)\n",
+ " the words that were not found in the model's vocabulary\n",
+ " , by default False.\n",
+ "\n",
+ " Returns\n",
+ " -------\n",
+ " Dict[str, Any]\n",
+ " A dictionary with the query name, the resulting score of the metric, \n",
+ " and other scores.\n",
+ " \"\"\"\n",
+ " # check the types of the provided arguments (only the defaults).\n",
+ " self._check_input(query, model)\n",
+ "\n",
+ " # transform query word sets into embeddings\n",
+ " embeddings = get_embeddings_from_query(\n",
+ " model=model,\n",
+ " query=query,\n",
+ " lost_vocabulary_threshold=lost_vocabulary_threshold,\n",
+ " preprocessors=preprocessors,\n",
+ " strategy=strategy,\n",
+ " normalize=normalize,\n",
+ " warn_not_found_words=warn_not_found_words,\n",
+ " )\n",
+ "\n",
+ " # if there is any/some set has less words than the allowed limit,\n",
+ " # return the default value (nan)\n",
+ " if embeddings is None:\n",
+ " return {\n",
+ " 'query_name': query.query_name, # the name of the evaluated query\n",
+ " 'result': np.nan, # the result of the metric\n",
+ " 'em': np.nan, # result of the calculated metric (recommended)\n",
+ " 'other_metric' : np.nan, # another metric calculated (optional)\n",
+ " 'results_by_word' : np.nan, # if available, values by word (optional)\n",
+ " # ...\n",
+ " }\n",
+ "\n",
+ " # get the targets and attribute sets transformed into embeddings.\n",
+ " target_sets, attribute_sets = embeddings\n",
+ "\n",
+ " # commonly, you only will need the embeddings of the sets.\n",
+ " # this can be obtained by using:\n",
+ " target_embeddings = list(target_sets.values())\n",
+ " attribute_embeddings = list(attribute_sets.values())\n",
+ "\n",
+ " result = self._calc_metric(target_embeddings, attribute_embeddings)\n",
+ "\n",
+ " # return the results.\n",
+ " return {\"query_name\": query.query_name, \"result\": result, 'em': result}"
+ ],
+ "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-29T18:24:32.343454Z",
+ "start_time": "2020-12-29T18:24:32.326425Z"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now, let's try it out:"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "source": [
+ "from wefe.query import Query\n",
+ "from wefe.utils import load_test_model # a few embeddings of WEAT experiments\n",
+ "from wefe.datasets.datasets import load_weat # the word sets of WEAT experiments\n",
+ "\n",
+ "weat = load_weat()\n",
+ "model = load_test_model()\n",
+ "\n",
+ "flowers = weat['flowers']\n",
+ "weapons = weat['weapons']\n",
+ "pleasant = weat['pleasant_5']\n",
+ "query = Query([flowers, weapons], [pleasant], ['Flowers', 'Weapons'],\n",
+ " ['Pleasant'])\n",
+ "\n",
+ "\n",
+ "results = ExampleMetric().run_query(query, model)\n",
+ "print(results)"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "INFO:gensim.utils:loading KeyedVectors object from /home/pablo/wefe/wefe/datasets/data/test_model.kv\n",
+ "DEBUG:smart_open.smart_open_lib:{'uri': '/home/pablo/wefe/wefe/datasets/data/test_model.kv', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}\n",
+ "INFO:gensim.utils:setting ignored attribute vectors_norm to None\n",
+ "DEBUG:gensim.utils:starting a new internal lifecycle event log for KeyedVectors\n",
+ "INFO:gensim.utils:KeyedVectors lifecycle event {'fname': '/home/pablo/wefe/wefe/datasets/data/test_model.kv', 'datetime': '2021-08-13T11:43:41.890605', 'gensim': '4.0.1', 'python': '3.7.10 (default, Jun 4 2021, 14:48:32) \\n[GCC 7.5.0]', 'platform': 'Linux-5.10.16.3-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'loaded'}\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{'query_name': 'Flowers and Weapons wrt Pleasant', 'result': -0.10210171341896057, 'em': -0.10210171341896057}\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-12-29T18:24:32.475099Z",
+ "start_time": "2020-12-29T18:24:32.345453Z"
+ }
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "We have completely defined a new metric.\n",
+ "Congratulations!\n",
+ "\n",
+ "\n",
+ "**Note**\n",
+ "\n",
+ "Some comments regarding the implementation of new metrics:\n",
+ "\n",
+ "- Note that the returned object must necessarily be a ``dict`` instance \n",
+ " containing the ``result`` and ``query_name`` key-values. Otherwise you \n",
+ " will not be able to run query batches using utility functions like \n",
+ " ``run_queries``.\n",
+ "- ``run_query`` can receive additional parameters. Simply add them to the \n",
+ " function signature. These parameters can also be used when running the \n",
+ " metric from the ``run_queries`` utility function.\n",
+ "- We recommend implementing the logic of the metric separated from the \n",
+ " ``run_query`` function. In other words, implement the logic in a \n",
+ " ``calc_your_metric`` function that receives the dictionaries with the \n",
+ " necessary embeddings and parameters.\n",
+ "- The file where ``ExampleMetric`` is located can be found inside the \n",
+ " distances folder of the [repository](https://github.com/dccuchile/wefe/blob/master/wefe/metrics/example_metric.py/)\n",
+ "\n"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Mitigation Method Implementation Guide"
+ ],
+ "metadata": {}
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3.7.10 64-bit ('wefe': conda)"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.10"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ },
+ "interpreter": {
+ "hash": "d84ebd0cc0ccc3ceb6e99e895c29aecce94e907fb1f0ef7f0ceb697eb7e914cf"
+ }
},
- "scrolled": true
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "If you inspect `target_sets`, it would look like the following dictionary:\n",
- "\n",
- "```python \n",
- "{\n",
- " 'Flowers': {\n",
- " 'aster': array([-0.22167969, 0.52734375, 0.01745605, ...], dtype=float32),\n",
- " 'clover': array([-0.03442383, 0.19042969, -0.17089844, ...], dtype=float32),\n",
- " 'hyacinth': array([-0.01391602, 0.3828125, -0.21679688, ...], dtype=float32),\n",
- " ...\n",
- " },\n",
- " 'Weapons': {\n",
- " 'arrow': array([0.18164062, 0.125, -0.12792969. ...], dtype=float32),\n",
- " 'club': array([-0.04907227, -0.07421875, -0.0390625, ...], dtype=float32),\n",
- " 'gun': array([0.05566406, 0.15039062, 0.33398438, ...], dtype=float32),\n",
- " 'missile': array([4.7874451e-04, 5.1953125e-01, -1.3809204e-03, ...], dtype=float32),\n",
- " ...\n",
- " }\n",
- "}\n",
- "```\n",
- "\n",
- "And `attribute_sets` would look like:\n",
- "```python\n",
- "{\n",
- " 'Pleasant': {\n",
- " 'caress': array([0.2578125, -0.22167969, 0.11669922], dtype=float32),\n",
- " 'freedom': array([0.26757812, -0.078125, 0.09326172], dtype=float32),\n",
- " 'health': array([-0.07421875, 0.11279297, 0.09472656], dtype=float32),\n",
- " ...\n",
- " }\n",
- "}\n",
- "```\n",
- "\n",
- "The idea of keeping a mapping between set names, words and their embeddings is that\n",
- "there are some metrics that can calculate sub-metrics at different levels and that can\n",
- "be useful for further use.\n",
- "\n"
- ],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-22T16:20:49.335341Z",
- "start_time": "2020-12-22T16:20:49.325329Z"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "## Example Metric\n",
- "\n",
- "Using the steps previously seen, a sample metric is implemented:"
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "source": [
- "from typing import Any, Dict, Union, List, Callable\n",
- "\n",
- "import numpy as np\n",
- "\n",
- "from wefe.metrics.base_metric import BaseMetric\n",
- "from wefe.query import Query\n",
- "from wefe.word_embedding_model import WordEmbeddingModel\n",
- "\n",
- "\n",
- "class ExampleMetric(BaseMetric):\n",
- "\n",
- " # replace with the parameters of your metric\n",
- " metric_template = (2, 1) # cardinalities of the targets and attributes sets that your metric will accept.\n",
- " metric_name = 'Example Metric' \n",
- " metric_short_name = 'EM'\n",
- "\n",
- " def run_query(\n",
- " self,\n",
- " query: Query,\n",
- " model: WordEmbeddingModel,\n",
- " lost_vocabulary_threshold: float = 0.2,\n",
- " preprocessors: List[Dict[str, Union[str, bool, Callable]]] = [{}],\n",
- " strategy: str = \"first\",\n",
- " normalize: bool = False,\n",
- " warn_not_found_words: bool = False,\n",
- " *args: Any,\n",
- " **kwargs: Any,\n",
- " ) -> Dict[str, Any]:\n",
- " \"\"\"Calculate the Example Metric metric over the provided parameters.\n",
- "\n",
- " Parameters\n",
- " ----------\n",
- " query : Query\n",
- " A Query object that contains the target and attribute word sets to \n",
- " be tested.\n",
- "\n",
- " word_embedding : WordEmbeddingModel\n",
- " A WordEmbeddingModel object that contains certain word embedding \n",
- " pretrained model.\n",
- " \n",
- " lost_vocabulary_threshold : float, optional\n",
- " Specifies the proportional limit of words that any set of the query is\n",
- " allowed to lose when transforming its words into embeddings.\n",
- " In the case that any set of the query loses proportionally more words\n",
- " than this limit, the result values will be np.nan, by default 0.2\n",
- "\n",
- " preprocessors : List[Dict[str, Union[str, bool, Callable]]]\n",
- " A list with preprocessor options.\n",
- "\n",
- " A ``preprocessor`` is a dictionary that specifies what processing(s) are\n",
- " performed on each word before its looked up in the model vocabulary.\n",
- " For example, the ``preprocessor``\n",
- " ``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase\n",
- " and remove the accent from each word before searching for them in the\n",
- " model vocabulary. Note that an empty dictionary ``{}`` indicates that no\n",
- " preprocessing is done.\n",
- "\n",
- " The possible options for a preprocessor are:\n",
- "\n",
- " * ``lowercase``: ``bool``. Indicates that the words are transformed to\n",
- " lowercase.\n",
- " * ``uppercase``: ``bool``. Indicates that the words are transformed to\n",
- " uppercase.\n",
- " * ``titlecase``: ``bool``. Indicates that the words are transformed to\n",
- " titlecase.\n",
- " * ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that\n",
- " the accents of the words are eliminated. The stripping type can be\n",
- " specified. True uses ‘unicode’ by default.\n",
- " * ``preprocessor``: ``Callable``. It receives a function that operates\n",
- " on each word. In the case of specifying a function, it overrides the\n",
- " default preprocessor (i.e., the previous options stop working).\n",
- "\n",
- " A list of preprocessor options allows searching for several\n",
- " variants of the words into the model. For example, the preprocessors\n",
- " ``[{}, {\"lowercase\": True, \"strip_accents\": True}]``\n",
- " ``{}`` allows first to search for the original words in the vocabulary of the model. \n",
- " In case some of them are not found, ``{\"lowercase\": True, \"strip_accents\": True}`` \n",
- " is executed on these words and then they are searched in the model vocabulary.\n",
- "\n",
- " strategy : str, optional\n",
- " The strategy indicates how it will use the preprocessed words: 'first' will\n",
- " include only the first transformed word found. all' will include all\n",
- " transformed words found, by default \"first\".\n",
- "\n",
- " normalize : bool, optional\n",
- " True indicates that embeddings will be normalized, by default False\n",
- "\n",
- " warn_not_found_words : bool, optional\n",
- " Specifies if the function will warn (in the logger)\n",
- " the words that were not found in the model's vocabulary\n",
- " , by default False.\n",
- "\n",
- " Returns\n",
- " -------\n",
- " Dict[str, Any]\n",
- " A dictionary with the query name, the resulting score of the metric, \n",
- " and other scores.\n",
- " \"\"\"\n",
- " # check the types of the provided arguments (only the defaults).\n",
- " self._check_input(query, model)\n",
- "\n",
- " # transform query word sets into embeddings\n",
- " embeddings = get_embeddings_from_query(\n",
- " model=model,\n",
- " query=query,\n",
- " lost_vocabulary_threshold=lost_vocabulary_threshold,\n",
- " preprocessors=preprocessors,\n",
- " strategy=strategy,\n",
- " normalize=normalize,\n",
- " warn_not_found_words=warn_not_found_words,\n",
- " )\n",
- "\n",
- " # if there is any/some set has less words than the allowed limit,\n",
- " # return the default value (nan)\n",
- " if embeddings is None:\n",
- " return {\n",
- " 'query_name': query.query_name, # the name of the evaluated query\n",
- " 'result': np.nan, # the result of the metric\n",
- " 'em': np.nan, # result of the calculated metric (recommended)\n",
- " 'other_metric' : np.nan, # another metric calculated (optional)\n",
- " 'results_by_word' : np.nan, # if available, values by word (optional)\n",
- " # ...\n",
- " }\n",
- "\n",
- " # get the targets and attribute sets transformed into embeddings.\n",
- " target_sets, attribute_sets = embeddings\n",
- "\n",
- " # commonly, you only will need the embeddings of the sets.\n",
- " # this can be obtained by using:\n",
- " target_embeddings = list(target_sets.values())\n",
- " attribute_embeddings = list(attribute_sets.values())\n",
- "\n",
- " \n",
- " \"\"\"\n",
- " # From here, the code can vary quite a bit depending on what you need.\n",
- " # It is recommended to calculate the metric operations in another method(s).\n",
- " results = calc_metric() \n",
- " \n",
- " # The final step is to return query and result. \n",
- " # You can return other scores, metrics by word or metrics by set, etc.\n",
- " return {\n",
- " 'query_name': query.query_name, # the name of the evaluated query\n",
- " 'result': results.metric, # the result of the metric\n",
- " 'em': results.metric # result of the calculated metric (recommended)\n",
- " 'other_metric' : results.other_metric # Another metric calculated (optional)\n",
- " 'another_results' : results.details_by_set # if available, values by word (optional),\n",
- " ...\n",
- " }\n",
- " \"\"\"\n"
- ],
- "outputs": [],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-29T18:24:32.324424Z",
- "start_time": "2020-12-29T18:24:32.315426Z"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "This is what the transformed :code:`target_embeddings_dict` would look like:"
- ],
- "metadata": {}
- },
- {
- "cell_type": "markdown",
- "source": [
- "## Implement the logic of the metric\n",
- "\n",
- "\n",
- "Suppose we want to implement an extremely simple three-step metric, where:\n",
- "\n",
- "1. We calculate the average of all the sets,\n",
- "2. Then, calculate the cosine distance between the target set averages and the \n",
- " attribute average.\n",
- "3. Subtract these distances.\n",
- "\n",
- "To do this, we create a new method :code:`_calc_metric` in which, using\n",
- "the array of embedding dict objects as input, we will implement the above."
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "source": [
- "from typing import Any, Dict, Union, List, Callable\n",
- "\n",
- "from scipy.spatial import distance\n",
- "import numpy as np\n",
- "\n",
- "from wefe.metrics import BaseMetric\n",
- "from wefe.query import Query\n",
- "from wefe.word_embedding_model import WordEmbeddingModel\n",
- "from wefe.preprocessing import get_embeddings_from_query\n",
- "\n",
- "class ExampleMetric(BaseMetric):\n",
- "\n",
- " # replace with the parameters of your metric\n",
- " metric_template = (\n",
- " 2, 1\n",
- " ) # cardinalities of the targets and attributes sets that your metric will accept.\n",
- " metric_name = 'Example Metric'\n",
- " metric_short_name = 'EM'\n",
- "\n",
- " def _calc_metric(self, target_embeddings, attribute_embeddings):\n",
- " \"\"\"Calculates the metric.\n",
- "\n",
- " Parameters\n",
- " ----------\n",
- " target_embeddings : np.array\n",
- " An array with dicts. Each dict represents an target set. \n",
- " A dict is composed with a word and its embedding as key, value respectively.\n",
- " attribute_embeddings : np.array\n",
- " An array with dicts. Each dict represents an attribute set. \n",
- " A dict is composed with a word and its embedding as key, value respectively.\n",
- "\n",
- " Returns\n",
- " -------\n",
- " np.float\n",
- " The value of the calculated metric.\n",
- " \"\"\"\n",
- "\n",
- " # get the embeddings from the dicts\n",
- " target_embeddings_0 = np.array(list(target_embeddings[0].values()))\n",
- " target_embeddings_1 = np.array(list(target_embeddings[1].values()))\n",
- "\n",
- " attribute_embeddings_0 = np.array(\n",
- " list(attribute_embeddings[0].values()))\n",
- "\n",
- " # calculate the average embedding by target and attribute set.\n",
- " target_embeddings_0_avg = np.mean(target_embeddings_0, axis=0)\n",
- " target_embeddings_1_avg = np.mean(target_embeddings_1, axis=0)\n",
- " attribute_embeddings_0_avg = np.mean(attribute_embeddings_0, axis=0)\n",
- "\n",
- " # calculate the distances between the target sets and the attribute set\n",
- " dist_target_0_attr = distance.cosine(target_embeddings_0_avg,\n",
- " attribute_embeddings_0_avg)\n",
- " dist_target_1_attr = distance.cosine(target_embeddings_1_avg,\n",
- " attribute_embeddings_0_avg)\n",
- "\n",
- " # subtract the distances\n",
- " metric_result = dist_target_0_attr - dist_target_1_attr\n",
- " return metric_result\n",
- "\n",
- " def run_query(\n",
- " self,\n",
- " query: Query,\n",
- " model: WordEmbeddingModel,\n",
- " lost_vocabulary_threshold: float = 0.2,\n",
- " preprocessors: List[Dict[str, Union[str, bool, Callable]]] = [{}],\n",
- " strategy: str = \"first\",\n",
- " normalize: bool = False,\n",
- " warn_not_found_words: bool = False,\n",
- " *args: Any,\n",
- " **kwargs: Any,\n",
- " ) -> Dict[str, Any]:\n",
- " \"\"\"Calculate the Example Metric metric over the provided parameters.\n",
- "\n",
- " Parameters\n",
- " ----------\n",
- " query : Query\n",
- " A Query object that contains the target and attribute word sets to \n",
- " be tested.\n",
- "\n",
- " word_embedding : WordEmbeddingModel\n",
- " A WordEmbeddingModel object that contains certain word embedding \n",
- " pretrained model.\n",
- " \n",
- " lost_vocabulary_threshold : float, optional\n",
- " Specifies the proportional limit of words that any set of the query is\n",
- " allowed to lose when transforming its words into embeddings.\n",
- " In the case that any set of the query loses proportionally more words\n",
- " than this limit, the result values will be np.nan, by default 0.2\n",
- "\n",
- " preprocessors : List[Dict[str, Union[str, bool, Callable]]]\n",
- " A list with preprocessor options.\n",
- "\n",
- " A ``preprocessor`` is a dictionary that specifies what processing(s) are\n",
- " performed on each word before its looked up in the model vocabulary.\n",
- " For example, the ``preprocessor``\n",
- " ``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase\n",
- " and remove the accent from each word before searching for them in the\n",
- " model vocabulary. Note that an empty dictionary ``{}`` indicates that no\n",
- " preprocessing is done.\n",
- "\n",
- " The possible options for a preprocessor are:\n",
- "\n",
- " * ``lowercase``: ``bool``. Indicates that the words are transformed to\n",
- " lowercase.\n",
- " * ``uppercase``: ``bool``. Indicates that the words are transformed to\n",
- " uppercase.\n",
- " * ``titlecase``: ``bool``. Indicates that the words are transformed to\n",
- " titlecase.\n",
- " * ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that\n",
- " the accents of the words are eliminated. The stripping type can be\n",
- " specified. True uses ‘unicode’ by default.\n",
- " * ``preprocessor``: ``Callable``. It receives a function that operates\n",
- " on each word. In the case of specifying a function, it overrides the\n",
- " default preprocessor (i.e., the previous options stop working).\n",
- "\n",
- " A list of preprocessor options allows searching for several\n",
- " variants of the words into the model. For example, the preprocessors\n",
- " ``[{}, {\"lowercase\": True, \"strip_accents\": True}]``\n",
- " ``{}`` allows first searching for the original words in the vocabulary of the model. \n",
- " In case some of them are not found, ``{\"lowercase\": True, \"strip_accents\": True}`` \n",
- " is executed on these words and then they are searched in the model vocabulary.\n",
- "\n",
- " strategy : str, optional\n",
- " The strategy indicates how it will use the preprocessed words: 'first' will\n",
- " include only the first transformed word found. all' will include all\n",
- " transformed words found, by default \"first\".\n",
- "\n",
- " normalize : bool, optional\n",
- " True indicates that embeddings will be normalized, by default False\n",
- "\n",
- " warn_not_found_words : bool, optional\n",
- " Specifies if the function will warn (in the logger)\n",
- " the words that were not found in the model's vocabulary\n",
- " , by default False.\n",
- "\n",
- " Returns\n",
- " -------\n",
- " Dict[str, Any]\n",
- " A dictionary with the query name, the resulting score of the metric, \n",
- " and other scores.\n",
- " \"\"\"\n",
- " # check the types of the provided arguments (only the defaults).\n",
- " self._check_input(query, model)\n",
- "\n",
- " # transform query word sets into embeddings\n",
- " embeddings = get_embeddings_from_query(\n",
- " model=model,\n",
- " query=query,\n",
- " lost_vocabulary_threshold=lost_vocabulary_threshold,\n",
- " preprocessors=preprocessors,\n",
- " strategy=strategy,\n",
- " normalize=normalize,\n",
- " warn_not_found_words=warn_not_found_words,\n",
- " )\n",
- "\n",
- " # if there is any/some set has less words than the allowed limit,\n",
- " # return the default value (nan)\n",
- " if embeddings is None:\n",
- " return {\n",
- " 'query_name': query.query_name, # the name of the evaluated query\n",
- " 'result': np.nan, # the result of the metric\n",
- " 'em': np.nan, # result of the calculated metric (recommended)\n",
- " 'other_metric' : np.nan, # another metric calculated (optional)\n",
- " 'results_by_word' : np.nan, # if available, values by word (optional)\n",
- " # ...\n",
- " }\n",
- "\n",
- " # get the targets and attribute sets transformed into embeddings.\n",
- " target_sets, attribute_sets = embeddings\n",
- "\n",
- " # commonly, you only will need the embeddings of the sets.\n",
- " # this can be obtained by using:\n",
- " target_embeddings = list(target_sets.values())\n",
- " attribute_embeddings = list(attribute_sets.values())\n",
- "\n",
- " result = self._calc_metric(target_embeddings, attribute_embeddings)\n",
- "\n",
- " # return the results.\n",
- " return {\"query_name\": query.query_name, \"result\": result, 'em': result}"
- ],
- "outputs": [],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-29T18:24:32.343454Z",
- "start_time": "2020-12-29T18:24:32.326425Z"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Now, let's try it out:"
- ],
- "metadata": {}
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "source": [
- "from wefe.query import Query\n",
- "from wefe.utils import load_test_model # a few embeddings of WEAT experiments\n",
- "from wefe.datasets.datasets import load_weat # the word sets of WEAT experiments\n",
- "\n",
- "weat = load_weat()\n",
- "model = load_test_model()\n",
- "\n",
- "flowers = weat['flowers']\n",
- "weapons = weat['weapons']\n",
- "pleasant = weat['pleasant_5']\n",
- "query = Query([flowers, weapons], [pleasant], ['Flowers', 'Weapons'],\n",
- " ['Pleasant'])\n",
- "\n",
- "\n",
- "results = ExampleMetric().run_query(query, model)\n",
- "print(results)"
- ],
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "INFO:gensim.utils:loading KeyedVectors object from /home/pablo/wefe/wefe/datasets/data/test_model.kv\n",
- "DEBUG:smart_open.smart_open_lib:{'uri': '/home/pablo/wefe/wefe/datasets/data/test_model.kv', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}\n",
- "INFO:gensim.utils:setting ignored attribute vectors_norm to None\n",
- "DEBUG:gensim.utils:starting a new internal lifecycle event log for KeyedVectors\n",
- "INFO:gensim.utils:KeyedVectors lifecycle event {'fname': '/home/pablo/wefe/wefe/datasets/data/test_model.kv', 'datetime': '2021-08-13T11:43:41.890605', 'gensim': '4.0.1', 'python': '3.7.10 (default, Jun 4 2021, 14:48:32) \\n[GCC 7.5.0]', 'platform': 'Linux-5.10.16.3-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid', 'event': 'loaded'}\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "{'query_name': 'Flowers and Weapons wrt Pleasant', 'result': -0.10210171341896057, 'em': -0.10210171341896057}\n"
- ]
- }
- ],
- "metadata": {
- "ExecuteTime": {
- "end_time": "2020-12-29T18:24:32.475099Z",
- "start_time": "2020-12-29T18:24:32.345453Z"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "\n",
- "We have completely defined a new metric.\n",
- "Congratulations!\n",
- "\n",
- "\n",
- "**Note**\n",
- "\n",
- "Some comments regarding the implementation of new metrics:\n",
- "\n",
- "- Note that the returned object must necessarily be a ``dict`` instance \n",
- " containing the ``result`` and ``query_name`` key-values. Otherwise you \n",
- " will not be able to run query batches using utility functions like \n",
- " ``run_queries``.\n",
- "- ``run_query`` can receive additional parameters. Simply add them to the \n",
- " function signature. These parameters can also be used when running the \n",
- " metric from the ``run_queries`` utility function.\n",
- "- We recommend implementing the logic of the metric separated from the \n",
- " ``run_query`` function. In other words, implement the logic in a \n",
- " ``calc_your_metric`` function that receives the dictionaries with the \n",
- " necessary embeddings and parameters.\n",
- "- The file where ``ExampleMetric`` is located can be found inside the \n",
- " distances folder of the [repository](https://github.com/dccuchile/wefe/blob/master/wefe/metrics/example_metric.py/)\n",
- "\n"
- ],
- "metadata": {}
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Mitigation Method Implementation Guide"
- ],
- "metadata": {}
- }
- ],
- "metadata": {
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3.7.10 64-bit ('wefe': conda)"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": true,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": false,
- "toc_position": {},
- "toc_section_display": true,
- "toc_window_display": false
- },
- "varInspector": {
- "cols": {
- "lenName": 16,
- "lenType": 16,
- "lenVar": 40
- },
- "kernels_config": {
- "python": {
- "delete_cmd_postfix": "",
- "delete_cmd_prefix": "del ",
- "library": "var_list.py",
- "varRefreshCmd": "print(var_dic_list())"
- },
- "r": {
- "delete_cmd_postfix": ") ",
- "delete_cmd_prefix": "rm(",
- "library": "var_list.r",
- "varRefreshCmd": "cat(var_dic_list()) "
- }
- },
- "types_to_exclude": [
- "module",
- "function",
- "builtin_function_or_method",
- "instance",
- "_Feature"
- ],
- "window_display": false
- },
- "interpreter": {
- "hash": "d84ebd0cc0ccc3ceb6e99e895c29aecce94e907fb1f0ef7f0ceb697eb7e914cf"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat": 4,
+ "nbformat_minor": 4
}
\ No newline at end of file
diff --git a/examples/measurement_user_guide.ipynb b/examples/measurement_user_guide.ipynb
new file mode 100644
index 0000000..e22d29d
--- /dev/null
+++ b/examples/measurement_user_guide.ipynb
@@ -0,0 +1,2128 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ ".. _bias measurement:\n",
+ "\n",
+ "Bias Measurement\n",
+ "================\n",
+ "\n",
+ "The following guide is designed to present the more general details on\n",
+ "using the package to measure bias. The following sections show:\n",
+ "\n",
+ "* how to run a simple query using ``Glove`` embedding model.\n",
+ "* how to run multiple queries on multiple embeddings.\n",
+ "* how to compare the results obtained from running multiple\n",
+ " sets of queries on multiple embeddings using different metrics\n",
+ " through ranking calculation.\n",
+ "* how to calculate the correlations between the\n",
+ " rankings obtained.\n",
+ "\n",
+ ".. warning::\n",
+ "\n",
+ " To accurately study and reduce biases contained in word embeddings, queries may\n",
+ " contain words that could be offensive to certain groups or individuals.\n",
+ " The relationships studied between these words DO NOT represent the\n",
+ " ideas, thoughts or beliefs of the authors of this library. \n",
+ " This warning applies to all documentation.\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " If you are not familiar with the concepts of query, target and attribute\n",
+ " set, please visit the :ref:`measurement framework`\n",
+ " on the library’s conceptual guides. These concepts are widely used in the\n",
+ " following sections.\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " For a list of metrics implemented in WEFE, refer to the\n",
+ " :ref:`metrics section` of the API reference. \n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Run a Query\n",
+ "-----------\n",
+ "\n",
+ "The following subsections explains how to run a simple query that\n",
+ "measures gender bias on\n",
+ "`Glove `_. The example uses\n",
+ "the Word Embedding Association Test (:class:`~wefe.metrics.WEAT.WEAT`) metric\n",
+ "quantifying the bias in the embeddings model. Below we show the three usual steps for\n",
+ "performing a query in WEFE:\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " :class:`~wefe.metrics.WEAT.WEAT` is a fairness metric that quantifies the relationship\n",
+ " between two sets of target words (sets of words intended to denote a social\n",
+ " groups as men and women) and two sets of attribute words (sets of words\n",
+ " representing some attitude, characteristic, trait, occupational field,\n",
+ " etc. that can be associated with individuals from any social group). \n",
+ "\n",
+ " The closer its value is to 0, the less biased the model is. \n",
+ "\n",
+ " Visit the metrics documentation (:class:`~wefe.metrics.WEAT.WEAT`) for more information.\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Load a word embeddings model as a ``WordEmbeddingModel`` object\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "Load the word embedding model and then wrap it using a\n",
+ ":class:`~wefe.word_embedding_model.WordEmbeddingModel` (class that allows WEFE to handle the models).\n",
+ "\n",
+ "WEFE bases all its operations on word embeddings using Gensim’s\n",
+ "``KeyedVectors`` interface. Any model that can be loaded using\n",
+ "``KeyedVectors`` will be compatible with WEFE. The following example uses a 25-dim pre-trained ``Glove`` model using a\n",
+ "twitter dataset loaded using `gensim-data `_."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import plotly.io as pio\n",
+ "png_renderer = pio.renderers[\"png\"]\n",
+ "png_renderer.scale = 2\n",
+ "\n",
+ "pio.renderers.default = \"png\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Renderers configuration\n",
+ "-----------------------\n",
+ " Default renderer: 'png'\n",
+ " Available renderers:\n",
+ " ['plotly_mimetype', 'jupyterlab', 'nteract', 'vscode',\n",
+ " 'notebook', 'notebook_connected', 'kaggle', 'azure', 'colab',\n",
+ " 'cocalc', 'databricks', 'json', 'png', 'jpeg', 'jpg', 'svg',\n",
+ " 'pdf', 'browser', 'firefox', 'chrome', 'chromium', 'iframe',\n",
+ " 'iframe_connected', 'sphinx_gallery', 'sphinx_gallery_png']"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pio.renderers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gensim.downloader as api\n",
+ "\n",
+ "from wefe.datasets import load_weat\n",
+ "from wefe.metrics import WEAT\n",
+ "from wefe.query import Query\n",
+ "from wefe.word_embedding_model import WordEmbeddingModel\n",
+ "\n",
+ "twitter_25 = api.load(\"glove-twitter-25\")\n",
+ "# WordEmbeddingModel receives as first argument a KeyedVectors model\n",
+ "# and the second argument the model name.\n",
+ "model = WordEmbeddingModel(twitter_25, \"glove twitter dim=25\")"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Create the query using a ``Query`` object\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "Define the target and attribute word sets and create a :class:`~wefe.query.Query` object\n",
+ "that contains them.\n",
+ "\n",
+ "For this initial example, a query is used to study the association\n",
+ "between gender with respect to family and career. The words used are\n",
+ "taken from the set of words used in the *Semantics derived automatically\n",
+ "from language corpora contain human-like biases* paper, which are\n",
+ "included in the ``datasets`` module."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gender_query = Query(\n",
+ " target_sets=[\n",
+ " [\"female\", \"woman\", \"girl\", \"sister\", \"she\", \"her\", \"hers\", \"daughter\"],\n",
+ " [\"male\", \"man\", \"boy\", \"brother\", \"he\", \"him\", \"his\", \"son\"],\n",
+ " ],\n",
+ " attribute_sets=[\n",
+ " [\n",
+ " \"home\",\n",
+ " \"parents\",\n",
+ " \"children\",\n",
+ " \"family\",\n",
+ " \"cousins\",\n",
+ " \"marriage\",\n",
+ " \"wedding\",\n",
+ " \"relatives\",\n",
+ " ],\n",
+ " [\n",
+ " \"executive\",\n",
+ " \"management\",\n",
+ " \"professional\",\n",
+ " \"corporation\",\n",
+ " \"salary\",\n",
+ " \"office\",\n",
+ " \"business\",\n",
+ " \"career\",\n",
+ " ],\n",
+ " ],\n",
+ " target_sets_names=[\"Female terms\", \"Male Terms\"],\n",
+ " attribute_sets_names=[\"Family\", \"Careers\"],\n",
+ ")\n",
+ "\n",
+ "gender_query\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Run the Query\n",
+ "~~~~~~~~~~~~~\n",
+ "\n",
+ "Instantiate the metric that you will use and then execute ``run_query``\n",
+ "with the parameters created in the previous steps.\n",
+ "\n",
+ "Any bias measurement process at WEFE consists of the following steps:\n",
+ "\n",
+ "1. Metric arguments checking.\n",
+ "2. Transform the word sets into word embeddings.\n",
+ "3. Calculate the metric.\n",
+ "\n",
+ "In this case we use the :class:`~wefe.metrics.WEAT.WEAT` metric (proposed in the\n",
+ "same paper of the set of words used in the query)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'Female terms and Male Terms wrt Family and Careers',\n",
+ " 'result': 0.31658412935212255,\n",
+ " 'weat': 0.31658412935212255,\n",
+ " 'effect_size': 0.6779439085309583,\n",
+ " 'p_value': nan}"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "metric = WEAT()\n",
+ "result = metric.run_query(gender_query, model)\n",
+ "result\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "By default, the results are a ``dict`` containing the query name (in the\n",
+ "key ``query_name``) and the calculated value of the metric in the\n",
+ "``result`` key. It also contains a key with the name and the value of\n",
+ "the calculated metric (which is duplicated in the “results” key).\n",
+ "\n",
+ "Depending on the metric class used, the result ``dict`` can also return\n",
+ "more metrics, detailed word-by-word values or other statistics like\n",
+ "p-values. Also some metrics allow you to change the default value in\n",
+ "results.\n",
+ "\n",
+ "Details of all the metrics implemented, their parameters and\n",
+ "examples of execution can be found at :ref:`metrics section `."
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Run Query Arguments\n",
+ "-------------------\n",
+ "\n",
+ "Each metric allows varying the behavior of ``run_query`` according to\n",
+ "different parameters. There are parameters to customize the\n",
+ "transformation of the sets of words to sets of embeddings, others to\n",
+ "warn errors or modify which calculation method the metric use.\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " Each metric implements the ``run_query`` method with different arguments. \n",
+ " Visit their API documentation for more information.\n",
+ "\n",
+ "\n",
+ "For example, ``run_query`` can be instructed to ``return effect_size``\n",
+ "in the ``result`` key by setting ``return_effect_size`` as ``True``.\n",
+ "Note that this parameter is only of the class :class:`~wefe.metrics.WEAT.WEAT``.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'Female terms and Male Terms wrt Family and Careers',\n",
+ " 'result': 0.6779439085309583,\n",
+ " 'weat': 0.31658412935212255,\n",
+ " 'effect_size': 0.6779439085309583,\n",
+ " 'p_value': nan}"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weat = WEAT()\n",
+ "result = weat.run_query(gender_query, model, return_effect_size=True)\n",
+ "result\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "You can also request ``run_query`` to run the statistical significance\n",
+ "calculation by setting ``calculate_p_value`` as ``True``. This checks\n",
+ "how many queries generated from permutations (controlled by the\n",
+ "parameter ``p_value_iterations``) of the target sets obtain values\n",
+ "greater than those obtained by the original query."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'Female terms and Male Terms wrt Family and Careers',\n",
+ " 'result': 0.31658412935212255,\n",
+ " 'weat': 0.31658412935212255,\n",
+ " 'effect_size': 0.6779439085309583,\n",
+ " 'p_value': 0.08418316336732654}"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weat = WEAT()\n",
+ "result = weat.run_query(\n",
+ " gender_query, model, calculate_p_value=True, p_value_iterations=5000\n",
+ ")\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Out of Vocabulary Words and Word Preprocessors\n",
+ "----------------------------------------------\n",
+ "\n",
+ "It is common in the literature to find bias tests whose tagret sets are\n",
+ "common names of social groups. These names are commonly cased and may\n",
+ "contain special characters. There are several embedding models whose\n",
+ "words are not cased or do not have accents or other special characters,\n",
+ "as for example, in ``Glove``. This implies that a query with target sets\n",
+ "composed by names executed in ``Glove`` (without any preprocessing of\n",
+ "the words) could produce erroneous results because WEFE will not be able\n",
+ "to find the names in the model vocabulary.\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " Some well-known word sets are already provided by the package and can be\n",
+ " easily loaded by the user through the :ref:`datasets ` module. From here on,\n",
+ " the tutorial use the words defined in the study *Semantics derived\n",
+ " automatically from language corpora contain human-like biases*, the same\n",
+ " that proposed the :class:`~wefe.metrics.WEAT.WEAT` metric.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load the weat word sets.\n",
+ "word_sets = load_weat()\n",
+ "\n",
+ "# print a set of european american common names.\n",
+ "print(word_sets[\"european_american_names_5\"])"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "The following query compares European-American and African-American\n",
+ "names with respect to pleasant and unpleasant attributes.\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " It can be indicated to ``run_query`` to log the words that were lost in\n",
+ " the transformation to vectors by using the parameter\n",
+ " ``warn_not_found_words`` as ``True``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']\n",
+ "WARNING:root:The transformation of 'European american names' into glove twitter dim=25 embeddings lost proportionally more words than specified in 'lost_words_threshold': 1.0 lost with respect to 0.2 maximum loss allowed.\n",
+ "WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'Jamel', 'Theo', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Lavon', 'Marcellus', 'Wardell', 'Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya', 'Yolanda', 'Malika', 'Yvette']\n",
+ "WARNING:root:The transformation of 'African american names' into glove twitter dim=25 embeddings lost proportionally more words than specified in 'lost_words_threshold': 1.0 lost with respect to 0.2 maximum loss allowed.\n",
+ "ERROR:root:At least one set of 'European american names and African american names wrt Pleasant and Unpleasant' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',\n",
+ " 'result': nan,\n",
+ " 'weat': nan,\n",
+ " 'effect_size': nan}"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ethnicity_query = Query(\n",
+ " [word_sets[\"european_american_names_5\"], word_sets[\"african_american_names_5\"]],\n",
+ " [word_sets[\"pleasant_5\"], word_sets[\"unpleasant_5\"]],\n",
+ " [\"European american names\", \"African american names\"],\n",
+ " [\"Pleasant\", \"Unpleasant\"],\n",
+ ")\n",
+ "result = weat.run_query(ethnicity_query, model, warn_not_found_words=True,)\n",
+ "result\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ ".. warning::\n",
+ "\n",
+ " If more than 20% of the words from any of the word sets of the query are\n",
+ " lost during the transformation to embeddings, the result of the metric\n",
+ " will be ``np.nan``. This behavior can be changed using a float number\n",
+ " parameter called ``lost_vocabulary_threshold``."
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Word Preprocessors\n",
+ "~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "Any ``run_query`` method allows preprocessing each word before they are searched in the model's \n",
+ "vocabulary through the parameter ``preprocessors`` (list of one or more preprocessor).\n",
+ "This parameter accepts a list of individual preprocessors, which are defined below:\n",
+ "\n",
+ "A ``preprocessor`` is a dictionary that specifies what processing(s) are \n",
+ "performed on each word before its looked up in the model vocabulary.\n",
+ "For example, the ``preprocessor``\n",
+ "``{'lowecase': True, 'strip_accents': True}`` allows you to lowercase\n",
+ "and remove the accent from each word before searching for them in the\n",
+ "model vocabulary. Note that an empty dictionary ``{}`` indicates that no\n",
+ "preprocessing is done.\n",
+ "\n",
+ "The possible options for a preprocessor are:\n",
+ "\n",
+ "- ``lowercase``: ``bool``. Indicates that the words are transformed to lowercase.\n",
+ "- ``uppercase``: ``bool``. Indicates that the words are transformed to uppercase.\n",
+ "- ``titlecase``: ``bool``. Indicates that the words are transformed to titlecase.\n",
+ "- ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that the accents of the words\n",
+ " are eliminated. The stripping type can be specified. True uses 'unicode' by default.\n",
+ "- ``preprocessor``: ``Callable``. It receives a function that operates on each word. \n",
+ " In the case of specifying a function, it overrides the default preprocessor \n",
+ " (i.e., the previous options stop working).\n",
+ "\n",
+ "\n",
+ "A list of preprocessor options allows searching for several\n",
+ "variants of the words into the model. For example, the preprocessors\n",
+ "``[{}, {\"lowercase\": True, \"strip_accents\": True}]``\n",
+ "``{}`` allows searching first for the original words in the vocabulary of the model. \n",
+ "In case some of them are not found, ``{\"lowercase\": True, \"strip_accents\": True}`` \n",
+ "is executed on these words and then they are searched in the model vocabulary.\n",
+ "\n",
+ "By default (in case there is more than one preprocessor in the list) the first \n",
+ "preprocessed word found in the embeddings model is used. \n",
+ "This behavior can be controlled by the ``strategy`` parameter of ``run_query``.\n",
+ "\n",
+ "In the following example, we provide a list with only one\n",
+ "preprocessor that instructs ``run_query`` to lowercase and remove all\n",
+ "accents from every word before they are searched in the embeddings\n",
+ "model.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['wardell']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',\n",
+ " 'result': 3.7529150679125456,\n",
+ " 'weat': 3.7529150679125456,\n",
+ " 'effect_size': 1.2746819330405683,\n",
+ " 'p_value': nan}"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weat = WEAT()\n",
+ "result = weat.run_query(\n",
+ " ethnicity_query,\n",
+ " model,\n",
+ " preprocessors=[{\"lowercase\": True, \"strip_accents\": True}],\n",
+ " warn_not_found_words=True,\n",
+ ")\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "It may happen that it is more important to find the original word and in\n",
+ "the case of not finding it, then preprocess it and look it up in the\n",
+ "vocabulary. This behavior can be specified in ``preprocessors`` list by\n",
+ "first specifying an empty preprocessor ``{}`` and then the preprocessor\n",
+ "that converts to lowercase and removes accents.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']\n",
+ "WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'Jamel', 'Theo', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Lavon', 'Marcellus', 'Wardell', 'wardell', 'Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya', 'Yolanda', 'Malika', 'Yvette']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',\n",
+ " 'result': 3.7529150679125456,\n",
+ " 'weat': 3.7529150679125456,\n",
+ " 'effect_size': 1.2746819330405683,\n",
+ " 'p_value': nan}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weat = WEAT()\n",
+ "result = weat.run_query(\n",
+ " ethnicity_query,\n",
+ " model,\n",
+ " preprocessors=[\n",
+ " {}, # empty preprocessor, search for the original words.\n",
+ " {\n",
+ " \"lowercase\": True,\n",
+ " \"strip_accents\": True,\n",
+ " }, # search for lowercase and no accent words.\n",
+ " ],\n",
+ " warn_not_found_words=True,\n",
+ ")\n",
+ "\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "The number of preprocessing steps can be increased as needed. For\n",
+ "example, we can complex the above preprocessor to first search for the\n",
+ "original words, then for the lowercase words, and finally for the\n",
+ "lowercase words without accents.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack', 'Matthew', 'Stephen', 'Brad', 'Greg', 'Paul', 'Jonathan', 'Peter', 'Amanda', 'Courtney', 'Heather', 'Melanie', 'Sara', 'Amber', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen', 'Lauren', 'Colleen', 'Emily', 'Megan', 'Rachel']\n",
+ "WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'Jamel', 'Theo', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Lionel', 'Tyree', 'Deion', 'Lamont', 'Malik', 'Terrence', 'Tyrone', 'Lavon', 'Marcellus', 'Wardell', 'wardell', 'wardell', 'Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya', 'Yolanda', 'Malika', 'Yvette']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',\n",
+ " 'result': 3.7529150679125456,\n",
+ " 'weat': 3.7529150679125456,\n",
+ " 'effect_size': 1.2746819330405683,\n",
+ " 'p_value': nan}"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weat = WEAT()\n",
+ "result = weat.run_query(\n",
+ " ethnicity_query,\n",
+ " model,\n",
+ " preprocessors=[\n",
+ " {}, # first step: empty preprocessor, search for the original words.\n",
+ " {\"lowercase\": True,}, # second step: search for lowercase.\n",
+ " {\n",
+ " \"lowercase\": True,\n",
+ " \"strip_accents\": True,\n",
+ " }, # third step: search for lowercase and no accent words.\n",
+ " ],\n",
+ " warn_not_found_words=True,\n",
+ ")\n",
+ "\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "It is also possible to change the behavior of the search by including\n",
+ "not only the first word, but all the words generated by the\n",
+ "preprocessors. This can be controlled by specifying the parameter\n",
+ "``strategy=all``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:The following words from set 'European american names' do not exist within the vocabulary of glove twitter dim=25: ['Adam', 'ADAM', 'Harry', 'HARRY', 'Josh', 'JOSH', 'Roger', 'ROGER', 'Alan', 'ALAN', 'Frank', 'FRANK', 'Justin', 'JUSTIN', 'Ryan', 'RYAN', 'Andrew', 'ANDREW', 'Jack', 'JACK', 'Matthew', 'MATTHEW', 'Stephen', 'STEPHEN', 'Brad', 'BRAD', 'Greg', 'GREG', 'Paul', 'PAUL', 'Jonathan', 'JONATHAN', 'Peter', 'PETER', 'Amanda', 'AMANDA', 'Courtney', 'COURTNEY', 'Heather', 'HEATHER', 'Melanie', 'MELANIE', 'Sara', 'SARA', 'Amber', 'AMBER', 'Katie', 'KATIE', 'Betsy', 'BETSY', 'Kristin', 'KRISTIN', 'Nancy', 'NANCY', 'Stephanie', 'STEPHANIE', 'Ellen', 'ELLEN', 'Lauren', 'LAUREN', 'Colleen', 'COLLEEN', 'Emily', 'EMILY', 'Megan', 'MEGAN', 'Rachel', 'RACHEL']\n",
+ "WARNING:root:The following words from set 'African american names' do not exist within the vocabulary of glove twitter dim=25: ['Alonzo', 'ALONZO', 'Jamel', 'JAMEL', 'Theo', 'THEO', 'Alphonse', 'ALPHONSE', 'Jerome', 'JEROME', 'Leroy', 'LEROY', 'Torrance', 'TORRANCE', 'Darnell', 'DARNELL', 'Lamar', 'LAMAR', 'Lionel', 'LIONEL', 'Tyree', 'TYREE', 'Deion', 'DEION', 'Lamont', 'LAMONT', 'Malik', 'MALIK', 'Terrence', 'TERRENCE', 'Tyrone', 'TYRONE', 'Lavon', 'LAVON', 'Marcellus', 'MARCELLUS', 'Wardell', 'wardell', 'WARDELL', 'Nichelle', 'NICHELLE', 'Shereen', 'SHEREEN', 'Ebony', 'EBONY', 'Latisha', 'LATISHA', 'Shaniqua', 'SHANIQUA', 'Jasmine', 'JASMINE', 'Tanisha', 'TANISHA', 'Tia', 'TIA', 'Lakisha', 'LAKISHA', 'Latoya', 'LATOYA', 'Yolanda', 'YOLANDA', 'Malika', 'MALIKA', 'Yvette', 'YVETTE']\n",
+ "WARNING:root:The following words from set 'Pleasant' do not exist within the vocabulary of glove twitter dim=25: ['CARESS', 'FREEDOM', 'HEALTH', 'LOVE', 'PEACE', 'CHEER', 'FRIEND', 'HEAVEN', 'LOYAL', 'PLEASURE', 'DIAMOND', 'GENTLE', 'HONEST', 'LUCKY', 'RAINBOW', 'DIPLOMA', 'GIFT', 'HONOR', 'MIRACLE', 'SUNRISE', 'FAMILY', 'HAPPY', 'LAUGHTER', 'PARADISE', 'VACATION']\n",
+ "WARNING:root:The following words from set 'Unpleasant' do not exist within the vocabulary of glove twitter dim=25: ['ABUSE', 'CRASH', 'FILTH', 'MURDER', 'SICKNESS', 'ACCIDENT', 'DEATH', 'GRIEF', 'POISON', 'STINK', 'ASSAULT', 'DISASTER', 'HATRED', 'POLLUTE', 'TRAGEDY', 'DIVORCE', 'JAIL', 'POVERTY', 'UGLY', 'CANCER', 'KILL', 'ROTTEN', 'VOMIT', 'AGONY', 'PRISON']\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'query_name': 'European american names and African american names wrt Pleasant and Unpleasant',\n",
+ " 'result': 3.7529150679125456,\n",
+ " 'weat': 3.7529150679125456,\n",
+ " 'effect_size': 1.2746819330405683,\n",
+ " 'p_value': nan}"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weat = WEAT()\n",
+ "result = weat.run_query(\n",
+ " ethnicity_query,\n",
+ " model,\n",
+ " preprocessors=[\n",
+ " {}, # first step: empty preprocessor, search for the original words.\n",
+ " {\"lowercase\": True,}, # second step: search for lowercase .\n",
+ " {\"uppercase\": True,}, # third step: search for uppercase.\n",
+ " ],\n",
+ " strategy=\"all\",\n",
+ " warn_not_found_words=True,\n",
+ ")\n",
+ "\n",
+ "result\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Running Multiple Queries\n",
+ "------------------------\n",
+ "\n",
+ "It is usual to want to test many queries of some bias criterion (gender,\n",
+ "ethnicity, religion, politics, socioeconomic, among others) on several\n",
+ "models at the same time. Trying to use ``run_query`` on each pair\n",
+ "embedding-query can be a bit complex and could require extra work to\n",
+ "implement.\n",
+ "\n",
+ "This is why WEFE also implements a function to test multiple\n",
+ "queries on various word embedding models in a single call: the\n",
+ ":func:`~wefe.utils.run_queries` util.\n",
+ "\n",
+ "The following code shows how to run various gender queries on ``Glove``\n",
+ "embedding models with different dimensions trained from the Twitter\n",
+ "dataset. The queries are executed using :class:`~wefe.metrics.WEAT.WEAT` metric."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gensim.downloader as api\n",
+ "\n",
+ "from wefe.datasets import load_weat\n",
+ "from wefe.metrics import RNSB, WEAT\n",
+ "from wefe.query import Query\n",
+ "from wefe.utils import run_queries\n",
+ "from wefe.word_embedding_model import WordEmbeddingModel"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Load the models\n",
+ "~~~~~~~~~~~~~~~\n",
+ "\n",
+ "Load three different Glove Twitter embedding models. These models were\n",
+ "trained using the same dataset varying the number of embedding\n",
+ "dimensions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_1 = WordEmbeddingModel(api.load(\"glove-twitter-25\"), \"glove twitter dim=25\")\n",
+ "model_2 = WordEmbeddingModel(api.load(\"glove-twitter-50\"), \"glove twitter dim=50\")\n",
+ "model_3 = WordEmbeddingModel(api.load(\"glove-twitter-100\"), \"glove twitter dim=100\")\n",
+ "\n",
+ "models = [model_1, model_2, model_3]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Load the word sets and create the queries\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "Now, we load the :class:`~wefe.metrics.WEAT.WEAT` word set and create three queries. The\n",
+ "three queries are intended to measure gender bias.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the WEAT word sets\n",
+ "word_sets = load_weat()\n",
+ "\n",
+ "# Create gender queries\n",
+ "gender_query_1 = Query(\n",
+ " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
+ " [word_sets[\"career\"], word_sets[\"family\"]],\n",
+ " [\"Male terms\", \"Female terms\"],\n",
+ " [\"Career\", \"Family\"],\n",
+ ")\n",
+ "\n",
+ "gender_query_2 = Query(\n",
+ " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
+ " [word_sets[\"science\"], word_sets[\"arts\"]],\n",
+ " [\"Male terms\", \"Female terms\"],\n",
+ " [\"Science\", \"Arts\"],\n",
+ ")\n",
+ "\n",
+ "gender_query_3 = Query(\n",
+ " [word_sets[\"male_terms\"], word_sets[\"female_terms\"]],\n",
+ " [word_sets[\"math\"], word_sets[\"arts_2\"]],\n",
+ " [\"Male terms\", \"Female terms\"],\n",
+ " [\"Math\", \"Arts\"],\n",
+ ")\n",
+ "\n",
+ "gender_queries = [gender_query_1, gender_query_2, gender_query_3]\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Run the queries on all Word Embeddings using WEAT\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "To run the list of queries and models, we call :func:`~wefe.utils.run_queries` using the\n",
+ "parameters defined in the previous step. The mandatory parameters of the\n",
+ "function are 3:\n",
+ "\n",
+ "- a metric,\n",
+ "- a list of queries, and,\n",
+ "- a list of embedding models.\n",
+ "\n",
+ "It is also possible to provide a name for the criterion studied in this\n",
+ "set of queries through the parameter ``queries_set_name``.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:The transformation of 'Science' into glove twitter dim=25 embeddings lost proportionally more words than specified in 'lost_words_threshold': 0.25 lost with respect to 0.2 maximum loss allowed.\n",
+ "ERROR:root:At least one set of 'Male terms and Female terms wrt Science and Arts' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.\n",
+ "WARNING:root:The transformation of 'Science' into glove twitter dim=50 embeddings lost proportionally more words than specified in 'lost_words_threshold': 0.25 lost with respect to 0.2 maximum loss allowed.\n",
+ "ERROR:root:At least one set of 'Male terms and Female terms wrt Science and Arts' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.\n",
+ "WARNING:root:The transformation of 'Science' into glove twitter dim=100 embeddings lost proportionally more words than specified in 'lost_words_threshold': 0.25 lost with respect to 0.2 maximum loss allowed.\n",
+ "ERROR:root:At least one set of 'Male terms and Female terms wrt Science and Arts' query has proportionally fewer embeddings than allowed by the lost_vocabulary_threshold parameter (0.2). This query will return np.nan.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
query_name
\n",
+ "
Male terms and Female terms wrt Career and Family
\n",
+ "
Male terms and Female terms wrt Science and Arts
\n",
+ "
Male terms and Female terms wrt Math and Arts
\n",
+ "
\n",
+ "
\n",
+ "
model_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
glove twitter dim=25
\n",
+ "
0.316584
\n",
+ "
NaN
\n",
+ "
-0.022133
\n",
+ "
\n",
+ "
\n",
+ "
glove twitter dim=50
\n",
+ "
0.363743
\n",
+ "
NaN
\n",
+ "
-0.272334
\n",
+ "
\n",
+ "
\n",
+ "
glove twitter dim=100
\n",
+ "
0.385352
\n",
+ "
NaN
\n",
+ "
-0.082544
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "query_name Male terms and Female terms wrt Career and Family \\\n",
+ "model_name \n",
+ "glove twitter dim=25 0.316584 \n",
+ "glove twitter dim=50 0.363743 \n",
+ "glove twitter dim=100 0.385352 \n",
+ "\n",
+ "query_name Male terms and Female terms wrt Science and Arts \\\n",
+ "model_name \n",
+ "glove twitter dim=25 NaN \n",
+ "glove twitter dim=50 NaN \n",
+ "glove twitter dim=100 NaN \n",
+ "\n",
+ "query_name Male terms and Female terms wrt Math and Arts \n",
+ "model_name \n",
+ "glove twitter dim=25 -0.022133 \n",
+ "glove twitter dim=50 -0.272334 \n",
+ "glove twitter dim=100 -0.082544 "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "WEAT_gender_results = run_queries(\n",
+ " WEAT, gender_queries, models, queries_set_name=\"Gender Queries\"\n",
+ ")\n",
+ "WEAT_gender_results\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Setting metric params\n",
+ "~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "There is a whole column that has no results. As the warnings point out,\n",
+ "when transforming the words of the sets into embeddings, there is a loss\n",
+ "of words that is greater than the allowed by the parameter\n",
+ "``lost_vocabulary_threshold``. In this case, it would be very useful to\n",
+ "use the word preprocessors seen above.\n",
+ "\n",
+ ":func:`~wefe.utils.run_queries`, accept specific parameters for each metric. These extra\n",
+ "parameters for the metric can be passed through ``metric_params``\n",
+ "parameter. In this case, a ``preprocessor`` is provided to lowercase the\n",
+ "words before searching for them in the models’ vocabularies.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
query_name
\n",
+ "
Male terms and Female terms wrt Career and Family
\n",
+ "
Male terms and Female terms wrt Science and Arts
\n",
+ "
Male terms and Female terms wrt Math and Arts
\n",
+ "
\n",
+ "
\n",
+ "
model_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
glove twitter dim=25
\n",
+ "
0.316584
\n",
+ "
0.167431
\n",
+ "
-0.033912
\n",
+ "
\n",
+ "
\n",
+ "
glove twitter dim=50
\n",
+ "
0.363743
\n",
+ "
-0.084690
\n",
+ "
-0.307589
\n",
+ "
\n",
+ "
\n",
+ "
glove twitter dim=100
\n",
+ "
0.385352
\n",
+ "
0.099632
\n",
+ "
-0.155790
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "query_name Male terms and Female terms wrt Career and Family \\\n",
+ "model_name \n",
+ "glove twitter dim=25 0.316584 \n",
+ "glove twitter dim=50 0.363743 \n",
+ "glove twitter dim=100 0.385352 \n",
+ "\n",
+ "query_name Male terms and Female terms wrt Science and Arts \\\n",
+ "model_name \n",
+ "glove twitter dim=25 0.167431 \n",
+ "glove twitter dim=50 -0.084690 \n",
+ "glove twitter dim=100 0.099632 \n",
+ "\n",
+ "query_name Male terms and Female terms wrt Math and Arts \n",
+ "model_name \n",
+ "glove twitter dim=25 -0.033912 \n",
+ "glove twitter dim=50 -0.307589 \n",
+ "glove twitter dim=100 -0.155790 "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "WEAT_gender_results = run_queries(\n",
+ " WEAT,\n",
+ " gender_queries,\n",
+ " models,\n",
+ " metric_params={\"preprocessors\": [{\"lowercase\": True}]},\n",
+ " queries_set_name=\"Gender Queries\",\n",
+ ")\n",
+ "\n",
+ "WEAT_gender_results"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "No query was null in these results.\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Plot the results in a barplot\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "The library also provides an easy way to plot the results obtained from\n",
+ "a ``run_queries`` execution into a `plotly `_ barplot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from wefe.utils import plot_queries_results, run_queries\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_queries_results(WEAT_gender_results).show()\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Aggregating Results\n",
+ "-------------------\n",
+ "\n",
+ "The execution of :func:`~wefe.utils.run_queries` provided many results evaluating the\n",
+ "gender bias in the tested embeddings. However, these results alone do\n",
+ "not comprehensively report the biases observed in all of these queries.\n",
+ "One way to obtain an overall view of bias is by aggregating results by\n",
+ "model.\n",
+ "\n",
+ "For WEAT, a simple way to aggregate the results is to average their\n",
+ "absolute values. When running :func:`~wefe.utils.run_queries`, it is possible to specify\n",
+ "that the results be aggregated by model by setting ``aggregate_results``\n",
+ "as ``True``\n",
+ "\n",
+ "The aggregation function can be specified through the\n",
+ "``aggregation_function`` parameter. This parameter accepts a list of\n",
+ "predefined aggregations as well as a custom function that operates on\n",
+ "the results dataframe. The aggregation functions available are:\n",
+ "\n",
+ "- Average ``avg``.\n",
+ "- Average of the absolute values ``abs_avg``.\n",
+ "- Sum ``sum``.\n",
+ "- Sum of the absolute values, ``abs_sum``.\n",
+ "\n",
+ ".. note::\n",
+ "\n",
+ " Notice that some functions are more appropriate for certain metrics. For\n",
+ " metrics returning only positive numbers, all the previous aggregation\n",
+ " functions would be OK. In contrast, metrics that return real values\n",
+ " (e.g., :class:`~wefe.metrics.WEAT.WEAT` , :class:`~wefe.metrics.RND.RND` , etc…), \n",
+ " aggregation functions such as sum would make positive and negative outputs to cancel \n",
+ " each other."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
WEAT: Ethnicity Queries average of abs values score
\n",
+ "
WEAT: Gender Queries average of abs values score
\n",
+ "
RNSB: Ethnicity Queries average of abs values score
\n",
+ "
\n",
+ "
\n",
+ "
model_name
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
glove twitter dim=25
\n",
+ "
3.0
\n",
+ "
1.0
\n",
+ "
3.0
\n",
+ "
\n",
+ "
\n",
+ "
glove twitter dim=50
\n",
+ "
2.0
\n",
+ "
2.0
\n",
+ "
2.0
\n",
+ "
\n",
+ "
\n",
+ "
glove twitter dim=100
\n",
+ "
1.0
\n",
+ "
3.0
\n",
+ "
1.0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " WEAT: Ethnicity Queries average of abs values score \\\n",
+ "model_name \n",
+ "glove twitter dim=25 3.0 \n",
+ "glove twitter dim=50 2.0 \n",
+ "glove twitter dim=100 1.0 \n",
+ "\n",
+ " WEAT: Gender Queries average of abs values score \\\n",
+ "model_name \n",
+ "glove twitter dim=25 1.0 \n",
+ "glove twitter dim=50 2.0 \n",
+ "glove twitter dim=100 3.0 \n",
+ "\n",
+ " RNSB: Ethnicity Queries average of abs values score \n",
+ "model_name \n",
+ "glove twitter dim=25 3.0 \n",
+ "glove twitter dim=50 2.0 \n",
+ "glove twitter dim=100 1.0 "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# create the ranking\n",
+ "ethnicity_ranking = create_ranking(\n",
+ " [WEAT_ethnicity_results, WEAT_EZ_gender_results, RNSB_ethnicity_results]\n",
+ ")\n",
+ "\n",
+ "ethnicity_ranking\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Plotting the rankings\n",
+ "~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "It is possible to graph the rankings in barplots using the\n",
+ ":ref:`~wefe.utils.plot_ranking` function. The generated figure shows the accumulated\n",
+ "rankings for each embedding model. Each bar represents the sum of the\n",
+ "rankings obtained by each embedding. Each color within a bar represents\n",
+ "a different criterion-metric ranking."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from wefe.utils import plot_ranking\n",
+ "\n",
+ "fig = plot_ranking(gender_ranking)\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig = plot_ranking(ethnicity_ranking)\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "Correlating Rankings\n",
+ "~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "Having obtained rankings by metric for each embeddings, it would be\n",
+ "ideal to see and analyze the degree of agreement between them.\n",
+ "\n",
+ "A high concordance between the rankings allows us to state with some certainty that \n",
+ "all metrics evaluated the embedding models in a similar way and therefore, \n",
+ "that the ordering of embeddings by bias calculated makes sense.\n",
+ "On the other hand, a low degree of agreement shows the opposite: the rankings do not \n",
+ "allow to clearly establish which embedding is less biased than another.\n",
+ "\n",
+ "The level of concordance of the rankings can be evaluated by calculating\n",
+ "correlations.WEFE provides :class:`~wefe.utils.calculate_ranking_correlations` to\n",
+ "calculate the correlations between rankings."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
WEAT: Gender Queries average of abs values score (1)
\n",
+ "
WEAT: Gender Queries average of abs values score (2)
\n",
+ "
RNSB: Gender Queries average of abs values score
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
WEAT: Gender Queries average of abs values score (1)
\n",
+ "
1.0
\n",
+ "
0.5
\n",
+ "
-1.0
\n",
+ "
\n",
+ "
\n",
+ "
WEAT: Gender Queries average of abs values score (2)
\n",
+ "
0.5
\n",
+ "
1.0
\n",
+ "
-0.5
\n",
+ "
\n",
+ "
\n",
+ "
RNSB: Gender Queries average of abs values score
\n",
+ "
-1.0
\n",
+ "
-0.5
\n",
+ "
1.0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " WEAT: Gender Queries average of abs values score (1) \\\n",
+ "WEAT: Gender Queries average of abs values scor... 1.0 \n",
+ "WEAT: Gender Queries average of abs values scor... 0.5 \n",
+ "RNSB: Gender Queries average of abs values score -1.0 \n",
+ "\n",
+ " WEAT: Gender Queries average of abs values score (2) \\\n",
+ "WEAT: Gender Queries average of abs values scor... 0.5 \n",
+ "WEAT: Gender Queries average of abs values scor... 1.0 \n",
+ "RNSB: Gender Queries average of abs values score -0.5 \n",
+ "\n",
+ " RNSB: Gender Queries average of abs values score \n",
+ "WEAT: Gender Queries average of abs values scor... -1.0 \n",
+ "WEAT: Gender Queries average of abs values scor... -0.5 \n",
+ "RNSB: Gender Queries average of abs values score 1.0 "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from wefe.utils import calculate_ranking_correlations, plot_ranking_correlations\n",
+ "\n",
+ "correlations = calculate_ranking_correlations(gender_ranking)\n",
+ "correlations\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ ".. note::\n",
+ "\n",
+ " ``calculate_ranking_correlations`` uses the ``corr()`` ``pandas``\n",
+ " dataframe method. The type of correlation that is calculated can be changed \n",
+ " through the method parameter. The available options are:\n",
+ " ``'pearson'``, ``'spearman'``, ``'kendall'``. By default, the spearman\n",
+ " correlation is calculated.\n",
+ "\n",
+ "In this example, Kendall’s correlation is used."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
WEAT: Gender Queries average of abs values score (1)
\n",
+ "
WEAT: Gender Queries average of abs values score (2)
\n",
+ "
RNSB: Gender Queries average of abs values score
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
WEAT: Gender Queries average of abs values score (1)
\n",
+ "
1.000000
\n",
+ "
0.333333
\n",
+ "
-1.000000
\n",
+ "
\n",
+ "
\n",
+ "
WEAT: Gender Queries average of abs values score (2)
\n",
+ "
0.333333
\n",
+ "
1.000000
\n",
+ "
-0.333333
\n",
+ "
\n",
+ "
\n",
+ "
RNSB: Gender Queries average of abs values score
\n",
+ "
-1.000000
\n",
+ "
-0.333333
\n",
+ "
1.000000
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " WEAT: Gender Queries average of abs values score (1) \\\n",
+ "WEAT: Gender Queries average of abs values scor... 1.000000 \n",
+ "WEAT: Gender Queries average of abs values scor... 0.333333 \n",
+ "RNSB: Gender Queries average of abs values score -1.000000 \n",
+ "\n",
+ " WEAT: Gender Queries average of abs values score (2) \\\n",
+ "WEAT: Gender Queries average of abs values scor... 0.333333 \n",
+ "WEAT: Gender Queries average of abs values scor... 1.000000 \n",
+ "RNSB: Gender Queries average of abs values score -0.333333 \n",
+ "\n",
+ " RNSB: Gender Queries average of abs values score \n",
+ "WEAT: Gender Queries average of abs values scor... -1.000000 \n",
+ "WEAT: Gender Queries average of abs values scor... -0.333333 \n",
+ "RNSB: Gender Queries average of abs values score 1.000000 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "calculate_ranking_correlations(gender_ranking, method=\"kendall\")\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "WEFE also provides a function for graphing the correlations:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": ""
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "correlation_fig = plot_ranking_correlations(correlations)\n",
+ "correlation_fig.show()\n"
+ ]
+ },
+ {
+ "cell_type": "raw",
+ "metadata": {},
+ "source": [
+ "In this case, only two of the three rankings show similar results.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.13"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "37d01894bb315c73bf6fde5551d8a97078996f38b23395695bd1998fb0ae5507"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/doc/user_guide/mitigation.ipynb b/examples/mitigation_user_guide.ipynb
similarity index 91%
rename from doc/user_guide/mitigation.ipynb
rename to examples/mitigation_user_guide.ipynb
index 60a6d02..c93bbc3 100644
--- a/doc/user_guide/mitigation.ipynb
+++ b/examples/mitigation_user_guide.ipynb
@@ -1,49 +1,46 @@
{
"cells": [
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
- "(bias mitigation)=\n",
+ ".. _bias mitigation:\n",
"\n",
- "# Bias Mitigation (Debias)\n",
+ "Bias Mitigation (Debias)\n",
+ "========================\n",
"\n",
"The following guide is designed to present the more general details on \n",
"using the package to mitigate (debias) bias in word embedding models. \n",
"The following sections show:\n",
"\n",
- "- run {class}`~wefe.debias.hard_debias.HardDebias` mitigation method on an\n",
+ "- run :class:`~wefe.debias.hard_debias.HardDebias` mitigation method on an\n",
" embedding model to mitigate gender bias (using the ``fit-transform`` interface).\n",
"- apply the ``target`` parameter when executing the transformation.\n",
"- apply the ``ignore`` parameter when executing the transformation.\n",
"- apply the ``copy`` parameter when executing the transformation.\n",
- "- run {class}`~wefe.debias.multiclass_hard_debias.MulticlassHardDebias` mitigation \n",
+ "- run :class:`~wefe.debias.multiclass_hard_debias.MulticlassHardDebias` mitigation \n",
" method on an word embedding model to mitigate ethnic bias.\n",
"\n",
- ":::{note}\n",
+ ".. note::\n",
"\n",
- "For a list of metrics implemented in WEFE, refer to the\n",
- "[debias section](debias-API) of the API reference. \n",
+ " For a list of metrics implemented in WEFE, refer to the\n",
+ " `debias section ` of the API reference. \n",
"\n",
- ":::\n",
"\n",
+ "Hard Debias\n",
+ "-----------\n",
"\n",
- "## Hard Debias\n",
"\n",
"Hard debias is a method that allows mitigating biases through geometric operations on embeddings. \n",
"This method is binary because it only allows 2 classes of the same bias criterion,\n",
"such as male or female.\n",
"\n",
- ":::{note}\n",
- "\n",
- "For a multiclass debias (such as for Latinos, Asians and Whites), it is\n",
- "recommended to visit\n",
- "{class}`~wefe.debias.multiclass_hard_debias.MulticlassHardDebias` class.\n",
- "\n",
- ":::\n",
- "\n",
- "\n",
+ ".. note::\n",
"\n",
+ " For a multiclass debias (such as for Latinos, Asians and Whites), it is\n",
+ " recommended to visit\n",
+ " :class:`~wefe.debias.multiclass_hard_debias.MulticlassHardDebias` class.\n",
+ " \n",
"\n",
"The main idea of this method is:\n",
"\n",
@@ -51,6 +48,7 @@
"these could be e.g. ``[['woman', 'man'], ['she', 'he'], ...]``\n",
"\n",
"2. Neutralize the bias subspace of embeddings that should not be biased.\n",
+ "\n",
"First, it is defined a set of words that are correct to be related to the bias\n",
"criterion: the *criterion specific gender words*.\n",
"For example, in the case of gender, *gender specific* words are:\n",
@@ -64,25 +62,29 @@
"\n",
"The neutralization is carried out under the following operation:\n",
"\n",
- "- $u$ : embedding\n",
- "- $v$ : bias direction\n",
+ "- :math:`u` : embedding\n",
+ "- :math:`v`: bias direction\n",
"\n",
"First calculate the projection of the embedding on the bias subspace.\n",
"\n",
+ ".. math::\n",
"\n",
- "$$\\text{bias_subspace} = \\frac{v \\cdot (v \\cdot u)}{(v \\cdot v)}$$\n",
+ " \\text{bias subspace} = \\frac{v \\cdot (v \\cdot u)}{(v \\cdot v)}\n",
"\n",
"Then subtract the projection from the embedding.\n",
"\n",
- "$$u' = u - \\text{bias_subspace}$$\n",
+ ".. math::\n",
+ "\n",
+ " u' = u - \\text{bias subspace}\n",
+ "\n",
+ "3. Equalizate the embeddings with respect to the bias direction.\n",
"\n",
- " 3. Equalizate the embeddings with respect to the bias direction.\n",
- " Given an equalization set (set of word pairs such as ``['she', 'he'],\n",
- " ['men', 'women'], ...``, but not limited to the definitional set) this step\n",
- " executes, for each pair, an equalization with respect to the bias direction.\n",
- " That is, it takes both embeddings of the pair and distributes them at the same\n",
- " distance from the bias direction, so that neither is closer to the bias direction\n",
- " than the other.\n",
+ "Given an equalization set (set of word pairs such as ``['she', 'he'],\n",
+ "['men', 'women'], ...``, but not limited to the definitional set) this step\n",
+ "executes, for each pair, an equalization with respect to the bias direction.\n",
+ "That is, it takes both embeddings of the pair and distributes them at the same\n",
+ "distance from the bias direction, so that neither is closer to the bias direction\n",
+ "than the other.\n",
"\n",
"\n",
"The fit parameters define how the neutralization will be calculated. In\n",
@@ -119,7 +121,7 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"Load the required word sets."
@@ -163,7 +165,7 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"Instantiate and fit the parameters of the debias transformation.\n",
@@ -196,10 +198,11 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
- "### Mitigation Parameters\n",
+ "Mitigation Parameters\n",
+ "~~~~~~~~~~~~~~~~~~~~~\n",
"\n",
"The parameters of the transform method are relatively standard for all\n",
"methods. The most important ones are ``target``, ``ignore`` and\n",
@@ -224,7 +227,10 @@
" the model and run debias on the copy. If ``False``, the method is\n",
" applied on the original model, causing the vectors to mutate.\n",
"\n",
- " **WARNING:** Setting copy with ``True`` requires at least 2x RAM of\n",
+ "\n",
+ ".. warning::\n",
+ "\n",
+ " WARNING:** Setting copy with ``True`` requires at least 2x RAM of\n",
" the size of the model. Otherwise the execution of the debias may raise\n",
" ``MemoryError``.\n",
"\n",
@@ -258,13 +264,14 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
- "### Measuring the Decrease of Bias\n",
+ "Measuring the Decrease of Bias\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
"\n",
- "Using the metrics displayed in the {ref}`bias measurement` user guide, we\n",
- "can measure whether or not there was a change in the measured gender bias\n",
+ "Using the metrics and queries shown in the :ref:`bias measurement` user guide, we\n",
+ "can measure whether there was a change in the measured gender bias\n",
"between the original model and the debiased model."
]
},
@@ -283,7 +290,7 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"Next, we measure the gender bias exposed by query 1 (Male terms and Female terms wrt Career and Family) with respect to the debiased model and the original."
@@ -332,7 +339,7 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"The above results show that there was a decrease in the measured gender bias.\n",
@@ -389,17 +396,18 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"Again, the above results show that there was a decrease in the measured gender bias."
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
- "### Target Parameter\n",
+ "Target Parameter\n",
+ "~~~~~~~~~~~~~~~~\n",
"\n",
"If a set of words is specified in ``target`` parameter, the debias method is performed\n",
"only on the embeddings associated with this set. \n",
@@ -459,7 +467,7 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"Next, a bias test is run on the mitigated embeddings associated with the\n",
@@ -513,7 +521,7 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"However, if a bias test is run with words that were outside the ``target``\n",
@@ -521,13 +529,12 @@
"metric scores lies in the fact that the equalize sets were still\n",
"equalized.\n",
"\n",
- ":::{warning}\n",
- "\n",
- "The equalization process can modify embeddings that have not been marked in the target.\n",
+ ".. warning::\n",
"\n",
- "Equalization can be deactivated by delivering an empty equalize set (``[]``)\n",
- "\n",
- ":::"
+ " The equalization process can modify embeddings that have not been marked in the target.\n",
+ " In Hard Debias, equalization can be deactivated by delivering an empty \n",
+ " equalize set (``[]``).\n",
+ "\n"
]
},
{
@@ -578,18 +585,19 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"Note that the equalization caused the bias of the debiased model to be slightly larger than the original."
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
"\n",
- "### Saving the Debiased Model\n",
+ "Saving the Debiased Model\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~\n",
"\n",
"To save the mitigated model one must access the ``KeyedVectors`` (the\n",
"gensim object that contains the embeddings) through ``wv`` and then use\n",
@@ -608,10 +616,11 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
- "## Multiclass Hard Debias\n",
+ "Multiclass Hard Debias\n",
+ "----------------------\n",
"\n",
"Multiclass Hard Debias is a generalized version of Hard Debias that\n",
"enables multiclass debiasing. Generalized refers to the fact that this\n",
@@ -680,10 +689,13 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "raw",
"metadata": {},
"source": [
- "### Measuring the Decrease of Bias\n"
+ "Measuring the Decrease of Bias\n",
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+ "\n",
+ "The following code compares the execution of a query measuring ethnic bias in the original model vs. in the debiased model."
]
},
{
@@ -738,6 +750,63 @@
" round(abs(biased_results_2[\"weat\"]), 3),\n",
")\n"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# import the modules\n",
+ "from wefe.query import Query\n",
+ "from wefe.word_embedding_model import WordEmbeddingModel\n",
+ "from wefe.metrics.WEAT import WEAT\n",
+ "import gensim.downloader as api\n",
+ "\n",
+ "# load glove\n",
+ "twitter_25 = api.load('glove-twitter-25')\n",
+ "model = WordEmbeddingModel(twitter_25, 'glove twitter dim=25')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create the word sets\n",
+ "target_sets = [['she', 'woman', 'girl'], ['he', 'man', 'boy']]\n",
+ "target_sets_names = ['Female Terms', 'Male Terms']\n",
+ "\n",
+ "attribute_sets = [['poetry','dance','literature'], ['math', 'physics', 'chemistry']]\n",
+ "attribute_sets_names = ['Arts', 'Science']\n",
+ "\n",
+ "# create the query\n",
+ "query = Query(target_sets, attribute_sets, target_sets_names,\n",
+ " attribute_sets_names)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'query_name': 'Female Terms and Male Terms wrt Arts and Science', 'result': 0.25956966479619337, 'weat': 0.25956966479619337, 'effect_size': 1.4524819039631838, 'p_value': nan}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# instance a WEAT metric\n",
+ "weat = WEAT()\n",
+ "result = weat.run_query(query, model)\n",
+ "print(result)\n",
+ "\n",
+ "\n"
+ ]
}
],
"metadata": {
@@ -756,7 +825,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.13"
+ "version": "3.7.13"
},
"vscode": {
"interpreter": {
@@ -765,5 +834,5 @@
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/requirements-dev.txt b/requirements-dev.txt
index df9d23d..72cb71c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,11 +1,13 @@
-pytest
+pytest>=7.0.0
pytest-cov
coverage
-flake8
-black
-mypy
-sphinx
-sphinx-gallery
-sphinx-rtd-theme
-numpydoc
-torch
\ No newline at end of file
+# flake8==5.0.4
+black==22.6.0
+isort==5.10.1
+mypy==0.812
+Sphinx==5.0.2
+sphinx-gallery==0.11.1
+sphinx-rtd-theme==1.0.0
+numpydoc==1.4.0
+docutils==0.16
+torch==1.12.1
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 40dd8ea..20a4782 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-numpy
-scipy
+numpy>=1.20
+scipy>=1.3.2
scikit-learn>=0.23.2
-pandas
+pandas>=0.25.1
gensim>=3.8.3
-plotly
-six
-requests
-semantic_version
-tqdm
\ No newline at end of file
+plotly>=4.2.0
+requests>=2.22.0
+tqdm>=4.30.0
+semantic_version>=2.8.0
+six
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e409e8d..c066e36 100644
--- a/setup.py
+++ b/setup.py
@@ -52,8 +52,7 @@
"Programming Language :: Python :: 3.10",
]
EXTRAS_REQUIRE = {
- "tests": ["pytest", "pytest-cov"],
- "docs": ["sphinx", "sphinx-gallery", "sphinx_rtd_theme", "numpydoc", "matplotlib"],
+ "pytorch": ["torch"],
}
setup(
diff --git a/test-results/junit.xml b/test-results/junit.xml
deleted file mode 100644
index 7bc6698..0000000
--- a/test-results/junit.xml
+++ /dev/null
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/tests/debias/__init__.py b/tests/debias/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/debias/conftest.py
similarity index 61%
rename from tests/conftest.py
rename to tests/debias/conftest.py
index d1aa95e..35f4344 100644
--- a/tests/conftest.py
+++ b/tests/debias/conftest.py
@@ -2,9 +2,7 @@
from typing import Dict, List, Union
import numpy as np
-import pkg_resources
import pytest
-from gensim.models import KeyedVectors
from wefe.datasets.datasets import fetch_debias_multiclass, fetch_debiaswe, load_weat
from wefe.query import Query
from wefe.utils import load_test_model
@@ -15,13 +13,6 @@
# -------------------------------------------------------------------------------------
-@pytest.fixture
-def keyed_vector_model() -> KeyedVectors:
-
- test_model = KeyedVectors.load("./wefe/datasets/data/test_model.kv")
- return test_model
-
-
@pytest.fixture
def model() -> WordEmbeddingModel:
"""Load a subset of Word2vec as a testing model.
@@ -99,149 +90,6 @@ def mhd_ethnicity_equalize_sets(multiclass_debias_wordsets) -> List[List[str]]:
return list(multiclass_debias_wordsets["ethnicity_analogy_templates"].values())
-# -------------------------------------------------------------------------------------
-# Queries
-# -------------------------------------------------------------------------------------
-
-
-@pytest.fixture
-def query_2t1a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- weat_wordsets = load_weat()
-
- query = Query(
- [weat_wordsets["flowers"], weat_wordsets["insects"]],
- [weat_wordsets["pleasant_5"]],
- ["Flowers", "Insects"],
- ["Pleasant"],
- )
- return query
-
-
-@pytest.fixture
-def query_2t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- """Generate a Flower and Insects wrt Pleasant vs Unpleasant test query.
-
- Parameters
- ----------
- weat_wordsets : Dict[str, List[str]]
- The word sets used in WEAT original work.
-
- Returns
- -------
- Query
- The generated query.
- """
- query = Query(
- [weat_wordsets["flowers"], weat_wordsets["insects"]],
- [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
- ["Flowers", "Insects"],
- ["Pleasant", "Unpleasant"],
- )
- return query
-
-
-@pytest.fixture
-def query_2t2a_uppercase(weat_wordsets: Dict[str, List[str]]) -> Query:
- """Generate a Flower and Insects wrt Pleasant vs Unpleasant test query.
-
- Parameters
- ----------
- weat_wordsets : Dict[str, List[str]]
- The word sets used in WEAT original work.
-
- Returns
- -------
- Query
- The generated query.
- """
- query = Query(
- [
- [s.upper() for s in weat_wordsets["flowers"]],
- [s.upper() for s in weat_wordsets["insects"]],
- ],
- [
- [s.upper() for s in weat_wordsets["pleasant_5"]],
- [s.upper() for s in weat_wordsets["unpleasant_5"]],
- ],
- ["Flowers", "Insects"],
- ["Pleasant", "Unpleasant"],
- )
- return query
-
-
-@pytest.fixture
-def query_3t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- query = Query(
- [
- weat_wordsets["flowers"],
- weat_wordsets["insects"],
- weat_wordsets["instruments"],
- ],
- [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
- ["Flowers", "Weapons", "Instruments"],
- ["Pleasant", "Unpleasant"],
- )
-
- return query
-
-
-@pytest.fixture
-def query_4t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- query = Query(
- [
- weat_wordsets["flowers"],
- weat_wordsets["insects"],
- weat_wordsets["instruments"],
- weat_wordsets["weapons"],
- ],
- [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
- ["Flowers", "Insects", "Instruments", "Weapons"],
- ["Pleasant", "Unpleasant"],
- )
-
- return query
-
-
-@pytest.fixture
-def query_1t4_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- query = Query(
- [weat_wordsets["flowers"]],
- [
- weat_wordsets["pleasant_5"],
- weat_wordsets["pleasant_9"],
- weat_wordsets["unpleasant_5"],
- weat_wordsets["unpleasant_9"],
- ],
- ["Flowers"],
- ["Pleasant 5 ", "Pleasant 9", "Unpleasant 5", "Unpleasant 9"],
- )
- return query
-
-
-@pytest.fixture
-def query_2t1a_lost_vocab_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- query = Query(
- [["bla", "asd"], weat_wordsets["insects"]],
- [weat_wordsets["pleasant_5"]],
- ["Flowers", "Insects"],
- ["Pleasant"],
- )
-
- return query
-
-
-@pytest.fixture
-def query_2t2a_lost_vocab_1(weat_wordsets: Dict[str, List[str]]) -> Query:
- query = Query(
- [["bla", "asd"], weat_wordsets["insects"]],
- [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
- ["Flowers", "Insects"],
- ["Pleasant", "Unpleasant"],
- )
-
- return query
-
-
# --------------------------------------------------------------------------------------
# 2 target 2 attribute gender, ethnicity and religion queries
# --------------------------------------------------------------------------------------
diff --git a/tests/debias/test_base_debias.py b/tests/debias/test_base_debias.py
index 0835aea..653685c 100644
--- a/tests/debias/test_base_debias.py
+++ b/tests/debias/test_base_debias.py
@@ -25,30 +25,36 @@ def test_check_transform_args_wrong_inputs(
# type checking function
with pytest.raises(
- TypeError, match=r"model should be a WordEmbeddingModel instance, got .*",
+ TypeError,
+ match=r"model should be a WordEmbeddingModel instance, got .*",
):
bd._check_transform_args(None)
with pytest.raises(
- TypeError, match=r"target should be None or a list of strings, got .*",
+ TypeError,
+ match=r"target should be None or a list of strings, got .*",
):
bd._check_transform_args(model, target=1)
with pytest.raises(
- TypeError, match=r"All elements in target should be strings, .*",
+ TypeError,
+ match=r"All elements in target should be strings, .*",
):
bd._check_transform_args(model, target=gender_specific + [10])
with pytest.raises(
- TypeError, match=r"ignore should be None or a list of strings, got .*",
+ TypeError,
+ match=r"ignore should be None or a list of strings, got .*",
):
bd._check_transform_args(model, ignore=1)
with pytest.raises(
- TypeError, match=r"All elements in ignore should be strings, .*",
+ TypeError,
+ match=r"All elements in ignore should be strings, .*",
):
bd._check_transform_args(model, ignore=gender_specific + [10])
with pytest.raises(
- TypeError, match=r"copy should be a bool, got .*",
+ TypeError,
+ match=r"copy should be a bool, got .*",
):
bd._check_transform_args(model, copy=None)
diff --git a/tests/debias/test_double_hard_debias.py b/tests/debias/test_double_hard_debias.py
index 40802f5..ce76ff6 100644
--- a/tests/debias/test_double_hard_debias.py
+++ b/tests/debias/test_double_hard_debias.py
@@ -14,21 +14,25 @@ def test_double_hard_debias_checks(
):
with pytest.raises(
- TypeError, match=r"verbose should be a bool, got .*",
+ TypeError,
+ match=r"verbose should be a bool, got .*",
):
DoubleHardDebias(verbose=1)
with pytest.raises(
- TypeError, match=r"n_words should be int, got: .*",
+ TypeError,
+ match=r"n_words should be int, got: .*",
):
DoubleHardDebias(n_words=2.3)
with pytest.raises(
- TypeError, match=r"n_components should be int, got: .*",
+ TypeError,
+ match=r"n_components should be int, got: .*",
):
DoubleHardDebias(n_components=2.3)
with pytest.raises(
- TypeError, match=r"incremental_pca should be a bool, got .*",
+ TypeError,
+ match=r"incremental_pca should be a bool, got .*",
):
DoubleHardDebias(incremental_pca=1)
@@ -56,7 +60,8 @@ def test_double_hard_debias_checks(
model, definitional_pairs + [["word1"]], bias_representation=["he", "she"]
)
with pytest.raises(
- Exception, match=r"bias_representation words not in model",
+ Exception,
+ match=r"bias_representation words not in model",
):
DoubleHardDebias().fit(
model,
@@ -75,7 +80,9 @@ def test_double_hard_debias(
weat = WEAT()
- dhd = DoubleHardDebias(criterion_name="gender",)
+ dhd = DoubleHardDebias(
+ criterion_name="gender",
+ )
dhd.fit(
model, definitional_pairs=definitional_pairs, bias_representation=["he", "she"]
)
@@ -108,7 +115,10 @@ def test_double_hard_debias_target_param(
):
weat = WEAT()
- dhd = DoubleHardDebias(verbose=True, criterion_name="gender",)
+ dhd = DoubleHardDebias(
+ verbose=True,
+ criterion_name="gender",
+ )
attribute_words = weat_wordsets["career"] + weat_wordsets["family"]
attribute_words.remove("family")
@@ -152,7 +162,10 @@ def test_multiclass_hard_debias_ignore_param(
weat_wordsets: Dict[str, List[str]],
):
weat = WEAT()
- dhd = DoubleHardDebias(verbose=True, criterion_name="gender",)
+ dhd = DoubleHardDebias(
+ verbose=True,
+ criterion_name="gender",
+ )
targets = weat_wordsets["male_names"] + weat_wordsets["female_names"]
attributes = weat_wordsets["pleasant_5"] + weat_wordsets["unpleasant_5"]
@@ -193,7 +206,9 @@ def test_double_hard_debias_copy_param(
biased_results_q2 = weat.run_query(gender_query_2, model, normalize=True)
# Test inplace (copy = False)
- dhd = DoubleHardDebias(criterion_name="gender",)
+ dhd = DoubleHardDebias(
+ criterion_name="gender",
+ )
dhd.fit(
model, definitional_pairs=definitional_pairs, bias_representation=["he", "she"]
)
@@ -242,7 +257,9 @@ def test_multiclass_hard_debias_verbose(
assert "Copy argument is True. Transform will attempt to create a copy" in out
assert "Executing debias" in out
- dhd = DoubleHardDebias(criterion_name="gender",)
+ dhd = DoubleHardDebias(
+ criterion_name="gender",
+ )
dhd.fit(
model, definitional_pairs=definitional_pairs, bias_representation=["he", "she"]
)
@@ -251,4 +268,3 @@ def test_multiclass_hard_debias_verbose(
assert model == gender_debiased_w2v
assert model.wv == gender_debiased_w2v.wv
assert model.name == gender_debiased_w2v.name
-
diff --git a/tests/debias/test_half_sibling_regression.py b/tests/debias/test_half_sibling_regression.py
index 812dc14..5498cbd 100644
--- a/tests/debias/test_half_sibling_regression.py
+++ b/tests/debias/test_half_sibling_regression.py
@@ -11,7 +11,8 @@
def test_half_sibling_checks(model):
with pytest.raises(
- TypeError, match=r"verbose should be a bool, got .*",
+ TypeError,
+ match=r"verbose should be a bool, got .*",
):
HalfSiblingRegression(verbose=1)
@@ -24,7 +25,9 @@ def test_half_sibling_regression_class(
):
weat = WEAT()
- hsr = HalfSiblingRegression(criterion_name="gender",)
+ hsr = HalfSiblingRegression(
+ criterion_name="gender",
+ )
hsr.fit(model, definitional_words=gender_specific)
gender_debiased_w2v = hsr.transform(model, copy=True)
@@ -55,7 +58,9 @@ def test_half_sibling_regression_target_param(
):
weat = WEAT()
- hsr = HalfSiblingRegression(criterion_name="gender",)
+ hsr = HalfSiblingRegression(
+ criterion_name="gender",
+ )
attribute_words = weat_wordsets["career"] + weat_wordsets["family"]
attribute_words.remove("family")
@@ -97,7 +102,10 @@ def test_half_sibling_regression_ignore_param(
):
weat = WEAT()
- hsr = HalfSiblingRegression(verbose=True, criterion_name="gender",)
+ hsr = HalfSiblingRegression(
+ verbose=True,
+ criterion_name="gender",
+ )
targets = weat_wordsets["male_names"] + weat_wordsets["female_names"]
attributes = weat_wordsets["pleasant_5"] + weat_wordsets["unpleasant_5"]
@@ -141,7 +149,10 @@ def test_double_hard_debias_copy_param(
biased_results_q2 = weat.run_query(gender_query_2, model, normalize=True)
# Test inplace (copy = False)
- hsr = HalfSiblingRegression(verbose=True, criterion_name="gender",)
+ hsr = HalfSiblingRegression(
+ verbose=True,
+ criterion_name="gender",
+ )
hsr.fit(model, definitional_words=gender_specific)
gender_debiased_w2v = hsr.transform(model, ignore=gender_specific, copy=False)
@@ -163,7 +174,9 @@ def test_double_hard_debias_copy_param(
def test_verbose(
- model: WordEmbeddingModel, gender_specific: List[str], capsys,
+ model: WordEmbeddingModel,
+ gender_specific: List[str],
+ capsys,
):
# -----------------------------------------------------------------
@@ -187,7 +200,9 @@ def test_verbose(
# -----------------------------------------------------------------
# Test inplace (copy = False)
- hsr = HalfSiblingRegression(criterion_name="gender",)
+ hsr = HalfSiblingRegression(
+ criterion_name="gender",
+ )
hsr.fit(model, definitional_words=gender_specific)
gender_debiased_w2v = hsr.transform(model, copy=False)
diff --git a/tests/debias/test_hard_debias.py b/tests/debias/test_hard_debias.py
index dad52ff..86884c9 100644
--- a/tests/debias/test_hard_debias.py
+++ b/tests/debias/test_hard_debias.py
@@ -10,11 +10,13 @@
def test_hard_debias_param_checks(
- model: WordEmbeddingModel, definitional_pairs: List[List[str]],
+ model: WordEmbeddingModel,
+ definitional_pairs: List[List[str]],
):
with pytest.raises(
- TypeError, match=r"verbose should be a bool, got .*",
+ TypeError,
+ match=r"verbose should be a bool, got .*",
):
HardDebias(verbose=1)
@@ -26,7 +28,8 @@ def test_hard_debias_param_checks(
),
):
HardDebias().fit(
- model, definitional_pairs + [["word1", "word2", "word3"]],
+ model,
+ definitional_pairs + [["word1", "word2", "word3"]],
)
with pytest.raises(
@@ -37,7 +40,8 @@ def test_hard_debias_param_checks(
),
):
HardDebias().fit(
- model, definitional_pairs + [["word1"]],
+ model,
+ definitional_pairs + [["word1"]],
)
@@ -56,7 +60,9 @@ def test_hard_debias_class(
# Gender Debias
hd = HardDebias(criterion_name="gender")
hd.fit(
- model, definitional_pairs=definitional_pairs, equalize_pairs=equalize_pairs,
+ model,
+ definitional_pairs=definitional_pairs,
+ equalize_pairs=equalize_pairs,
)
gender_debiased_w2v = hd.transform(model, ignore=gender_specific, copy=True)
@@ -98,7 +104,9 @@ def test_hard_debias_target_param(
attribute_words.remove("executive")
gender_debiased_w2v = hd.fit(
- model, definitional_pairs=definitional_pairs, equalize_pairs=equalize_pairs,
+ model,
+ definitional_pairs=definitional_pairs,
+ equalize_pairs=equalize_pairs,
).transform(model, target=attribute_words, copy=True)
# test gender query 1, debias was only applied to the target words
@@ -146,7 +154,9 @@ def test_hard_debias_ignore_param(
ignore = targets + attributes
gender_debiased_w2v = hd.fit(
- model, definitional_pairs, equalize_pairs=equalize_pairs,
+ model,
+ definitional_pairs,
+ equalize_pairs=equalize_pairs,
).transform(model, ignore=ignore, copy=True)
# test gender query 1,none of their words were debiased.
@@ -180,7 +190,9 @@ def test_hard_debias_verbose_param(
# Test verbose
hd = HardDebias(verbose=True)
gender_debiased_w2v = hd.fit(
- model, definitional_pairs, equalize_pairs=equalize_pairs,
+ model,
+ definitional_pairs,
+ equalize_pairs=equalize_pairs,
).transform(model, ignore=gender_specific, copy=True)
out = capsys.readouterr().out
@@ -226,9 +238,13 @@ def test_hard_debias_copy_param(
biased_results_q1 = weat.run_query(gender_query_1, model, normalize=True)
biased_results_q2 = weat.run_query(gender_query_2, model, normalize=True)
- hd = HardDebias(criterion_name="gender",)
+ hd = HardDebias(
+ criterion_name="gender",
+ )
hd.fit(
- model, definitional_pairs=definitional_pairs, equalize_pairs=equalize_pairs,
+ model,
+ definitional_pairs=definitional_pairs,
+ equalize_pairs=equalize_pairs,
)
gender_debiased_w2v = hd.transform(model, ignore=gender_specific, copy=False)
diff --git a/tests/debias/test_multiclass_hard_debias.py b/tests/debias/test_multiclass_hard_debias.py
index 624f585..ebf2c4b 100644
--- a/tests/debias/test_multiclass_hard_debias.py
+++ b/tests/debias/test_multiclass_hard_debias.py
@@ -10,7 +10,8 @@
def test_multiclass_hard_debias_param_checks(
- model: WordEmbeddingModel, definitional_pairs: List[List[str]],
+ model: WordEmbeddingModel,
+ definitional_pairs: List[List[str]],
):
with pytest.raises(
@@ -135,7 +136,11 @@ def test_multiclass_hard_debias_target_param(
model,
definitional_sets=mhd_gender_definitional_sets,
equalize_sets=mhd_gender_equalize_sets,
- ).transform(model, target=attribute_words, copy=True,)
+ ).transform(
+ model,
+ target=attribute_words,
+ copy=True,
+ )
assert model.name == "test_w2v"
assert gender_debiased_w2v.name == "test_w2v_debiased"
diff --git a/tests/debias/test_repulsion_attraction_neutralization.py b/tests/debias/test_repulsion_attraction_neutralization.py
index 2cddf09..dc5f084 100644
--- a/tests/debias/test_repulsion_attraction_neutralization.py
+++ b/tests/debias/test_repulsion_attraction_neutralization.py
@@ -24,7 +24,9 @@ def test_hard_debias_target_param(
attribute_words.remove("family")
attribute_words.remove("executive")
- ran = RepulsionAttractionNeutralization(criterion_name="gender",)
+ ran = RepulsionAttractionNeutralization(
+ criterion_name="gender",
+ )
ran.fit(model, definitional_pairs=definitional_pairs)
gender_debiased_w2v = ran.transform(model, target=attribute_words, copy=True)
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/metrics/conftest.py b/tests/metrics/conftest.py
new file mode 100644
index 0000000..68add16
--- /dev/null
+++ b/tests/metrics/conftest.py
@@ -0,0 +1,149 @@
+"""Test configurations and fixtures."""
+from typing import Dict, List
+
+import pytest
+from wefe.datasets.datasets import load_weat
+from wefe.query import Query
+from wefe.utils import load_test_model
+from wefe.word_embedding_model import WordEmbeddingModel
+
+# -------------------------------------------------------------------------------------
+# Models
+# -------------------------------------------------------------------------------------
+
+
+@pytest.fixture
+def model() -> WordEmbeddingModel:
+ """Load a subset of Word2vec as a testing model.
+
+ Returns
+ -------
+ WordEmbeddingModel
+ The loaded testing model.
+ """
+ return load_test_model()
+
+
+# -------------------------------------------------------------------------------------
+# Word sets
+# -------------------------------------------------------------------------------------
+
+
+@pytest.fixture
+def weat_wordsets() -> Dict[str, List[str]]:
+ """Load the word sets used in WEAT original work.
+
+ Returns
+ -------
+ Dict[str, List[str]]
+ A dictionary that map a word set name to a set of words.
+ """
+ weat_wordsets = load_weat()
+ return weat_wordsets
+
+
+@pytest.fixture
+def query_2t1a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ query = Query(
+ [weat_wordsets["flowers"], weat_wordsets["insects"]],
+ [weat_wordsets["pleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant"],
+ )
+ return query
+
+
+@pytest.fixture
+def query_2t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ """Generate a Flower and Insects wrt Pleasant vs Unpleasant test query.
+
+ Parameters
+ ----------
+ weat_wordsets : Dict[str, List[str]]
+ The word sets used in WEAT original work.
+
+ Returns
+ -------
+ Query
+ The generated query.
+ """
+ query = Query(
+ [weat_wordsets["flowers"], weat_wordsets["insects"]],
+ [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant", "Unpleasant"],
+ )
+ return query
+
+
+@pytest.fixture
+def query_3t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ query = Query(
+ [
+ weat_wordsets["flowers"],
+ weat_wordsets["insects"],
+ weat_wordsets["instruments"],
+ ],
+ [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
+ ["Flowers", "Weapons", "Instruments"],
+ ["Pleasant", "Unpleasant"],
+ )
+
+ return query
+
+
+@pytest.fixture
+def query_4t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ query = Query(
+ [
+ weat_wordsets["flowers"],
+ weat_wordsets["insects"],
+ weat_wordsets["instruments"],
+ weat_wordsets["weapons"],
+ ],
+ [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
+ ["Flowers", "Insects", "Instruments", "Weapons"],
+ ["Pleasant", "Unpleasant"],
+ )
+
+ return query
+
+
+@pytest.fixture
+def query_1t4_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ query = Query(
+ [weat_wordsets["flowers"]],
+ [
+ weat_wordsets["pleasant_5"],
+ weat_wordsets["pleasant_9"],
+ weat_wordsets["unpleasant_5"],
+ weat_wordsets["unpleasant_9"],
+ ],
+ ["Flowers"],
+ ["Pleasant 5 ", "Pleasant 9", "Unpleasant 5", "Unpleasant 9"],
+ )
+ return query
+
+
+@pytest.fixture
+def query_2t1a_lost_vocab_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ query = Query(
+ [["bla", "asd"], weat_wordsets["insects"]],
+ [weat_wordsets["pleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant"],
+ )
+
+ return query
+
+
+@pytest.fixture
+def query_2t2a_lost_vocab_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ query = Query(
+ [["bla", "asd"], weat_wordsets["insects"]],
+ [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant", "Unpleasant"],
+ )
+
+ return query
diff --git a/tests/metrics/test_RND.py b/tests/metrics/test_RND.py
index 8a38ddb..87481cb 100644
--- a/tests/metrics/test_RND.py
+++ b/tests/metrics/test_RND.py
@@ -13,7 +13,7 @@ def check_RND_result_keys(results: Dict[str, Any]):
"query_name",
"result",
"rnd",
- "distances_by_word",
+ "distance_by_word",
]
@@ -25,7 +25,7 @@ def check_RND_result_values(results: Dict[str, Any]):
assert isinstance(results["result"], np.number)
assert isinstance(results["rnd"], np.number)
- distances_by_word = results["distances_by_word"]
+ distances_by_word = results["distance_by_word"]
assert isinstance(distances_by_word, dict)
assert len(distances_by_word) > 0
for word, distance in distances_by_word.items():
@@ -69,7 +69,10 @@ def test_RND_lost_vocabulary_threshold(
):
rnd = RND()
- result = rnd.run_query(query_2t1a_lost_vocab_1, model,)
+ result = rnd.run_query(
+ query_2t1a_lost_vocab_1,
+ model,
+ )
check_RND_result_keys(result)
assert result["query_name"] == "Flowers and Insects wrt Pleasant"
@@ -77,5 +80,5 @@ def test_RND_lost_vocabulary_threshold(
assert np.isnan(result["result"])
assert np.isnan(result["rnd"])
- assert isinstance(result["distances_by_word"], dict)
- assert len(result["distances_by_word"]) == 0
+ assert isinstance(result["distance_by_word"], dict)
+ assert len(result["distance_by_word"]) == 0
diff --git a/tests/metrics/test_base_metric.py b/tests/metrics/test_base_metric.py
index ddcb43a..986fa49 100644
--- a/tests/metrics/test_base_metric.py
+++ b/tests/metrics/test_base_metric.py
@@ -5,7 +5,9 @@
def test_base_metric_input_checking(
- model: WordEmbeddingModel, query_2t2a_1: Query, query_3t2a_1: Query,
+ model: WordEmbeddingModel,
+ query_2t2a_1: Query,
+ query_3t2a_1: Query,
):
# Create and configure base metric testing.
# disable abstract methods.
@@ -45,7 +47,8 @@ def test_base_metric_input_checking(
def test_validate_old_preprocessor_args_inputs(
- model: WordEmbeddingModel, query_2t2a_1: Query,
+ model: WordEmbeddingModel,
+ query_2t2a_1: Query,
):
# instance test metric
@@ -105,5 +108,7 @@ def test_run_query(model: WordEmbeddingModel, query_2t2a_1: Query):
base_metric.metric_name = "Test Metric"
base_metric.metric_short_name = "TM"
- with pytest.raises(NotImplementedError,):
+ with pytest.raises(
+ NotImplementedError,
+ ):
base_metric.run_query(query_2t2a_1, model)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index f6de983..cb4a3dd 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,6 +1,11 @@
-from wefe.datasets.datasets import (fetch_debias_multiclass, fetch_debiaswe,
- fetch_eds, fetch_gn_glove, load_bingliu,
- load_weat)
+from wefe.datasets.datasets import (
+ fetch_debias_multiclass,
+ fetch_debiaswe,
+ fetch_eds,
+ fetch_gn_glove,
+ load_bingliu,
+ load_weat,
+)
def test_load_bingliu():
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index de980e7..7af0201 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+from wefe.datasets.datasets import load_weat
from wefe.preprocessing import (
_warn_not_found_words,
get_embeddings_from_query,
@@ -11,8 +12,87 @@
preprocess_word,
)
from wefe.query import Query
+from wefe.utils import load_test_model
from wefe.word_embedding_model import WordEmbeddingModel
+
+@pytest.fixture
+def model() -> WordEmbeddingModel:
+ """Load a subset of Word2vec as a testing model.
+
+ Returns
+ -------
+ WordEmbeddingModel
+ The loaded testing model.
+ """
+ return load_test_model()
+
+
+@pytest.fixture
+def query_2t2a_1(weat_wordsets: Dict[str, List[str]]) -> Query:
+ """Generate a Flower and Insects wrt Pleasant vs Unpleasant test query.
+
+ Parameters
+ ----------
+ weat_wordsets : Dict[str, List[str]]
+ The word sets used in WEAT original work.
+
+ Returns
+ -------
+ Query
+ The generated query.
+ """
+ query = Query(
+ [weat_wordsets["flowers"], weat_wordsets["insects"]],
+ [weat_wordsets["pleasant_5"], weat_wordsets["unpleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant", "Unpleasant"],
+ )
+ return query
+
+
+@pytest.fixture
+def weat_wordsets() -> Dict[str, List[str]]:
+ """Load the word sets used in WEAT original work.
+
+ Returns
+ -------
+ Dict[str, List[str]]
+ A dictionary that map a word set name to a set of words.
+ """
+ weat_wordsets = load_weat()
+ return weat_wordsets
+
+
+@pytest.fixture
+def query_2t2a_uppercase(weat_wordsets: Dict[str, List[str]]) -> Query:
+ """Generate a Flower and Insects wrt Pleasant vs Unpleasant test query.
+
+ Parameters
+ ----------
+ weat_wordsets : Dict[str, List[str]]
+ The word sets used in WEAT original work.
+
+ Returns
+ -------
+ Query
+ The generated query.
+ """
+ query = Query(
+ [
+ [s.upper() for s in weat_wordsets["flowers"]],
+ [s.upper() for s in weat_wordsets["insects"]],
+ ],
+ [
+ [s.upper() for s in weat_wordsets["pleasant_5"]],
+ [s.upper() for s in weat_wordsets["unpleasant_5"]],
+ ],
+ ["Flowers", "Insects"],
+ ["Pleasant", "Unpleasant"],
+ )
+ return query
+
+
# --------------------------------------------------------------------------------------
# test preprocess_word
# --------------------------------------------------------------------------------------
@@ -270,7 +350,8 @@ def test_get_embeddings_from_sets_type_checkings(model):
# Test types and value checking.
with pytest.raises(
- TypeError, match=(r"model should be a WordEmbeddingModel instance, got None"),
+ TypeError,
+ match=(r"model should be a WordEmbeddingModel instance, got None"),
):
get_embeddings_from_tuples(None, [["he"]])
@@ -311,19 +392,22 @@ def test_get_embeddings_from_sets_type_checkings(model):
get_embeddings_from_tuples(model, [["she", 1]])
with pytest.raises(
- TypeError, match=r"sets_name should be a string or None, got:.*",
+ TypeError,
+ match=r"sets_name should be a string or None, got:.*",
):
get_embeddings_from_tuples(model, [["she", "he"]], 0)
with pytest.raises(
- TypeError, match=r"warn_lost_sets should be a bool, got:.*",
+ TypeError,
+ match=r"warn_lost_sets should be a bool, got:.*",
):
get_embeddings_from_tuples(
model, [["she", "he"]], "definning", warn_lost_sets=None
)
with pytest.raises(
- TypeError, match=r"verbose should be a bool, got:.*",
+ TypeError,
+ match=r"verbose should be a bool, got:.*",
):
get_embeddings_from_tuples(
model, [["she", "he"]], "definning", True, verbose=None
@@ -545,7 +629,9 @@ def test_get_embeddings_from_query(
def test_get_embeddings_from_query_oov_warns(
- caplog, model: WordEmbeddingModel, weat_wordsets: Dict[str, List[str]],
+ caplog,
+ model: WordEmbeddingModel,
+ weat_wordsets: Dict[str, List[str]],
):
# check lost words warning when warn_not_found_words is True
diff --git a/tests/test_query.py b/tests/test_query.py
index 0fc21f0..53ba8ea 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -2,11 +2,23 @@
from typing import Dict, List
import pytest
-
from wefe.datasets.datasets import load_weat
from wefe.query import Query
+@pytest.fixture
+def weat_wordsets() -> Dict[str, List[str]]:
+ """Load the word sets used in WEAT original work.
+
+ Returns
+ -------
+ Dict[str, List[str]]
+ A dictionary that map a word set name to a set of words.
+ """
+ weat_wordsets = load_weat()
+ return weat_wordsets
+
+
def test_create_query_input_checks():
# target sets None
@@ -343,3 +355,153 @@ def test_wrong_target_and_attribute_sets_and_names(caplog):
["Flowers"],
["Pleasant", "asdf"],
)
+
+
+def test_query__repr__():
+
+ weat_word_set = load_weat()
+ query = Query(
+ [weat_word_set["flowers"], weat_word_set["insects"]],
+ [weat_word_set["pleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant"],
+ )
+
+ assert query.__repr__() == (
+ ""
+ )
+
+ query_2 = Query([[]], [])
+
+ assert (
+ query_2.__repr__()
+ == ""
+ )
+
+ del query.target_sets
+ assert query.__repr__() == ""
+
+
+def test_query_dict():
+
+ weat_word_set = load_weat()
+ query = Query(
+ [weat_word_set["flowers"], weat_word_set["insects"]],
+ [weat_word_set["pleasant_5"]],
+ ["Flowers", "Insects"],
+ ["Pleasant"],
+ )
+
+ assert query.dict() == {
+ "target_sets": [
+ [
+ "aster",
+ "clover",
+ "hyacinth",
+ "marigold",
+ "poppy",
+ "azalea",
+ "crocus",
+ "iris",
+ "orchid",
+ "rose",
+ "bluebell",
+ "daffodil",
+ "lilac",
+ "pansy",
+ "tulip",
+ "buttercup",
+ "daisy",
+ "lily",
+ "peony",
+ "violet",
+ "carnation",
+ "gladiola",
+ "magnolia",
+ "petunia",
+ "zinnia",
+ ],
+ [
+ "ant",
+ "caterpillar",
+ "flea",
+ "locust",
+ "spider",
+ "bedbug",
+ "centipede",
+ "fly",
+ "maggot",
+ "tarantula",
+ "bee",
+ "cockroach",
+ "gnat",
+ "mosquito",
+ "termite",
+ "beetle",
+ "cricket",
+ "hornet",
+ "moth",
+ "wasp",
+ "blackfly",
+ "dragonfly",
+ "horsefly",
+ "roach",
+ "weevil",
+ ],
+ ],
+ "attribute_sets": [
+ [
+ "caress",
+ "freedom",
+ "health",
+ "love",
+ "peace",
+ "cheer",
+ "friend",
+ "heaven",
+ "loyal",
+ "pleasure",
+ "diamond",
+ "gentle",
+ "honest",
+ "lucky",
+ "rainbow",
+ "diploma",
+ "gift",
+ "honor",
+ "miracle",
+ "sunrise",
+ "family",
+ "happy",
+ "laughter",
+ "paradise",
+ "vacation",
+ ]
+ ],
+ "target_sets_names": ["Flowers", "Insects"],
+ "attribute_sets_names": ["Pleasant"],
+ "query_name": "Flowers and Insects wrt Pleasant",
+ "template": (2, 1),
+ }
+
+ query_2 = Query([[]], [])
+
+ assert query_2.dict() == {
+ "target_sets": [[]],
+ "attribute_sets": [],
+ "target_sets_names": ["Target set 0"],
+ "attribute_sets_names": [],
+ "query_name": "Target set 0",
+ "template": (1, 0),
+ }
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 91a4865..c2d38be 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,8 +8,12 @@
from wefe.datasets import load_weat
from wefe.metrics import RND, WEAT
from wefe.query import Query
-from wefe.utils import (calculate_ranking_correlations, create_ranking,
- load_test_model, run_queries)
+from wefe.utils import (
+ calculate_ranking_correlations,
+ create_ranking,
+ load_test_model,
+ run_queries,
+)
from wefe.word_embedding_model import WordEmbeddingModel
gensim_version = semantic_version.Version.coerce(gensim.__version__)
diff --git a/tests/test_word_embedding_model.py b/tests/test_word_embedding_model.py
index c3ef4ac..a1a0af1 100644
--- a/tests/test_word_embedding_model.py
+++ b/tests/test_word_embedding_model.py
@@ -10,6 +10,13 @@
gensim_version = semantic_version.Version.coerce(gensim.__version__)
+@pytest.fixture
+def test_keyed_vectors() -> KeyedVectors:
+
+ test_model = KeyedVectors.load("./wefe/datasets/data/test_model.kv")
+ return test_model
+
+
def test__init__():
# Test types verifications
@@ -62,11 +69,11 @@ def test__init__():
assert model.vocab_prefix == "\\c\\en"
-def test__eq__(keyed_vector_model: gensim.models.KeyedVectors):
- model_1 = WordEmbeddingModel(keyed_vector_model, "w2v")
- model_2 = WordEmbeddingModel(keyed_vector_model, "w2v_2")
- model_3_prefix_a = WordEmbeddingModel(keyed_vector_model, "w2v_3", vocab_prefix="a")
- model_3_prefix_b = WordEmbeddingModel(keyed_vector_model, "w2v_3", vocab_prefix="b")
+def test__eq__(test_keyed_vectors: gensim.models.KeyedVectors):
+ model_1 = WordEmbeddingModel(test_keyed_vectors, "w2v")
+ model_2 = WordEmbeddingModel(test_keyed_vectors, "w2v_2")
+ model_3_prefix_a = WordEmbeddingModel(test_keyed_vectors, "w2v_3", vocab_prefix="a")
+ model_3_prefix_b = WordEmbeddingModel(test_keyed_vectors, "w2v_3", vocab_prefix="b")
assert model_1 != ""
@@ -83,14 +90,17 @@ def test__eq__(keyed_vector_model: gensim.models.KeyedVectors):
assert model_3_prefix_b != model_3_prefix_a
-def test__contains__(model: WordEmbeddingModel):
+def test__contains__(test_keyed_vectors: gensim.models.KeyedVectors):
+ model = WordEmbeddingModel(test_keyed_vectors, "w2v")
+
assert "men" in model
assert "asdf" not in model
assert None not in model
assert 0 not in model
-def test__getitem__(model: WordEmbeddingModel):
+def test__getitem__(test_keyed_vectors: gensim.models.KeyedVectors):
+ model = WordEmbeddingModel(test_keyed_vectors, "w2v")
embedding = model["ASDF"]
assert embedding is None
@@ -100,6 +110,33 @@ def test__getitem__(model: WordEmbeddingModel):
assert embedding.shape == (300,)
+def test__repr__(test_keyed_vectors: gensim.models.KeyedVectors):
+
+ model_1 = WordEmbeddingModel(test_keyed_vectors, "w2v")
+ model_1_no_name = WordEmbeddingModel(test_keyed_vectors)
+ model_1_prefix_a = WordEmbeddingModel(test_keyed_vectors, "w2v", vocab_prefix="a")
+ model_1_no_name_prefix_a = WordEmbeddingModel(test_keyed_vectors, vocab_prefix="a")
+
+ assert (
+ model_1.__repr__()
+ == ""
+ )
+ assert model_1_no_name.__repr__() == (
+ ""
+ )
+ assert model_1_prefix_a.__repr__() == (
+ ""
+ )
+ assert model_1_no_name_prefix_a.__repr__() == (
+ ""
+ )
+
+ del model_1.name
+ assert model_1.__repr__() == ""
+
+
def test__init__with_w2v_model():
if gensim_version.major >= 4:
@@ -127,7 +164,9 @@ def test__init_with_fast_model():
# -------------------------------------------------------------------------------------
-def test_normalize_embeddings(model: WordEmbeddingModel):
+def test_normalize_embeddings(test_keyed_vectors: gensim.models.KeyedVectors):
+ model = WordEmbeddingModel(test_keyed_vectors, "w2v")
+
# test unnormalized embeddings
for word in model.vocab:
assert np.abs(np.linalg.norm(model[word]) - 1.0) > 0.000001
@@ -146,7 +185,8 @@ def test_normalize_embeddings(model: WordEmbeddingModel):
# -------------------------------------------------------------------------------------
-def test_update_embedding(model: WordEmbeddingModel):
+def test_update_embedding(test_keyed_vectors: gensim.models.KeyedVectors):
+ model = WordEmbeddingModel(test_keyed_vectors, "w2v")
new_embedding = np.ones(300, dtype=model.wv.vectors.dtype)
model.update("The", new_embedding)
@@ -178,7 +218,9 @@ def test_update_embedding(model: WordEmbeddingModel):
# -------------------------------------------------------------------------------------
-def test_update_embeddings(model):
+def test_update_embeddings(test_keyed_vectors: gensim.models.KeyedVectors):
+ model = WordEmbeddingModel(test_keyed_vectors, "w2v")
+
words = ["The", "in"]
embeddings = [np.ones(300, dtype=np.float32), np.ones(300, dtype=np.float32) * -1]
@@ -202,11 +244,13 @@ def test_update_embeddings(model):
model.batch_update(None, embeddings)
with pytest.raises(
- TypeError, match=r"embeddings should be a list, tuple or np.array, got:.*",
+ TypeError,
+ match=r"embeddings should be a list, tuple or np.array, got:.*",
):
model.batch_update(words, None)
with pytest.raises(
- ValueError, match=r"words and embeddings must have the same size, got:.*",
+ ValueError,
+ match=r"words and embeddings must have the same size, got:.*",
):
model.batch_update(words + ["is"], embeddings)
diff --git a/wefe/_version.py b/wefe/_version.py
index f9aa3e1..6a9beea 100644
--- a/wefe/_version.py
+++ b/wefe/_version.py
@@ -1 +1 @@
-__version__ = "0.3.2"
+__version__ = "0.4.0"
diff --git a/wefe/datasets/__init__.py b/wefe/datasets/__init__.py
index 7773142..3f5de20 100644
--- a/wefe/datasets/__init__.py
+++ b/wefe/datasets/__init__.py
@@ -1,3 +1,4 @@
+# flake8: noqa
from wefe.datasets.datasets import (
fetch_debias_multiclass,
fetch_debiaswe,
diff --git a/wefe/debias/__init__.py b/wefe/debias/__init__.py
index 334f35f..e69de29 100644
--- a/wefe/debias/__init__.py
+++ b/wefe/debias/__init__.py
@@ -1,7 +0,0 @@
-from wefe.debias.double_hard_debias import DoubleHardDebias
-from wefe.debias.half_sibling_regression import HalfSiblingRegression
-from wefe.debias.hard_debias import HardDebias
-from wefe.debias.multiclass_hard_debias import MulticlassHardDebias
-from wefe.debias.repulsion_attraction_neutralization import (
- RepulsionAttractionNeutralization,
-)
diff --git a/wefe/debias/base_debias.py b/wefe/debias/base_debias.py
index c222df0..9156061 100644
--- a/wefe/debias/base_debias.py
+++ b/wefe/debias/base_debias.py
@@ -8,8 +8,15 @@
class BaseDebias:
"""Mixin class for implement any debias method in WEFE."""
+ # The name of the method.
+ name: str
+
@abstractmethod
- def fit(self, model: WordEmbeddingModel, **fit_params,) -> "BaseDebias":
+ def fit(
+ self,
+ model: WordEmbeddingModel,
+ **fit_params,
+ ) -> "BaseDebias":
"""Fit the transformation.
Parameters
@@ -38,9 +45,10 @@ def transform(
model : WordEmbeddingModel
The word embedding model to debias.
target : Optional[List[str]], optional
- If a set of words is specified in target, the debias method will be performed
- only on the word embeddings of this set. In the case of provide `None`, the
- debias will be performed on all words (except those specified in ignore).
+ If a set of words is specified in target, the debias method will be
+ performed only on the word embeddings of this set. In the case of provide
+ `None`, the debias will be performed on all words (except those specified
+ in ignore).
by default `None`.
ignore : Optional[List[str]], optional
If target is `None` and a set of words is specified in ignore, the debias
@@ -148,7 +156,10 @@ def _check_transform_args(
raise TypeError(f"copy should be a bool, got {copy}.")
def _check_sets_sizes(
- self, sets: List[List[str]], set_name: str, set_size: Union[int, str],
+ self,
+ sets: List[List[str]],
+ set_name: str,
+ set_size: Union[int, str],
):
if len(sets) == 0:
diff --git a/wefe/debias/double_hard_debias.py b/wefe/debias/double_hard_debias.py
index a2a90b7..377dd07 100644
--- a/wefe/debias/double_hard_debias.py
+++ b/wefe/debias/double_hard_debias.py
@@ -69,7 +69,8 @@ class DoubleHardDebias(BaseDebias):
>>> debiased_model = dhd.transform(
... model=model, ignore=gender_specific
... )
- Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Copy argument is True. Transform will attempt to create a copy of the original
+ model. This may fail due to lack of memory.
Model copy created successfully.
If you want the debiased to be performed over a specific set of words you can
@@ -78,7 +79,8 @@ class DoubleHardDebias(BaseDebias):
>>> debiased_model = dhd.transform(
... model=model, target = ['doctor','nurse','programmer','teacher']
... )
- Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Copy argument is True. Transform will attempt to create a copy of the original
+ model. This may fail due to lack of memory.
Model copy created successfully.
References
@@ -163,7 +165,9 @@ def __init__(
self.n_components = n_components
def _check_sets_size(
- self, sets: List[List[str]], set_name: str,
+ self,
+ sets: List[List[str]],
+ set_name: str,
):
for idx, set_ in enumerate(sets):
@@ -176,7 +180,7 @@ def _check_sets_size(
f"got {len(set_)} words, expected 2."
)
- def _similarity(self, u: List[np.ndarray], v: List[np.ndarray]) -> np.array:
+ def _similarity(self, u: List[np.ndarray], v: List[np.ndarray]) -> np.ndarray:
return 1 - pairwise_distances(u, v, metric="cosine")
def _bias_by_projection(
@@ -290,7 +294,9 @@ def _drop_frecuency_features(
return droped_frecuencies
def _identify_bias_subspace(
- self, defining_pairs_embeddings, verbose: bool = False,
+ self,
+ defining_pairs_embeddings,
+ verbose: bool = False,
) -> PCA:
matrix = []
@@ -362,6 +368,7 @@ def fit(
model: WordEmbeddingModel,
definitional_pairs: List[List[str]],
bias_representation: List[str],
+ **fit_params,
) -> BaseDebias:
"""Compute the bias direction and obtain the principal components of the entire
set of vectors.
@@ -383,6 +390,7 @@ def fit(
BaseDebias
The debias method fitted.
"""
+
self.definitional_pairs = definitional_pairs
self._check_sets_sizes(self.definitional_pairs, "definitional", set_size=2)
@@ -429,8 +437,8 @@ def fit(
def transform(
self,
model: WordEmbeddingModel,
- target: List[str] = None,
- ignore: List[str] = [],
+ target: Optional[List[str]] = None,
+ ignore: Optional[List[str]] = [],
copy: bool = True,
) -> WordEmbeddingModel:
"""Execute hard debias over the provided model.
@@ -459,7 +467,9 @@ def transform(
"""
# check if the following attributes exist in the object.
self._check_transform_args(
- model=model, ignore=ignore, copy=copy,
+ model=model,
+ ignore=ignore,
+ copy=copy,
)
check_is_fitted(
self,
diff --git a/wefe/debias/half_sibling_regression.py b/wefe/debias/half_sibling_regression.py
index 324f4b7..2476db5 100644
--- a/wefe/debias/half_sibling_regression.py
+++ b/wefe/debias/half_sibling_regression.py
@@ -5,7 +5,6 @@
import numpy as np
from tqdm import tqdm
-
from wefe.debias.base_debias import BaseDebias
from wefe.preprocessing import get_embeddings_from_tuples
from wefe.utils import check_is_fitted
@@ -13,7 +12,7 @@
class HalfSiblingRegression(BaseDebias):
- """Half Sibling Debias method.
+ r"""Half Sibling Debias method.
This method proposes to learn spurious gender information via causal
inference by utilizing the statistical dependency between gender-biased
@@ -40,7 +39,7 @@ class HalfSiblingRegression(BaseDebias):
.. math::
- W = ((V_d)^T V_d + \\alpha I)^{-1} (V_d)^TV_n
+ W = ((V_d)^T V_d + \alpha I)^{-1} (V_d)^TV_n
2. Compute the gender information:
@@ -86,7 +85,8 @@ class HalfSiblingRegression(BaseDebias):
... )
>>> # execute the debias on the words not included in the gender definition set
>>> debiased_model = hsr.transform(model = model)
- Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Copy argument is True. Transform will attempt to create a copy of the original
+ model. This may fail due to lack of memory.
Model copy created successfully.
>>>
>>>
@@ -95,7 +95,8 @@ class HalfSiblingRegression(BaseDebias):
>>> debiased_model = hsr.transform(
... model=model, target=["doctor", "nurse", "programmer"]
... )
- Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Copy argument is True. Transform will attempt to create a copy of the original
+ model. This may fail due to lack of memory.
Model copy created successfully.
>>>
>>> # if you want to exclude a set of words from the debias process
@@ -103,7 +104,8 @@ class HalfSiblingRegression(BaseDebias):
>>> debiased_model = hsr.transform(
... model=model, ignore=["dress", "beard", "niece", "nephew"]
... )
- Copy argument is True. Transform will attempt to create a copy of the original model. This may fail due to lack of memory.
+ Copy argument is True. Transform will attempt to create a copy of the original
+ model. This may fail due to lack of memory.
Model copy created successfully.
References
diff --git a/wefe/debias/hard_debias.py b/wefe/debias/hard_debias.py
index 3158193..7da2dd8 100644
--- a/wefe/debias/hard_debias.py
+++ b/wefe/debias/hard_debias.py
@@ -15,7 +15,7 @@
class HardDebias(BaseDebias):
- """Hard Debias debiasing method.
+ r"""Hard Debias debiasing method.
Hard debias is a method that allows mitigating biases through geometric operations
on embeddings.
@@ -38,7 +38,8 @@ class HardDebias(BaseDebias):
First, it is defined a set of words that are correct to be related to the bias
criterion: the *criterion specific gender words*.
For example, in the case of gender, *gender specific* words are:
- ``['he', 'his', 'He', 'her', 'she', 'him', 'him', 'She', 'man', 'women', 'men'...]``.
+ ``['he', 'his', 'He', 'her', 'she', 'him', 'him', 'She', 'man', 'women', 'men',
+ ...]``.
Then, it is defined that all words outside this set should have no relation to the
bias criterion and thus have the possibility of being biased. (e.g. for the case of
@@ -55,13 +56,13 @@ class HardDebias(BaseDebias):
.. math::
- \\text{bias_subspace} = \\frac{v \\cdot (v \\cdot u)}{(v \\cdot v)}
+ \text{bias subspace} = \frac{v \cdot (v \cdot u)}{(v \cdot v)}
Then subtract the projection from the embedding.
.. math::
- u' = u - \\text{bias_subspace}
+ u' = u - \text{bias subspace}
3. Equalizate the embeddings with respect to the bias direction.
Given an equalization set (set of word pairs such as ``['she', 'he'],
@@ -139,7 +140,8 @@ class HardDebias(BaseDebias):
References
----------
- | [1]: Bolukbasi, T., Chang, K. W., Zou, J. Y., Saligrama, V., & Kalai, A. T. (2016).
+ | [1]: Bolukbasi, T., Chang, K. W., Zou, J. Y., Saligrama, V., & Kalai, A. T.
+ (2016).
| Man is to computer programmer as woman is to homemaker? debiasing word embeddings.
| Advances in Neural Information Processing Systems.
| [2]: https://github.com/tolga-b/debiaswe
@@ -166,8 +168,8 @@ def __init__(
by default False.
criterion_name : Optional[str], optional
The name of the criterion for which the debias is being executed,
- e.g., 'Gender'. This will indicate the name of the model returning transform,
- by default None
+ e.g., 'Gender'. This will indicate the name of the model returning
+ transform, by default None
"""
# check verbose
if not isinstance(verbose, bool):
@@ -181,22 +183,10 @@ def __init__(
else:
raise ValueError(f"criterion_name should be str, got: {criterion_name}")
- def _check_sets_size(
- self, sets: List[List[str]], set_name: str,
- ):
-
- for idx, set_ in enumerate(sets):
- if len(set_) != 2:
- adverb = "less" if len(set_) < 2 else "more"
-
- raise ValueError(
- f"The {set_name} pair at position {idx} ({set_}) has {adverb} "
- f"words than allowed by {self.name}: "
- f"got {len(set_)} words, expected 2."
- )
-
def _identify_bias_subspace(
- self, definning_pairs_embeddings: List[EmbeddingDict], verbose: bool = False,
+ self,
+ definning_pairs_embeddings: List[EmbeddingDict],
+ verbose: bool = False,
) -> PCA:
matrix = []
@@ -285,7 +275,10 @@ def _equalize(
):
(
(word_a, embedding_a),
- (word_b, embedding_b,),
+ (
+ word_b,
+ embedding_b,
+ ),
) = equalize_pair_embeddings.items()
y = self._drop((embedding_a + embedding_b) / 2, bias_direction)
@@ -353,7 +346,8 @@ def fit(
print("Identifying the bias subspace.")
self.pca_ = self._identify_bias_subspace(
- self.definitional_pairs_embeddings_, self.verbose,
+ self.definitional_pairs_embeddings_,
+ self.verbose,
)
self.bias_direction_ = self.pca_.components_[0]
@@ -411,12 +405,13 @@ def transform(
model : WordEmbeddingModel
The word embedding model to debias.
target : Optional[List[str]], optional
- If a set of words is specified in target, the debias method will be performed
- only on the word embeddings of this set. If `None` is provided, the
- debias will be performed on all words (except those specified in ignore).
+ If a set of words is specified in target, the debias method will be
+ performed only on the word embeddings of this set. If `None` is provided,
+ the debias will be performed on all words (except those specified in
+ ignore).
Note that some words that are not in target may be modified due to the
equalization process.
- By default `None`.
+ By default `None`.
ignore : Optional[List[str]], optional
If target is `None` and a set of words is specified in ignore, the debias
method will perform the debias in all words except those specified in this
@@ -441,7 +436,10 @@ def transform(
# Check types and if the method is fitted
self._check_transform_args(
- model=model, target=target, ignore=ignore, copy=copy,
+ model=model,
+ target=target,
+ ignore=ignore,
+ copy=copy,
)
# check if the following attributes exist in the object.
@@ -501,7 +499,9 @@ def transform(
print("Equalizing embeddings.")
self._equalize(
- model, self.equalize_pairs_embeddings_, self.bias_direction_,
+ model,
+ self.equalize_pairs_embeddings_,
+ self.bias_direction_,
)
if self.verbose:
diff --git a/wefe/debias/multiclass_hard_debias.py b/wefe/debias/multiclass_hard_debias.py
index daaf6fc..51f979e 100644
--- a/wefe/debias/multiclass_hard_debias.py
+++ b/wefe/debias/multiclass_hard_debias.py
@@ -29,7 +29,7 @@ class MulticlassHardDebias(BaseDebias):
For more information on the use of mitigation methods, visit
:ref:`bias mitigation` in the User Guide.
- The following example shows how to run an ethnicity debias based on Black,
+ The following example shows how to run an ethnicity debias based on Black,
Asian and Caucasian groups.
>>> from wefe.datasets import fetch_debias_multiclass, load_weat
@@ -116,7 +116,8 @@ def __init__(
)
def _identify_bias_subspace(
- self, definning_sets_embeddings: List[EmbeddingDict],
+ self,
+ definning_sets_embeddings: List[EmbeddingDict],
) -> PCA:
matrix = []
@@ -149,27 +150,6 @@ def _project_onto_subspace(self, vector, subspace):
v_b += np.dot(vector.transpose(), component) * component
return v_b
- def _get_target(
- self, model: WordEmbeddingModel, target: Optional[List[str]] = None,
- ) -> List[str]:
-
- definitional_words = np.array(self.definitional_sets_).flatten().tolist()
-
- if target is not None:
- # keep only words in the model's vocab.
- target = list(
- filter(
- lambda x: x in model.vocab and x not in definitional_words, target,
- )
- )
- else:
- # indicate that all words are canditates to neutralize.
- target = list(
- filter(lambda x: x not in definitional_words, model.vocab.keys(),)
- )
-
- return target
-
def _neutralize(
self,
model: WordEmbeddingModel,
@@ -277,7 +257,9 @@ def fit(
# Identify the bias subspace using the definning sets.
if self.verbose:
print("Identifying the bias subspace.")
- self.pca_ = self._identify_bias_subspace(self.definitional_sets_embeddings_,)
+ self.pca_ = self._identify_bias_subspace(
+ self.definitional_sets_embeddings_,
+ )
self.bias_subspace_ = self.pca_.components_[: self.pca_num_components_]
# ------------------------------------------------------------------------------
@@ -311,9 +293,10 @@ def transform(
model : WordEmbeddingModel
The word embedding model to debias.
target : Optional[List[str]], optional
- If a set of words is specified in target, the debias method will be performed
- only on the word embeddings of this set. If `None` is provided, the
- debias will be performed on all words (except those specified in ignore).
+ If a set of words is specified in target, the debias method will be
+ performed only on the word embeddings of this set. If `None` is provided,
+ the debias will be performed on all words (except those specified in
+ ignore).
Note that some words that are not in target may be modified due to the
equalization process.
by default `None`.
@@ -338,7 +321,10 @@ def transform(
The debiased embedding model.
"""
self._check_transform_args(
- model=model, target=target, ignore=ignore, copy=copy,
+ model=model,
+ target=target,
+ ignore=ignore,
+ copy=copy,
)
check_is_fitted(
diff --git a/wefe/debias/repulsion_attraction_neutralization.py b/wefe/debias/repulsion_attraction_neutralization.py
index a02fe04..3cb4d7a 100644
--- a/wefe/debias/repulsion_attraction_neutralization.py
+++ b/wefe/debias/repulsion_attraction_neutralization.py
@@ -1,6 +1,6 @@
"""Repulsion Attraction Neutralization WEFE implementation."""
from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional, Sequence
+from typing import Any, Dict, List, Optional, Sequence
import numpy as np
from sklearn.decomposition import PCA
@@ -41,7 +41,7 @@ def __init__(
repulsion_set: List[np.ndarray]
Set of embeddings to be repeled from word
bias_direction: np.array
-
+
"""
super(RAN_Loss, self).__init__()
self.w = w
@@ -142,7 +142,7 @@ def _forward(self):
class RepulsionAttractionNeutralization(BaseDebias):
- """Repulsion Attraction Neutralization method.
+ r"""Repulsion Attraction Neutralization method.
.. warning::
@@ -188,7 +188,7 @@ class RepulsionAttractionNeutralization(BaseDebias):
.. math::
- F_r(w_d) = \\sum |cos(w_d,n_i)| / |S|
+ F_r(w_d) = \sum |cos(w_d,n_i)| / |S|
.. math::
@@ -202,7 +202,7 @@ class RepulsionAttractionNeutralization(BaseDebias):
.. math::
- F(w_d) = \\lambda_1 F_r(w_d) + \\lambda_2 F_a(w_d) + \\lambda_3 F_n(w_d)
+ F(w_d) = \lambda_1 F_r(w_d) + \lambda_2 F_a(w_d) + \lambda_3 F_n(w_d)
In the original implementation is define a preserve set :math:`(V_p)` corresponding
to words for which gender carries semantic importance, this words are not
@@ -210,7 +210,7 @@ class RepulsionAttractionNeutralization(BaseDebias):
In WEFE this words would be the ones included in the ignore parameter of the
transform method. The words that are not present in :math:`V_p` are the ones to be
- included in the debias process and form part of the debias set :math:`(V_d)`,
+ included in the debias process and form part of the debias set :math:`(V_d)`,
in WEFE this words can be specified in the target parameter of the transform method.
Examples
@@ -239,7 +239,8 @@ class RepulsionAttractionNeutralization(BaseDebias):
>>> debiased_model = ran.transform(
... model = model, target = ['doctor','nurse','programmer']
... )
- Copy argument is True. Transform will attempt to create a copyof the original model. This may fail due to lack of memory.
+ Copy argument is True. Transform will attempt to create a copyof the original model.
+ This may fail due to lack of memory.
Model copy created successfully.
>>> # if you don't want a set of words to be debiased include them in the ignore set
>>> gender_specific = debiaswe_wordsets["gender_specific"]
@@ -332,7 +333,9 @@ def __init__(
self.learning_rate = learning_rate
def _identify_bias_subspace(
- self, defining_pairs_embeddings: List[EmbeddingDict], verbose: bool = False,
+ self,
+ defining_pairs_embeddings: List[EmbeddingDict],
+ verbose: bool = False,
) -> PCA:
matrix = []
@@ -357,7 +360,9 @@ def _identify_bias_subspace(
return pca
def _check_sets_size(
- self, sets: Sequence[Sequence[str]], set_name: str,
+ self,
+ sets: Sequence[Sequence[str]],
+ set_name: str,
):
for idx, set_ in enumerate(sets):
@@ -443,7 +448,15 @@ def _debias(
weights: List[float],
) -> torch.Tensor:
- ran = RAN(model, word, w_b, w, repulsion_set, bias_direction, weights,)
+ ran = RAN(
+ model,
+ word,
+ w_b,
+ w,
+ repulsion_set,
+ bias_direction,
+ weights,
+ )
optimizer = torch.optim.Adam(ran.parameters(), lr=learning_rate)
for epoch in range(epochs):
optimizer.zero_grad()
@@ -458,7 +471,9 @@ def _init_vector(self, model: WordEmbeddingModel, word: str) -> torch.Tensor:
return torch.FloatTensor(np.array(v))
def fit(
- self, model: WordEmbeddingModel, definitional_pairs: Sequence[Sequence[str]],
+ self,
+ model: WordEmbeddingModel,
+ definitional_pairs: Sequence[Sequence[str]],
) -> BaseDebias:
"""Compute the bias direction.
@@ -501,7 +516,8 @@ def fit(
print("Identifying the bias subspace.")
self.pca_ = self._identify_bias_subspace(
- self.definitional_pairs_embeddings_, self.verbose,
+ self.definitional_pairs_embeddings_,
+ self.verbose,
)
self.bias_direction_ = self.pca_.components_[0]
return self
diff --git a/wefe/metrics/ECT.py b/wefe/metrics/ECT.py
index 356f9ac..c3b337c 100644
--- a/wefe/metrics/ECT.py
+++ b/wefe/metrics/ECT.py
@@ -4,6 +4,7 @@
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
+
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -17,7 +18,7 @@ class ECT(BaseMetric):
It calculates the average target group vectors, measures the cosine similarity of
each to a list of attribute words and calculates the correlation of the resulting
similarity lists.
-
+
Values closer to 1 are better as they represent less bias.
The general steps of the test, as defined in [1], are as follows:
@@ -89,7 +90,7 @@ def run_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -97,14 +98,14 @@ def run_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
diff --git a/wefe/metrics/MAC.py b/wefe/metrics/MAC.py
index 45d6cee..363c40e 100644
--- a/wefe/metrics/MAC.py
+++ b/wefe/metrics/MAC.py
@@ -3,7 +3,6 @@
import numpy as np
from scipy.spatial import distance
-
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -23,9 +22,9 @@ class MAC(BaseMetric):
For each target set:
For each word embedding in the target set:
For each attribute set:
- Calculate the cosine similarity of the target embedding and each attribute
- embedding of the set.
- Calculate the mean of the cosines similarities and store it in a array.
+ Calculate the cosine similarity of the target embedding and each attribute
+ embedding of the set.
+ Calculate the mean of the cosines similarities and store it in a array.
Average all the mean cosine similarities and return the calculated score.
The closer the value is to 1, the less biased the query will be.
@@ -140,7 +139,7 @@ def run_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -148,14 +147,14 @@ def run_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -174,6 +173,12 @@ def run_query(
Examples
--------
+ The following example shows how to run a query that measures gender bias using
+ MAC.
+ Note that the results return both the result of the metric and the cosine
+ distance of each target embedding with respect to the average embedding
+ of each attribute set.
+
>>> from wefe.metrics import MAC
>>> from wefe.query import Query
>>> from wefe.utils import load_test_model
@@ -200,28 +205,233 @@ def run_query(
>>> model = load_test_model()
>>>
>>> # instance the metric and run the query
- >>> MAC().run_query(query, model) # doctest: +SKIP
- {'query_name': 'Female terms and Male Terms wrt Family and Careers',
- 'result': 0.8416415235615204,
- 'mac': 0.8416415235615204,
- 'targets_eval': {'Female terms': {'female': {'Family': 0.9185737599618733,
- 'Careers': 0.916069650076679},
- 'woman': {'Family': 0.752434104681015, 'Careers': 0.9377805145923048},
- 'girl': {'Family': 0.707457959651947, 'Careers': 0.9867974997032434},
- 'sister': {'Family': 0.5973392464220524, 'Careers': 0.9482253392925486},
- 'she': {'Family': 0.7872791914269328, 'Careers': 0.9161583095556125},
- 'her': {'Family': 0.7883057091385126, 'Careers': 0.9237247597193345},
- 'hers': {'Family': 0.7385367527604103, 'Careers': 0.9480051446007565},
- 'daughter': {'Family': 0.5472579970955849, 'Careers': 0.9277344475267455}},
- 'Male Terms': {'male': {'Family': 0.8735092766582966,
- 'Careers': 0.9468009045813233},
- 'man': {'Family': 0.8249392118304968, 'Careers': 0.9350165261421353},
- 'boy': {'Family': 0.7106057899072766, 'Careers': 0.9879048476286698},
- 'brother': {'Family': 0.6280269809067249, 'Careers': 0.9477180293761194},
- 'he': {'Family': 0.8693044614046812, 'Careers': 0.8771287016716087},
- 'him': {'Family': 0.8230192996561527, 'Careers': 0.888683641096577},
- 'his': {'Family': 0.8876195731572807, 'Careers': 0.8920885202242061},
- 'son': {'Family': 0.5764635019004345, 'Careers': 0.9220191016211174}}}}
+ >>> MAC().run_query(query, model)
+ {
+ "query_name": "Female terms and Male Terms wrt Family and Careers",
+ "result": 0.8416415235615204,
+ "mac": 0.8416415235615204,
+ "targets_eval": {
+ "Female terms": {
+ "female": {"Family": 0.9185737599618733, "Careers": 0.916069650076679},
+ "woman": {"Family": 0.752434104681015, "Careers": 0.9377805145923048},
+ "girl": {"Family": 0.707457959651947, "Careers": 0.9867974997032434},
+ "sister": {"Family": 0.5973392464220524, "Careers": 0.9482253392925486},
+ "she": {"Family": 0.7872791914269328, "Careers": 0.9161583095556125},
+ "her": {"Family": 0.7883057091385126, "Careers": 0.9237247597193345},
+ "hers": {"Family": 0.7385367527604103, "Careers": 0.9480051446007565},
+ "daughter": {"Family": 0.5472579970955849, "Careers": 0.9277344475267455},
+ },
+ "Male Terms": {
+ "male": {"Family": 0.8735092766582966, "Careers": 0.9468009045813233},
+ "man": {"Family": 0.8249392118304968, "Careers": 0.9350165261421353},
+ "boy": {"Family": 0.7106057899072766, "Careers": 0.9879048476286698},
+ "brother": {"Family": 0.6280269809067249, "Careers": 0.9477180293761194},
+ "he": {"Family": 0.8693044614046812, "Careers": 0.8771287016716087},
+ "him": {"Family": 0.8230192996561527, "Careers": 0.888683641096577},
+ "his": {"Family": 0.8876195731572807, "Careers": 0.8920885202242061},
+ "son": {"Family": 0.5764635019004345, "Careers": 0.9220191016211174},
+ },
+ },
+ }
+
+ MAC was originally designed to accept more than two target sets.
+ The following example shows how to run a Query that measures ethnic bias
+ (black, white and Asian) with respect to biased concepts commonly associated
+ with these groups:
+
+ >>> from wefe.word_embedding_model import WordEmbeddingModel
+ >>> from wefe.query import Query
+ >>> from wefe.metrics import MAC
+ >>>
+ >>> ethnicity_query = Query(
+ ... target_sets=[
+ ... ["black", "africa"],
+ ... ["caucasian", "america"],
+ ... ["asian", "asia"]],
+ ... attribute_sets=[
+ ... ["slave", "musician", "runner", "criminal", "homeless"],
+ ... ["manager", "executive", "redneck", "hillbilly", "leader", "farmer"],
+ ... ["doctor", "engineer", "laborer", "teacher"],
+ ... ],
+ ... target_sets_names=["Black words", "White words", "Asian words"],
+ ... attribute_sets_names=[
+ ... "Black biased words",
+ ... "White biased words",
+ ... "Asian biased words",
+ ... ],
+ ... )
+ >>>
+ >>> # load word2vec from gensim api.
+ >>> model = WordEmbeddingModel(api.load("word2vec-google-news-300"), "word2vec")
+ >>>
+ >>> # instance the metric and run the query
+ >>> MAC().run_query(ethnicity_query, model)
+ {
+ "query_name": "Black words, White words and Asian words wrt Black biased words, White biased words and Asian biased words",
+ "result": 0.9462675075454171,
+ "mac": 0.9462675075454171,
+ "targets_eval": {
+ "Black words": {
+ "black": {
+ "Black biased words": 0.887664645537734,
+ "White biased words": 0.8847345071844757,
+ "Asian biased words": 0.9767352174967527,
+ },
+ "africa": {
+ "Black biased words": 0.9635645077563822,
+ "White biased words": 0.9766113509734472,
+ "Asian biased words": 1.0130788520909846,
+ },
+ },
+ "White words": {
+ "caucasian": {
+ "Black biased words": 0.8768320322036743,
+ "White biased words": 0.8743524154027303,
+ "Asian biased words": 0.9000119741540402,
+ },
+ "america": {
+ "Black biased words": 0.9680319786071777,
+ "White biased words": 0.9214469141637286,
+ "Asian biased words": 0.994440613896586,
+ },
+ },
+ "Asian words": {
+ "asian": {
+ "Black biased words": 0.965478494623676,
+ "White biased words": 0.9149698968976736,
+ "Asian biased words": 0.9534151484258473,
+ },
+ "asia": {
+ "Black biased words": 0.972597711533308,
+ "White biased words": 0.9664777771686204,
+ "Asian biased words": 1.0223710977006704,
+ },
+ },
+ },
+ }
+
+ We can also test words that represent religious groups and beliefs:
+
+ >>> from wefe.word_embedding_model import WordEmbeddingModel
+ >>> from wefe.query import Query
+ >>> from wefe.metrics import MAC
+ >>> from wefe.utils import load_test_model
+ >>> import gensim.downloader as api
+ >>>
+ >>> # load word2vec from gensim api.
+ >>> model = WordEmbeddingModel(api.load("word2vec-google-news-300"), "word2vec")
+ >>>
+ >>> # create the query
+ >>> religion_query = Query(
+ ... target_sets=[
+ ... ["judaism", "jew", "synagogue", "torah", "rabbi"],
+ ... ["christianity", "christian", "church", "bible", "priest"],
+ ... ["islam", "muslim", "mosque", "quran", "imam"],
+ ... ],
+ ... attribute_sets=[
+ ... ["greedy", "cheap", "hairy", "liberal"],
+ ... ["judgmental", "conservative", "familial"],
+ ... ["violent", "terrorist", "dirty", "uneducated"],
+ ... ],
+ ... target_sets_names=["judaism", "christianity", "islam"],
+ ... attribute_sets_names=[
+ ... "jew biased words",
+ ... "christian biased words",
+ ... "musilm biased words",
+ ... ],
+ ... )
+ >>>
+ >>> # instance the metric and run the query
+ >>> MAC().run_query(religion_query, model, warn_not_found_words=True)
+ {
+ "query_name": "judaism, christianity and islam wrt jew biased words, christian biased words and musilm biased words",
+ "result": 0.8589896201628209,
+ "mac": 0.8589896201628209,
+ "targets_eval": {
+ "judaism": {
+ "judaism": {
+ "jew biased words": 0.8744675349444151,
+ "christian biased words": 0.815421904126803,
+ "musilm biased words": 0.8894469570368528,
+ },
+ "jew": {
+ "jew biased words": 0.7810277417302132,
+ "christian biased words": 0.8705306425690651,
+ "musilm biased words": 0.8410659478977323,
+ },
+ "synagogue": {
+ "jew biased words": 0.9586692564189434,
+ "christian biased words": 0.8717945317427317,
+ "musilm biased words": 0.9161230166791938,
+ },
+ "torah": {
+ "jew biased words": 0.9311909799580462,
+ "christian biased words": 0.8741760378082594,
+ "musilm biased words": 0.9664547641441459,
+ },
+ "rabbi": {
+ "jew biased words": 0.9022729225689545,
+ "christian biased words": 0.8595656901597977,
+ "musilm biased words": 0.9270578834693879,
+ },
+ },
+ "christianity": {
+ "christianity": {
+ "jew biased words": 0.8192066270858049,
+ "christian biased words": 0.783344641327858,
+ "musilm biased words": 0.808249220252037,
+ },
+ "christian": {
+ "jew biased words": 0.8092729989439249,
+ "christian biased words": 0.7565138414502144,
+ "musilm biased words": 0.7822588048875332,
+ },
+ "church": {
+ "jew biased words": 0.934008174444898,
+ "christian biased words": 0.8065384129683176,
+ "musilm biased words": 0.8915035352110863,
+ },
+ "bible": {
+ "jew biased words": 0.8507496938109398,
+ "christian biased words": 0.8642359959194437,
+ "musilm biased words": 0.8490688409656286,
+ },
+ "priest": {
+ "jew biased words": 0.9257305036298931,
+ "christian biased words": 0.861459826429685,
+ "musilm biased words": 0.8620996568351984,
+ },
+ },
+ "islam": {
+ "islam": {
+ "jew biased words": 0.8377434946596622,
+ "christian biased words": 0.8127042253812155,
+ "musilm biased words": 0.7525370791554451,
+ },
+ "muslim": {
+ "jew biased words": 0.8212915528565645,
+ "christian biased words": 0.8246404901146889,
+ "musilm biased words": 0.7299829311668873,
+ },
+ "mosque": {
+ "jew biased words": 0.9514421001076698,
+ "christian biased words": 0.8898302918920914,
+ "musilm biased words": 0.8566081328317523,
+ },
+ "quran": {
+ "jew biased words": 0.913289612159133,
+ "christian biased words": 0.8723569065332413,
+ "musilm biased words": 0.8311764020472765,
+ },
+ "imam": {
+ "jew biased words": 0.9434488633705769,
+ "christian biased words": 0.8907990877827009,
+ "musilm biased words": 0.8431751518510282,
+ },
+ },
+ },
+ }
+
"""
# check the types of the provided arguments (only the defaults).
self._check_input(query, model, locals())
diff --git a/wefe/metrics/RIPA.py b/wefe/metrics/RIPA.py
index 347e78f..74b191b 100644
--- a/wefe/metrics/RIPA.py
+++ b/wefe/metrics/RIPA.py
@@ -2,6 +2,7 @@
from typing import Any, Callable, Dict, List, Tuple, Union
import numpy as np
+
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -186,7 +187,7 @@ def run_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -194,14 +195,14 @@ def run_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
diff --git a/wefe/metrics/RND.py b/wefe/metrics/RND.py
index 5dfa977..5b8fe4e 100644
--- a/wefe/metrics/RND.py
+++ b/wefe/metrics/RND.py
@@ -3,7 +3,6 @@
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
-
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -96,13 +95,13 @@ def __calc_rnd(
# by word
distance_by_words[attribute_words[attribute_word_index]] = current_distance
- sorted_distances_by_word = {
+ sorted_distance_by_word = {
k: v for k, v in sorted(distance_by_words.items(), key=lambda item: item[1])
}
# calculate the average of the distances and return
mean_distance = sum_of_distances / len(distance_by_words)
- return mean_distance, sorted_distances_by_word
+ return mean_distance, sorted_distance_by_word
def run_query(
self,
@@ -152,7 +151,7 @@ def run_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -160,14 +159,14 @@ def run_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -214,11 +213,11 @@ def run_query(
>>> model = load_test_model()
>>>
>>> # instance the metric and run the query
- >>> RND().run_query(query, model) # doctest: +SKIP
+ >>> RND().run_query(query, model)
{'query_name': 'Female terms and Male Terms wrt Family',
'result': 0.030381828546524048,
'rnd': 0.030381828546524048,
- 'distances_by_word': {'wedding': -0.1056304,
+ 'distance_by_word': {'wedding': -0.1056304,
'marriage': -0.10163283,
'children': -0.068374634,
'parents': 0.00097084045,
@@ -228,14 +227,14 @@ def run_query(
'home': 0.1733501}}
>>>
- If you want the embeddings to be normalized before calculating the metrics
+ If you want the embeddings to be normalized before calculating the metrics,
use the normalize parameter as True before executing the query.
- >>> RND().run_query(query, model, normalize=True) # doctest: +SKIP
+ >>> RND().run_query(query, model, normalize=True)
{'query_name': 'Female terms and Male Terms wrt Family',
'result': -0.006278775632381439,
'rnd': -0.006278775632381439,
- 'distances_by_word': {'children': -0.05244279,
+ 'distance_by_word': {'children': -0.05244279,
'wedding': -0.04642248,
'marriage': -0.04268837,
'parents': -0.022358716,
@@ -244,14 +243,14 @@ def run_query(
'home': 0.04009247,
'cousins': 0.044702888}}
- If you want to use cosine distance instead of euclidean norm
+ If you want to use cosine distance instead of Euclidean norm
use the distance parameter as 'cos' before executing the query.
- >>> RND().run_query(query, model, normalize=True, distance='cos') # doctest: +SKIP
+ >>> RND().run_query(query, model, normalize=True, distance='cos')
{'query_name': 'Female terms and Male Terms wrt Family',
'result': 0.03643466345965862,
'rnd': 0.03643466345965862,
- 'distances_by_word': {'cousins': -0.035989374,
+ 'distance_by_word': {'cousins': -0.035989374,
'home': -0.026971221,
'family': -0.009296179,
'relatives': 0.015690982,
@@ -281,7 +280,7 @@ def run_query(
"query_name": query.query_name,
"result": np.nan,
"rnd": np.nan,
- "distances_by_word": {},
+ "distance_by_word": {},
}
# get the targets and attribute sets transformed into embeddings.
@@ -298,7 +297,7 @@ def run_query(
# get a list with the transformed attribute words
attribute_0_words = list(attribute_embeddings[0].keys())
- rnd, distances_by_word = self.__calc_rnd(
+ rnd, distance_by_word = self.__calc_rnd(
target_0_embeddings,
target_1_embeddings,
attribute_0_embeddings,
@@ -310,5 +309,5 @@ def run_query(
"query_name": query.query_name,
"result": rnd,
"rnd": rnd,
- "distances_by_word": distances_by_word,
+ "distance_by_word": distance_by_word,
}
diff --git a/wefe/metrics/RNSB.py b/wefe/metrics/RNSB.py
index e868a26..023e80b 100644
--- a/wefe/metrics/RNSB.py
+++ b/wefe/metrics/RNSB.py
@@ -9,7 +9,6 @@
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
-
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -17,19 +16,20 @@
class RNSB(BaseMetric):
- """Relative Relative Negative Sentiment Bias (RNSB).
+ r"""Relative Relative Negative Sentiment Bias (RNSB).
The metric was originally proposed in "A transparent framework for evaluating
unintended demographic bias in word embeddings" [1].
This metric is based on measuring bias through word sentiment.
- The main idea is that if there were no bias, all words should be equally negative.
+ The main idea is that if there was no bias, all words should be equally negative.
Therefore, its procedure is based on calculating how negative the words in
the target sets are.
For this purpose, RNSB trains a classifier that assigns a probability to each
word of belonging to the negative class (in the original work the classifier is
- trained using `Bing Liu's lexicon `_
+ trained using
+ :func:`~wefe.datasets.load_bingliu`
of positive and negative words).
Then, it generates a probability distribution with the probabilities calculated in
the previous step and compares them to the uniform distribution
@@ -43,14 +43,14 @@ class RNSB(BaseMetric):
in the original RNSB work.
RNSB receives as input queries with two attribute sets :math:`A_1` and
- :math:`A_2` and two or more target sets. Thus has a template (tuple of numbers that
+ :math:`A_2` and two or more target sets, thus has a template (tuple of numbers that
defines the allowed target and attribute sets in the query)
- of the form :math:`s=(N,2)` with :math:`N\\geq 2`.
+ of the form :math:`s=(N,2)` with :math:`N\geq 2`.
- Given a query :math:`Q=(\\{T_1,T_2,\\ldots,T_n\\},\\{A_1,A_2\\})` RNSB is
+ Given a query :math:`Q=(\{T_1,T_2,\ldots,T_n\},\{A_1,A_2\})` RNSB is
calculated under the following steps:
- 1. First constructs a binary classifier :math:`C_{(A_1,A_2)}(\\cdot)` using
+ 1. First constructs a binary classifier :math:`C_{(A_1,A_2)}(\cdot)` using
set :math:`A_1` as training examples for the negative class, and :math:`A_2` as
training examples for the positive class.
@@ -59,17 +59,20 @@ class RNSB(BaseMetric):
association of :math:`w` with respect to :math:`A_2` (value
:math:`1-C_{(A_1,A_2)}(w)` is the degree of association with :math:`A_1`).
- 3. Then, the metric construct a probability distribution :math:`P(\\cdot)` over all
- the words :math:`w` in :math:`T_1\\cup \\cdots \\cup T_n`, by computing
- :math:`C_{(A_1,A_2)}(w)` and normalizing it to ensure that :math:`\\sum_w P(w)=1`.
+ 3. Then, the metric constructs a probability distribution :math:`P(\cdot)` over all
+ the words :math:`w` in :math:`T_1\cup \cdots \cup T_n`, by computing
+ :math:`C_{(A_1,A_2)}(w)` and normalizing it to ensure that
+ :math:`\sum_w P(w)=1`.
- 4. Finally RNSB is calculated as the distance between :math:`P(\\cdot)` and
- the uniform distribution :math:`Y(\\cdot)` using the KL-divergence.
+ 4. Finally RNSB is calculated as the distance between :math:`P(\cdot)` and
+ the uniform distribution :math:`Y(\cdot)` using the KL-divergence.
- The main idea behind RNSB is that the more that :math:`P(\\cdot)` resembles a
+ The main idea behind RNSB is that the more that :math:`P(\cdot)` resembles a
uniform distribution, the less biased the word embedding model is.
Thus, the optimal value is 0.
+ You can see the full paper replication in Previous Studies Replication section.
+
References
----------
| [1]: Chris Sweeney and Maryam Najafian. A transparent framework for evaluating
@@ -88,7 +91,7 @@ def _train_classifier(
self,
attribute_embeddings_dict: List[Dict[str, np.ndarray]],
estimator: BaseEstimator = LogisticRegression,
- estimator_params: Dict[str, Any] = {"solver": "liblinear", "max_iter": 10000},
+ estimator_params: Dict[str, Any] = {"max_iter": 10000},
random_state: Union[int, None] = None,
holdout: bool = True,
print_model_evaluation: bool = False,
@@ -102,11 +105,10 @@ def _train_classifier(
estimator : BaseEstimator, optional
A scikit-learn classifier class that implements predict_proba function,
- by default None,
+ by default LogisticRegression,
estimator_params : dict, optional
- Parameters that will use the classifier, by default { 'solver': 'liblinear',
- 'max_iter': 10000, }
+ Parameters that will use the classifier, by default { 'max_iter': 10000, }
random_state : Union[int, None], optional
A seed that allows making the execution of the query reproducible, by
@@ -157,15 +159,15 @@ def _train_classifier(
if num_train_positive_examples == 1:
raise Exception(
"After splitting the dataset using train_test_split "
- "(with test_size=0.1), the first attribute remained with 0 training "
- "examples."
+ "(with test_size=0.1), the first attribute remained with 0 "
+ "training examples."
)
if num_train_negative_examples < 1:
raise Exception(
"After splitting the dataset using train_test_split "
- "(with test_size=0.1), the second attribute remained with 0 training "
- "examples."
+ "(with test_size=0.1), the second attribute remained with 0 "
+ "training examples."
)
estimator = estimator(**estimator_params)
@@ -310,24 +312,27 @@ def run_query(
n_iterations : int, optional
When provided, it tells the metric to run the specified number of times
and then average its results. This functionality is indicated to
- strengthen the results obtained, by default 1.
+ strengthen the results obtained.
+ Note that you cannot specify random_state next to n_iterations as this
+ would always produce the same results,
+ by default 1.
random_state : Union[int, None], optional
- Seed that allow making the execution of the query reproducible.
+ Seed that allows making the execution of the query reproducible.
Warning: if a random_state other than None is provided along with
n_iterations, each iteration will split the dataset and train a
classifier associated to the same seed, so the results of each iteration
- will always be the same , by default None.
+ will always be the same, by default None.
holdout: bool, optional
True indicates that a holdout (split attributes in train/test sets) will
be executed before running the model training.
- This option allows to evaluate the performance of the classifier
+ This option allows for evaluating the performance of the classifier
(can be printed using print_model_evaluation=True) at the cost of training
the classifier with fewer examples. False disables this functionality.
- Note that holdout divides into 80%train and 20% test, performs a shuffle
- and tries to maintain the original ratio of the classes via stratify param.
- by default True
+ Note that holdout divides into 80% train and 20% test, performs a shuffle
+ and tries to maintain the original ratio of the classes via stratify param,
+ by default True.
print_model_evaluation : bool, optional
Indicates whether the classifier evaluation is printed after the
@@ -354,7 +359,7 @@ def run_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -362,14 +367,14 @@ def run_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -389,7 +394,10 @@ def run_query(
Examples
--------
The following example shows how to run a query that measures gender
- bias using RNSB:
+ bias using RNSB.
+ Note that by default the RNSB score is returned plus the negative class
+ probabilities for each word and its distribution (the above probabilities
+ normalized to 1).
>>> from wefe.query import Query
>>> from wefe.utils import load_test_model
@@ -416,58 +424,58 @@ def run_query(
>>> model = load_test_model()
>>>
>>> # instance the metric and run the query
- >>> RNSB().run_query(query, model) # doctest: +SKIP
+ >>> RNSB().run_query(query, model)
{
- 'query_name': 'Female terms and Male Terms wrt Family and Careers',
- 'result': 0.02899395368025491,
- 'rnsb': 0.02899395368025491,
- 'negative_sentiment_probabilities': {
- 'female': 0.43272977959940667,
- 'woman': 0.6951544646603257,
- 'girl': 0.8141335128074891,
- 'sister': 0.8472896023561901,
- 'she': 0.5718048693637721,
- 'her': 0.5977365245684795,
- 'hers': 0.6939932357393684,
- 'daughter': 0.8887895021296551,
- 'male': 0.5511334216620132,
- 'man': 0.584603563015763,
- 'boy': 0.8129431089763982,
- 'brother': 0.8331301278277582,
- 'he': 0.4420145415672582,
- 'him': 0.5139776652415698,
- 'his': 0.44459083129125154,
- 'son': 0.8483699001061482
+ "query_name": "Female terms and Male Terms wrt Family and Careers",
+ "result": 0.10769060995617141,
+ "rnsb": 0.10769060995617141,
+ "negative_sentiment_probabilities": {
+ "female": 0.5742192708509877,
+ "woman": 0.32330898567978306,
+ "girl": 0.17573260129841273,
+ "sister": 0.15229835340332343,
+ "she": 0.3761328719677399,
+ "her": 0.35739995104539557,
+ "hers": 0.2911542159275662,
+ "daughter": 0.11714195753410628,
+ "male": 0.4550779245077232,
+ "man": 0.39826729589696475,
+ "boy": 0.17445392462199483,
+ "brother": 0.16517694979156405,
+ "he": 0.5044892468050808,
+ "him": 0.45426103796811057,
+ "his": 0.5013980699813614,
+ "son": 0.1509265229834842,
+ },
+ "negative_sentiment_distribution": {
+ "female": 0.11103664779476699,
+ "woman": 0.0625181838962096,
+ "girl": 0.033981372529543176,
+ "sister": 0.02944989742595416,
+ "she": 0.07273272658861042,
+ "her": 0.06911034599602082,
+ "hers": 0.0563004234950174,
+ "daughter": 0.022651713275710483,
+ "male": 0.08799831316676666,
+ "man": 0.07701285503210042,
+ "boy": 0.033734115115920706,
+ "brother": 0.031940228635376634,
+ "he": 0.09755296914839981,
+ "him": 0.08784035200525282,
+ "his": 0.09695522899987082,
+ "son": 0.029184626894479145,
},
- 'negative_sentiment_distribution': {
- 'female': 0.04093015763103808,
- 'woman': 0.06575184597373163,
- 'girl': 0.07700559236475293,
- 'sister': 0.08014169261861909,
- 'she': 0.05408470722518866,
- 'her': 0.05653747748783378,
- 'hers': 0.0656420100321782,
- 'daughter': 0.0840670000956609,
- 'male': 0.052129478690471215,
- 'man': 0.055295283832909777,
- 'boy': 0.07689299688658582,
- 'brother': 0.07880240525790659,
- 'he': 0.04180836566946482,
- 'him': 0.04861506614276754,
- 'his': 0.04205204648247447,
- 'son': 0.0802438736084164
- }
}
If you want to perform a holdout to evaluate (defualt option) the model
- and print the evaluation, use the params `holdout=True` and
- `print_model_evaluation=True`
+ and print the evaluation, use the params ``holdout=True`` and
+ ``print_model_evaluation=True``
>>> RNSB().run_query(
... query,
... model,
... holdout=True,
- ... print_model_evaluation=True) # doctest: +SKIP
+ ... print_model_evaluation=True)
"Classification Report:"
" precision recall f1-score support"
" "
@@ -478,48 +486,49 @@ def run_query(
" macro avg 1.00 1.00 1.00 4"
"weighted avg 1.00 1.00 1.00 4"
{
- 'query_name': 'Female terms and Male Terms wrt Family and Careers',
- 'result': 0.028622532697549753,
- 'rnsb': 0.028622532697549753,
- 'negative_sentiment_probabilities': {
- 'female': 0.4253580834091863,
- 'woman': 0.7001106999668327,
- 'girl': 0.8332271657179001,
- 'sister': 0.8396986674252397,
- 'she': 0.603565156083575,
- 'her': 0.6155296658190583,
- 'hers': 0.7147102319731146,
- 'daughter': 0.884829695542309,
- 'male': 0.5368167185683463,
- 'man': 0.5884385611055519,
- 'boy': 0.8132056992854114,
- 'brother': 0.8270792128939456,
- 'he': 0.4500708786239489,
- 'him': 0.49965355723589994,
- 'his': 0.45394634194580535,
- 'son': 0.8450690196299462
+ "query_name": "Female terms and Male Terms wrt Family and Careers",
+ "result": 0.09400726375514418,
+ "rnsb": 0.09400726375514418,
+ "negative_sentiment_probabilities": {
+ "female": 0.5583010801302075,
+ "woman": 0.3159147912504866,
+ "girl": 0.20753840501109977,
+ "sister": 0.16020059726421976,
+ "she": 0.4266765171984158,
+ "her": 0.4066467259229203,
+ "hers": 0.32435655424005905,
+ "daughter": 0.13318012193912765,
+ "male": 0.44129601598998147,
+ "man": 0.42681869843678866,
+ "boy": 0.21830517614567535,
+ "brother": 0.2037443178553553,
+ "he": 0.5655603842644314,
+ "him": 0.512466010254818,
+ "his": 0.5689713390373838,
+ "son": 0.18364286185769785,
},
- 'negative_sentiment_distribution': {
- 'female': 0.04000994319670431,
- 'woman': 0.0658536664275202,
- 'girl': 0.07837483962483958,
- 'sister': 0.07898356066672689,
- 'she': 0.05677241964432896,
- 'her': 0.057897822860029945,
- 'hers': 0.06722692455767754,
- 'daughter': 0.08322866600691568,
- 'male': 0.05049394205657851,
- 'man': 0.055349585027011844,
- 'boy': 0.07649158463116877,
- 'brother': 0.07779655217044128,
- 'he': 0.04233447297841125,
- 'him': 0.04699830853762932,
- 'his': 0.04269900599992016,
- 'son': 0.07948870561409564
- }
- }
+ "negative_sentiment_distribution": {
+ "female": 0.09875108690481095,
+ "woman": 0.05587832464521873,
+ "girl": 0.0367089439708001,
+ "sister": 0.02833593497428311,
+ "she": 0.07546961904547295,
+ "her": 0.07192679290859644,
+ "hers": 0.05737148541506475,
+ "daughter": 0.023556611770367462,
+ "male": 0.0780554843555203,
+ "man": 0.07549476775523993,
+ "boy": 0.038613347150078775,
+ "brother": 0.03603785404499525,
+ "he": 0.1000350969111323,
+ "him": 0.09064387893111858,
+ "his": 0.10063841921015712,
+ "son": 0.032482352007143285,
+ },
+ }
+
- If you want to disable the holdout, use the param `holdout=False`.
+ If you want to disable the holdout, use the param ``holdout=False``.
>>> # instance the metric and run the query
>>> RNSB().run_query(
@@ -529,50 +538,249 @@ def run_query(
... print_model_evaluation=True) # doctest: +SKIP
"Holdout is disabled. No evaluation was performed."
{
- 'query_name': 'Female terms and Male Terms wrt Family and Careers',
- 'result': 0.03171747070323668,
- 'rnsb': 0.03171747070323668,
- 'negative_sentiment_probabilities': {
- 'female': 0.41846552820545985,
- 'woman': 0.7104860753714863,
- 'girl': 0.8325507470146775,
- 'sister': 0.8634309153859019,
- 'she': 0.593223646607777,
- 'her': 0.6138756234516175,
- 'hers': 0.7205687956033292,
- 'daughter': 0.8964129106245865,
- 'male': 0.545075356696542,
- 'man': 0.5856674025396198,
- 'boy': 0.8184955986780176,
- 'brother': 0.8392921127806534,
- 'he': 0.43437306199747594,
- 'him': 0.4974336520424158,
- 'his': 0.4342254305877148,
- 'son': 0.851969666735826
+ "query_name": "Female terms and Male Terms wrt Family and Careers",
+ "result": 0.12921977967420623,
+ "rnsb": 0.12921977967420623,
+ "negative_sentiment_probabilities": {
+ "female": 0.5815344717945401,
+ "woman": 0.28951392462851366,
+ "girl": 0.16744925298532254,
+ "sister": 0.1365690846140981,
+ "she": 0.40677635339222296,
+ "her": 0.3861243765483825,
+ "hers": 0.2794312043966708,
+ "daughter": 0.1035870893754135,
+ "male": 0.45492464330345805,
+ "man": 0.4143325974603802,
+ "boy": 0.18150440132198242,
+ "brother": 0.1607078872193466,
+ "he": 0.5656269380025241,
+ "him": 0.5025663479575841,
+ "his": 0.5657745694122852,
+ "son": 0.14803033326417403,
+ },
+ "negative_sentiment_distribution": {
+ "female": 0.10881083995606983,
+ "woman": 0.05417091306830872,
+ "girl": 0.03133140811261611,
+ "sister": 0.025553423794525788,
+ "she": 0.0761118709786697,
+ "her": 0.07224768225706711,
+ "hers": 0.05228433658715302,
+ "daughter": 0.019382166922557734,
+ "male": 0.08512089128923325,
+ "man": 0.07752571883094242,
+ "boy": 0.03396126510372403,
+ "brother": 0.030070032034284336,
+ "he": 0.10583438336150637,
+ "him": 0.09403512449772648,
+ "his": 0.1058620066555313,
+ "son": 0.027697936550083888,
},
- 'negative_sentiment_distribution': {
- 'female': 0.03927208494188834,
- 'woman': 0.0666775818349327,
- 'girl': 0.07813308731881921,
- 'sister': 0.0810311243458957,
- 'she': 0.055672756461026464,
- 'her': 0.05761089983046311,
- 'hers': 0.06762382332604978,
- 'daughter': 0.08412641327954143,
- 'male': 0.05115414356760721,
- 'man': 0.05496361929467757,
- 'boy': 0.07681404203995185,
- 'brother': 0.07876574991858241,
- 'he': 0.04076497259018534,
- 'him': 0.04668307260513937,
- 'his': 0.04075111770161401,
- 'son': 0.07995551094362546
- }
}
+
+
+ Since each run of RNSB may give a different result due to the random
+ ``train_test_split`` and random initializations, RNSB can be requested to
+ run many times and returns the average of all runs through the
+ parameter ``n_iterations``.
+ This makes it potentially more stable and robust to outlier runs.
+
+ >>> RNSB().run_query(query, model, n_iterations=1000)
+ {
+ "query_name": "Female terms and Male Terms wrt Family and Careers",
+ "result": 0.09649701346914233,
+ "rnsb": 0.09649701346914233,
+ "negative_sentiment_probabilities": {
+ "female": 0.5618993210534083,
+ "woman": 0.31188456697468364,
+ "girl": 0.1968846981458747,
+ "sister": 0.1666990161087616,
+ "she": 0.4120315698794307,
+ "her": 0.3956786125532543,
+ "hers": 0.3031550094192968,
+ "daughter": 0.13259627603249385,
+ "male": 0.45579890258209677,
+ "man": 0.4210218238530363,
+ "boy": 0.2104231329680286,
+ "brother": 0.18879207133574177,
+ "he": 0.5473770682025214,
+ "him": 0.4924664455586234,
+ "his": 0.5479209229372095,
+ "son": 0.1770764765027373,
+ },
+ "negative_sentiment_distribution": {
+ "female": 0.10176190651838826,
+ "woman": 0.056483371593163176,
+ "girl": 0.035656498409823205,
+ "sister": 0.030189767202716926,
+ "she": 0.07462033949087124,
+ "her": 0.07165876247453706,
+ "hers": 0.05490241858857015,
+ "daughter": 0.024013643264435475,
+ "male": 0.08254675451251275,
+ "man": 0.07624850551663455,
+ "boy": 0.03810835568595322,
+ "brother": 0.034190895761652934,
+ "he": 0.09913187640146649,
+ "him": 0.08918737310881396,
+ "his": 0.09923037037111067,
+ "son": 0.03206916109934998,
+ },
+ }
+
+ If you want the embeddings to be normalized before calculating the metrics
+ use the ``normalize=True`` before executing the query.
+
+ >>> RNSB().run_query(query, model, normalize=True)
+ {
+ "query_name": "Female terms and Male Terms wrt Family and Careers",
+ "result": 0.00957187793390364,
+ "rnsb": 0.00957187793390364,
+ "negative_sentiment_probabilities": {
+ "female": 0.5078372178028085,
+ "woman": 0.4334357574118245,
+ "girl": 0.3764103216054252,
+ "sister": 0.35256834229924383,
+ "she": 0.4454357087596428,
+ "her": 0.4390986149718311,
+ "hers": 0.41329577968574494,
+ "daughter": 0.33427930165282493,
+ "male": 0.470250420503012,
+ "man": 0.4577545228416623,
+ "boy": 0.3698438702135818,
+ "brother": 0.35380575403374315,
+ "he": 0.49962008627445753,
+ "him": 0.47052126448152776,
+ "his": 0.49505591114138436,
+ "son": 0.34192683607526553,
+ },
+ "negative_sentiment_distribution": {
+ "female": 0.07511118533317328,
+ "woman": 0.06410690741777263,
+ "girl": 0.05567261405091151,
+ "sister": 0.05214628855999085,
+ "she": 0.06588174891831232,
+ "her": 0.0649444670309599,
+ "hers": 0.061128122983393235,
+ "daughter": 0.04944126523085684,
+ "male": 0.0695519454840733,
+ "man": 0.06770375151119586,
+ "boy": 0.054701409243182085,
+ "brother": 0.05232930677698083,
+ "he": 0.07389583823474007,
+ "him": 0.06959200440758956,
+ "his": 0.07322077821098569,
+ "son": 0.050572366605882095,
+ },
+ }
+
+ RNSB accepts more than 2 sets of target words.
+ This example shows how to measure words representing different nationalities
+ with respect to positive and negative words.
+
+ Note this is one of the tests that was proposed in the RNSB paper. You can
+ see the full paper replication in Previous Studies Replication.
+
+ >>> import gensim.downloader as api
+ >>>
+ >>> from wefe.word_embedding_model import WordEmbeddingModel
+ >>> from wefe.query import Query
+ >>> from wefe.datasets import load_bingliu
+ >>> from wefe.metrics import RNSB
+ >>>
+ >>> # Load the model
+ >>> model = WordEmbeddingModel(
+ ... api.load('glove-wiki-gigaword-300'), 'Glove wiki'
+ ... )
+ >>>
+ >>> RNSB_words = [
+ ... ["swedish"],
+ ... ["irish"],
+ ... ["mexican"],
+ ... ["chinese"],
+ ... ["filipino"],
+ ... ["german"],
+ ... ["english"],
+ ... ["french"],
+ ... ["norwegian"],
+ ... ["american"],
+ ... ["indian"],
+ ... ["dutch"],
+ ... ["russian"],
+ ... ["scottish"],
+ ... ["italian"],
+ ... ]
+ >>>
+ >>> bing_liu = load_bingliu()
+ >>>
+ >>> # Create the query
+ >>> query = Query(
+ ... RNSB_words,
+ ... [bing_liu["positive_words"], bing_liu["negative_words"]],
+ ... attribute_sets_names=["Positive words", "Negative words"],
+ ... )
+ >>>
+ >>> results = RNSB().run_query(
+ ... query,
+ ... model,
+ ... preprocessors=[{"lowercase": True}],
+ ... holdout=True,
+ ... print_model_evaluation=True,
+ ... n_iterations=500,
+ ... )
+ >>> results
+ {
+ "query_name": "Target set 0, Target set 1, Target set 2, Target set 3, Target set 4, Target set 5, Target set 6, Target set 7, Target set 8, Target set 9, Target set 10, Target set 11, Target set 12, Target set 13 and Target set 14 wrt Positive words and Negative words",
+ "result": 0.6313118439654091,
+ "rnsb": 0.6313118439654091,
+ "negative_sentiment_probabilities": {
+ "swedish": 0.03865446713798508,
+ "irish": 0.12266387930214015,
+ "mexican": 0.5038405165657709,
+ "chinese": 0.01913990969357335,
+ "filipino": 0.08074140612507152,
+ "german": 0.0498762435972975,
+ "english": 0.058042779461913364,
+ "french": 0.08030917713203162,
+ "norwegian": 0.12177903128690087,
+ "american": 0.22908203952254658,
+ "indian": 0.7836948288757486,
+ "dutch": 0.22748838866881654,
+ "russian": 0.4877408793080844,
+ "scottish": 0.027805085889223837,
+ "italian": 0.007885923500742055,
+ },
+ "negative_sentiment_distribution": {
+ "swedish": 0.01361674725376778,
+ "irish": 0.04321060837966024,
+ "mexican": 0.17748709213331876,
+ "chinese": 0.006742385345191273,
+ "filipino": 0.02844264587050847,
+ "german": 0.017569824481278706,
+ "english": 0.020446636995867174,
+ "french": 0.028290385255119174,
+ "norwegian": 0.04289890438595362,
+ "american": 0.08069836330742804,
+ "indian": 0.27607092269031136,
+ "dutch": 0.08013697047258364,
+ "russian": 0.17181569869170976,
+ "scottish": 0.009794853090881381,
+ "italian": 0.0027779616464207076,
+ },
+ }
+
+
"""
# check the types of the provided arguments (only the defaults).
self._check_input(query, model, locals())
+ if n_iterations > 1 and random_state is not None:
+ raise ValueError(
+ "It is not possible to specify random_state together with n_iterations"
+ " > 1 since all iterations would produce the same results."
+ )
+
# transform query word sets into embeddings
embeddings = get_embeddings_from_query(
model=model,
diff --git a/wefe/metrics/WEAT.py b/wefe/metrics/WEAT.py
index ebbad53..e1f5153 100644
--- a/wefe/metrics/WEAT.py
+++ b/wefe/metrics/WEAT.py
@@ -5,7 +5,6 @@
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
-
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -13,7 +12,7 @@
class WEAT(BaseMetric):
- """Word Embedding Association Test (WEAT).
+ r"""Word Embedding Association Test (WEAT).
The following description of the metric is WEFE's adaptation of what was presented
in the original WEAT work "Semantics derived automatically from language corpora
@@ -27,47 +26,47 @@ class WEAT(BaseMetric):
In formal terms, let :math:`T_1` and :math:`T_2` be two sets of target words of
equal size, and :math:`A_1`, :math:`A_2` the two sets of attribute words.
- Let :math:`\\cos(\\vec{a}, \\vec{b})` denote the cosine of the angle between the
- vectors :math:`\\vec{a}` and :math:`\\vec{b}`. The test statistic is:
+ Let :math:`\cos(\vec{a}, \vec{b})` denote the cosine of the angle between the
+ vectors :math:`\vec{a}` and :math:`\vec{b}`. The test statistic is:
.. math::
- \\text{WEAT}(T_1,T_2,A_1,A_2) = \\sum_{x \\in T_1} s(x, A_1, A_2) -
- \\sum_{y \\in T_2} s(y, A_1, A_2)
+ \text{WEAT}(T_1,T_2,A_1,A_2) = \sum_{x \in T_1} s(x, A_1, A_2) -
+ \sum_{y \in T_2} s(y, A_1, A_2)
where
.. math::
- s(w, A, B)=\\text{mean}_{a \\in A} \\cos(\\vec{w}, \\vec{a}) -
- \\text{mean}_{b \\in B} \\cos(\\vec{w},\\vec{b})
+ s(w, A, B)=\text{mean}_{a \in A} \cos(\vec{w}, \vec{a}) -
+ \text{mean}_{b \in B} \cos(\vec{w},\vec{b})
:math:`s(w,A,B)` measures the association of :math:`w` with the
- attributes, and :math:`\\text{WEAT}(T_1,T_2,A_1,A_2)` measures the differential
+ attributes, and :math:`\text{WEAT}(T_1,T_2,A_1,A_2)` measures the differential
association of the two sets of target words with the attribute.
This metric also contains a variant: WEAT Effect Size (WEAT-ES). This variant
represents a normalized measure that quantifies how far apart the two distributions
- of association between targets and attributes are. Iin practical terms, WEAT
+ of association between targets and attributes are. In practical terms, WEAT
Effect Size makes the metric not dependent on the number of words used in each set.
.. math::
- \\text{WEAT-ES}(T_1,T_2,A_1,A_2) = \\frac{\\text{mean}_{x \\in T_1}\\,
- s(x, A_1, A_2) - \\text{mean}_{y \\in T_2}\\, s(y, A_1, A_2) }
- {\\text{std-dev}_{w \\in T_1 \\cup T_2}\\, s(w, A_1, A_2)}
+ \text{WEAT-ES}(T_1,T_2,A_1,A_2) = \frac{\text{mean}_{x \in T_1}\,
+ s(x, A_1, A_2) - \text{mean}_{y \in T_2}\, s(y, A_1, A_2) }
+ {\text{std-dev}_{w \in T_1 \cup T_2}\, s(w, A_1, A_2)}
The permutation test measures the (un)likelihood of the null hypothesis by
computing the probability that a random permutation of the attribute words would
produce the observed (or greater) difference in sample mean.
Let :math:`{(T_{1_i},T_{2_i})}_{i}` denote all the partitions of
- :math:`T_1 \\cup T_2` into two sets of equal size. The one-sided p-value of the
+ :math:`T_1 \cup T_2` into two sets of equal size. The one-sided p-value of the
permutation test is:
.. math::
- \\text{Pr}_{i}[s(T_{1_i}, T_{2_i}, A_1, A_2) > s(T_1, T_2, A_1, A_2)]
+ \text{Pr}_{i}[s(T_{1_i}, T_{2_i}, A_1, A_2) > s(T_1, T_2, A_1, A_2)]
References
----------
@@ -214,8 +213,8 @@ def test_function(calculated, original):
if verbose:
logging.info(
- f"Number of runs: {runs}, Permutations that pass the test function type:"
- f"{count_pass_function}, p-value: {p_value}"
+ f"Number of runs: {runs}, Permutations that pass the test function "
+ f"type: {count_pass_function}, p-value: {p_value}"
)
return p_value
@@ -257,12 +256,12 @@ def run_query(
p_value_test_type : {'left-sided', 'right-sided', 'two-sided}, optional
When calculating the p-value, specify the type of test to be performed.
- The options are 'left-sided', 'right-sided' and 'two-sided
+ The options are 'left-sided', 'right-sided' and 'two-sided'
, by default 'right-sided'
p_value_method : {'exact', 'approximate'}, optional
When calculating the p-value, specify the method for calculating the
- p-value. This can be 'exact 'and 'approximate'.
+ p-value. This can be 'exact' and 'approximate'.
by default 'approximate'.
p_value_iterations : int, optional
@@ -272,7 +271,7 @@ def run_query(
p_value_verbose : bool, optional
In case of calculating the p-value, specify if notification messages
- will be logged during its calculation., by default False.
+ will be logged during its calculation, by default False.
lost_vocabulary_threshold : float, optional
Specifies the proportional limit of words that any set of the query is
@@ -301,7 +300,7 @@ def run_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -309,14 +308,14 @@ def run_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -364,19 +363,17 @@ def run_query(
>>> model = load_test_model()
>>>
>>> # instance the metric and run the query
- >>> WEAT().run_query(query, model) # doctest: +SKIP
+ >>> WEAT().run_query(query, model)
{'query_name': 'Female terms and Male Terms wrt Family and Career',
'result': 0.4634388245467562,
'weat': 0.4634388245467562,
'effect_size': 0.45076532408312986,
'p_value': nan}
- >>>
- >>>
If you want to return the effect size as result value, use
`return_effect_size` parameter as `True` while running the query.
- >>> WEAT().run_query(query, model, return_effect_size=True) # doctest: +SKIP
+ >>> WEAT().run_query(query, model, return_effect_size=True)
{'query_name': 'Female terms and Male Terms wrt Family and Career',
'result': 0.45076532408312986,
'weat': 0.4634388245467562,
@@ -386,7 +383,7 @@ def run_query(
If you want the embeddings to be normalized before calculating the metrics
use the `normalize` parameter as `True` before executing the query.
- >>> WEAT().run_query(query, model, normalize=True) # doctest: +SKIP
+ >>> WEAT().run_query(query, model, normalize=True)
{'query_name': 'Female terms and Male Terms wrt Family and Career',
'result': 0.4634388248814503,
'weat': 0.4634388248814503,
@@ -397,8 +394,8 @@ def run_query(
the permutation test and return its p-value. The argument
`p_value_method='approximate'` indicates that the calculation of the
permutation test will be approximate, i.e., not all possible permutations
- will be generated. Instead, random permutations of the attributes to test
will be generated.
+ Instead, random permutations of the attributes to test will be generated.
On the other hand, the argument `p_value_iterations`
indicates the number of permutations that will be generated and tested.
@@ -408,7 +405,7 @@ def run_query(
... calculate_p_value=True,
... p_value_method="approximate",
... p_value_iterations=10000,
- ... ) # doctest: +SKIP
+ ... )
{
'query_name': 'Female terms and Male Terms wrt Family and Career',
'result': 0.46343879750929773,
diff --git a/wefe/metrics/__init__.py b/wefe/metrics/__init__.py
index 5e88161..5f03686 100644
--- a/wefe/metrics/__init__.py
+++ b/wefe/metrics/__init__.py
@@ -1,3 +1,4 @@
+# flake8: noqa
from wefe.metrics.ECT import ECT
from wefe.metrics.MAC import MAC
from wefe.metrics.RIPA import RIPA
diff --git a/wefe/metrics/example_metric.py b/wefe/metrics/example_metric.py
index dd469ab..a74f1f7 100644
--- a/wefe/metrics/example_metric.py
+++ b/wefe/metrics/example_metric.py
@@ -4,7 +4,6 @@
import numpy as np
from scipy.spatial import distance
-
from wefe.metrics.base_metric import BaseMetric
from wefe.preprocessing import get_embeddings_from_query
from wefe.query import Query
@@ -33,10 +32,12 @@ def _calc_metric(
----------
target_embeddings : List[EmbeddingDict]
An array with EmbeddingDict. Each dictionary represents an target set.
- A dict is composed with a word and its embedding as key, value respectively.
+ A dict is composed with a word and its embedding as key, value
+ respectively.
attribute_embeddings : List[EmbeddingDict]
- [An array with dicts. Each dictionary represents an attribute set.
- A dict is composed with a word and its embedding as key, value respectively.
+ An array with dicts. Each dictionary represents an attribute set.
+ A dict is composed with a word and its embedding as key, value
+ respectively.
Returns
-------
@@ -86,7 +87,8 @@ def run_query(
Parameters
----------
query : Query
- A Query object that contains the target and attribute word sets to be tested.
+ A Query object that contains the target and attribute word sets to be
+ tested.
model : WordEmbeddingModel
An object containing a word embeddings model.
@@ -111,8 +113,8 @@ def run_query(
A list of these preprocessor options will allow you to search for several
variants of the words (depending on the search strategy) into the model.
For example `[{}, {'lowecase': True, 'strip_accents': True}]` will allow you
- to search for each word first without any transformation and then transformed
- to lowercase and without accents.
+ to search for each word first without any transformation and then
+ transformed to lowercase and without accents.
The available word preprocessing options are as follows (it is not necessary
to put them all):
@@ -120,9 +122,9 @@ def run_query(
- `lowercase`: `bool`. Indicates if the words are transformed to lowercase.
- `uppercase`: `bool`. Indicates if the words are transformed to uppercase.
- `titlecase`: `bool`. Indicates if the words are transformed to titlecase.
- - `strip_accents`: `bool`, `{'ascii', 'unicode'}`: Specifies if the accents of
- the words are eliminated. The stripping type can be
- specified. True uses 'unicode' by default.
+ - `strip_accents`: `bool`, `{'ascii', 'unicode'}`: Specifies if the accents
+ of the words are eliminated. The stripping type can be
+ specified. True uses 'unicode' by default.
- `preprocessor`: `Callable`. It receives a function that operates on each
word. In the case of specifying a function, it overrides
the default preprocessor (i.e., the previous options
@@ -131,7 +133,7 @@ def run_query(
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found., by default "first"
normalize : bool, optional
diff --git a/wefe/preprocessing.py b/wefe/preprocessing.py
index 0fbeceb..82a2a10 100644
--- a/wefe/preprocessing.py
+++ b/wefe/preprocessing.py
@@ -3,8 +3,7 @@
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
-from sklearn.feature_extraction.text import (strip_accents_ascii,
- strip_accents_unicode)
+from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
@@ -34,8 +33,8 @@ def preprocess_word(
- ```titlecase```: bool. Indicates if the words are transformed to titlecase.
- - ```strip_accents```: `bool`, `{'ascii', 'unicode'}`: Specifies if the accents of
- the words are eliminated. The stripping type can be
+ - ```strip_accents```: `bool`, `{'ascii', 'unicode'}`: Specifies if the
+ accents of the words are eliminated. The stripping type can be
specified. True uses 'unicode' by default.
- ```preprocessor```: Callable. It receives a function that operates on each
@@ -125,7 +124,7 @@ def get_embeddings_from_set(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -133,7 +132,7 @@ def get_embeddings_from_set(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
@@ -141,7 +140,7 @@ def get_embeddings_from_set(
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -171,8 +170,8 @@ def get_embeddings_from_set(
if not isinstance(preprocessors, list):
raise TypeError(
- "preprocessors should be a list of dicts which contains preprocessor options"
- f", got {preprocessors}."
+ "preprocessors should be a list of dicts which contains preprocessor "
+ f"options, got {preprocessors}."
)
if len(preprocessors) == 0:
raise TypeError(
@@ -333,7 +332,7 @@ def get_embeddings_from_tuples(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -341,7 +340,7 @@ def get_embeddings_from_tuples(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
@@ -349,7 +348,7 @@ def get_embeddings_from_tuples(
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
@@ -501,7 +500,7 @@ def get_embeddings_from_query(
titlecase.
* ``strip_accents``: ``bool``, ``{'ascii', 'unicode'}``: Specifies that
the accents of the words are eliminated. The stripping type can be
- specified. True uses ‘unicode’ by default.
+ specified. True uses 'unicode' by default.
* ``preprocessor``: ``Callable``. It receives a function that operates
on each word. In the case of specifying a function, it overrides the
default preprocessor (i.e., the previous options stop working).
@@ -509,7 +508,7 @@ def get_embeddings_from_query(
A list of preprocessor options allows you to search for several
variants of the words into the model. For example, the preprocessors
``[{}, {"lowercase": True, "strip_accents": True}]``
- ``{}`` allows first to search for the original words in the vocabulary of
+ ``{}`` allows searching first for the original words in the vocabulary of
the model. In case some of them are not found,
``{"lowercase": True, "strip_accents": True}`` is executed on these words
and then they are searched in the model vocabulary.
@@ -517,7 +516,7 @@ def get_embeddings_from_query(
strategy : str, optional
The strategy indicates how it will use the preprocessed words: 'first' will
- include only the first transformed word found. all' will include all
+ include only the first transformed word found. 'all' will include all
transformed words found, by default "first".
normalize : bool, optional
diff --git a/wefe/query.py b/wefe/query.py
index 0a65045..ecfdb2c 100644
--- a/wefe/query.py
+++ b/wefe/query.py
@@ -1,6 +1,5 @@
-import logging
from itertools import combinations
-from typing import Any, List, Union
+from typing import Any, Dict, List, Union
import numpy as np
@@ -201,6 +200,13 @@ def __eq__(self, other):
return True
def __repr__(self) -> str:
+ """Generates a repr that shows the name, target and attributes of the query.
+
+ Returns
+ -------
+ str
+ The generated representation.
+ """
try:
repr_ = (
" str:
+ ">"
)
return repr_
- except:
- return ""
+ except AttributeError:
+ # it can happen if some of the attributes (query_name, target_sets
+ # or attribute_sets) are not defined.
+ return ""
+
+ def dict(self) -> Dict[str, Any]:
+ """Generates a dictionary from the Query data
+
+ This includes the target and attribute sets, as well as their names,
+ the query name generated from them and the query template.
+
+ Returns
+ -------
+ Dict[str, Any]
+ The dictionary generated with the query data.
+ """
+ return {
+ "target_sets": self.target_sets,
+ "attribute_sets": self.attribute_sets,
+ "target_sets_names": self.target_sets_names,
+ "attribute_sets_names": self.attribute_sets_names,
+ "query_name": self.query_name,
+ "template": self.template,
+ }
def get_subqueries(self, new_template: tuple) -> list:
"""Generate the subqueries from this query using the given template"""
diff --git a/wefe/utils.py b/wefe/utils.py
index 494d341..8f1531e 100644
--- a/wefe/utils.py
+++ b/wefe/utils.py
@@ -7,13 +7,14 @@
import copy
import logging
-from typing import Callable, Dict, List, Type, Union
+from typing import Callable, List, Type, Union
import numpy as np
import pandas as pd
import pkg_resources
import plotly.express as px
import plotly.graph_objects as go
+from gensim.models.keyedvectors import KeyedVectors
from sklearn.utils.validation import check_is_fitted as _check_is_fitted
from wefe.metrics.base_metric import BaseMetric
@@ -50,13 +51,13 @@ def check_is_fitted(estimator, attributes):
def generate_subqueries_from_queries_list(
- metric: Type[BaseMetric], queries: List[Query]
+ metric: BaseMetric, queries: List[Query]
) -> List[Query]:
"""Generate a list of subqueries from queries.
Parameters
----------
- metric : Type[BaseMetric]
+ metric : BaseMetric
Some metric.
queries : List[Query]
A list with queries.
@@ -67,20 +68,20 @@ def generate_subqueries_from_queries_list(
A list with all the generated subqueries.
"""
# instance metric
- metric = metric()
+ metric_ = metric()
subqueries = []
for query_idx, query in enumerate(queries):
try:
- subqueries += query.get_subqueries(metric.metric_template)
+ subqueries += query.get_subqueries(metric_.metric_template)
except Exception as e:
logging.warning(
"Query in index {} ({}) can not be splitted in subqueries "
"with the {} metric template = {}. Exception: \n{}".format(
query_idx,
query.query_name,
- metric.metric_name,
- metric.metric_template,
+ metric_.metric_name,
+ metric_.metric_template,
e,
)
)
@@ -282,7 +283,8 @@ def run_queries(
index="model_name", columns="query_name", values="result"
)
pivoted_results = pivoted_results.reindex(
- index=[model.name for model in models], columns=query_names,
+ index=[model.name for model in models],
+ columns=query_names,
)
if aggregate_results:
@@ -386,7 +388,8 @@ def plot_queries_results(results: pd.DataFrame, by: str = "query"):
barmode="group",
)
fig.update_layout(
- xaxis_title=xaxis_title, yaxis_title="Bias measure",
+ xaxis_title=xaxis_title,
+ yaxis_title="Bias measure",
)
fig.for_each_trace(
lambda t: t.update(x=["wrt ".join(label.split("wrt")) for label in t.x])
@@ -426,7 +429,8 @@ def create_ranking(
- dense: like ‘min’, but rank always increases by 1 between groups.
ascending : bool, optional
- Whether or not the elements should be ranked in ascending order, by default True.
+ Whether or not the elements should be ranked in ascending order,
+ by default True.
Returns
-------
@@ -611,9 +615,6 @@ def save_doc_image(fig, name):
fig.write_image(f"./doc/images/{name}.png", width=1200, height=600, scale=3)
-from gensim.models.keyedvectors import KeyedVectors
-
-
def flair_to_gensim(flair_embedding):
# load model from flair
diff --git a/wefe/word_embedding_model.py b/wefe/word_embedding_model.py
index 2612cc8..2fd1a4e 100644
--- a/wefe/word_embedding_model.py
+++ b/wefe/word_embedding_model.py
@@ -168,6 +168,40 @@ def __contains__(self, key):
"""
return key in self.vocab
+ def __repr__(self) -> str:
+ try:
+ if self.name == "Unnamed model" and self.vocab_prefix is not None:
+ return (
+ ""
+ )
+
+ if self.name == "Unnamed model":
+ return (
+ ""
+ )
+
+ if self.vocab_prefix is not None:
+ return (
+ f""
+ )
+
+ return (
+ f""
+ )
+ except AttributeError:
+ # it can happen if some of the attributes (name or vocab_prefix) are not
+ # defined.
+ return ""
+
def normalize(self):
"""Normalize word embeddings in the model by using the L2 norm.
@@ -245,7 +279,9 @@ def update(self, word: str, embedding: np.ndarray):
self.wv.vectors[word_index] = embedding
def batch_update(
- self, words: Sequence[str], embeddings: Union[Sequence[np.ndarray], np.ndarray],
+ self,
+ words: Sequence[str],
+ embeddings: Union[Sequence[np.ndarray], np.ndarray],
):
"""Update a batch of embeddings.