Prepare v2.3.0 (#21)

* fix handling of nan values in compress_memberships * make black * add publish workflow
Valires · Nov 29, 2023 · 282bb8b · 282bb8b
1 parent 691ee7e
commit 282bb8b
Show file tree

Hide file tree

Showing 29 changed files with 178 additions and 219 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,27 @@
+name: Publish Python package to PyPI
+
+on:
+  push:
+    tags:
+      - 'v*'  # Trigger the workflow on push tags like v1.0.0
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'  # Use the version appropriate for your project
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+      run: |
+        python -m build
+        twine upload dist/*
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,11 @@
 Changelog
 =========
 
+2.3.0 (November 29, 2023)
+-------------------------
+
+* Fix handling of NaN values in `compress_memberships()`
+
 2.2.1 (November 8, 2023)
 ------------------------
 * Small fixes to paper and documentation.

diff --git a/docs/conf.py b/docs/conf.py
@@ -19,6 +19,7 @@
 #
 import os
 import sys
+
 import er_evaluation
 
 sys.path.insert(0, os.path.abspath(".."))

diff --git a/er_evaluation/__init__.py b/er_evaluation/__init__.py
@@ -1,21 +1,21 @@
-__version__ = "2.2.1"
+__version__ = "2.3.0"
 
 import er_evaluation.data_structures
 import er_evaluation.datasets
 import er_evaluation.error_analysis
 import er_evaluation.estimators
 import er_evaluation.metrics
 import er_evaluation.plots
-import er_evaluation.utils
 import er_evaluation.summary
+import er_evaluation.utils
 from er_evaluation.data_structures import *
 from er_evaluation.datasets import *
 from er_evaluation.error_analysis import *
 from er_evaluation.estimators import *
 from er_evaluation.metrics import *
 from er_evaluation.plots import *
-from er_evaluation.utils import *
 from er_evaluation.summary import *
+from er_evaluation.utils import *
 
 __all__ = (
     er_evaluation.data_structures.__all__

diff --git a/er_evaluation/data_structures/__init__.py b/er_evaluation/data_structures/__init__.py
@@ -56,25 +56,11 @@
         └─3─┘       5
 """
 from er_evaluation.data_structures._data_structures import (
-    MembershipVector,
-    compress_memberships,
-    clusters_to_graph,
-    clusters_to_membership,
-    clusters_to_pairs,
-    graph_to_clusters,
-    graph_to_membership,
-    graph_to_pairs,
-    isclusters,
-    isgraph,
-    ismembership,
-    ispairs,
-    membership_to_clusters,
-    membership_to_graph,
-    membership_to_pairs,
-    pairs_to_clusters,
-    pairs_to_graph,
-    pairs_to_membership,
-)
+    MembershipVector, clusters_to_graph, clusters_to_membership,
+    clusters_to_pairs, compress_memberships, graph_to_clusters,
+    graph_to_membership, graph_to_pairs, isclusters, isgraph, ismembership,
+    ispairs, membership_to_clusters, membership_to_graph, membership_to_pairs,
+    pairs_to_clusters, pairs_to_graph, pairs_to_membership)
 
 __all__ = [
     "compress_memberships",

diff --git a/er_evaluation/data_structures/_data_structures.py b/er_evaluation/data_structures/_data_structures.py
@@ -16,21 +16,21 @@ def compress_memberships(*memberships):
         List of Series with int codes for index and values. Index are compatible accross the Series.
 
     Examples:
-        >>> membership = pd.Series(["c1", "c1", "c1", "c2", "c2", "c3"], index=[0,1,2,3,4,5])
+        >>> membership = pd.Series([None, "c1", "c1", "c2", "c2", "c3"], index=[0,1,2,3,4,5])
         >>> compressed, = compress_memberships(membership)
         >>> compressed
-        0    0
-        1    0
-        2    0
-        3    1
-        4    1
-        5    2
-        Name: 0, dtype: int8
+        0    NaN
+        1    0.0
+        2    0.0
+        3    1.0
+        4    1.0
+        5    2.0
+        Name: 0, dtype: float64
     """
     compressed = pd.concat(memberships, axis=1)
-    compressed.index = pd.Categorical(compressed.index).codes
     for col in compressed.columns:
-        compressed[col] = pd.Categorical(compressed[col]).codes
+        codes = pd.Categorical(compressed[col]).codes
+        compressed[col] = np.where(compressed[col].isna(), np.nan, codes)
 
     return [compressed[col] for col in compressed.columns]
 

diff --git a/er_evaluation/datasets/__init__.py b/er_evaluation/datasets/__init__.py
@@ -12,13 +12,12 @@
 The :py:meth:`load_rldata10000_disambiguations` and :py:meth:`load_rldata10000` return ground truth disambiguation, toy predicted disambiguations, and the full RLdata1000 dataframe.
 """
 
-from er_evaluation.datasets.patentsview import load_pv_data, load_pv_disambiguations
-from er_evaluation.datasets.rldata import (
-    load_rldata500,
-    load_rldata500_disambiguations,
-    load_rldata10000,
-    load_rldata10000_disambiguations,
-)
+from er_evaluation.datasets.patentsview import (load_pv_data,
+                                                load_pv_disambiguations)
+from er_evaluation.datasets.rldata import (load_rldata500,
+                                           load_rldata500_disambiguations,
+                                           load_rldata10000,
+                                           load_rldata10000_disambiguations)
 
 __all__ = [
     "load_pv_data",

diff --git a/er_evaluation/error_analysis/__init__.py b/er_evaluation/error_analysis/__init__.py
@@ -85,29 +85,15 @@
 The key advantage of working with the record error table is that it allows sensitivity analyses to be performed. Since all cluster error metrics and representative performance estimators can be computed directly from the record error table, uncertainty regarding error rates can be propagated from the record error table into cluster error metrics and into performance estimates.
 """
 from er_evaluation.error_analysis._cluster_error import (
-    count_extra,
-    count_missing,
-    error_indicator,
-    error_metrics,
-    expected_extra,
-    expected_missing,
-    expected_relative_extra,
-    expected_relative_missing,
-    expected_size_difference,
-    splitting_entropy,
-)
+    count_extra, count_missing, error_indicator, error_metrics, expected_extra,
+    expected_missing, expected_relative_extra, expected_relative_missing,
+    expected_size_difference, splitting_entropy)
 from er_evaluation.error_analysis._record_error import (
-    cluster_sizes_from_table,
-    error_indicator_from_table,
-    error_metrics_from_table,
-    expected_extra_from_table,
-    expected_missing_from_table,
-    expected_relative_extra_from_table,
-    expected_relative_missing_from_table,
-    expected_size_difference_from_table,
-    pred_cluster_sizes_from_table,
-    record_error_table,
-)
+    cluster_sizes_from_table, error_indicator_from_table,
+    error_metrics_from_table, expected_extra_from_table,
+    expected_missing_from_table, expected_relative_extra_from_table,
+    expected_relative_missing_from_table, expected_size_difference_from_table,
+    pred_cluster_sizes_from_table, record_error_table)
 from er_evaluation.error_analysis._subgroup_discovery import fit_dt_regressor
 
 __all__ = [

diff --git a/er_evaluation/error_analysis/_cluster_error.py b/er_evaluation/error_analysis/_cluster_error.py
@@ -4,12 +4,9 @@
 
 from er_evaluation.data_structures import MembershipVector
 from er_evaluation.error_analysis._record_error import (
-    error_metrics_from_table,
-    expected_relative_missing_from_table,
-    expected_size_difference_from_table,
-    record_error_table,
-    error_indicator_from_table,
-)
+    error_indicator_from_table, error_metrics_from_table,
+    expected_relative_missing_from_table, expected_size_difference_from_table,
+    record_error_table)
 from er_evaluation.utils import relevant_prediction_subset
 
 

diff --git a/er_evaluation/error_analysis/_subgroup_discovery.py b/er_evaluation/error_analysis/_subgroup_discovery.py
@@ -1,8 +1,8 @@
-from sklearn.tree import DecisionTreeRegressor
 from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder
-from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.tree import DecisionTreeRegressor
 
 
 def fit_dt_regressor(

diff --git a/er_evaluation/estimators/__init__.py b/er_evaluation/estimators/__init__.py
@@ -14,24 +14,18 @@
 
 **Note:** In order to obtain representative performance estimators, the set of predicted clusters given as an argument to estimator functions should cover the entire population of interest. Typically, this set of predicted clusters will be much larger than the set of sampled clusters.
 """
-from er_evaluation.estimators._estimators import (
-    b_cubed_precision_estimator,
-    b_cubed_recall_estimator,
-    cluster_f_estimator,
-    cluster_precision_estimator,
-    cluster_recall_estimator,
-    estimates_table,
-    pairwise_f_estimator,
-    pairwise_precision_estimator,
-    pairwise_recall_estimator,
-)
+from er_evaluation.estimators._estimators import (b_cubed_precision_estimator,
+                                                  b_cubed_recall_estimator,
+                                                  cluster_f_estimator,
+                                                  cluster_precision_estimator,
+                                                  cluster_recall_estimator,
+                                                  estimates_table,
+                                                  pairwise_f_estimator,
+                                                  pairwise_precision_estimator,
+                                                  pairwise_recall_estimator)
 from er_evaluation.estimators._summary_estimators import (
-    avg_cluster_size_estimator,
-    homonymy_rate_estimator,
-    matching_rate_estimator,
-    name_variation_estimator,
-    summary_estimates_table,
-)
+    avg_cluster_size_estimator, homonymy_rate_estimator,
+    matching_rate_estimator, name_variation_estimator, summary_estimates_table)
 
 __all__ = [
     "b_cubed_precision_estimator",

diff --git a/er_evaluation/estimators/_estimators.py b/er_evaluation/estimators/_estimators.py
@@ -3,23 +3,16 @@
 from scipy.special import comb
 
 from er_evaluation.data_structures import MembershipVector
-from er_evaluation.error_analysis import (
-    record_error_table,
-)
+from er_evaluation.error_analysis import record_error_table
+from er_evaluation.estimators._utils import (_parse_weights,
+                                             ratio_of_means_estimator,
+                                             validate_prediction_sample,
+                                             validate_weights)
 from er_evaluation.estimators.from_table import (
-    pairwise_f_estimator_from_table,
-    cluster_precision_estimator_from_table,
-    cluster_recall_estimator_from_table,
-    cluster_f_estimator_from_table,
     b_cubed_precision_estimator_from_table,
-    b_cubed_recall_estimator_from_table,
-)
-from er_evaluation.estimators._utils import (
-    validate_prediction_sample,
-    _parse_weights,
-    validate_weights,
-    ratio_of_means_estimator,
-)
+    b_cubed_recall_estimator_from_table, cluster_f_estimator_from_table,
+    cluster_precision_estimator_from_table,
+    cluster_recall_estimator_from_table, pairwise_f_estimator_from_table)
 from er_evaluation.utils import expand_grid
 
 

diff --git a/er_evaluation/estimators/_summary_estimators.py b/er_evaluation/estimators/_summary_estimators.py
@@ -1,13 +1,11 @@
 import pandas as pd
 
-from er_evaluation.estimators._utils import (
-    ratio_of_means_estimator,
-    validate_prediction_sample,
-    _parse_weights,
-    validate_weights,
-)
-from er_evaluation.summary import cluster_sizes
 from er_evaluation.data_structures import MembershipVector
+from er_evaluation.estimators._utils import (_parse_weights,
+                                             ratio_of_means_estimator,
+                                             validate_prediction_sample,
+                                             validate_weights)
+from er_evaluation.summary import cluster_sizes
 from er_evaluation.utils import expand_grid
 
 

diff --git a/er_evaluation/estimators/_utils.py b/er_evaluation/estimators/_utils.py
@@ -1,5 +1,6 @@
-import logging
 import functools
+import logging
+
 import numpy as np
 import pandas as pd
 

diff --git a/er_evaluation/estimators/from_table.py b/er_evaluation/estimators/from_table.py
@@ -1,11 +1,9 @@
-from er_evaluation.error_analysis import (
-    cluster_sizes_from_table,
-    error_indicator_from_table,
-    expected_missing_from_table,
-    expected_relative_extra_from_table,
-    expected_relative_missing_from_table,
-    expected_size_difference_from_table,
-)
+from er_evaluation.error_analysis import (cluster_sizes_from_table,
+                                          error_indicator_from_table,
+                                          expected_missing_from_table,
+                                          expected_relative_extra_from_table,
+                                          expected_relative_missing_from_table,
+                                          expected_size_difference_from_table)
 from er_evaluation.estimators._utils import ratio_of_means_estimator
 
 

diff --git a/er_evaluation/metrics/__init__.py b/er_evaluation/metrics/__init__.py
@@ -18,23 +18,14 @@
 - Records with NA cluster identifier in the reference or predicted clusterings are dropped.
 - The metrics in this module do not provide representative performance estimates. They are only useful for comparing two clusterings, such as a. For representative performance estimates, see the :mod:`er_evaluation.estimators` module.
 """
-from er_evaluation.metrics._metrics import (
-    adjusted_rand_score,
-    b_cubed_f,
-    b_cubed_precision,
-    b_cubed_recall,
-    cluster_completeness,
-    cluster_f,
-    cluster_homogeneity,
-    cluster_precision,
-    cluster_recall,
-    cluster_v_measure,
-    metrics_table,
-    pairwise_f,
-    pairwise_precision,
-    pairwise_recall,
-    rand_score,
-)
+from er_evaluation.metrics._metrics import (adjusted_rand_score, b_cubed_f,
+                                            b_cubed_precision, b_cubed_recall,
+                                            cluster_completeness, cluster_f,
+                                            cluster_homogeneity,
+                                            cluster_precision, cluster_recall,
+                                            cluster_v_measure, metrics_table,
+                                            pairwise_f, pairwise_precision,
+                                            pairwise_recall, rand_score)
 
 __all__ = [
     "adjusted_rand_score",

diff --git a/er_evaluation/metrics/_metrics.py b/er_evaluation/metrics/_metrics.py
@@ -6,12 +6,10 @@
 from scipy.special import comb
 
 from er_evaluation.data_structures import MembershipVector
-from er_evaluation.error_analysis import (
-    error_indicator,
-    expected_relative_extra_from_table,
-    expected_relative_missing_from_table,
-    record_error_table,
-)
+from er_evaluation.error_analysis import (error_indicator,
+                                          expected_relative_extra_from_table,
+                                          expected_relative_missing_from_table,
+                                          record_error_table)
 from er_evaluation.summary import number_of_links
 from er_evaluation.utils import expand_grid