Merge pull request #220 from matthewwardrop/improve_contrast_prefixes

matthewwardrop · web-flow · commit 8230d6e47cd9 · 2024-11-28T22:30:00.000-08:00
diff --git a/formulaic/materializers/base.py b/formulaic/materializers/base.py
@@ -758,6 +758,7 @@ def wrapped(
             encoded = FactorValues(
                 encoded.copy(),
                 metadata=encoded.__formulaic_metadata__,  # type: ignore
+                reduced=True,
             )
             del encoded[encoded.__formulaic_metadata__.drop_field]
 
@@ -782,7 +783,7 @@ def _flatten_encoded_evaled_factor(
         # Some nested dictionaries may not be a `FactorValues[dict]` instance,
         # in which case we impute the default formatter in `FactorValues.format`.
         if hasattr(values, "__formulaic_metadata__"):
-            name_format = values.__formulaic_metadata__.format
+            name_format = values.__formulaic_metadata__.get_format()
         else:
             name_format = FactorValuesMetadata.format
 
diff --git a/formulaic/materializers/types/factor_values.py b/formulaic/materializers/types/factor_values.py
@@ -43,9 +43,6 @@ class FactorValuesMetadata:
 
     Attributes:
         kind: The kind of the evaluated values.
-        spans_intercept: Whether the values span the intercept or not.
-        drop_field: If the values do span the intercept, and we want to reduce
-            the rank, which field should be dropped.
         format: The format to use when exploding factors into multiple columns
             (e.g. when encoding categories via dummy-encoding).
         encoded: Whether the values should be treated as pre-encoded.
@@ -55,18 +52,37 @@ class FactorValuesMetadata:
             materializer. Note that this should only be used in cases where
             direct evaluation would yield different results in reduced vs.
             non-reduced rank scenarios.
+
+        Rank-Reduction Attributes:
+            spans_intercept: Whether the values span the intercept or not.
+            drop_field: If the values do span the intercept, and we want to reduce
+                the rank, which field should be dropped.
+            reduced: Whether the rank has already been reduced by dropping the
+                `drop_field` above.
+            format_reduced: The format to use when exploding factors (as above), but
+                in the case where the rank has been reduced by dropping a field.
+                (This defaults to `format`.)
     """
 
     kind: Factor.Kind = Factor.Kind.UNKNOWN
     column_names: Optional[Tuple[str]] = None
-    spans_intercept: bool = False
-    drop_field: Optional[str] = None
     format: str = "{name}[{field}]"
     encoded: bool = False
     encoder: Optional[
         Callable[[Any, bool, List[int], Dict[str, Any], ModelSpec], Any]
     ] = None
 
+    # Rank-Reduction Attributes
+    spans_intercept: bool = False
+    drop_field: Optional[str] = None
+    reduced: bool = False
+    format_reduced: Optional[str] = None
+
+    def get_format(self) -> str:
+        return (
+            self.format_reduced if self.reduced and self.format_reduced else self.format
+        )
+
     def replace(self, **kwargs: Any) -> FactorValuesMetadata:
         """
         Return a copy of this `FactorValuesMetadata` instance with the nominated
@@ -91,25 +107,29 @@ def __init__(
         *,
         kind: Union[str, Factor.Kind, _MissingType] = MISSING,
         column_names: Union[Tuple[Hashable, ...], _MissingType] = MISSING,
-        spans_intercept: Union[bool, _MissingType] = MISSING,
-        drop_field: Union[None, Hashable, _MissingType] = MISSING,
         format: Union[str, _MissingType] = MISSING,  # pylint: disable=redefined-builtin
         encoded: Union[bool, _MissingType] = MISSING,
         encoder: Union[
             None,
             Callable[[Any, bool, List[int], Dict[str, Any], ModelSpec], Any],
             _MissingType,
         ] = MISSING,
+        spans_intercept: Union[bool, _MissingType] = MISSING,
+        drop_field: Union[None, Hashable, _MissingType] = MISSING,
+        reduced: Union[bool, _MissingType] = MISSING,
+        format_reduced: Union[str, _MissingType] = MISSING,
     ):
         metadata_constructor: Callable = FactorValuesMetadata
         metadata_kwargs = dict(
             kind=Factor.Kind(kind) if kind is not MISSING else kind,
             column_names=column_names,
-            spans_intercept=spans_intercept,
-            drop_field=drop_field,
             format=format,
             encoded=encoded,
             encoder=encoder,
+            spans_intercept=spans_intercept,
+            drop_field=drop_field,
+            reduced=reduced,
+            format_reduced=format_reduced,
         )
         for key in set(metadata_kwargs):
             if metadata_kwargs[key] is MISSING:
diff --git a/formulaic/transforms/contrasts.py b/formulaic/transforms/contrasts.py
@@ -208,6 +208,7 @@ class Contrasts(metaclass=InterfaceMeta):
     INTERFACE_RAISE_ON_VIOLATION = True
 
     FACTOR_FORMAT = "{name}[{field}]"
+    FACTOR_FORMAT_REDUCED = "{name}[{field}]"
 
     def apply(
         self,
@@ -251,9 +252,11 @@ def apply(
         if not levels or len(levels) == 1 and reduced_rank:
             if output == "pandas":
                 encoded = pandas.DataFrame(
-                    index=dummies.index
-                    if isinstance(dummies, pandas.DataFrame)
-                    else range(dummies.shape[0])
+                    index=(
+                        dummies.index
+                        if isinstance(dummies, pandas.DataFrame)
+                        else range(dummies.shape[0])
+                    )
                 )
             elif output == "numpy":
                 encoded = numpy.ones((dummies.shape[0], 0))
@@ -269,6 +272,7 @@ def apply(
                 column_names=cast(Tuple[Hashable], ()),
                 spans_intercept=False,
                 format=self.get_factor_format(levels, reduced_rank=reduced_rank),
+                format_reduced=self.get_factor_format(levels, reduced_rank=True),
                 encoded=True,
             )
 
@@ -295,6 +299,7 @@ def apply(
             spans_intercept=self.get_spans_intercept(levels, reduced_rank=reduced_rank),
             drop_field=self.get_drop_field(levels, reduced_rank=reduced_rank),
             format=self.get_factor_format(levels, reduced_rank=reduced_rank),
+            format_reduced=self.get_factor_format(levels, reduced_rank=True),
             encoded=True,
         )
 
@@ -480,7 +485,7 @@ def get_factor_format(
             levels: The names of the levels/categories in the data.
             reduced_rank: Whether the contrast encoding used had reduced rank.
         """
-        return self.FACTOR_FORMAT
+        return self.FACTOR_FORMAT_REDUCED if reduced_rank else self.FACTOR_FORMAT
 
 
 @dataclass
@@ -493,7 +498,7 @@ class TreatmentContrasts(Contrasts):
     is taken to be the first level.
     """
 
-    FACTOR_FORMAT = "{name}[T.{field}]"
+    FACTOR_FORMAT_REDUCED = "{name}[T.{field}]"
 
     base: Hashable = UNSET
 
@@ -609,7 +614,7 @@ class SumContrasts(Contrasts):
     (except the last, which is redundant) to the global average of all levels.
     """
 
-    FACTOR_FORMAT = "{name}[S.{field}]"
+    FACTOR_FORMAT_REDUCED = "{name}[S.{field}]"
 
     @Contrasts.override
     def _get_coding_matrix(
@@ -659,7 +664,7 @@ class HelmertContrasts(Contrasts):
             integer one).
     """
 
-    FACTOR_FORMAT = "{name}[H.{field}]"
+    FACTOR_FORMAT_REDUCED = "{name}[H.{field}]"
 
     reverse: bool = True
     scale: bool = False
@@ -729,7 +734,7 @@ class DiffContrasts(Contrasts):
             Level 1 cf. Level 1 - Level 2).
     """
 
-    FACTOR_FORMAT = "{name}[D.{field}]"
+    FACTOR_FORMAT_REDUCED = "{name}[D.{field}]"
 
     backward: bool = True
 
@@ -792,7 +797,6 @@ class PolyContrasts(Contrasts):
             have the same cardinality as the categories being coded.
     """
 
-    FACTOR_FORMAT = "{name}{field}"
     NAME_ALIASES = {
         1: ".L",
         2: ".Q",
diff --git a/tests/materializers/test_arrow.py b/tests/materializers/test_arrow.py
@@ -19,15 +19,15 @@ def check_for_pyarrow():
     "a": (["Intercept", "a"], ["Intercept", "a"]),
     "A": (
         ["Intercept", "A[T.b]", "A[T.c]"],
-        ["Intercept", "A[T.a]", "A[T.b]", "A[T.c]"],
+        ["Intercept", "A[a]", "A[b]", "A[c]"],
     ),
     "C(A)": (
         ["Intercept", "C(A)[T.b]", "C(A)[T.c]"],
-        ["Intercept", "C(A)[T.a]", "C(A)[T.b]", "C(A)[T.c]"],
+        ["Intercept", "C(A)[a]", "C(A)[b]", "C(A)[c]"],
     ),
     "a:A": (
-        ["Intercept", "a:A[T.a]", "a:A[T.b]", "a:A[T.c]"],
-        ["Intercept", "a:A[T.a]", "a:A[T.b]", "a:A[T.c]"],
+        ["Intercept", "a:A[a]", "a:A[b]", "a:A[c]"],
+        ["Intercept", "a:A[a]", "a:A[b]", "a:A[c]"],
     ),
 }
 
diff --git a/tests/materializers/test_pandas.py b/tests/materializers/test_pandas.py
@@ -24,45 +24,45 @@
     "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2),
     "A": (
         ["Intercept", "A[T.b]", "A[T.c]"],
-        ["Intercept", "A[T.a]", "A[T.b]", "A[T.c]"],
+        ["Intercept", "A[a]", "A[b]", "A[c]"],
         ["Intercept", "A[T.c]"],
         2,
     ),
     "C(A)": (
         ["Intercept", "C(A)[T.b]", "C(A)[T.c]"],
-        ["Intercept", "C(A)[T.a]", "C(A)[T.b]", "C(A)[T.c]"],
+        ["Intercept", "C(A)[a]", "C(A)[b]", "C(A)[c]"],
         ["Intercept", "C(A)[T.c]"],
         2,
     ),
     "A:a": (
-        ["Intercept", "A[T.a]:a", "A[T.b]:a", "A[T.c]:a"],
-        ["Intercept", "A[T.a]:a", "A[T.b]:a", "A[T.c]:a"],
-        ["Intercept", "A[T.a]:a"],
+        ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"],
+        ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"],
+        ["Intercept", "A[a]:a"],
         1,
     ),
     "A:B": (
         [
             "Intercept",
             "B[T.b]",
             "B[T.c]",
-            "A[T.b]:B[T.a]",
-            "A[T.c]:B[T.a]",
-            "A[T.b]:B[T.b]",
-            "A[T.c]:B[T.b]",
-            "A[T.b]:B[T.c]",
-            "A[T.c]:B[T.c]",
+            "A[T.b]:B[a]",
+            "A[T.c]:B[a]",
+            "A[T.b]:B[b]",
+            "A[T.c]:B[b]",
+            "A[T.b]:B[c]",
+            "A[T.c]:B[c]",
         ],
         [
             "Intercept",
-            "A[T.a]:B[T.a]",
-            "A[T.b]:B[T.a]",
-            "A[T.c]:B[T.a]",
-            "A[T.a]:B[T.b]",
-            "A[T.b]:B[T.b]",
-            "A[T.c]:B[T.b]",
-            "A[T.a]:B[T.c]",
-            "A[T.b]:B[T.c]",
-            "A[T.c]:B[T.c]",
+            "A[a]:B[a]",
+            "A[b]:B[a]",
+            "A[c]:B[a]",
+            "A[a]:B[b]",
+            "A[b]:B[b]",
+            "A[c]:B[b]",
+            "A[a]:B[c]",
+            "A[b]:B[c]",
+            "A[c]:B[c]",
         ],
         ["Intercept"],
         1,
@@ -324,7 +324,7 @@ def test_encoding_edge_cases(self, materializer):
                 spec=ModelSpec(formula=[]),
                 drop_rows=[],
             )
-        ) == ["B[a][T.a]", "B[a][T.b]", "B[a][T.c]"]
+        ) == ["B[a][a]", "B[a][b]", "B[a][c]"]
 
     def test_empty(self, materializer):
         mm = materializer.get_model_matrix("0", ensure_full_rank=True)
@@ -366,27 +366,27 @@ def test_category_reordering(self):
         )
 
         m = PandasMaterializer(data).get_model_matrix("A + 0", ensure_full_rank=False)
-        assert list(m.columns) == ["A[T.a]", "A[T.b]", "A[T.c]"]
+        assert list(m.columns) == ["A[a]", "A[b]", "A[c]"]
         assert list(m.model_spec.get_model_matrix(data3).columns) == [
-            "A[T.a]",
-            "A[T.b]",
-            "A[T.c]",
+            "A[a]",
+            "A[b]",
+            "A[c]",
         ]
 
         m2 = PandasMaterializer(data2).get_model_matrix("A + 0", ensure_full_rank=False)
-        assert list(m2.columns) == ["A[T.a]", "A[T.b]", "A[T.c]"]
+        assert list(m2.columns) == ["A[a]", "A[b]", "A[c]"]
         assert list(m2.model_spec.get_model_matrix(data3).columns) == [
-            "A[T.a]",
-            "A[T.b]",
-            "A[T.c]",
+            "A[a]",
+            "A[b]",
+            "A[c]",
         ]
 
         m3 = PandasMaterializer(data3).get_model_matrix("A + 0", ensure_full_rank=False)
-        assert list(m3.columns) == ["A[T.c]", "A[T.b]", "A[T.a]"]
+        assert list(m3.columns) == ["A[c]", "A[b]", "A[a]"]
         assert list(m3.model_spec.get_model_matrix(data).columns) == [
-            "A[T.c]",
-            "A[T.b]",
-            "A[T.a]",
+            "A[c]",
+            "A[b]",
+            "A[a]",
         ]
 
     def test_term_clustering(self, materializer):
diff --git a/tests/transforms/test_contrasts.py b/tests/transforms/test_contrasts.py