matthewwardrop · matthewwardrop · Dec 2, 2024 · Nov 27, 2024 · Nov 27, 2024 · Dec 2, 2024
diff --git a/docsite/docs/guides/grammar.md b/docsite/docs/guides/grammar.md
@@ -26,10 +26,12 @@ unless otherwise indicated.
 | `{...}`[^1] | 1 | Quotes python operations, as a more convenient way to do Python operations than `I(...)`, e.g. `` {`my|col`**2} `` | ✓ | ✗ | ✗ |
 | `<function>(...)`[^1] | 1 | Python transform on column, e.g. `my_func(x)` which is equivalent to `{my_func(x)}` | ✓[^2] | ✓ | ✗ |
 |-----|
-| `(...)` | 1 | Groups operations, overriding normal precedence rules. All operations with the parentheses are performed before the result of these operations is permitted to be operated upon by its peers. | ✓ | ✓ | ✓ |
+| `(...)` | 1 | Groups operations, overriding normal precedence rules. All operations with the parentheses are performed before the result of these operations is permitted to be operated upon by its peers. | ✓ | ✗ | ✓ |
 |-----|
-| ** | 2 | Includes all n-th order interactions of the terms in the left operand, where n is the (integral) value of the right operand, e.g. `(a+b+c)**2` is equivalent to `a + b + c + a:b + a:c + b:c`. | ✓ | ✓ | ✓ |
-| ^ | 2 | Alias for `**`. | ✓ | ✗[^3] | ✓ |
+| `.`[^9] | 0 | Stands in as a wild-card for the sum of variables in the data not used on the left-hand side of a formula. | ✓ | ✗ | ✓ |
+|-----|
+| `**` | 2 | Includes all n-th order interactions of the terms in the left operand, where n is the (integral) value of the right operand, e.g. `(a+b+c)**2` is equivalent to `a + b + c + a:b + a:c + b:c`. | ✓ | ✓ | ✓ |
+| `^` | 2 | Alias for `**`. | ✓ | ✗[^3] | ✓ |
 |-----|
 | `:` | 2 | Adds a new term that corresponds to the interaction of its operands (i.e. their elementwise product). | ✓[^4] | ✓ | ✓ |
 |-----|
@@ -123,4 +125,5 @@ and conventions of which you should be aware.
 [^5]: This somewhat confusing operator is useful when you want to include hierachical features in your data, and where certain interaction terms do not make sense (particularly in ANOVA contexts). For example, if `a` represents countries, and `b` represents cities, then the full product of terms from `a * b === a + b + a:b` does not make sense, because any value of `b` is guaranteed to coincide with a value in `a`, and does not independently add value. Thus, the operation `a / b === a + a:b` results in more sensible dataset. As a result, the `/` operator is right-distributive, since if `b` and `c` were both nested in `a`, you would want `a/(b+c) === a + a:b + a:c`. Likewise, the operator is not left-distributive, since if `c` is nested under both `a` and `b` separately, then you want `(a + b)/c === a + b + a:b:c`. Lastly, if `c` is nested in `b`, and `b` is nested in `a`, then you would want `a/b/c === a + a:(b/c) === a + a:b + a:b:c`.
 [^6]: Implemented by an R package called [Formula](https://cran.r-project.org/web/packages/Formula/index.html) that extends the default formula syntax.
 [^7]: Patsy uses the `rescale` keyword rather than `scale`, but provides the same functionality.
-[^8]: For increased compatibility with patsy, we use patsy's signature for `standardize`.
+[^8]: For increased compatibility with patsy, we use patsy's signature for `standardize`.
+[^9]: Requires additional context to be passed in when directly using the `Formula` constructor. e.g. `Formula("y ~ .", context={"__formulaic_variables_available__": ["x", "y", "z"]})`; or you can use `model_matrix`, `ModelSpec.get_model_matrix()`, or `FormulaMaterializer.get_model_matrix()` without further specification.
diff --git a/formulaic/formula.py b/formulaic/formula.py
@@ -74,6 +74,7 @@ def __call__(
         _ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
         _parser: Optional[FormulaParser] = None,
         _nested_parser: Optional[FormulaParser] = None,
+        _context: Optional[Mapping[str, Any]] = None,
         **structure: FormulaSpec,
     ) -> Formula:
         """
@@ -82,7 +83,7 @@ def __call__(
         `SimpleFormula` instance will be returned; otherwise, a
         `StructuredFormula`.
 
-        Some arguments a prefixed with underscores to prevent collision with
+        Some arguments are prefixed with underscores to prevent collision with
         formula structure.
 
         Args:
@@ -108,6 +109,7 @@ def __call__(
                 _ordering=_ordering,
                 _parser=_parser,
                 _nested_parser=_nested_parser,
+                _context=_context,
                 **structure,
             )
             return self
@@ -120,13 +122,15 @@ def __call__(
                 _parser=_parser,
                 _nested_parser=_nested_parser,
                 _ordering=_ordering,
-                **structure,
-            )
+                _context=_context,
+                **structure,  # type: ignore[arg-type]
+            )._simplify()
         return cls.from_spec(
             cast(FormulaSpec, root),
             ordering=_ordering,
             parser=_parser,
             nested_parser=_nested_parser,
+            context=_context,
         )
 
     def from_spec(
@@ -136,6 +140,7 @@ def from_spec(
         ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
         parser: Optional[FormulaParser] = None,
         nested_parser: Optional[FormulaParser] = None,
+        context: Optional[Mapping[str, Any]] = None,
     ) -> Union[SimpleFormula, StructuredFormula]:
         """
         Construct a `SimpleFormula` or `StructuredFormula` instance from a
@@ -164,18 +169,25 @@ def from_spec(
         if isinstance(spec, str):
             spec = cast(
                 FormulaSpec,
-                (parser or DefaultFormulaParser()).get_terms(spec)._simplify(),
+                (parser or DefaultFormulaParser())
+                .get_terms(spec, context=context)
+                ._simplify(),
             )
 
         if isinstance(spec, dict):
             return StructuredFormula(
-                _parser=parser, _nested_parser=nested_parser, _ordering=ordering, **spec
+                _parser=parser,
+                _nested_parser=nested_parser,
+                _ordering=ordering,
+                _context=context,
+                **spec,  # type: ignore[arg-type]
             )
         if isinstance(spec, Structured):
             return StructuredFormula(
                 _ordering=ordering,
                 _parser=nested_parser,
                 _nested_parser=nested_parser,
+                _context=context,
                 **spec._structure,
             )._simplify()
         if isinstance(spec, tuple):
@@ -184,13 +196,14 @@ def from_spec(
                 _ordering=ordering,
                 _parser=parser,
                 _nested_parser=nested_parser,
+                _context=context,
             )._simplify()
         if isinstance(spec, (list, set, OrderedSet)):
             terms = [
                 term
                 for value in spec
                 for term in (
-                    nested_parser.get_terms(value)  # type: ignore[attr-defined]
+                    nested_parser.get_terms(value, context=context)  # type: ignore[attr-defined]
                     if isinstance(value, str)
                     else [value]
                 )
@@ -248,9 +261,11 @@ class Formula(metaclass=_FormulaMeta):
     def __init__(
         self,
         root: Union[FormulaSpec, _MissingType] = MISSING,
+        *,
         _parser: Optional[FormulaParser] = None,
         _nested_parser: Optional[FormulaParser] = None,
         _ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
+        _context: Optional[Mapping[str, Any]] = None,
         **structure: FormulaSpec,
     ):
         """
@@ -288,7 +303,7 @@ def get_model_matrix(
     @abstractmethod
     def required_variables(self) -> Set[Variable]:
         """
-        The set of variables required in the data order to materialize this
+        The set of variables required to be in the data to materialize this
         formula.
 
         Attempts are made to restrict these variables only to those expected in
@@ -354,6 +369,7 @@ def __init__(
         _ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
         _parser: Optional[FormulaParser] = None,
         _nested_parser: Optional[FormulaParser] = None,
+        _context: Optional[Mapping[str, Any]] = None,
         **structure: FormulaSpec,
     ):
         if root is MISSING:
@@ -667,19 +683,22 @@ class StructuredFormula(Structured[SimpleFormula], Formula):
             formula specifications. Can be: "none", "degree" (default), or "sort".
     """
 
-    __slots__ = ("_parser", "_nested_parser", "_ordering")
+    __slots__ = ("_parser", "_nested_parser", "_ordering", "_context")
 
     def __init__(
         self,
         root: Union[FormulaSpec, _MissingType] = MISSING,
+        *,
+        _ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
         _parser: Optional[FormulaParser] = None,
         _nested_parser: Optional[FormulaParser] = None,
-        _ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
+        _context: Optional[Mapping[str, Any]] = None,
         **structure: FormulaSpec,
     ):
+        self._ordering = OrderingMethod(_ordering)
         self._parser = _parser or DEFAULT_PARSER
         self._nested_parser = _nested_parser or _parser or DEFAULT_NESTED_PARSER
-        self._ordering = OrderingMethod(_ordering)
+        self._context = _context
         super().__init__(root, **structure)  # type: ignore
         self._simplify(unwrap=False, inplace=True)
 
@@ -704,6 +723,7 @@ def _prepare_item(  # type: ignore[override]
             ordering=self._ordering,
             parser=(self._parser if key == "root" else self._nested_parser),
             nested_parser=self._nested_parser,
+            context=self._context,
         )
 
     def get_model_matrix(
@@ -782,3 +802,14 @@ def differentiate(  # pylint: disable=redefined-builtin
             SimpleFormula,
             self._map(lambda formula: formula.differentiate(*wrt, use_sympy=use_sympy)),
         )
+
+    # Ensure pickling never includes context
+    def __getstate__(self) -> Tuple[None, Dict[str, Any]]:
+        slots = self.__slots__ + Structured.__slots__
+        return (
+            None,
+            {
+                slot: getattr(self, slot) if slot != "_context" else None
+                for slot in slots
+            },
+        )
diff --git a/formulaic/materializers/base.py b/formulaic/materializers/base.py
@@ -163,7 +163,9 @@ def get_model_matrix(
         from formulaic import ModelSpec
 
         # Prepare ModelSpec(s)
-        spec: Union[ModelSpec, ModelSpecs] = ModelSpec.from_spec(spec, **spec_overrides)
+        spec: Union[ModelSpec, ModelSpecs] = ModelSpec.from_spec(
+            spec, context=self.layered_context, **spec_overrides
+        )
         should_simplify = isinstance(spec, ModelSpec)
         model_specs: ModelSpecs = self._prepare_model_specs(spec)
 

diff --git a/formulaic/model_spec.py b/formulaic/model_spec.py
@@ -78,6 +78,8 @@ class ModelSpec:
     def from_spec(
         cls,
         spec: Union[FormulaSpec, ModelMatrix, ModelMatrices, ModelSpec, ModelSpecs],
+        *,
+        context: Optional[Mapping[str, Any]] = None,
         **attrs: Any,
     ) -> Union[ModelSpec, ModelSpecs]:
         """
@@ -90,6 +92,11 @@ def from_spec(
                 instance or structured set of `ModelSpec` instances.
             attrs: Any `ModelSpec` attributes to set and/or override on all
                 generated `ModelSpec` instances.
+            context: Optional additional context to pass through to the formula
+                parsing algorithms. This is not normally required, and if
+                involved operators place additional constraints on the type
+                and/or structure of this context, they will raise exceptions
+                when they are not satisfied with instructions for how to fix it.
         """
         from .model_matrix import ModelMatrix
 
@@ -98,7 +105,7 @@ def prepare_model_spec(obj: Any) -> Union[ModelSpec, ModelSpecs]:
                 obj = obj.model_spec
             if isinstance(obj, ModelSpec):
                 return obj.update(**attrs)
-            formula = Formula.from_spec(obj)
+            formula = Formula.from_spec(obj, context=context)
             if isinstance(formula, StructuredFormula):
                 return cast(
                     ModelSpecs, formula._map(prepare_model_spec, as_type=ModelSpecs)
@@ -417,6 +424,21 @@ def variables_by_source(self) -> Dict[Optional[str], Set[Variable]]:
             variables_by_source[variable.source].add(variable)
         return dict(variables_by_source)
 
+    @property
+    def required_variables(self) -> Set[Variable]:
+        """
+        The set of variables required to be in the data to materialize this
+        model specification.
+
+        If `.structure` has not been populated (which contains metadata about
+        which columns where ultimate drawn from the data during
+        materialization), then this will fallback to the variables inferred to
+        be required by `.formula`.
+        """
+        if self.structure is None:
+            return self.formula.required_variables
+        return self.variables_by_source.get("data", set())
+
     def get_slice(self, columns_identifier: Union[int, str, Term, slice]) -> slice:
         """
         Generate a `slice` instance corresponding to the columns associated with
@@ -459,6 +481,24 @@ def get_slice(self, columns_identifier: Union[int, str, Term, slice]) -> slice:
 
     # Utility methods
 
+    def get_materializer(
+        self, data: Any, context: Optional[Mapping[str, Any]] = None
+    ) -> FormulaMaterializer:
+        """
+        Construct a `FormulaMaterializer` instance for `data` that can be used
+        to generate model matrices consistent with this model specification.
+
+        Args:
+            data: The data for which to build the materializer.
+            context: An additional mapping object of names to make available in
+                when evaluating formula term factors.
+        """
+        if self.materializer is None:
+            materializer = FormulaMaterializer.for_data(data)
+        else:
+            materializer = FormulaMaterializer.for_materializer(self.materializer)
+        return materializer(data, context=context, **(self.materializer_params or {}))
+
     def get_model_matrix(
         self,
         data: Any,
@@ -484,13 +524,12 @@ def get_model_matrix(
         """
         if attr_overrides:
             return self.update(**attr_overrides).get_model_matrix(data, context=context)
-        if self.materializer is None:
-            materializer = FormulaMaterializer.for_data(data)
-        else:
-            materializer = FormulaMaterializer.for_materializer(self.materializer)
-        return materializer(
-            data, context=context, **(self.materializer_params or {})
-        ).get_model_matrix(self, drop_rows=drop_rows)
+        return cast(
+            "ModelMatrix",
+            self.get_materializer(data, context=context).get_model_matrix(
+                self, drop_rows=drop_rows
+            ),
+        )
 
     def get_linear_constraints(self, spec: LinearConstraintSpec) -> LinearConstraints:
         """
@@ -632,6 +671,16 @@ def _prepare_item(self, key: str, item: Any) -> Any:
             )
         return item
 
+    @property
+    def required_variables(self) -> Set[Variable]:
+        """
+        The set of variables required to be in the data to materialize all of
+        the model specifications in this `ModelSpecs` instance.
+        """
+        variables: Set[Variable] = set()
+        self._map(lambda ms: variables.update(ms.required_variables))
+        return variables
+
     def get_model_matrix(
         self,
         data: Any,

diff --git a/formulaic/parser/algos/sanitize_tokens.py b/formulaic/parser/algos/sanitize_tokens.py
@@ -15,6 +15,8 @@ def sanitize_tokens(tokens: Iterable[Token]) -> Iterable[Token]:
         - possible more in the future
     """
     for token in tokens:
+        if token.token == ".":  # noqa: S105
+            token.kind = Token.Kind.OPERATOR
         if token.kind is Token.Kind.PYTHON:
             token.token = sanitize_python_code(token.token)
         yield token