Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for the . operator (and related work) #216

Merged
merged 6 commits into from
Dec 2, 2024
11 changes: 7 additions & 4 deletions docsite/docs/guides/grammar.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ unless otherwise indicated.
| `{...}`[^1] | 1 | Quotes python operations, as a more convenient way to do Python operations than `I(...)`, e.g. `` {`my|col`**2} `` ||||
| `<function>(...)`[^1] | 1 | Python transform on column, e.g. `my_func(x)` which is equivalent to `{my_func(x)}` |[^2] |||
|-----|
| `(...)` | 1 | Groups operations, overriding normal precedence rules. All operations with the parentheses are performed before the result of these operations is permitted to be operated upon by its peers. || ||
| `(...)` | 1 | Groups operations, overriding normal precedence rules. All operations with the parentheses are performed before the result of these operations is permitted to be operated upon by its peers. || ||
|-----|
| ** | 2 | Includes all n-th order interactions of the terms in the left operand, where n is the (integral) value of the right operand, e.g. `(a+b+c)**2` is equivalent to `a + b + c + a:b + a:c + b:c`. ||||
| ^ | 2 | Alias for `**`. ||[^3] ||
| `.`[^9] | 0 | Stands in as a wild-card for the sum of variables in the data not used on the left-hand side of a formula. ||||
|-----|
| `**` | 2 | Includes all n-th order interactions of the terms in the left operand, where n is the (integral) value of the right operand, e.g. `(a+b+c)**2` is equivalent to `a + b + c + a:b + a:c + b:c`. ||||
| `^` | 2 | Alias for `**`. ||[^3] ||
|-----|
| `:` | 2 | Adds a new term that corresponds to the interaction of its operands (i.e. their elementwise product). |[^4] |||
|-----|
Expand Down Expand Up @@ -123,4 +125,5 @@ and conventions of which you should be aware.
[^5]: This somewhat confusing operator is useful when you want to include hierachical features in your data, and where certain interaction terms do not make sense (particularly in ANOVA contexts). For example, if `a` represents countries, and `b` represents cities, then the full product of terms from `a * b === a + b + a:b` does not make sense, because any value of `b` is guaranteed to coincide with a value in `a`, and does not independently add value. Thus, the operation `a / b === a + a:b` results in more sensible dataset. As a result, the `/` operator is right-distributive, since if `b` and `c` were both nested in `a`, you would want `a/(b+c) === a + a:b + a:c`. Likewise, the operator is not left-distributive, since if `c` is nested under both `a` and `b` separately, then you want `(a + b)/c === a + b + a:b:c`. Lastly, if `c` is nested in `b`, and `b` is nested in `a`, then you would want `a/b/c === a + a:(b/c) === a + a:b + a:b:c`.
[^6]: Implemented by an R package called [Formula](https://cran.r-project.org/web/packages/Formula/index.html) that extends the default formula syntax.
[^7]: Patsy uses the `rescale` keyword rather than `scale`, but provides the same functionality.
[^8]: For increased compatibility with patsy, we use patsy's signature for `standardize`.
[^8]: For increased compatibility with patsy, we use patsy's signature for `standardize`.
[^9]: Requires additional context to be passed in when directly using the `Formula` constructor. e.g. `Formula("y ~ .", context={"__formulaic_variables_available__": ["x", "y", "z"]})`; or you can use `model_matrix`, `ModelSpec.get_model_matrix()`, or `FormulaMaterializer.get_model_matrix()` without further specification.
51 changes: 41 additions & 10 deletions formulaic/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __call__(
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
_parser: Optional[FormulaParser] = None,
_nested_parser: Optional[FormulaParser] = None,
_context: Optional[Mapping[str, Any]] = None,
**structure: FormulaSpec,
) -> Formula:
"""
Expand All @@ -82,7 +83,7 @@ def __call__(
`SimpleFormula` instance will be returned; otherwise, a
`StructuredFormula`.

Some arguments a prefixed with underscores to prevent collision with
Some arguments are prefixed with underscores to prevent collision with
formula structure.

Args:
Expand All @@ -108,6 +109,7 @@ def __call__(
_ordering=_ordering,
_parser=_parser,
_nested_parser=_nested_parser,
_context=_context,
**structure,
)
return self
Expand All @@ -120,13 +122,15 @@ def __call__(
_parser=_parser,
_nested_parser=_nested_parser,
_ordering=_ordering,
**structure,
)
_context=_context,
**structure, # type: ignore[arg-type]
)._simplify()
return cls.from_spec(
cast(FormulaSpec, root),
ordering=_ordering,
parser=_parser,
nested_parser=_nested_parser,
context=_context,
)

def from_spec(
Expand All @@ -136,6 +140,7 @@ def from_spec(
ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
parser: Optional[FormulaParser] = None,
nested_parser: Optional[FormulaParser] = None,
context: Optional[Mapping[str, Any]] = None,
) -> Union[SimpleFormula, StructuredFormula]:
"""
Construct a `SimpleFormula` or `StructuredFormula` instance from a
Expand Down Expand Up @@ -164,18 +169,25 @@ def from_spec(
if isinstance(spec, str):
spec = cast(
FormulaSpec,
(parser or DefaultFormulaParser()).get_terms(spec)._simplify(),
(parser or DefaultFormulaParser())
.get_terms(spec, context=context)
._simplify(),
)

if isinstance(spec, dict):
return StructuredFormula(
_parser=parser, _nested_parser=nested_parser, _ordering=ordering, **spec
_parser=parser,
_nested_parser=nested_parser,
_ordering=ordering,
_context=context,
**spec, # type: ignore[arg-type]
)
if isinstance(spec, Structured):
return StructuredFormula(
_ordering=ordering,
_parser=nested_parser,
_nested_parser=nested_parser,
_context=context,
**spec._structure,
)._simplify()
if isinstance(spec, tuple):
Expand All @@ -184,13 +196,14 @@ def from_spec(
_ordering=ordering,
_parser=parser,
_nested_parser=nested_parser,
_context=context,
)._simplify()
if isinstance(spec, (list, set, OrderedSet)):
terms = [
term
for value in spec
for term in (
nested_parser.get_terms(value) # type: ignore[attr-defined]
nested_parser.get_terms(value, context=context) # type: ignore[attr-defined]
if isinstance(value, str)
else [value]
)
Expand Down Expand Up @@ -248,9 +261,11 @@ class Formula(metaclass=_FormulaMeta):
def __init__(
self,
root: Union[FormulaSpec, _MissingType] = MISSING,
*,
_parser: Optional[FormulaParser] = None,
_nested_parser: Optional[FormulaParser] = None,
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
_context: Optional[Mapping[str, Any]] = None,
**structure: FormulaSpec,
):
"""
Expand Down Expand Up @@ -288,7 +303,7 @@ def get_model_matrix(
@abstractmethod
def required_variables(self) -> Set[Variable]:
"""
The set of variables required in the data order to materialize this
The set of variables required to be in the data to materialize this
formula.

Attempts are made to restrict these variables only to those expected in
Expand Down Expand Up @@ -354,6 +369,7 @@ def __init__(
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
_parser: Optional[FormulaParser] = None,
_nested_parser: Optional[FormulaParser] = None,
_context: Optional[Mapping[str, Any]] = None,
**structure: FormulaSpec,
):
if root is MISSING:
Expand Down Expand Up @@ -667,19 +683,22 @@ class StructuredFormula(Structured[SimpleFormula], Formula):
formula specifications. Can be: "none", "degree" (default), or "sort".
"""

__slots__ = ("_parser", "_nested_parser", "_ordering")
__slots__ = ("_parser", "_nested_parser", "_ordering", "_context")

def __init__(
self,
root: Union[FormulaSpec, _MissingType] = MISSING,
*,
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
_parser: Optional[FormulaParser] = None,
_nested_parser: Optional[FormulaParser] = None,
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
_context: Optional[Mapping[str, Any]] = None,
**structure: FormulaSpec,
):
self._ordering = OrderingMethod(_ordering)
self._parser = _parser or DEFAULT_PARSER
self._nested_parser = _nested_parser or _parser or DEFAULT_NESTED_PARSER
self._ordering = OrderingMethod(_ordering)
self._context = _context
super().__init__(root, **structure) # type: ignore
self._simplify(unwrap=False, inplace=True)

Expand All @@ -704,6 +723,7 @@ def _prepare_item( # type: ignore[override]
ordering=self._ordering,
parser=(self._parser if key == "root" else self._nested_parser),
nested_parser=self._nested_parser,
context=self._context,
)

def get_model_matrix(
Expand Down Expand Up @@ -782,3 +802,14 @@ def differentiate( # pylint: disable=redefined-builtin
SimpleFormula,
self._map(lambda formula: formula.differentiate(*wrt, use_sympy=use_sympy)),
)

# Ensure pickling never includes context
def __getstate__(self) -> Tuple[None, Dict[str, Any]]:
slots = self.__slots__ + Structured.__slots__
return (
None,
{
slot: getattr(self, slot) if slot != "_context" else None
for slot in slots
},
)
4 changes: 3 additions & 1 deletion formulaic/materializers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,9 @@ def get_model_matrix(
from formulaic import ModelSpec

# Prepare ModelSpec(s)
spec: Union[ModelSpec, ModelSpecs] = ModelSpec.from_spec(spec, **spec_overrides)
spec: Union[ModelSpec, ModelSpecs] = ModelSpec.from_spec(
spec, context=self.layered_context, **spec_overrides
)
should_simplify = isinstance(spec, ModelSpec)
model_specs: ModelSpecs = self._prepare_model_specs(spec)

Expand Down
65 changes: 57 additions & 8 deletions formulaic/model_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ class ModelSpec:
def from_spec(
cls,
spec: Union[FormulaSpec, ModelMatrix, ModelMatrices, ModelSpec, ModelSpecs],
*,
context: Optional[Mapping[str, Any]] = None,
**attrs: Any,
) -> Union[ModelSpec, ModelSpecs]:
"""
Expand All @@ -90,6 +92,11 @@ def from_spec(
instance or structured set of `ModelSpec` instances.
attrs: Any `ModelSpec` attributes to set and/or override on all
generated `ModelSpec` instances.
context: Optional additional context to pass through to the formula
parsing algorithms. This is not normally required, and if
involved operators place additional constraints on the type
and/or structure of this context, they will raise exceptions
when they are not satisfied with instructions for how to fix it.
"""
from .model_matrix import ModelMatrix

Expand All @@ -98,7 +105,7 @@ def prepare_model_spec(obj: Any) -> Union[ModelSpec, ModelSpecs]:
obj = obj.model_spec
if isinstance(obj, ModelSpec):
return obj.update(**attrs)
formula = Formula.from_spec(obj)
formula = Formula.from_spec(obj, context=context)
if isinstance(formula, StructuredFormula):
return cast(
ModelSpecs, formula._map(prepare_model_spec, as_type=ModelSpecs)
Expand Down Expand Up @@ -417,6 +424,21 @@ def variables_by_source(self) -> Dict[Optional[str], Set[Variable]]:
variables_by_source[variable.source].add(variable)
return dict(variables_by_source)

@property
def required_variables(self) -> Set[Variable]:
"""
The set of variables required to be in the data to materialize this
model specification.

If `.structure` has not been populated (which contains metadata about
which columns where ultimate drawn from the data during
materialization), then this will fallback to the variables inferred to
be required by `.formula`.
"""
if self.structure is None:
return self.formula.required_variables
return self.variables_by_source.get("data", set())

def get_slice(self, columns_identifier: Union[int, str, Term, slice]) -> slice:
"""
Generate a `slice` instance corresponding to the columns associated with
Expand Down Expand Up @@ -459,6 +481,24 @@ def get_slice(self, columns_identifier: Union[int, str, Term, slice]) -> slice:

# Utility methods

def get_materializer(
self, data: Any, context: Optional[Mapping[str, Any]] = None
) -> FormulaMaterializer:
"""
Construct a `FormulaMaterializer` instance for `data` that can be used
to generate model matrices consistent with this model specification.

Args:
data: The data for which to build the materializer.
context: An additional mapping object of names to make available in
when evaluating formula term factors.
"""
if self.materializer is None:
materializer = FormulaMaterializer.for_data(data)
else:
materializer = FormulaMaterializer.for_materializer(self.materializer)
return materializer(data, context=context, **(self.materializer_params or {}))

def get_model_matrix(
self,
data: Any,
Expand All @@ -484,13 +524,12 @@ def get_model_matrix(
"""
if attr_overrides:
return self.update(**attr_overrides).get_model_matrix(data, context=context)
if self.materializer is None:
materializer = FormulaMaterializer.for_data(data)
else:
materializer = FormulaMaterializer.for_materializer(self.materializer)
return materializer(
data, context=context, **(self.materializer_params or {})
).get_model_matrix(self, drop_rows=drop_rows)
return cast(
"ModelMatrix",
self.get_materializer(data, context=context).get_model_matrix(
self, drop_rows=drop_rows
),
)

def get_linear_constraints(self, spec: LinearConstraintSpec) -> LinearConstraints:
"""
Expand Down Expand Up @@ -632,6 +671,16 @@ def _prepare_item(self, key: str, item: Any) -> Any:
)
return item

@property
def required_variables(self) -> Set[Variable]:
"""
The set of variables required to be in the data to materialize all of
the model specifications in this `ModelSpecs` instance.
"""
variables: Set[Variable] = set()
self._map(lambda ms: variables.update(ms.required_variables))
return variables

def get_model_matrix(
self,
data: Any,
Expand Down
2 changes: 2 additions & 0 deletions formulaic/parser/algos/sanitize_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def sanitize_tokens(tokens: Iterable[Token]) -> Iterable[Token]:
- possible more in the future
"""
for token in tokens:
if token.token == ".": # noqa: S105
token.kind = Token.Kind.OPERATOR
if token.kind is Token.Kind.PYTHON:
token.token = sanitize_python_code(token.token)
yield token
Expand Down
Loading