Skip to content

Commit 28fd9ec

Browse files
Merge pull request #216 from matthewwardrop/add_support_for_dot_operator
2 parents fc37e29 + 8183ba8 commit 28fd9ec

28 files changed

+739
-192
lines changed

docsite/docs/guides/grammar.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@ unless otherwise indicated.
2626
| `{...}`[^1] | 1 | Quotes python operations, as a more convenient way to do Python operations than `I(...)`, e.g. `` {`my|col`**2} `` ||||
2727
| `<function>(...)`[^1] | 1 | Python transform on column, e.g. `my_func(x)` which is equivalent to `{my_func(x)}` |[^2] |||
2828
|-----|
29-
| `(...)` | 1 | Groups operations, overriding normal precedence rules. All operations with the parentheses are performed before the result of these operations is permitted to be operated upon by its peers. || ||
29+
| `(...)` | 1 | Groups operations, overriding normal precedence rules. All operations with the parentheses are performed before the result of these operations is permitted to be operated upon by its peers. || ||
3030
|-----|
31-
| ** | 2 | Includes all n-th order interactions of the terms in the left operand, where n is the (integral) value of the right operand, e.g. `(a+b+c)**2` is equivalent to `a + b + c + a:b + a:c + b:c`. ||||
32-
| ^ | 2 | Alias for `**`. ||[^3] ||
31+
| `.`[^9] | 0 | Stands in as a wild-card for the sum of variables in the data not used on the left-hand side of a formula. ||||
32+
|-----|
33+
| `**` | 2 | Includes all n-th order interactions of the terms in the left operand, where n is the (integral) value of the right operand, e.g. `(a+b+c)**2` is equivalent to `a + b + c + a:b + a:c + b:c`. ||||
34+
| `^` | 2 | Alias for `**`. ||[^3] ||
3335
|-----|
3436
| `:` | 2 | Adds a new term that corresponds to the interaction of its operands (i.e. their elementwise product). |[^4] |||
3537
|-----|
@@ -123,4 +125,5 @@ and conventions of which you should be aware.
123125
[^5]: This somewhat confusing operator is useful when you want to include hierachical features in your data, and where certain interaction terms do not make sense (particularly in ANOVA contexts). For example, if `a` represents countries, and `b` represents cities, then the full product of terms from `a * b === a + b + a:b` does not make sense, because any value of `b` is guaranteed to coincide with a value in `a`, and does not independently add value. Thus, the operation `a / b === a + a:b` results in more sensible dataset. As a result, the `/` operator is right-distributive, since if `b` and `c` were both nested in `a`, you would want `a/(b+c) === a + a:b + a:c`. Likewise, the operator is not left-distributive, since if `c` is nested under both `a` and `b` separately, then you want `(a + b)/c === a + b + a:b:c`. Lastly, if `c` is nested in `b`, and `b` is nested in `a`, then you would want `a/b/c === a + a:(b/c) === a + a:b + a:b:c`.
124126
[^6]: Implemented by an R package called [Formula](https://cran.r-project.org/web/packages/Formula/index.html) that extends the default formula syntax.
125127
[^7]: Patsy uses the `rescale` keyword rather than `scale`, but provides the same functionality.
126-
[^8]: For increased compatibility with patsy, we use patsy's signature for `standardize`.
128+
[^8]: For increased compatibility with patsy, we use patsy's signature for `standardize`.
129+
[^9]: Requires additional context to be passed in when directly using the `Formula` constructor. e.g. `Formula("y ~ .", context={"__formulaic_variables_available__": ["x", "y", "z"]})`; or you can use `model_matrix`, `ModelSpec.get_model_matrix()`, or `FormulaMaterializer.get_model_matrix()` without further specification.

formulaic/formula.py

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def __call__(
7474
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
7575
_parser: Optional[FormulaParser] = None,
7676
_nested_parser: Optional[FormulaParser] = None,
77+
_context: Optional[Mapping[str, Any]] = None,
7778
**structure: FormulaSpec,
7879
) -> Formula:
7980
"""
@@ -82,7 +83,7 @@ def __call__(
8283
`SimpleFormula` instance will be returned; otherwise, a
8384
`StructuredFormula`.
8485
85-
Some arguments a prefixed with underscores to prevent collision with
86+
Some arguments are prefixed with underscores to prevent collision with
8687
formula structure.
8788
8889
Args:
@@ -108,6 +109,7 @@ def __call__(
108109
_ordering=_ordering,
109110
_parser=_parser,
110111
_nested_parser=_nested_parser,
112+
_context=_context,
111113
**structure,
112114
)
113115
return self
@@ -120,13 +122,15 @@ def __call__(
120122
_parser=_parser,
121123
_nested_parser=_nested_parser,
122124
_ordering=_ordering,
123-
**structure,
124-
)
125+
_context=_context,
126+
**structure, # type: ignore[arg-type]
127+
)._simplify()
125128
return cls.from_spec(
126129
cast(FormulaSpec, root),
127130
ordering=_ordering,
128131
parser=_parser,
129132
nested_parser=_nested_parser,
133+
context=_context,
130134
)
131135

132136
def from_spec(
@@ -136,6 +140,7 @@ def from_spec(
136140
ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
137141
parser: Optional[FormulaParser] = None,
138142
nested_parser: Optional[FormulaParser] = None,
143+
context: Optional[Mapping[str, Any]] = None,
139144
) -> Union[SimpleFormula, StructuredFormula]:
140145
"""
141146
Construct a `SimpleFormula` or `StructuredFormula` instance from a
@@ -164,18 +169,25 @@ def from_spec(
164169
if isinstance(spec, str):
165170
spec = cast(
166171
FormulaSpec,
167-
(parser or DefaultFormulaParser()).get_terms(spec)._simplify(),
172+
(parser or DefaultFormulaParser())
173+
.get_terms(spec, context=context)
174+
._simplify(),
168175
)
169176

170177
if isinstance(spec, dict):
171178
return StructuredFormula(
172-
_parser=parser, _nested_parser=nested_parser, _ordering=ordering, **spec
179+
_parser=parser,
180+
_nested_parser=nested_parser,
181+
_ordering=ordering,
182+
_context=context,
183+
**spec, # type: ignore[arg-type]
173184
)
174185
if isinstance(spec, Structured):
175186
return StructuredFormula(
176187
_ordering=ordering,
177188
_parser=nested_parser,
178189
_nested_parser=nested_parser,
190+
_context=context,
179191
**spec._structure,
180192
)._simplify()
181193
if isinstance(spec, tuple):
@@ -184,13 +196,14 @@ def from_spec(
184196
_ordering=ordering,
185197
_parser=parser,
186198
_nested_parser=nested_parser,
199+
_context=context,
187200
)._simplify()
188201
if isinstance(spec, (list, set, OrderedSet)):
189202
terms = [
190203
term
191204
for value in spec
192205
for term in (
193-
nested_parser.get_terms(value) # type: ignore[attr-defined]
206+
nested_parser.get_terms(value, context=context) # type: ignore[attr-defined]
194207
if isinstance(value, str)
195208
else [value]
196209
)
@@ -248,9 +261,11 @@ class Formula(metaclass=_FormulaMeta):
248261
def __init__(
249262
self,
250263
root: Union[FormulaSpec, _MissingType] = MISSING,
264+
*,
251265
_parser: Optional[FormulaParser] = None,
252266
_nested_parser: Optional[FormulaParser] = None,
253267
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
268+
_context: Optional[Mapping[str, Any]] = None,
254269
**structure: FormulaSpec,
255270
):
256271
"""
@@ -288,7 +303,7 @@ def get_model_matrix(
288303
@abstractmethod
289304
def required_variables(self) -> Set[Variable]:
290305
"""
291-
The set of variables required in the data order to materialize this
306+
The set of variables required to be in the data to materialize this
292307
formula.
293308
294309
Attempts are made to restrict these variables only to those expected in
@@ -354,6 +369,7 @@ def __init__(
354369
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
355370
_parser: Optional[FormulaParser] = None,
356371
_nested_parser: Optional[FormulaParser] = None,
372+
_context: Optional[Mapping[str, Any]] = None,
357373
**structure: FormulaSpec,
358374
):
359375
if root is MISSING:
@@ -667,19 +683,22 @@ class StructuredFormula(Structured[SimpleFormula], Formula):
667683
formula specifications. Can be: "none", "degree" (default), or "sort".
668684
"""
669685

670-
__slots__ = ("_parser", "_nested_parser", "_ordering")
686+
__slots__ = ("_parser", "_nested_parser", "_ordering", "_context")
671687

672688
def __init__(
673689
self,
674690
root: Union[FormulaSpec, _MissingType] = MISSING,
691+
*,
692+
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
675693
_parser: Optional[FormulaParser] = None,
676694
_nested_parser: Optional[FormulaParser] = None,
677-
_ordering: Union[OrderingMethod, str] = OrderingMethod.DEGREE,
695+
_context: Optional[Mapping[str, Any]] = None,
678696
**structure: FormulaSpec,
679697
):
698+
self._ordering = OrderingMethod(_ordering)
680699
self._parser = _parser or DEFAULT_PARSER
681700
self._nested_parser = _nested_parser or _parser or DEFAULT_NESTED_PARSER
682-
self._ordering = OrderingMethod(_ordering)
701+
self._context = _context
683702
super().__init__(root, **structure) # type: ignore
684703
self._simplify(unwrap=False, inplace=True)
685704

@@ -704,6 +723,7 @@ def _prepare_item( # type: ignore[override]
704723
ordering=self._ordering,
705724
parser=(self._parser if key == "root" else self._nested_parser),
706725
nested_parser=self._nested_parser,
726+
context=self._context,
707727
)
708728

709729
def get_model_matrix(
@@ -782,3 +802,14 @@ def differentiate( # pylint: disable=redefined-builtin
782802
SimpleFormula,
783803
self._map(lambda formula: formula.differentiate(*wrt, use_sympy=use_sympy)),
784804
)
805+
806+
# Ensure pickling never includes context
807+
def __getstate__(self) -> Tuple[None, Dict[str, Any]]:
808+
slots = self.__slots__ + Structured.__slots__
809+
return (
810+
None,
811+
{
812+
slot: getattr(self, slot) if slot != "_context" else None
813+
for slot in slots
814+
},
815+
)

formulaic/materializers/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,9 @@ def get_model_matrix(
163163
from formulaic import ModelSpec
164164

165165
# Prepare ModelSpec(s)
166-
spec: Union[ModelSpec, ModelSpecs] = ModelSpec.from_spec(spec, **spec_overrides)
166+
spec: Union[ModelSpec, ModelSpecs] = ModelSpec.from_spec(
167+
spec, context=self.layered_context, **spec_overrides
168+
)
167169
should_simplify = isinstance(spec, ModelSpec)
168170
model_specs: ModelSpecs = self._prepare_model_specs(spec)
169171

formulaic/model_spec.py

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ class ModelSpec:
7878
def from_spec(
7979
cls,
8080
spec: Union[FormulaSpec, ModelMatrix, ModelMatrices, ModelSpec, ModelSpecs],
81+
*,
82+
context: Optional[Mapping[str, Any]] = None,
8183
**attrs: Any,
8284
) -> Union[ModelSpec, ModelSpecs]:
8385
"""
@@ -90,6 +92,11 @@ def from_spec(
9092
instance or structured set of `ModelSpec` instances.
9193
attrs: Any `ModelSpec` attributes to set and/or override on all
9294
generated `ModelSpec` instances.
95+
context: Optional additional context to pass through to the formula
96+
parsing algorithms. This is not normally required, and if
97+
involved operators place additional constraints on the type
98+
and/or structure of this context, they will raise exceptions
99+
when they are not satisfied with instructions for how to fix it.
93100
"""
94101
from .model_matrix import ModelMatrix
95102

@@ -98,7 +105,7 @@ def prepare_model_spec(obj: Any) -> Union[ModelSpec, ModelSpecs]:
98105
obj = obj.model_spec
99106
if isinstance(obj, ModelSpec):
100107
return obj.update(**attrs)
101-
formula = Formula.from_spec(obj)
108+
formula = Formula.from_spec(obj, context=context)
102109
if isinstance(formula, StructuredFormula):
103110
return cast(
104111
ModelSpecs, formula._map(prepare_model_spec, as_type=ModelSpecs)
@@ -417,6 +424,21 @@ def variables_by_source(self) -> Dict[Optional[str], Set[Variable]]:
417424
variables_by_source[variable.source].add(variable)
418425
return dict(variables_by_source)
419426

427+
@property
428+
def required_variables(self) -> Set[Variable]:
429+
"""
430+
The set of variables required to be in the data to materialize this
431+
model specification.
432+
433+
If `.structure` has not been populated (which contains metadata about
434+
which columns where ultimate drawn from the data during
435+
materialization), then this will fallback to the variables inferred to
436+
be required by `.formula`.
437+
"""
438+
if self.structure is None:
439+
return self.formula.required_variables
440+
return self.variables_by_source.get("data", set())
441+
420442
def get_slice(self, columns_identifier: Union[int, str, Term, slice]) -> slice:
421443
"""
422444
Generate a `slice` instance corresponding to the columns associated with
@@ -459,6 +481,24 @@ def get_slice(self, columns_identifier: Union[int, str, Term, slice]) -> slice:
459481

460482
# Utility methods
461483

484+
def get_materializer(
485+
self, data: Any, context: Optional[Mapping[str, Any]] = None
486+
) -> FormulaMaterializer:
487+
"""
488+
Construct a `FormulaMaterializer` instance for `data` that can be used
489+
to generate model matrices consistent with this model specification.
490+
491+
Args:
492+
data: The data for which to build the materializer.
493+
context: An additional mapping object of names to make available in
494+
when evaluating formula term factors.
495+
"""
496+
if self.materializer is None:
497+
materializer = FormulaMaterializer.for_data(data)
498+
else:
499+
materializer = FormulaMaterializer.for_materializer(self.materializer)
500+
return materializer(data, context=context, **(self.materializer_params or {}))
501+
462502
def get_model_matrix(
463503
self,
464504
data: Any,
@@ -484,13 +524,12 @@ def get_model_matrix(
484524
"""
485525
if attr_overrides:
486526
return self.update(**attr_overrides).get_model_matrix(data, context=context)
487-
if self.materializer is None:
488-
materializer = FormulaMaterializer.for_data(data)
489-
else:
490-
materializer = FormulaMaterializer.for_materializer(self.materializer)
491-
return materializer(
492-
data, context=context, **(self.materializer_params or {})
493-
).get_model_matrix(self, drop_rows=drop_rows)
527+
return cast(
528+
"ModelMatrix",
529+
self.get_materializer(data, context=context).get_model_matrix(
530+
self, drop_rows=drop_rows
531+
),
532+
)
494533

495534
def get_linear_constraints(self, spec: LinearConstraintSpec) -> LinearConstraints:
496535
"""
@@ -632,6 +671,16 @@ def _prepare_item(self, key: str, item: Any) -> Any:
632671
)
633672
return item
634673

674+
@property
675+
def required_variables(self) -> Set[Variable]:
676+
"""
677+
The set of variables required to be in the data to materialize all of
678+
the model specifications in this `ModelSpecs` instance.
679+
"""
680+
variables: Set[Variable] = set()
681+
self._map(lambda ms: variables.update(ms.required_variables))
682+
return variables
683+
635684
def get_model_matrix(
636685
self,
637686
data: Any,

formulaic/parser/algos/sanitize_tokens.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ def sanitize_tokens(tokens: Iterable[Token]) -> Iterable[Token]:
1515
- possible more in the future
1616
"""
1717
for token in tokens:
18+
if token.token == ".": # noqa: S105
19+
token.kind = Token.Kind.OPERATOR
1820
if token.kind is Token.Kind.PYTHON:
1921
token.token = sanitize_python_code(token.token)
2022
yield token

0 commit comments

Comments
 (0)