Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,20 @@ Changes

.. currentmodule:: patsy

v0.4.0
------

* Formulas (more precisely, :class:`EvalFactor` objects) now only
keep a reference to the variables required from their environment
instead of the whole environment when the formula was defined.

* Incompatible change: :class:`EvalFactor` does not take an
``eval_env`` argument anymore.

* Incompatible change: the :func:`design_matrix_builders` function and
the :meth:`EvalFactor.memorize_passes_needed` method now
requires an ``eval_env`` as an additional argument.

v0.3.0
------

Expand Down
8 changes: 5 additions & 3 deletions doc/expert-model-specification.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,11 @@ things are.)
from patsy import (ModelDesc, EvalEnvironment, Term, EvalFactor,
LookupFactor, demo_data, dmatrix)
data = demo_data("a", "x")
env = EvalEnvironment.capture()

# LookupFactor takes a dictionary key:
a_lookup = LookupFactor("a")
# EvalFactor takes arbitrary Python code:
x_transform = EvalFactor("np.log(x ** 2)", env)
x_transform = EvalFactor("np.log(x ** 2)")
# First argument is empty list for dmatrix; we would need to put
# something there if we were calling dmatrices.
desc = ModelDesc([],
Expand Down Expand Up @@ -157,7 +156,7 @@ The full interface looks like this:
:term:`hashable`. These methods will determine which factors
Patsy considers equal for purposes of redundancy elimination.

.. method:: memorize_passes_needed(state)
.. method:: memorize_passes_needed(state, eval_env)

Return the number of passes through the data that this factor
will need in order to set up any :ref:`stateful-transforms`.
Expand All @@ -171,6 +170,9 @@ The full interface looks like this:
will be passed back in to all memorization and evaluation
methods.

`eval_env` is an :class:`EvalEnvironment` object, describing
the Python environment where the factor is being evaluated.

.. method:: memorize_chunk(state, which_pass, data)

Called repeatedly with each 'chunk' of data produced by the
Expand Down
11 changes: 5 additions & 6 deletions doc/formulas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,12 @@ To make this more concrete, here's how you could manually construct
the same objects that Patsy will construct if given the above
formula::

from patsy import EvalEnvironment, ModelDesc
env = EvalEnvironment.capture()
ModelDesc([Term([EvalFactor("y", env)])],
from patsy import ModelDesc
ModelDesc([Term([EvalFactor("y")])],
[Term([]),
Term([EvalFactor("a", env)]),
Term([EvalFactor("a", env), EvalFactor("b", env)]),
Term([EvalFactor("np.log(x)", env)])])
Term([EvalFactor("a")]),
Term([EvalFactor("a"), EvalFactor("b")]),
Term([EvalFactor("np.log(x)")])])

Compare to what you get from parsing the above formula::

Expand Down
31 changes: 25 additions & 6 deletions patsy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,14 @@ def test__ColumnBuilder():
cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3)
assert np.allclose(mat3, 1)

def _factors_memorize(factors, data_iter_maker):
def _factors_memorize(factors, data_iter_maker, eval_env):
# First, start off the memorization process by setting up each factor's
# state and finding out how many passes it will need:
factor_states = {}
passes_needed = {}
for factor in factors:
state = {}
which_pass = factor.memorize_passes_needed(state)
which_pass = factor.memorize_passes_needed(state, eval_env)
factor_states[factor] = state
passes_needed[factor] = which_pass
# Now, cycle through the data until all the factors have finished
Expand Down Expand Up @@ -362,7 +362,7 @@ def __init__(self, requested_passes, token):
self._chunk_in_pass = 0
self._seen_passes = 0

def memorize_passes_needed(self, state):
def memorize_passes_needed(self, state, eval_env):
state["calls"] = []
state["token"] = self._token
return self._requested_passes
Expand All @@ -389,7 +389,7 @@ def __call__(self):
f1 = MockFactor(1, "f1")
f2a = MockFactor(2, "f2a")
f2b = MockFactor(2, "f2b")
factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data)
factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data, {})
assert data.calls == 2
mem_chunks0 = [("memorize_chunk", 0)] * data.CHUNKS
mem_chunks1 = [("memorize_chunk", 1)] * data.CHUNKS
Expand Down Expand Up @@ -615,7 +615,7 @@ def _make_term_column_builders(terms,
term_to_column_builders[term] = column_builders
return new_term_order, term_to_column_builders

def design_matrix_builders(termlists, data_iter_maker, NA_action="drop"):
def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"):
"""Construct several :class:`DesignMatrixBuilders` from termlists.

This is one of Patsy's fundamental functions. This function and
Expand All @@ -629,6 +629,14 @@ def design_matrix_builders(termlists, data_iter_maker, NA_action="drop"):
simple iterator because sufficiently complex formulas may require
multiple passes over the data (e.g. if there are nested stateful
transforms).
:arg eval_env: Either a :class:`EvalEnvironment` which will be used to
look up any variables referenced in `termlists` that cannot be
found in `data_iter_maker`, or else a depth represented as an
integer which will be passed to :meth:`EvalEnvironment.capture`.
``eval_env=0`` means to use the context of the function calling
:func:`design_matrix_builders` for lookups. If calling this function
from a library, you probably want ``eval_env=1``, which means that
variables should be resolved in *your* caller's namespace.
:arg NA_action: An :class:`NAAction` object or string, used to determine
what values count as 'missing' for purposes of determining the levels of
categorical factors.
Expand All @@ -643,14 +651,25 @@ def design_matrix_builders(termlists, data_iter_maker, NA_action="drop"):

.. versionadded:: 0.2.0
The ``NA_action`` argument.
.. versionadded:: 0.4.0
The ``eval_env`` argument.
"""
# Check type of eval_env to help people migrating to 0.4.0. Third
# argument used to be NA_action (a string). Having the check for
# eval_env's type gives people migrating to 0.4.0 who used NA_action
# not as a keyword argument a nice error message here, instead of a
# more obscure backtrace later on.
if not isinstance(eval_env, six.integer_types + (EvalEnvironment,)):
raise TypeError("Parameter 'eval_env' must be either an integer or an instance "
"of patsy.EvalEnvironment.")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpicks:

  • The "+" is unnecessary here -- the Python lexer (like that in many languages, e.g. C) will concatenate adjacent string tokens at compile time, e.g. "foo" "bar" is the same as "foobar" (and so is "foo" 'bar' for that matter). It's even slightly more efficient, b/c + is evaluated at runtime, while leaving it out gives a single string directly in the .pyc. Not that this efficiency matters at all :-). But as a style/consistency thing I leave the + out.
  • You're missing a space, this string says "instanceof" :-)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. I'm aware of this feature, but I vaguely remember being bitten by it (although maybe it was in C instead of Python), so I personally tend to prefer being explicit unless necessary. But given that this is the style of the rest of the code base, I'll go with it. Done.

eval_env = EvalEnvironment.capture(eval_env, reference=1)
if isinstance(NA_action, str):
NA_action = NAAction(NA_action)
all_factors = set()
for termlist in termlists:
for term in termlist:
all_factors.update(term.factors)
factor_states = _factors_memorize(all_factors, data_iter_maker)
factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env)
# Now all the factors have working eval methods, so we can evaluate them
# on some data to find out what type of data they return.
(num_column_counts,
Expand Down
18 changes: 7 additions & 11 deletions patsy/desc.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ def test_ModelDesc_from_formula():
for input in ("y ~ x", parse_formula("y ~ x")):
eval_env = EvalEnvironment.capture(0)
md = ModelDesc.from_formula(input, eval_env)
assert md.lhs_termlist == [Term([EvalFactor("y", eval_env)]),]
assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x", eval_env)])]
assert md.lhs_termlist == [Term([EvalFactor("y")]),]
assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x")])]

class IntermediateExpr(object):
"This class holds an intermediate result while we're evaluating a tree."
Expand Down Expand Up @@ -356,8 +356,7 @@ def _eval_number(evaluator, tree):
"only allowed with **", tree)

def _eval_python_expr(evaluator, tree):
factor = EvalFactor(tree.token.extra, evaluator._factor_eval_env,
origin=tree.origin)
factor = EvalFactor(tree.token.extra, origin=tree.origin)
return IntermediateExpr(False, None, False, [Term([factor])])

class Evaluator(object):
Expand Down Expand Up @@ -585,16 +584,15 @@ def eval(self, tree, require_evalexpr=True):
"a + <-a**2>",
]

def _assert_terms_match(terms, expected_intercept, expecteds, eval_env): # pragma: no cover
def _assert_terms_match(terms, expected_intercept, expecteds): # pragma: no cover
if expected_intercept:
expecteds = [()] + expecteds
assert len(terms) == len(expecteds)
for term, expected in zip(terms, expecteds):
if isinstance(term, Term):
if isinstance(expected, str):
expected = (expected,)
assert term.factors == tuple([EvalFactor(s, eval_env)
for s in expected])
assert term.factors == tuple([EvalFactor(s) for s in expected])
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think eval_env can probably be removed more thoroughly from this code? (I.e., if there aren't any other users in the function, then the variable can be removed entirely, and so on up the call stack.)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It didn't percolate very far off the call stack, but I removed a couple uses. Good catch.

else:
assert term == expected

Expand All @@ -609,11 +607,9 @@ def _do_eval_formula_tests(tests): # pragma: no cover
print(model_desc)
lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result
_assert_terms_match(model_desc.lhs_termlist,
lhs_intercept, lhs_termlist,
eval_env)
lhs_intercept, lhs_termlist)
_assert_terms_match(model_desc.rhs_termlist,
rhs_intercept, rhs_termlist,
eval_env)
rhs_intercept, rhs_termlist)

def test_eval_formula():
_do_eval_formula_tests(_eval_tests)
Expand Down
Loading