pydata · njsmith · May 20, 2015 · Apr 15, 2015 · Apr 15, 2015 · Apr 16, 2015
diff --git a/doc/changes.rst b/doc/changes.rst
@@ -3,6 +3,20 @@ Changes
 
 .. currentmodule:: patsy
 
+v0.4.0
+------
+
+* Formulas (more precisely, :class:`EvalFactor` objects) now only
+  keep a reference to the variables required from their environment
+  instead of the whole environment when the formula was defined.
+
+* Incompatible change: :class:`EvalFactor` does not take an
+  ``eval_env`` argument anymore.
+
+* Incompatible change: the :func:`design_matrix_builders` function and
+  the :meth:`EvalFactor.memorize_passes_needed` method now
+  requires an ``eval_env`` as an additional argument.
+
 v0.3.0
 ------
 

diff --git a/doc/expert-model-specification.rst b/doc/expert-model-specification.rst
@@ -78,12 +78,11 @@ things are.)
    from patsy import (ModelDesc, EvalEnvironment, Term, EvalFactor,
                       LookupFactor, demo_data, dmatrix)
    data = demo_data("a", "x")
-   env = EvalEnvironment.capture()
 
    # LookupFactor takes a dictionary key:
    a_lookup = LookupFactor("a")
    # EvalFactor takes arbitrary Python code:
-   x_transform = EvalFactor("np.log(x ** 2)", env)
+   x_transform = EvalFactor("np.log(x ** 2)")
    # First argument is empty list for dmatrix; we would need to put
    # something there if we were calling dmatrices.
    desc = ModelDesc([],
@@ -157,7 +156,7 @@ The full interface looks like this:
        :term:`hashable`. These methods will determine which factors
        Patsy considers equal for purposes of redundancy elimination.
 
-    .. method:: memorize_passes_needed(state)
+    .. method:: memorize_passes_needed(state, eval_env)
 
        Return the number of passes through the data that this factor
        will need in order to set up any :ref:`stateful-transforms`.
@@ -171,6 +170,9 @@ The full interface looks like this:
        will be passed back in to all memorization and evaluation
        methods.
 
+       `eval_env` is an :class:`EvalEnvironment` object, describing
+       the Python environment where the factor is being evaluated.
+
     .. method:: memorize_chunk(state, which_pass, data)
 
        Called repeatedly with each 'chunk' of data produced by the

diff --git a/doc/formulas.rst b/doc/formulas.rst
@@ -58,13 +58,12 @@ To make this more concrete, here's how you could manually construct
 the same objects that Patsy will construct if given the above
 formula::
 
-  from patsy import EvalEnvironment, ModelDesc
-  env = EvalEnvironment.capture()
-  ModelDesc([Term([EvalFactor("y", env)])],
+  from patsy import ModelDesc
+  ModelDesc([Term([EvalFactor("y")])],
             [Term([]),
-             Term([EvalFactor("a", env)]),
-             Term([EvalFactor("a", env), EvalFactor("b", env)]),
-             Term([EvalFactor("np.log(x)", env)])])
+             Term([EvalFactor("a")]),
+             Term([EvalFactor("a"), EvalFactor("b")]),
+             Term([EvalFactor("np.log(x)")])])
 
 Compare to what you get from parsing the above formula::
 

diff --git a/patsy/build.py b/patsy/build.py
@@ -325,14 +325,14 @@ def test__ColumnBuilder():
     cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3)
     assert np.allclose(mat3, 1)
 
-def _factors_memorize(factors, data_iter_maker):
+def _factors_memorize(factors, data_iter_maker, eval_env):
     # First, start off the memorization process by setting up each factor's
     # state and finding out how many passes it will need:
     factor_states = {}
     passes_needed = {}
     for factor in factors:
         state = {}
-        which_pass = factor.memorize_passes_needed(state)
+        which_pass = factor.memorize_passes_needed(state, eval_env)
         factor_states[factor] = state
         passes_needed[factor] = which_pass
     # Now, cycle through the data until all the factors have finished
@@ -362,7 +362,7 @@ def __init__(self, requested_passes, token):
             self._chunk_in_pass = 0
             self._seen_passes = 0
 
-        def memorize_passes_needed(self, state):
+        def memorize_passes_needed(self, state, eval_env):
             state["calls"] = []
             state["token"] = self._token
             return self._requested_passes
@@ -389,7 +389,7 @@ def __call__(self):
     f1 = MockFactor(1, "f1")
     f2a = MockFactor(2, "f2a")
     f2b = MockFactor(2, "f2b")
-    factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data)
+    factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data, {})
     assert data.calls == 2
     mem_chunks0 = [("memorize_chunk", 0)] * data.CHUNKS
     mem_chunks1 = [("memorize_chunk", 1)] * data.CHUNKS
@@ -615,7 +615,7 @@ def _make_term_column_builders(terms,
             term_to_column_builders[term] = column_builders
     return new_term_order, term_to_column_builders
 
-def design_matrix_builders(termlists, data_iter_maker, NA_action="drop"):
+def design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action="drop"):
     """Construct several :class:`DesignMatrixBuilders` from termlists.
 
     This is one of Patsy's fundamental functions. This function and
@@ -629,6 +629,14 @@ def design_matrix_builders(termlists, data_iter_maker, NA_action="drop"):
       simple iterator because sufficiently complex formulas may require
       multiple passes over the data (e.g. if there are nested stateful
       transforms).
+    :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
+      look up any variables referenced in `termlists` that cannot be
+      found in `data_iter_maker`, or else a depth represented as an
+      integer which will be passed to :meth:`EvalEnvironment.capture`.
+      ``eval_env=0`` means to use the context of the function calling
+      :func:`design_matrix_builders` for lookups. If calling this function
+      from a library, you probably want ``eval_env=1``, which means that
+      variables should be resolved in *your* caller's namespace.
     :arg NA_action: An :class:`NAAction` object or string, used to determine
       what values count as 'missing' for purposes of determining the levels of
       categorical factors.
@@ -643,14 +651,25 @@ def design_matrix_builders(termlists, data_iter_maker, NA_action="drop"):
 
     .. versionadded:: 0.2.0
        The ``NA_action`` argument.
+    .. versionadded:: 0.4.0
+       The ``eval_env`` argument.
     """
+    # Check type of eval_env to help people migrating to 0.4.0. Third
+    # argument used to be NA_action (a string). Having the check for
+    # eval_env's type gives people migrating to 0.4.0 who used NA_action
+    # not as a keyword argument a nice error message here, instead of a
+    # more obscure backtrace later on.
+    if not isinstance(eval_env, six.integer_types + (EvalEnvironment,)):
+        raise TypeError("Parameter 'eval_env' must be either an integer or an instance "
+                        "of patsy.EvalEnvironment.")
+    eval_env = EvalEnvironment.capture(eval_env, reference=1)
     if isinstance(NA_action, str):
         NA_action = NAAction(NA_action)
     all_factors = set()
     for termlist in termlists:
         for term in termlist:
             all_factors.update(term.factors)
-    factor_states = _factors_memorize(all_factors, data_iter_maker)
+    factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env)
     # Now all the factors have working eval methods, so we can evaluate them
     # on some data to find out what type of data they return.
     (num_column_counts,

diff --git a/patsy/desc.py b/patsy/desc.py
@@ -191,8 +191,8 @@ def test_ModelDesc_from_formula():
     for input in ("y ~ x", parse_formula("y ~ x")):
         eval_env = EvalEnvironment.capture(0)
         md = ModelDesc.from_formula(input, eval_env)
-        assert md.lhs_termlist == [Term([EvalFactor("y", eval_env)]),]
-        assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x", eval_env)])]
+        assert md.lhs_termlist == [Term([EvalFactor("y")]),]
+        assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x")])]
 
 class IntermediateExpr(object):
     "This class holds an intermediate result while we're evaluating a tree."
@@ -356,8 +356,7 @@ def _eval_number(evaluator, tree):
                         "only allowed with **", tree)
 
 def _eval_python_expr(evaluator, tree):
-    factor = EvalFactor(tree.token.extra, evaluator._factor_eval_env,
-                        origin=tree.origin)
+    factor = EvalFactor(tree.token.extra, origin=tree.origin)
     return IntermediateExpr(False, None, False, [Term([factor])])
 
 class Evaluator(object):
@@ -585,16 +584,15 @@ def eval(self, tree, require_evalexpr=True):
     "a + <-a**2>",
 ]
 
-def _assert_terms_match(terms, expected_intercept, expecteds, eval_env): # pragma: no cover
+def _assert_terms_match(terms, expected_intercept, expecteds): # pragma: no cover
     if expected_intercept:
         expecteds = [()] + expecteds
     assert len(terms) == len(expecteds)
     for term, expected in zip(terms, expecteds):
         if isinstance(term, Term):
             if isinstance(expected, str):
                 expected = (expected,)
-            assert term.factors == tuple([EvalFactor(s, eval_env)
-                                          for s in expected])
+            assert term.factors == tuple([EvalFactor(s) for s in expected])
         else:
             assert term == expected
 
@@ -609,11 +607,9 @@ def _do_eval_formula_tests(tests): # pragma: no cover
         print(model_desc)
         lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result
         _assert_terms_match(model_desc.lhs_termlist,
-                            lhs_intercept, lhs_termlist,
-                            eval_env)
+                            lhs_intercept, lhs_termlist)
         _assert_terms_match(model_desc.rhs_termlist,
-                            rhs_intercept, rhs_termlist,
-                            eval_env)
+                            rhs_intercept, rhs_termlist)
 
 def test_eval_formula():
     _do_eval_formula_tests(_eval_tests)