diff --git a/.travis.yml b/.travis.yml
index c29e0354..9a921766 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,7 +25,7 @@ before_install:
   - # Setup Python environment with BLAS libraries
   - wget -q http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
   - chmod +x miniconda.sh
-  - ./miniconda.sh -b
+  - ./miniconda.sh -b -p $HOME/miniconda
   - export PATH=$HOME/miniconda/bin:$PATH
   - conda update -q --yes conda
   - export FUEL_DATA_PATH=$TRAVIS_BUILD_DIR/data
@@ -54,6 +54,7 @@ script:
         cd $TRAVIS_BUILD_DIR
         git clone https://github.com/mila-udem/blocks-examples.git
         cd blocks-examples
+        git checkout e0d7a0e5b60e802634161a63602673717c3e3c78
         nose2 tests
       fi
 after_script:
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index d010396d..075e6ecd 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -1,3 +1,26 @@
+Filing an issue
+===============
+If you are having a problem, then *before* filing an issue, please verify
+the following:
+
+* That you are using a **compatible version of Python** -- this means version
+  3.4 or newer for mainline Python. Legacy Python support is limited to 2.7 and
+  will eventually be dropped, and not all features may be available; users are
+  encouraged to move to Python 3.x as soon as possible.
+* That you are using **the latest version of Theano** from the GitHub ``master``
+  branch. Blocks is developed concurrently with Theano's bleeding edge development
+  and many problems with using Blocks can be traced to using the latest stable
+  version of Theano (or an insufficiently recent GitHub checkout). Please see the
+  `Blocks installation instructions`_ for more details.
+* You are using the latest Blocks (and Fuel_) from the GitHub ``master``
+  branch. If you are using ``stable``, then if possible, please check if your
+  problem persists if you switch to using ``master``. It may still be worth
+  filing the issue if your problem is fixed in ``master``, if it is a serious
+  enough problem to warrant backporting a fix to ``stable``.
+* That your issue is about the software itself -- either a bug report, feature
+  request, question on how to accomplish a certain defined operation within
+  Blocks, etc. -- and not a general machine learning or neural networks question.
+
 Making a pull request
 =====================
 
@@ -49,6 +72,8 @@ mailing list and the GitHub issues to make sure the answer isn't out there
 already.
 
 .. _Blocks users mailing list: https://groups.google.com/forum/#!forum/blocks-users
+.. _Blocks installation instructions: https://blocks.readthedocs.org/en/latest/setup.html
+.. _Fuel: http://fuel.readthedocs.org/
 .. _quick reference: https://blocks.readthedocs.org/en/latest/development/pull_request.html
 .. _the documentation: https://blocks.readthedocs.org/en/latest/development/index.html#formatting-guidelines
 .. _coding guidelines: https://blocks.readthedocs.org/en/latest/development/index.html#code-guidelines
diff --git a/blocks/__init__.py b/blocks/__init__.py
index 90230562..002a8b97 100644
--- a/blocks/__init__.py
+++ b/blocks/__init__.py
@@ -1,5 +1,3 @@
 """The blocks library for parametrized Theano ops."""
-# Scary warning: Adding code to this file can break namespace packages
-# See https://pythonhosted.org/setuptools/setuptools.html#namespace-packages
-__import__("pkg_resources").declare_namespace(__name__)
-__version__ = '0.1.1'
+import blocks.version
+__version__ = blocks.version.version
diff --git a/blocks/algorithms/__init__.py b/blocks/algorithms/__init__.py
index 45e9cae7..d3032b63 100644
--- a/blocks/algorithms/__init__.py
+++ b/blocks/algorithms/__init__.py
@@ -14,7 +14,8 @@
 from blocks.graph import ComputationGraph
 from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
 from blocks.theano_expressions import l2_norm
-from blocks.utils import dict_subset, pack, shared_floatx
+from blocks.utils import (dict_subset, pack, shared_floatx,
+                          shared_floatx_zeros_matching)
 
 logger = logging.getLogger(__name__)
 
@@ -420,7 +421,7 @@ def __init__(self, momentum=0.):
         add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
 
     def compute_step(self, parameter, previous_step):
-        velocity = shared_floatx(parameter.get_value() * 0., "velocity")
+        velocity = shared_floatx_zeros_matching(parameter, "velocity")
         add_role(velocity, ALGORITHM_BUFFER)
         step = self.momentum * velocity + previous_step
         updates = [(velocity, step)]
@@ -487,11 +488,11 @@ def __init__(self, decay_rate=0.95, epsilon=1e-6):
         add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
 
     def compute_step(self, parameter, previous_step):
-        mean_square_step_tm1 = shared_floatx(parameter.get_value() * 0.,
-                                             "mean_square_step_tm1")
+        mean_square_step_tm1 = shared_floatx_zeros_matching(
+            parameter, "mean_square_step_tm1")
         add_role(mean_square_step_tm1, ALGORITHM_BUFFER)
-        mean_square_delta_x_tm1 = shared_floatx(parameter.get_value() * 0.,
-                                                "mean_square_delta_x_tm1")
+        mean_square_delta_x_tm1 = shared_floatx_zeros_matching(
+            parameter, "mean_square_delta_x_tm1")
         add_role(mean_square_delta_x_tm1, ALGORITHM_BUFFER)
 
         mean_square_step_t = (
@@ -550,8 +551,8 @@ def __init__(self, decay_rate=0.9, max_scaling=1e5):
         self.epsilon = 1. / max_scaling
 
     def compute_step(self, parameter, previous_step):
-        mean_square_step_tm1 = shared_floatx(parameter.get_value() * 0.,
-                                             "mean_square_step_tm1")
+        mean_square_step_tm1 = shared_floatx_zeros_matching(
+            parameter, "mean_square_step_tm1")
         add_role(mean_square_step_tm1, ALGORITHM_BUFFER)
         mean_square_step_t = (
             self.decay_rate * mean_square_step_tm1 +
@@ -742,15 +743,16 @@ class AdaGrad(StepRule):
 
     """
     def __init__(self, learning_rate=0.002, epsilon=1e-6):
-        self.learning_rate = learning_rate
-        self.epsilon = epsilon
+        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
+        self.epsilon = shared_floatx(epsilon, "epsilon")
+        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
+        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
 
     def compute_step(self, parameter, previous_step):
         name = 'adagrad_sqs'
         if parameter.name:
             name += '_' + parameter.name
-        ssq = shared_floatx(parameter.get_value() * 0.,
-                            name=name)
+        ssq = shared_floatx_zeros_matching(parameter, name=name)
         add_role(ssq, ALGORITHM_BUFFER)
 
         ssq_t = (tensor.sqr(previous_step) + ssq)
@@ -789,16 +791,19 @@ class Adam(StepRule):
     def __init__(self, learning_rate=0.002,
                  beta1=0.1, beta2=0.001, epsilon=1e-8,
                  decay_factor=(1 - 1e-8)):
-        self.learning_rate = learning_rate
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.decay_factor = decay_factor
+        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
+        self.beta1 = shared_floatx(beta1, "beta1")
+        self.beta2 = shared_floatx(beta2, "beta2")
+        self.epsilon = shared_floatx(epsilon, "epsilon")
+        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
+        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
+                      self.decay_factor]:
+            add_role(param, ALGORITHM_HYPERPARAMETER)
 
     def compute_step(self, parameter, previous_step):
-        mean = shared_floatx(parameter.get_value() * 0., 'mean')
+        mean = shared_floatx_zeros_matching(parameter, 'mean')
         add_role(mean, ALGORITHM_BUFFER)
-        variance = shared_floatx(parameter.get_value() * 0., 'variance')
+        variance = shared_floatx_zeros_matching(parameter, 'variance')
         add_role(variance, ALGORITHM_BUFFER)
         time = shared_floatx(0., 'time')
         add_role(time, ALGORITHM_BUFFER)
diff --git a/blocks/bricks/__init__.py b/blocks/bricks/__init__.py
index f626f725..4482f03d 100644
--- a/blocks/bricks/__init__.py
+++ b/blocks/bricks/__init__.py
@@ -1,745 +1,18 @@
-"""The interface of bricks and some simple implementations."""
-import logging
-
-import numpy
-from six import add_metaclass
-from theano import tensor
-from theano.sandbox.rng_mrg import MRG_RandomStreams
-from toolz import interleave
-from picklable_itertools.extras import equizip
-
-from blocks.config import config
-from blocks.bricks.base import application, _Brick, Brick, lazy
-from blocks.bricks.wrappers import WithExtraDims
-from blocks.roles import add_role, WEIGHT, BIAS
-from blocks.utils import pack, shared_floatx_nans
-
-logger = logging.getLogger(__name__)
-
-
-class Random(Brick):
-    """A mixin class for Bricks which need Theano RNGs.
-
-    Parameters
-    ----------
-    theano_seed : int or list, optional
-        Seed to use for a
-        :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` object.
-
-    """
-    seed_rng = numpy.random.RandomState(config.default_seed)
-
-    def __init__(self, theano_seed=None, **kwargs):
-        super(Random, self).__init__(**kwargs)
-        self.theano_seed = theano_seed
-
-    @property
-    def theano_seed(self):
-        if getattr(self, '_theano_seed', None) is not None:
-            return self._theano_seed
-        else:
-            self._theano_seed = self.seed_rng.randint(
-                numpy.iinfo(numpy.int32).max)
-            return self._theano_seed
-
-    @theano_seed.setter
-    def theano_seed(self, value):
-        if hasattr(self, '_theano_seed'):
-            raise AttributeError("seed already set")
-        self._theano_seed = value
-
-    @property
-    def theano_rng(self):
-        """Returns Brick's Theano RNG, or a default one.
-
-        The default seed can be set through ``blocks.config``.
-
-        """
-        if not hasattr(self, '_theano_rng'):
-            self._theano_rng = MRG_RandomStreams(self.theano_seed)
-        return self._theano_rng
-
-    @theano_rng.setter
-    def theano_rng(self, theano_rng):
-        self._theano_rng = theano_rng
-
-
-class Initializable(Brick):
-    """Base class for bricks which push parameter initialization.
-
-    Many bricks will initialize children which perform a linear
-    transformation, often with biases. This brick allows the weights
-    and biases initialization to be configured in the parent brick and
-    pushed down the hierarchy.
-
-    Parameters
-    ----------
-    weights_init : object
-        A `NdarrayInitialization` instance which will be used by to
-        initialize the weight matrix. Required by
-        :meth:`~.Brick.initialize`.
-    biases_init : :obj:`object`, optional
-        A `NdarrayInitialization` instance that will be used to initialize
-        the biases. Required by :meth:`~.Brick.initialize` when `use_bias`
-        is `True`. Only supported by bricks for which :attr:`has_biases` is
-        ``True``.
-    use_bias : :obj:`bool`, optional
-        Whether to use a bias. Defaults to `True`. Required by
-        :meth:`~.Brick.initialize`. Only supported by bricks for which
-        :attr:`has_biases` is ``True``.
-    rng : :class:`numpy.random.RandomState`
-
-    Attributes
-    ----------
-    has_biases : bool
-        ``False`` if the brick does not support biases, and only has
-        :attr:`weights_init`.  For an example of this, see
-        :class:`.Bidirectional`. If this is ``False``, the brick does not
-        support the arguments ``biases_init`` or ``use_bias``.
-
-    """
-    has_biases = True
-    seed_rng = numpy.random.RandomState(config.default_seed)
-
-    @lazy()
-    def __init__(self, weights_init=None, biases_init=None, use_bias=True,
-                 seed=None, **kwargs):
-        super(Initializable, self).__init__(**kwargs)
-        self.weights_init = weights_init
-        if self.has_biases:
-            self.biases_init = biases_init
-        elif biases_init is not None or not use_bias:
-            raise ValueError("This brick does not support biases config")
-        self.use_bias = use_bias
-        self.seed = seed
-
-    @property
-    def seed(self):
-        if getattr(self, '_seed', None) is not None:
-            return self._seed
-        else:
-            self._seed = self.seed_rng.randint(
-                numpy.iinfo(numpy.int32).max)
-            return self._seed
-
-    @seed.setter
-    def seed(self, value):
-        if hasattr(self, '_seed'):
-            raise AttributeError("seed already set")
-        self._seed = value
-
-    @property
-    def rng(self):
-        if getattr(self, '_rng', None) is not None:
-            return self._rng
-        else:
-            self._rng = numpy.random.RandomState(self.seed)
-            return self._rng
-
-    @rng.setter
-    def rng(self, rng):
-        self._rng = rng
-
-    def _push_initialization_config(self):
-        for child in self.children:
-            if isinstance(child, Initializable):
-                child.rng = self.rng
-                if self.weights_init:
-                    child.weights_init = self.weights_init
-        if hasattr(self, 'biases_init') and self.biases_init:
-            for child in self.children:
-                if (isinstance(child, Initializable) and
-                        hasattr(child, 'biases_init')):
-                    child.biases_init = self.biases_init
-
-
-class Feedforward(Brick):
-    """Declares an interface for bricks with one input and one output.
-
-    Many bricks have just one input and just one output (activations,
-    :class:`Linear`, :class:`MLP`). To make such bricks interchangable
-    in most contexts they should share an interface for configuring
-    their input and output dimensions. This brick declares such an
-    interface.
-
-    Attributes
-    ----------
-    input_dim : int
-        The input dimension of the brick.
-    output_dim : int
-        The output dimension of the brick.
-
-    """
-    def __getattr__(self, name):
-        message = ("'{}' object does not have an attribute '{}'"
-                   .format(self.__class__.__name__, name))
-        if name in ('input_dim', 'output_dim'):
-            message += (" (which is a part of 'Feedforward' interface it"
-                        " claims to support)")
-        raise AttributeError(message)
-
-
-class Linear(Initializable, Feedforward):
-    r"""A linear transformation with optional bias.
-
-    Brick which applies a linear (affine) transformation by multiplying
-    the input with a weight matrix. By default, a bias term is added
-    (see :class:`Initializable` for information on disabling this).
-
-    Parameters
-    ----------
-    input_dim : int
-        The dimension of the input. Required by :meth:`~.Brick.allocate`.
-    output_dim : int
-        The dimension of the output. Required by :meth:`~.Brick.allocate`.
-
-    Notes
-    -----
-    See :class:`Initializable` for initialization parameters.
-
-    A linear transformation with bias is a matrix multiplication followed
-    by a vector summation.
-
-    .. math:: f(\mathbf{x}) = \mathbf{W}\mathbf{x} + \mathbf{b}
-
-    """
-    @lazy(allocation=['input_dim', 'output_dim'])
-    def __init__(self, input_dim, output_dim, **kwargs):
-        super(Linear, self).__init__(**kwargs)
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-
-    @property
-    def W(self):
-        return self.parameters[0]
-
-    @property
-    def b(self):
-        return self.parameters[1]
-
-    def _allocate(self):
-        W = shared_floatx_nans((self.input_dim, self.output_dim), name='W')
-        add_role(W, WEIGHT)
-        self.parameters.append(W)
-        self.add_auxiliary_variable(W.norm(2), name='W_norm')
-        if self.use_bias:
-            b = shared_floatx_nans((self.output_dim,), name='b')
-            add_role(b, BIAS)
-            self.parameters.append(b)
-            self.add_auxiliary_variable(b.norm(2), name='b_norm')
-
-    def _initialize(self):
-        if self.use_bias:
-            W, b = self.parameters
-            self.biases_init.initialize(b, self.rng)
-        else:
-            W, = self.parameters
-        self.weights_init.initialize(W, self.rng)
-
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        """Apply the linear transformation.
-
-        Parameters
-        ----------
-        input_ : :class:`~tensor.TensorVariable`
-            The input on which to apply the transformation
-
-        Returns
-        -------
-        output : :class:`~tensor.TensorVariable`
-            The transformed input plus optional bias
-
-        """
-        if self.use_bias:
-            W, b = self.parameters
-        else:
-            W, = self.parameters
-        output = tensor.dot(input_, W)
-        if self.use_bias:
-            output += b
-        return output
-
-    def get_dim(self, name):
-        if name == 'input_':
-            return self.input_dim
-        if name == 'output':
-            return self.output_dim
-        super(Linear, self).get_dim(name)
-
-
-class Bias(Feedforward, Initializable):
-    """Add a bias (i.e. sum with a vector)."""
-    @lazy(allocation=['dim'])
-    def __init__(self, dim, **kwargs):
-        super(Bias, self).__init__(**kwargs)
-        self.dim = dim
-
-    def _allocate(self):
-        b = shared_floatx_nans((self.output_dim,), name='b')
-        add_role(b, BIAS)
-        self.parameters.append(b)
-
-    def _initialize(self):
-        b, = self.parameters
-        self.biases_init.initialize(b, self.rng)
-
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        """Apply the linear transformation.
-
-        Parameters
-        ----------
-        input_ : :class:`~tensor.TensorVariable`
-            The input on which to apply the transformation
-
-        Returns
-        -------
-        output : :class:`~tensor.TensorVariable`
-            The transformed input plus optional bias
-
-        """
-        b, = self.parameters
-        return input_ + b
-
-    def get_dim(self, name):
-        if name in ['input_', 'output']:
-            return self.dim
-        super(Bias, self).get_dim(name)
-
-    def _get_dim(self):
-        return self.dim
-
-    def _set_dim(self, value):
-        self.dim = value
-
-    input_dim = output_dim = property(_get_dim, _set_dim)
-
-
-class Maxout(Brick):
-    """Maxout pooling transformation.
-
-    A brick that does max pooling over groups of input units. If you use
-    this code in a research project, please cite [GWFM13]_.
-
-    .. [GWFM13] Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
-       Courville, and Yoshua Bengio, *Maxout networks*, ICML (2013), pp.
-       1319-1327.
-
-    Parameters
-    ----------
-    num_pieces : int
-        The size of the groups the maximum is taken over.
-
-    Notes
-    -----
-    Maxout applies a set of linear transformations to a vector and selects
-    for each output dimension the result with the highest value.
-
-    """
-    @lazy(allocation=['num_pieces'])
-    def __init__(self, num_pieces, **kwargs):
-        super(Maxout, self).__init__(**kwargs)
-        self.num_pieces = num_pieces
-
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        """Apply the maxout transformation.
-
-        Parameters
-        ----------
-        input_ : :class:`~tensor.TensorVariable`
-            The input on which to apply the transformation
-
-        Returns
-        -------
-        output : :class:`~tensor.TensorVariable`
-            The transformed input
-
-        """
-        last_dim = input_.shape[-1]
-        output_dim = last_dim // self.num_pieces
-        new_shape = ([input_.shape[i] for i in range(input_.ndim - 1)] +
-                     [output_dim, self.num_pieces])
-        output = tensor.max(input_.reshape(new_shape, ndim=input_.ndim + 1),
-                            axis=input_.ndim)
-        return output
-
-
-class LinearMaxout(Initializable, Feedforward):
-    """Maxout pooling following a linear transformation.
-
-    This code combines the :class:`Linear` brick with a :class:`Maxout`
-    brick.
-
-    Parameters
-    ----------
-    input_dim : int
-        The dimension of the input. Required by :meth:`~.Brick.allocate`.
-    output_dim : int
-        The dimension of the output. Required by :meth:`~.Brick.allocate`.
-    num_pieces : int
-        The number of linear functions. Required by
-        :meth:`~.Brick.allocate`.
-
-    Notes
-    -----
-    See :class:`Initializable` for initialization parameters.
-
-    """
-    @lazy(allocation=['input_dim', 'output_dim', 'num_pieces'])
-    def __init__(self, input_dim, output_dim, num_pieces, **kwargs):
-        super(LinearMaxout, self).__init__(**kwargs)
-        self.linear = Linear()
-        self.maxout = Maxout()
-        self.children = [self.linear,
-                         self.maxout]
-
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.num_pieces = num_pieces
-
-    @property
-    def input_dim(self):
-        return self.linear.input_dim
-
-    @input_dim.setter
-    def input_dim(self, value):
-        self.linear.input_dim = value
-
-    def _push_allocation_config(self):
-        self.linear.output_dim = self.output_dim * self.num_pieces
-        self.maxout.num_pieces = self.num_pieces
-
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        """Apply the linear transformation followed by maxout.
-
-        Parameters
-        ----------
-        input_ : :class:`~tensor.TensorVariable`
-            The input on which to apply the transformations
-
-        Returns
-        -------
-        output : :class:`~tensor.TensorVariable`
-            The transformed input
-
-        """
-        pre_activation = self.linear.apply(input_)
-        output = self.maxout.apply(pre_activation)
-        return output
-
-
-class ActivationDocumentation(_Brick):
-    """Dynamically adds documentation to activations.
-
-    Notes
-    -----
-    See http://bugs.python.org/issue12773.
-
-    """
-    def __new__(cls, name, bases, classdict):
-        classdict['__doc__'] = \
-            """Elementwise application of {0} function.""".format(name.lower())
-        if 'apply' in classdict:
-            classdict['apply'].__doc__ = \
-                """Apply the {0} function element-wise.
-
-                Parameters
-                ----------
-                input_ : :class:`~tensor.TensorVariable`
-                    Theano variable to apply {0} to, element-wise.
-
-                Returns
-                -------
-                output : :class:`~tensor.TensorVariable`
-                    The input with the activation function applied.
-
-                """.format(name.lower())
-        return super(ActivationDocumentation, cls).__new__(cls, name, bases,
-                                                           classdict)
-
-
-@add_metaclass(ActivationDocumentation)
-class Activation(Brick):
-    """A base class for simple, element-wise activation functions.
-
-    This base class ensures that activation functions are automatically
-    documented using the :class:`ActivationDocumentation` metaclass.
-
-    """
-    pass
-
-
-class Identity(Activation):
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        return input_
-
-
-class Tanh(Activation):
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        return tensor.tanh(input_)
-
-
-class Logistic(Activation):
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        return tensor.nnet.sigmoid(input_)
-
-
-class Softplus(Activation):
-    r""" Softplus brick.
-
-    The softplus is defined as :math:`\zeta(x) = \log(1+e^x)`.
-
-    .. Dugas, C., Bengio, Y., Belisle, F., Nadeau, C., and Garcia,
-       R. (2001). Incorporating second-order functional knowledge
-       for better option pricing. In NIPS 13 . MIT Press.
-
-    """
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        return tensor.nnet.softplus(input_)
-
-
-class Rectifier(Activation):
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        return tensor.switch(input_ > 0, input_, 0)
-
-
-class Softmax(Brick):
-    """A softmax brick.
-
-    Works with 2-dimensional inputs only. If you need more,
-    see :class:`NDimensionalSoftmax`.
-
-    """
-    @application(inputs=['input_'], outputs=['output'])
-    def apply(self, input_):
-        """Standard softmax.
-
-        Parameters
-        ----------
-        input_ : :class:`~theano.Variable`
-            A matrix, each row contains unnormalized log-probabilities of a
-            distribution.
-
-        Returns
-        -------
-        output_ : :class:`~theano.Variable`
-            A matrix with probabilities in each row for each distribution
-            from `input_`.
-
-        """
-        return tensor.nnet.softmax(input_)
-
-    @application(inputs=['input_'], outputs=['output'])
-    def log_probabilities(self, input_):
-        """Normalize log-probabilities.
-
-        Converts unnormalized log-probabilities (exponents of which do not
-        sum to one) into actual log-probabilities (exponents of which sum
-        to one).
-
-        Parameters
-        ----------
-        input_ : :class:`~theano.Variable`
-            A matrix, each row contains unnormalized log-probabilities of a
-            distribution.
-
-        Returns
-        -------
-        output : :class:`~theano.Variable`
-            A matrix with normalized log-probabilities in each row for each
-            distribution from `input_`.
-
-        """
-        shifted = input_ - input_.max(axis=1, keepdims=True)
-        return shifted - tensor.log(
-            tensor.exp(shifted).sum(axis=1, keepdims=True))
-
-    @application(inputs=['y', 'x'], outputs=['output'])
-    def categorical_cross_entropy(self, application_call, y, x):
-        """Computationally stable cross-entropy for pre-softmax values.
-
-        Parameters
-        ----------
-        y : :class:`~tensor.TensorVariable`
-            In the case of a matrix argument, each row represents a
-            probabilility distribution. In the vector case, each element
-            represents a distribution by specifying the position of 1 in a
-            1-hot vector.
-        x : :class:`~tensor.TensorVariable`
-            A matrix, each row contains unnormalized probabilities of a
-            distribution.
-
-        Returns
-        -------
-        cost : :class:`~tensor.TensorVariable`
-            A vector of cross-entropies between respective distributions
-            from y and x.
-
-        """
-        x = self.log_probabilities(x)
-        application_call.add_auxiliary_variable(
-            x.copy(name='log_probabilities'))
-        if y.ndim == x.ndim - 1:
-            indices = tensor.arange(y.shape[0]) * x.shape[1] + y
-            cost = -x.flatten()[indices]
-        elif y.ndim == x.ndim:
-            cost = -(x * y).sum(axis=1)
-        else:
-            raise TypeError('rank mismatch between x and y')
-        return cost
-
-
-class NDimensionalSoftmax(Softmax):
-    decorators = [WithExtraDims()]
-
-
-class Sequence(Brick):
-    """A sequence of bricks.
-
-    This brick applies a sequence of bricks, assuming that their in- and
-    outputs are compatible.
-
-    Parameters
-    ----------
-    application_methods : list
-        List of :class:`.BoundApplication` to apply
-
-    """
-    def __init__(self, application_methods, **kwargs):
-        super(Sequence, self).__init__(**kwargs)
-        self.application_methods = application_methods
-
-        seen = set()
-        self.children = [app.brick for app in application_methods
-                         if not (app.brick in seen or seen.add(app.brick))]
-
-    @application
-    def apply(self, *args):
-        child_input = args
-        for application_method in self.application_methods:
-            output = application_method(*pack(child_input))
-            child_input = output
-        return output
-
-    @apply.property('inputs')
-    def apply_inputs(self):
-        return self.application_methods[0].inputs
-
-    @apply.property('outputs')
-    def apply_outputs(self):
-        return self.application_methods[-1].outputs
-
-
-class FeedforwardSequence(Sequence, Feedforward):
-    """A sequence where the first and last bricks are feedforward.
-
-    Parameters
-    ----------
-    application_methods : list
-        List of :class:`.BoundApplication` to apply. The first and last
-        application method should belong to a :class:`Feedforward` brick.
-
-    """
-    @property
-    def input_dim(self):
-        return self.children[0].input_dim
-
-    @input_dim.setter
-    def input_dim(self, value):
-        self.children[0].input_dim = value
-
-    @property
-    def output_dim(self):
-        return self.children[-1].output_dim
-
-    @output_dim.setter
-    def output_dim(self, value):
-        self.children[-1].output_dim = value
-
-
-class MLP(Sequence, Initializable, Feedforward):
-    """A simple multi-layer perceptron.
-
-    Parameters
-    ----------
-    activations : list of :class:`.Brick`, :class:`.BoundApplication`,
-                  or ``None``
-        A list of activations to apply after each linear transformation.
-        Give ``None`` to not apply any activation. It is assumed that the
-        application method to use is ``apply``. Required for
-        :meth:`__init__`.
-    dims : list of ints
-        A list of input dimensions, as well as the output dimension of the
-        last layer. Required for :meth:`~.Brick.allocate`.
-
-    Notes
-    -----
-    See :class:`Initializable` for initialization parameters.
-
-    Note that the ``weights_init``, ``biases_init`` and ``use_bias``
-    configurations will overwrite those of the layers each time the
-    :class:`MLP` is re-initialized. For more fine-grained control, push the
-    configuration to the child layers manually before initialization.
-
-    >>> from blocks.initialization import IsotropicGaussian, Constant
-    >>> mlp = MLP(activations=[Tanh(), None], dims=[30, 20, 10],
-    ...           weights_init=IsotropicGaussian(),
-    ...           biases_init=Constant(1))
-    >>> mlp.push_initialization_config()  # Configure children
-    >>> mlp.children[0].weights_init = IsotropicGaussian(0.1)
-    >>> mlp.initialize()
-
-    """
-    @lazy(allocation=['dims'])
-    def __init__(self, activations, dims, **kwargs):
-        self.activations = activations
-
-        self.linear_transformations = [Linear(name='linear_{}'.format(i))
-                                       for i in range(len(activations))]
-        # Interleave the transformations and activations
-        application_methods = []
-        for entity in interleave([self.linear_transformations, activations]):
-            if entity is None:
-                continue
-            if isinstance(entity, Brick):
-                application_methods.append(entity.apply)
-            else:
-                application_methods.append(entity)
-        if not dims:
-            dims = [None] * (len(activations) + 1)
-        self.dims = dims
-        super(MLP, self).__init__(application_methods, **kwargs)
-
-    @property
-    def input_dim(self):
-        return self.dims[0]
-
-    @input_dim.setter
-    def input_dim(self, value):
-        self.dims[0] = value
-
-    @property
-    def output_dim(self):
-        return self.dims[-1]
-
-    @output_dim.setter
-    def output_dim(self, value):
-        self.dims[-1] = value
-
-    def _push_allocation_config(self):
-        if not len(self.dims) - 1 == len(self.linear_transformations):
-            raise ValueError
-        for input_dim, output_dim, layer in \
-                equizip(self.dims[:-1], self.dims[1:],
-                        self.linear_transformations):
-            layer.input_dim = input_dim
-            layer.output_dim = output_dim
-            layer.use_bias = self.use_bias
+"""Bricks are parameterized Theano operations."""
+from .base import application, Brick, lazy
+from .bn import (BatchNormalization, SpatialBatchNormalization,
+                 BatchNormalizedMLP)
+from .interfaces import Activation, Feedforward, Initializable, Random
+from .simple import (Linear, Bias, Maxout, LinearMaxout, Identity, Tanh,
+                     Logistic, Softplus, Rectifier, Softmax,
+                     NDimensionalSoftmax)
+from .sequences import Sequence, FeedforwardSequence, MLP
+from .wrappers import WithExtraDims
+
+__all__ = ('application', 'Brick', 'lazy', 'BatchNormalization',
+           'SpatialBatchNormalization', 'BatchNormalizedMLP',
+           'Activation', 'Feedforward', 'Initializable', 'Random',
+           'Linear', 'Bias', 'Maxout', 'LinearMaxout', 'Identity',
+           'Tanh', 'Logistic', 'Softplus', 'Rectifier', 'Softmax',
+           'NDimensionalSoftmax', 'Sequence', 'FeedforwardSequence',
+           'MLP', 'WithExtraDims')
diff --git a/blocks/bricks/attention.py b/blocks/bricks/attention.py
index c1aeb82e..8946ea0a 100644
--- a/blocks/bricks/attention.py
+++ b/blocks/bricks/attention.py
@@ -415,11 +415,20 @@ def get_dim(self, name):
 
 
 class ShallowEnergyComputer(Sequence, Initializable, Feedforward):
-    """A simple energy computer: first tanh, then weighted sum."""
+    """A simple energy computer: first tanh, then weighted sum.
+
+    Parameters
+    ----------
+    use_bias : bool, optional
+        Whether a bias should be added to the energies. Does not change
+        anything if softmax normalization is used to produce the attention
+        weights, but might be useful when e.g. spherical softmax is used.
+
+    """
     @lazy()
-    def __init__(self, **kwargs):
+    def __init__(self, use_bias=False, **kwargs):
         super(ShallowEnergyComputer, self).__init__(
-            [Tanh().apply, Linear(use_bias=False).apply], **kwargs)
+            [Tanh().apply, Linear(use_bias=use_bias).apply], **kwargs)
 
     @property
     def input_dim(self):
diff --git a/blocks/bricks/base.py b/blocks/bricks/base.py
index 46e54430..6bd0d29d 100644
--- a/blocks/bricks/base.py
+++ b/blocks/bricks/base.py
@@ -326,6 +326,13 @@ def copy_and_tag(variable, role, name):
             return OrderedDict(zip(bound_application.outputs, outputs))
         return unpack(outputs)
 
+    # Application instances are used instead of usual methods in bricks.
+    # The usual methods are not pickled per-se, similarly to classes
+    # and modules. Instead, a reference to the method is put into the pickle.
+    # Here, we ensure the same behaviour for Application instances.
+    def __reduce__(self):
+        return (getattr, (self.brick, self.application_name))
+
 
 class BoundApplication(object):
     """An application method bound to a :class:`Brick` instance."""
@@ -873,6 +880,7 @@ class ApplicationCall(Annotation):
     """
     def __init__(self, application):
         self.application = application
+        self.metadata = {}
         super(ApplicationCall, self).__init__()
 
     def add_auxiliary_variable(self, variable, roles=None, name=None):
diff --git a/blocks/bricks/bn.py b/blocks/bricks/bn.py
new file mode 100644
index 00000000..4aefff33
--- /dev/null
+++ b/blocks/bricks/bn.py
@@ -0,0 +1,348 @@
+import collections
+
+import numpy
+from picklable_itertools.extras import equizip
+import theano
+from theano import tensor
+from theano.tensor.nnet import bn
+
+from ..graph import add_annotation
+from ..initialization import Constant
+from ..roles import (BATCH_NORM_POPULATION_MEAN,
+                     BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
+                     BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
+                     BATCH_NORM_SHIFT_PARAMETER, BATCH_NORM_SCALE_PARAMETER,
+                     add_role)
+from ..utils import (shared_floatx_zeros, shared_floatx,
+                     shared_floatx_nans)
+from .base import lazy, application
+from .sequences import Sequence, Feedforward, MLP
+from .interfaces import RNGMixin
+
+
+def _add_batch_axis(var):
+    """Prepend a singleton axis to a TensorVariable and name it."""
+    new_var = new_var = tensor.shape_padleft(var)
+    new_var.name = 'shape_padleft({})'.format(var.name)
+    return new_var
+
+
+def _add_role_and_annotate(var, role, annotations=()):
+    """Add a role and zero or more annotations to a variable."""
+    add_role(var, role)
+    for annotation in annotations:
+        add_annotation(var, annotation)
+
+
+class BatchNormalization(RNGMixin, Feedforward):
+    r"""Normalizes activations, parameterizes a scale and shift.
+
+    Parameters
+    ----------
+    input_dim : int or tuple
+        Shape of a single input example. It is assumed that a batch axis
+        will be prepended to this.
+    broadcastable : tuple, optional
+        Tuple the same length as `input_dim` which specifies which of the
+        per-example axes should be averaged over to compute means and
+        standard deviations. For example, in order to normalize over all
+        spatial locations in a `(batch_index, channels, height, width)`
+        image, pass `(False, True, True)`.
+    conserve_memory : bool, optional
+        Use an implementation that stores less intermediate state and
+        therefore uses less memory, at the expense of 5-10% speed. Default
+        is `True`.
+    epsilon : float, optional
+       The stabilizing constant for the minibatch standard deviation
+       computation (when the brick is run in training mode).
+       Added to the variance inside the square root, as in the
+       batch normalization paper.
+    scale_init : object, optional
+        Initialization object to use for the learned scaling parameter
+        ($\\gamma$ in [BN]_). By default, uses constant initialization
+        of 1.
+    shift_init : object, optional
+        Initialization object to use for the learned shift parameter
+        ($\\beta$ in [BN]_). By default, uses constant initialization of 0.
+
+    Notes
+    -----
+    In order for trained models to behave sensibly immediately upon
+    upon deserialization, by default, this brick runs in *inference* mode,
+    using a population mean and population standard deviation (initialized
+    to zeros and ones respectively) to normalize activations. It is
+    expected that the user will adapt these during training in some
+    fashion, independently of the training objective, e.g. by taking a
+    moving average of minibatch-wise statistics.
+
+    In order to *train* with batch normalization, one must obtain a
+    training graph by transforming the original inference graph. See
+    :func:`~blocks.graph.apply_batch_normalization` for a routine to
+    transform graphs, and :func:`~blocks.graph.batch_normalization`
+    for a context manager that may enable shorter compile times
+    (every instance of :class:`BatchNormalization` is itself a context
+    manager, entry into which causes applications to be in minibatch
+    "training" mode, however it is usually more convenient to use
+    :func:`~blocks.graph.batch_normalization` to enable this behaviour
+    for all of your graph's :class:`BatchNormalization` bricks at once).
+
+    Note that training in inference mode should be avoided, as this
+    brick introduces scales and shift parameters (tagged with the
+    `PARAMETER` role) that, in the absence of batch normalization,
+    usually makes things unstable. If you must do this, filter for and
+    remove `BATCH_NORM_SHIFT_PARAMETER` and `BATCH_NORM_SCALE_PARAMETER`
+    from the list of parameters you are training, and this brick should
+    behave as a (somewhat expensive) no-op.
+
+    This Brick accepts `scale_init` and `shift_init` arguments but is
+    *not* an instance of :class:`~blocks.bricks.Initializable`, and will
+    therefore not receive pushed initialization config from any parent
+    brick. In almost all cases, you will probably want to stick with the
+    defaults (unit scale and zero offset), but you can explicitly pass one
+    or both initializers to override this.
+
+    This has the necessary properties to be inserted into a
+    :class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
+    the `input_dim` should be omitted at construction, to be inferred from
+    the layer below.
+
+    """
+    @lazy(allocation=['input_dim'])
+    def __init__(self, input_dim, broadcastable=None,
+                 conserve_memory=True, epsilon=1e-4, scale_init=None,
+                 shift_init=None, **kwargs):
+        self.input_dim = input_dim
+        self.broadcastable = broadcastable
+        self.conserve_memory = conserve_memory
+        self.epsilon = epsilon
+        self.scale_init = (Constant(1) if scale_init is None
+                           else scale_init)
+        self.shift_init = (Constant(0) if shift_init is None
+                           else shift_init)
+        self._training_mode = []
+        super(BatchNormalization, self).__init__(**kwargs)
+
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_, application_call):
+        if self._training_mode:
+            mean, stdev = self._compute_training_statistics(input_)
+        else:
+            mean, stdev = self._prepare_population_statistics()
+        # Useful for filtration of calls that were already made in
+        # training mode when doing graph transformations.
+        # Very important to cast to bool, as self._training_mode is
+        # normally a list (to support nested context managers), which would
+        # otherwise get passed by reference and be remotely mutated.
+        application_call.metadata['training_mode'] = bool(self._training_mode)
+        # Useful for retrieving a list of updates for population
+        # statistics. Ditch the broadcastable first axis, though, to
+        # make it the same dimensions as the population mean and stdev
+        # shared variables.
+        application_call.metadata['offset'] = mean[0]
+        application_call.metadata['divisor'] = stdev[0]
+        # Give these quantities roles in the graph.
+        _add_role_and_annotate(mean, BATCH_NORM_OFFSET,
+                               [self, application_call])
+        _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
+                               [self, application_call])
+        scale = _add_batch_axis(self.scale)
+        shift = _add_batch_axis(self.shift)
+        # Heavy lifting is done by the Theano utility function.
+        normalized = bn.batch_normalization(input_, scale, shift, mean, stdev,
+                                            mode=('low_mem'
+                                                  if self.conserve_memory
+                                                  else 'high_mem'))
+        return normalized
+
+    def __enter__(self):
+        self._training_mode.append(True)
+
+    def __exit__(self, *exc_info):
+        self._training_mode.pop()
+
+    def _compute_training_statistics(self, input_):
+        axes = (0,) + tuple((i + 1) for i, b in
+                            enumerate(self.population_mean.broadcastable)
+                            if b)
+        mean = input_.mean(axis=axes, keepdims=True)
+        assert mean.broadcastable[1:] == self.population_mean.broadcastable
+        stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
+                            numpy.cast[theano.config.floatX](self.epsilon))
+        assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
+        add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
+        add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
+        return mean, stdev
+
+    def _prepare_population_statistics(self):
+        mean = _add_batch_axis(self.population_mean)
+        stdev = _add_batch_axis(self.population_stdev)
+        return mean, stdev
+
+    def _allocate(self):
+        input_dim = ((self.input_dim,)
+                     if not isinstance(self.input_dim, collections.Sequence)
+                     else self.input_dim)
+        broadcastable = (tuple(False for _ in input_dim)
+                         if self.broadcastable is None else self.broadcastable)
+        if len(input_dim) != len(broadcastable):
+            raise ValueError("input_dim and broadcastable must be same length")
+        var_dim = tuple(1 if broadcast else dim for dim, broadcast in
+                        equizip(input_dim, broadcastable))
+        broadcastable = broadcastable
+
+        # "gamma", from the Ioffe & Szegedy manuscript.
+        self.scale = shared_floatx_nans(var_dim, name='batch_norm_scale',
+                                        broadcastable=broadcastable)
+
+        # "beta", from the Ioffe & Szegedy manuscript.
+        self.shift = shared_floatx_nans(var_dim, name='batch_norm_shift',
+                                        broadcastable=broadcastable)
+        add_role(self.scale, BATCH_NORM_SCALE_PARAMETER)
+        add_role(self.shift, BATCH_NORM_SHIFT_PARAMETER)
+        self.parameters.append(self.scale)
+        self.parameters.append(self.shift)
+
+        # These aren't technically parameters, in that they should not be
+        # learned using the same cost function as other model parameters.
+        self.population_mean = shared_floatx_zeros(var_dim,
+                                                   name='population_mean',
+                                                   broadcastable=broadcastable)
+        self.population_stdev = shared_floatx(numpy.ones(var_dim),
+                                              name='population_stdev',
+                                              broadcastable=broadcastable)
+        add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
+        add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)
+
+        # Normally these would get annotated by an AnnotatingList, but they
+        # aren't in self.parameters.
+        add_annotation(self.population_mean, self)
+        add_annotation(self.population_stdev, self)
+
+    def _initialize(self):
+        self.shift_init.initialize(self.shift, self.rng)
+        self.scale_init.initialize(self.scale, self.rng)
+
+    # Needed for the Feedforward interface.
+    @property
+    def output_dim(self):
+        return self.input_dim
+
+    # The following properties allow for BatchNormalization bricks
+    # to be used directly inside of a ConvolutionalSequence.
+    @property
+    def image_size(self):
+        return self.input_dim[-2:]
+
+    @image_size.setter
+    def image_size(self, value):
+        if not isinstance(self.input_dim, collections.Sequence):
+            self.input_dim = (None,) + tuple(value)
+        else:
+            self.input_dim = (self.input_dim[0],) + tuple(value)
+
+    @property
+    def num_channels(self):
+        return self.input_dim[0]
+
+    @num_channels.setter
+    def num_channels(self, value):
+        if not isinstance(self.input_dim, collections.Sequence):
+            self.input_dim = (value,) + (None, None)
+        else:
+            self.input_dim = (value,) + self.input_dim[-2:]
+
+    def get_dim(self, name):
+        if name in ('input', 'output'):
+            return self.input_dim
+        else:
+            raise KeyError
+
+    @property
+    def num_output_channels(self):
+        return self.num_channels
+
+
+class SpatialBatchNormalization(BatchNormalization):
+    """Convenient subclass for batch normalization across spatial inputs.
+
+    Parameters
+    ----------
+    input_dim : int or tuple
+        The input size of a single example. Must be length at least 2.
+        It's assumed that the first axis of this tuple is a "channels"
+        axis, which should not be summed over, and all remaining
+        dimensions are spatial dimensions.
+
+    Notes
+    -----
+    See :class:`BatchNormalization` for more details (and additional
+    keyword arguments).
+
+    """
+    def _allocate(self):
+        if not isinstance(self.input_dim,
+                          collections.Sequence) or len(self.input_dim) < 2:
+            raise ValueError('expected input_dim to be length >= 2 '
+                             'e.g. (channels, height, width)')
+        self.broadcastable = (False,) + ((True,) * (len(self.input_dim) - 1))
+        super(SpatialBatchNormalization, self)._allocate()
+
+
+class BatchNormalizedMLP(MLP):
+    """Convenient subclass for building an MLP with batch normalization.
+
+    Parameters
+    ----------
+    conserve_memory : bool, optional
+        See :class:`BatchNormalization`.
+
+    Notes
+    -----
+    All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
+    activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
+    containing an appropriate :class:`BatchNormalization` brick and
+    the activation that follows it.
+
+    By default, the contained :class:`~blocks.bricks.Linear` bricks will
+    not contain any biases, as they could be canceled out by the biases
+    in the :class:`BatchNormalization` bricks being added. Pass
+    `use_bias` with a value of `True` if you really want this for some
+    reason.
+
+    """
+    @lazy(allocation=['dims'])
+    def __init__(self, activations, dims, *args, **kwargs):
+        conserve_memory = kwargs.pop('conserve_memory', True)
+        activations = [
+            Sequence([
+                BatchNormalization(conserve_memory=conserve_memory).apply,
+                act.apply
+            ], name='batch_norm_activation_{}'.format(i))
+            for i, act in enumerate(activations)
+        ]
+        # Batch normalization bricks incorporate a bias, so there's no
+        # need for our Linear bricks to have them.
+        kwargs.setdefault('use_bias', False)
+        super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
+                                                 **kwargs)
+
+    @property
+    def conserve_memory(self):
+        return self._conserve_memory
+
+    @conserve_memory.setter
+    def conserve_memory(self, value):
+        self._conserve_memory = value
+        for act in self.activations:
+            assert isinstance(act.children[0], BatchNormalization)
+            act.children[0].conserve_memory = value
+
+    def _push_allocation_config(self):
+        super(BatchNormalizedMLP, self)._push_allocation_config()
+        # Do the extra allocation pushing for the BatchNormalization
+        # bricks. They need as their input dimension the output dimension
+        # of each linear transformation.  Exclude the first dimension,
+        # which is the input dimension.
+        for act, dim in equizip(self.activations, self.dims[1:]):
+            assert isinstance(act.children[0], BatchNormalization)
+            act.children[0].input_dim = dim
diff --git a/blocks/bricks/conv.py b/blocks/bricks/conv.py
index 33a9728d..f9340c75 100644
--- a/blocks/bricks/conv.py
+++ b/blocks/bricks/conv.py
@@ -1,7 +1,9 @@
-from theano.tensor.nnet.conv import conv2d, ConvOp
-from theano.tensor.signal.downsample import max_pool_2d, DownsampleFactorMax
+from theano.tensor.nnet import conv2d
+from theano.tensor.nnet.abstract_conv import (AbstractConv2d_gradInputs,
+                                              get_conv_output_shape)
+from theano.tensor.signal.pool import pool_2d, Pool
 
-from blocks.bricks import Initializable, Feedforward, Sequence
+from blocks.bricks import Initializable, Feedforward, Sequence, Activation
 from blocks.bricks.base import application, Brick, lazy
 from blocks.roles import add_role, FILTER, BIAS
 from blocks.utils import shared_floatx_nans
@@ -49,15 +51,21 @@ class Convolutional(Initializable):
     # to leverage features not yet available in Theano's standard conv2d.
     # The function you override with here should accept at least the
     # input and the kernels as positionals, and the keyword arguments
-    # image_shape, subsample, border_mode, and filter_shape. If some of
+    # input_shape, subsample, border_mode, and filter_shape. If some of
     # these are unsupported they should still be accepted and ignored,
     # e.g. with a wrapper function that swallows **kwargs.
     conv2d_impl = staticmethod(conv2d)
 
     # Used to override the output shape computation for a given value of
-    # conv2d_impl. Should accept 4 positional arguments: the image size,
-    # the filter size, the step (strides), and the border mode.
-    get_output_shape = staticmethod(ConvOp.getOutputShape)
+    # conv2d_impl. Should accept 4 positional arguments: the shape of an
+    # image minibatch (with 4 elements: batch size, number of channels,
+    # height, and width), the shape of the filter bank (number of filters,
+    # number of output channels, filter height, filter width), the border
+    # mode, and the step (vertical and horizontal strides). It is expected
+    # to return a 4-tuple of (batch size, number of channels, output
+    # height, output width). The first element of this tuple is not used
+    # for anything by this brick.
+    get_output_shape = staticmethod(get_conv_output_shape)
 
     @lazy(allocation=['filter_size', 'num_filters', 'num_channels'])
     def __init__(self, filter_size, num_filters, num_channels, batch_size=None,
@@ -134,14 +142,14 @@ def apply(self, input_):
             W, = self.parameters
 
         if self.image_size == (None, None):
-            image_shape = None
+            input_shape = None
         else:
-            image_shape = (self.batch_size, self.num_channels)
-            image_shape += self.image_size
+            input_shape = (self.batch_size, self.num_channels)
+            input_shape += self.image_size
 
         output = self.conv2d_impl(
             input_, W,
-            image_shape=image_shape,
+            input_shape=input_shape,
             subsample=self.step,
             border_mode=self.border_mode,
             filter_shape=((self.num_filters, self.num_channels) +
@@ -157,36 +165,108 @@ def get_dim(self, name):
         if name == 'input_':
             return (self.num_channels,) + self.image_size
         if name == 'output':
-            return ((self.num_filters,) +
-                    self.get_output_shape(self.image_size, self.filter_size,
-                                          self.step, self.border_mode))
+            input_shape = (None, self.num_channels) + self.image_size
+            kernel_shape = ((self.num_filters, self.num_channels) +
+                            self.filter_size)
+            out_shape = self.get_output_shape(input_shape, kernel_shape,
+                                              self.border_mode, self.step)
+            assert len(out_shape) == 4
+            return out_shape[1:]
         return super(Convolutional, self).get_dim(name)
 
+    @property
+    def num_output_channels(self):
+        return self.num_filters
 
-class MaxPooling(Initializable, Feedforward):
-    """Max pooling layer.
+
+class ConvolutionalTranspose(Convolutional):
+    """Performs the transpose of a 2D convolution.
 
     Parameters
     ----------
-    pooling_size : tuple
-        The height and width of the pooling region i.e. this is the factor
-        by which your input's last two dimensions will be downscaled.
+    original_image_size : tuple
+        The height and width of the image that forms the output of
+        the transpose operation, which is the input of the original
+        (non-transposed) convolution.
+    num_filters : int
+        Number of filters at the *output* of the transposed convolution,
+        i.e. the number of channels in the corresponding convolution.
+    num_channels : int
+        Number of channels at the *input* of the transposed convolution,
+        i.e. the number of output filters in the corresponding
+        convolution.
     step : tuple, optional
-        The vertical and horizontal shift (stride) between pooling regions.
-        By default this is equal to `pooling_size`. Setting this to a lower
-        number results in overlapping pooling regions.
-    input_dim : tuple, optional
-        A tuple of integers representing the shape of the input. The last
-        two dimensions will be used to calculate the output dimension.
+        The step (or stride) of the corresponding *convolution*.
+        Defaults to (1, 1).
+    image_size : tuple, optional
+        Image size of the input to the *transposed* convolution, i.e.
+        the output of the corresponding convolution. Required for tied
+        biases. Defaults to ``None``.
+
+    See Also
+    --------
+    :class:`Convolutional` : For the documentation of other parameters.
 
     """
-    @lazy(allocation=['pooling_size'])
-    def __init__(self, pooling_size, step=None, input_dim=None, **kwargs):
-        super(MaxPooling, self).__init__(**kwargs)
+    @lazy(allocation=['original_image_size', 'filter_size', 'num_filters',
+                      'num_channels'])
+    def __init__(self, original_image_size, filter_size, num_filters,
+                 num_channels, **kwargs):
+        super(ConvolutionalTranspose, self).__init__(
+            filter_size, num_filters, num_channels, **kwargs)
+        self.original_image_size = original_image_size
+
+    def conv2d_impl(self, input_, W, input_shape, subsample, border_mode,
+                    filter_shape):
+        # The AbstractConv2d_gradInputs op takes a kernel that was used for the
+        # **convolution**. We therefore have to invert num_channels and
+        # num_filters for W.
+        W = W.transpose(1, 0, 2, 3)
+        imshp = (None,) + self.get_dim('output')
+        kshp = (filter_shape[1], filter_shape[0]) + filter_shape[2:]
+        return AbstractConv2d_gradInputs(
+            imshp=imshp, kshp=kshp, border_mode=border_mode,
+            subsample=subsample)(W, input_, self.get_dim('output')[1:])
+
+    def get_dim(self, name):
+        if name == 'output':
+            return (self.num_filters,) + self.original_image_size
+        return super(ConvolutionalTranspose, self).get_dim(name)
+
+
+class Pooling(Initializable, Feedforward):
+    """Base Brick for pooling operations.
 
-        self.input_dim = input_dim
+    This should generally not be instantiated directly; see
+    :class:`MaxPooling`.
+
+    """
+    @lazy(allocation=['mode', 'pooling_size'])
+    def __init__(self, mode, pooling_size, step, input_dim, ignore_border,
+                 padding, **kwargs):
+        super(Pooling, self).__init__(**kwargs)
         self.pooling_size = pooling_size
+        self.mode = mode
         self.step = step
+        self.input_dim = input_dim if input_dim is not None else (None,) * 3
+        self.ignore_border = ignore_border
+        self.padding = padding
+
+    @property
+    def image_size(self):
+        return self.input_dim[-2:]
+
+    @image_size.setter
+    def image_size(self, value):
+        self.input_dim = self.input_dim[:-2] + value
+
+    @property
+    def num_channels(self):
+        return self.input_dim[0]
+
+    @num_channels.setter
+    def num_channels(self, value):
+        self.input_dim = (value,) + self.input_dim[1:]
 
     @application(inputs=['input_'], outputs=['output'])
     def apply(self, input_):
@@ -207,145 +287,124 @@ def apply(self, input_):
             with the last two dimensions downsampled.
 
         """
-        output = max_pool_2d(input_, self.pooling_size, st=self.step)
+        output = pool_2d(input_, self.pooling_size, st=self.step,
+                         mode=self.mode, padding=self.padding,
+                         ignore_border=self.ignore_border)
         return output
 
     def get_dim(self, name):
         if name == 'input_':
             return self.input_dim
         if name == 'output':
-            return tuple(DownsampleFactorMax.out_shape(self.input_dim,
-                                                       self.pooling_size,
-                                                       st=self.step))
+            return tuple(Pool.out_shape(
+                self.input_dim, self.pooling_size, st=self.step,
+                ignore_border=self.ignore_border, padding=self.padding))
 
-
-class _AllocationMixin(object):
-    def _push_allocation_config(self):
-        for attr in ['filter_size', 'num_filters', 'border_mode',
-                     'batch_size', 'num_channels', 'image_size',
-                     'tied_biases', 'use_bias']:
-            setattr(self.convolution, attr, getattr(self, attr))
+    @property
+    def num_output_channels(self):
+        return self.input_dim[0]
 
 
-class ConvolutionalActivation(_AllocationMixin, Sequence, Initializable):
-    """A convolution followed by an activation function.
+class MaxPooling(Pooling):
+    """Max pooling layer.
 
     Parameters
     ----------
-    activation : :class:`.BoundApplication`
-        The application method to apply after convolution (i.e.
-        the nonlinear activation function)
+    pooling_size : tuple
+        The height and width of the pooling region i.e. this is the factor
+        by which your input's last two dimensions will be downscaled.
+    step : tuple, optional
+        The vertical and horizontal shift (stride) between pooling regions.
+        By default this is equal to `pooling_size`. Setting this to a lower
+        number results in overlapping pooling regions.
+    input_dim : tuple, optional
+        A tuple of integers representing the shape of the input. The last
+        two dimensions will be used to calculate the output dimension.
+    padding : tuple, optional
+        A tuple of integers representing the vertical and horizontal
+        zero-padding to be applied to each of the top and bottom
+        (vertical) and left and right (horizontal) edges. For example,
+        an argument of (4, 3) will apply 4 pixels of padding to the
+        top edge, 4 pixels of padding to the bottom edge, and 3 pixels
+        each for the left and right edge. By default, no padding is
+        performed.
+    ignore_border : bool, optional
+        Whether or not to do partial downsampling based on borders where
+        the extent of the pooling region reaches beyond the edge of the
+        image. If `True`, a (5, 5) image with (2, 2) pooling regions
+        and (2, 2) step will be downsampled to shape (2, 2), otherwise
+        it will be downsampled to (3, 3). `True` by default.
 
-    See Also
-    --------
-    :class:`Convolutional` : For the documentation of other parameters.
+    Notes
+    -----
+    .. warning::
+        As of this writing, setting `ignore_border` to `False` with a step
+        not equal to the pooling size will force Theano to perform pooling
+        computations on CPU rather than GPU, even if you have specified
+        a GPU as your computation device. Additionally, Theano will only
+        use [cuDNN]_ (if available) for pooling computations with
+        `ignure_border` set to `True`. You can ensure that the entire
+        input is captured by at least one pool by using the `padding`
+        argument to add zero padding prior to pooling being performed.
+
+    .. [cuDNN]: `NVIDIA cuDNN <https://developer.nvidia.com/cudnn>`_.
 
     """
-    @lazy(allocation=['filter_size', 'num_filters', 'num_channels'])
-    def __init__(self, activation, filter_size, num_filters, num_channels,
-                 batch_size=None, image_size=None, step=(1, 1),
-                 border_mode='valid', tied_biases=False, **kwargs):
-        self.convolution = Convolutional()
-
-        self.filter_size = filter_size
-        self.num_filters = num_filters
-        self.num_channels = num_channels
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.step = step
-        self.border_mode = border_mode
-        self.tied_biases = tied_biases
-
-        super(ConvolutionalActivation, self).__init__(
-            application_methods=[self.convolution.apply, activation],
-            **kwargs)
-
-    def get_dim(self, name):
-        # TODO The name of the activation output doesn't need to be `output`
-        return self.convolution.get_dim(name)
-
-    def _push_allocation_config(self):
-        super(ConvolutionalActivation, self)._push_allocation_config()
-        self.convolution.step = self.step
-
+    @lazy(allocation=['pooling_size'])
+    def __init__(self, pooling_size, step=None, input_dim=None,
+                 ignore_border=True, padding=(0, 0),
+                 **kwargs):
+        super(MaxPooling, self).__init__('max', pooling_size,
+                                         step=step, input_dim=input_dim,
+                                         ignore_border=ignore_border,
+                                         padding=padding, **kwargs)
 
-class ConvolutionalLayer(_AllocationMixin, Sequence, Initializable):
-    """A complete convolutional layer: Convolution, nonlinearity, pooling.
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        # Fix objects created before pull request #899.
+        self.mode = getattr(self, 'mode', 'max')
+        self.padding = getattr(self, 'padding', (0, 0))
+        self.ignore_border = getattr(self, 'ignore_border', False)
 
-    .. todo::
 
-       Mean pooling.
+class AveragePooling(Pooling):
+    """Average pooling layer.
 
     Parameters
     ----------
-    activation : :class:`.BoundApplication`
-        The application method to apply in the detector stage (i.e. the
-        nonlinearity before pooling. Needed for ``__init__``.
-
-    See Also
-    --------
-    :class:`Convolutional` : Documentation of convolution arguments.
-    :class:`MaxPooling` : Documentation of pooling arguments.
+    include_padding : bool, optional
+        When calculating an average, include zeros that are the
+        result of zero padding added by the `padding` argument.
+        A value of `True` is only accepted if `ignore_border`
+        is also `True`. `False` by default.
 
     Notes
     -----
-    Uses max pooling.
+    For documentation on the remainder of the arguments to this
+    class, see :class:`MaxPooling`.
 
     """
-    @lazy(allocation=['filter_size', 'num_filters', 'pooling_size',
-                      'num_channels'])
-    def __init__(self, activation, filter_size, num_filters, pooling_size,
-                 num_channels, conv_step=(1, 1), pooling_step=None,
-                 batch_size=None, image_size=None, border_mode='valid',
-                 tied_biases=False, **kwargs):
-        self.convolution = ConvolutionalActivation(activation)
-        self.pooling = MaxPooling()
-        super(ConvolutionalLayer, self).__init__(
-            application_methods=[self.convolution.apply,
-                                 self.pooling.apply], **kwargs)
-        self.convolution.name = self.name + '_convolution'
-        self.pooling.name = self.name + '_pooling'
-
-        self.filter_size = filter_size
-        self.num_filters = num_filters
-        self.num_channels = num_channels
-        self.pooling_size = pooling_size
-        self.conv_step = conv_step
-        self.pooling_step = pooling_step
-        self.batch_size = batch_size
-        self.border_mode = border_mode
-        self.image_size = image_size
-        self.tied_biases = tied_biases
-
-    def _push_allocation_config(self):
-        super(ConvolutionalLayer, self)._push_allocation_config()
-        self.convolution.step = self.conv_step
-        self.convolution._push_allocation_config()
-        if self.image_size is not None:
-            pooling_input_dim = self.convolution.get_dim('output')
-        else:
-            pooling_input_dim = None
-        self.pooling.input_dim = pooling_input_dim
-        self.pooling.pooling_size = self.pooling_size
-        self.pooling.step = self.pooling_step
-        self.pooling.batch_size = self.batch_size
-
-    def get_dim(self, name):
-        if name == 'input_':
-            return self.convolution.get_dim('input_')
-        if name == 'output':
-            return self.pooling.get_dim('output')
-        return super(ConvolutionalLayer, self).get_dim(name)
+    @lazy(allocation=['pooling_size'])
+    def __init__(self, pooling_size, step=None, input_dim=None,
+                 ignore_border=True, padding=(0, 0),
+                 include_padding=False, **kwargs):
+        mode = 'average_inc_pad' if include_padding else 'average_exc_pad'
+        super(AveragePooling, self).__init__(mode, pooling_size,
+                                             step=step, input_dim=input_dim,
+                                             ignore_border=ignore_border,
+                                             padding=padding, **kwargs)
 
 
 class ConvolutionalSequence(Sequence, Initializable, Feedforward):
-    """A sequence of convolutional operations.
+    """A sequence of convolutional (or pooling) operations.
 
     Parameters
     ----------
     layers : list
-        List of convolutional bricks (i.e. :class:`ConvolutionalActivation`
-        or :class:`ConvolutionalLayer`)
+        List of convolutional bricks (i.e. :class:`Convolutional`,
+        :class:`ConvolutionalActivation`, or :class:`Pooling` bricks).
+        :class:`Activation` bricks that operate elementwise can also
+        be included.
     num_channels : int
         Number of input channels in the image. For the first layer this is
         normally 1 for grayscale images and 3 for color (RGB) images. For
@@ -403,13 +462,25 @@ def get_dim(self, name):
         if name == 'input_':
             return ((self.num_channels,) + self.image_size)
         if name == 'output':
-            return self.layers[-1].get_dim(name)
+            last = len(self.layers) - 1
+            while last >= 0:
+                try:
+                    return self.layers[last].get_dim(name)
+                except ValueError:
+                    last -= 1
+            # The output shape of an empty ConvolutionalSequence or one
+            # consisting only of Activations is the input shape.
+            return self.get_dim('input_')
         return super(ConvolutionalSequence, self).get_dim(name)
 
     def _push_allocation_config(self):
         num_channels = self.num_channels
         image_size = self.image_size
         for layer in self.layers:
+            if isinstance(layer, Activation):
+                # Activations operate elementwise; nothing to set.
+                layer.push_allocation_config()
+                continue
             if self.border_mode is not None:
                 layer.border_mode = self.border_mode
             layer.tied_biases = self.tied_biases
@@ -419,14 +490,14 @@ def _push_allocation_config(self):
             layer.use_bias = self.use_bias
 
             # Push input dimensions to children
-            layer._push_allocation_config()
+            layer.push_allocation_config()
 
             # Retrieve output dimensions
             # and set it for next layer
             if layer.image_size is not None:
                 output_shape = layer.get_dim('output')
                 image_size = output_shape[1:]
-            num_channels = layer.num_filters
+            num_channels = layer.num_output_channels
 
 
 class Flattener(Brick):
diff --git a/blocks/bricks/interfaces.py b/blocks/bricks/interfaces.py
new file mode 100644
index 00000000..aef73a43
--- /dev/null
+++ b/blocks/bricks/interfaces.py
@@ -0,0 +1,214 @@
+"""Bricks that are interfaces and/or mixins."""
+import numpy
+from six import add_metaclass
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+from ..config import config
+from .base import _Brick, Brick, lazy
+
+
+class ActivationDocumentation(_Brick):
+    """Dynamically adds documentation to activations.
+
+    Notes
+    -----
+    See http://bugs.python.org/issue12773.
+
+    """
+    def __new__(cls, name, bases, classdict):
+        classdict['__doc__'] = \
+            """Elementwise application of {0} function.""".format(name.lower())
+        if 'apply' in classdict:
+            classdict['apply'].__doc__ = \
+                """Apply the {0} function element-wise.
+
+                Parameters
+                ----------
+                input_ : :class:`~tensor.TensorVariable`
+                    Theano variable to apply {0} to, element-wise.
+
+                Returns
+                -------
+                output : :class:`~tensor.TensorVariable`
+                    The input with the activation function applied.
+
+                """.format(name.lower())
+        return super(ActivationDocumentation, cls).__new__(cls, name, bases,
+                                                           classdict)
+
+
+@add_metaclass(ActivationDocumentation)
+class Activation(Brick):
+    """A base class for simple, element-wise activation functions.
+
+    This base class ensures that activation functions are automatically
+    documented using the :class:`ActivationDocumentation` metaclass.
+
+    """
+    pass
+
+
+class Feedforward(Brick):
+    """Declares an interface for bricks with one input and one output.
+
+    Many bricks have just one input and just one output (activations,
+    :class:`Linear`, :class:`MLP`). To make such bricks interchangable
+    in most contexts they should share an interface for configuring
+    their input and output dimensions. This brick declares such an
+    interface.
+
+    Attributes
+    ----------
+    input_dim : int
+        The input dimension of the brick.
+    output_dim : int
+        The output dimension of the brick.
+
+    """
+    def __getattr__(self, name):
+        message = ("'{}' object does not have an attribute '{}'"
+                   .format(self.__class__.__name__, name))
+        if name in ('input_dim', 'output_dim'):
+            message += (" (which is a part of 'Feedforward' interface it"
+                        " claims to support)")
+        raise AttributeError(message)
+
+
+class RNGMixin(object):
+    """Mixin for initialization random number generators."""
+    seed_rng = numpy.random.RandomState(config.default_seed)
+
+    @property
+    def seed(self):
+        if getattr(self, '_seed', None) is not None:
+            return self._seed
+        else:
+            self._seed = self.seed_rng.randint(
+                numpy.iinfo(numpy.int32).max)
+            return self._seed
+
+    @seed.setter
+    def seed(self, value):
+        if hasattr(self, '_seed'):
+            raise AttributeError("seed already set")
+        self._seed = value
+
+    @property
+    def rng(self):
+        if getattr(self, '_rng', None) is not None:
+            return self._rng
+        else:
+            self._rng = numpy.random.RandomState(self.seed)
+            return self._rng
+
+    @rng.setter
+    def rng(self, rng):
+        self._rng = rng
+
+
+class Initializable(RNGMixin, Brick):
+    """Base class for bricks which push parameter initialization.
+
+    Many bricks will initialize children which perform a linear
+    transformation, often with biases. This brick allows the weights
+    and biases initialization to be configured in the parent brick and
+    pushed down the hierarchy.
+
+    Parameters
+    ----------
+    weights_init : object
+        A `NdarrayInitialization` instance which will be used by to
+        initialize the weight matrix. Required by
+        :meth:`~.Brick.initialize`.
+    biases_init : :obj:`object`, optional
+        A `NdarrayInitialization` instance that will be used to initialize
+        the biases. Required by :meth:`~.Brick.initialize` when `use_bias`
+        is `True`. Only supported by bricks for which :attr:`has_biases` is
+        ``True``.
+    use_bias : :obj:`bool`, optional
+        Whether to use a bias. Defaults to `True`. Required by
+        :meth:`~.Brick.initialize`. Only supported by bricks for which
+        :attr:`has_biases` is ``True``.
+    rng : :class:`numpy.random.RandomState`
+
+    Attributes
+    ----------
+    has_biases : bool
+        ``False`` if the brick does not support biases, and only has
+        :attr:`weights_init`.  For an example of this, see
+        :class:`.Bidirectional`. If this is ``False``, the brick does not
+        support the arguments ``biases_init`` or ``use_bias``.
+
+    """
+    has_biases = True
+
+    @lazy()
+    def __init__(self, weights_init=None, biases_init=None, use_bias=True,
+                 seed=None, **kwargs):
+        super(Initializable, self).__init__(**kwargs)
+        self.weights_init = weights_init
+        if self.has_biases:
+            self.biases_init = biases_init
+        elif biases_init is not None or not use_bias:
+            raise ValueError("This brick does not support biases config")
+        self.use_bias = use_bias
+        self.seed = seed
+
+    def _push_initialization_config(self):
+        for child in self.children:
+            if isinstance(child, Initializable):
+                child.rng = self.rng
+                if self.weights_init:
+                    child.weights_init = self.weights_init
+        if hasattr(self, 'biases_init') and self.biases_init:
+            for child in self.children:
+                if (isinstance(child, Initializable) and
+                        hasattr(child, 'biases_init')):
+                    child.biases_init = self.biases_init
+
+
+class Random(Brick):
+    """A mixin class for Bricks which need Theano RNGs.
+
+    Parameters
+    ----------
+    theano_seed : int or list, optional
+        Seed to use for a
+        :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` object.
+
+    """
+    seed_rng = numpy.random.RandomState(config.default_seed)
+
+    def __init__(self, theano_seed=None, **kwargs):
+        super(Random, self).__init__(**kwargs)
+        self.theano_seed = theano_seed
+
+    @property
+    def theano_seed(self):
+        if getattr(self, '_theano_seed', None) is not None:
+            return self._theano_seed
+        else:
+            self._theano_seed = self.seed_rng.randint(
+                numpy.iinfo(numpy.int32).max)
+            return self._theano_seed
+
+    @theano_seed.setter
+    def theano_seed(self, value):
+        if hasattr(self, '_theano_seed'):
+            raise AttributeError("seed already set")
+        self._theano_seed = value
+
+    @property
+    def theano_rng(self):
+        """Returns Brick's Theano RNG, or a default one.
+
+        The default seed can be set through ``blocks.config``.
+
+        """
+        if not hasattr(self, '_theano_rng'):
+            self._theano_rng = MRG_RandomStreams(self.theano_seed)
+        return self._theano_rng
+
+    @theano_rng.setter
+    def theano_rng(self, theano_rng):
+        self._theano_rng = theano_rng
diff --git a/blocks/bricks/lookup.py b/blocks/bricks/lookup.py
index 175fc0d8..2fd20ba4 100644
--- a/blocks/bricks/lookup.py
+++ b/blocks/bricks/lookup.py
@@ -1,12 +1,16 @@
 """Introduces Lookup brick."""
-from blocks.bricks import Initializable
+from blocks.bricks import Initializable, Feedforward
 from blocks.bricks.base import application, lazy
+from blocks.roles import WEIGHT, add_role
 from blocks.utils import check_theano_variable, shared_floatx_nans
 
 
-class LookupTable(Initializable):
+class LookupTable(Initializable, Feedforward):
     """Encapsulates representations of a range of integers.
 
+    This brick can be used to embed integers, e.g. word indices,
+    into a vector space.
+
     Parameters
     ----------
     length : int
@@ -35,11 +39,12 @@ def W(self):
     def _allocate(self):
         self.parameters.append(shared_floatx_nans((self.length, self.dim),
                                name='W'))
+        add_role(self.parameters[-1], WEIGHT)
 
     def _initialize(self):
         self.weights_init.initialize(self.W, self.rng)
 
-    @application
+    @application(inputs=['indices'], outputs=['output'])
     def apply(self, indices):
         """Perform lookup.
 
@@ -61,3 +66,27 @@ def apply(self, indices):
         output_shape = [indices.shape[i]
                         for i in range(indices.ndim)] + [self.dim]
         return self.W[indices.flatten()].reshape(output_shape)
+
+    def get_dim(self, name):
+        if name == 'output':
+            return self.dim
+        if name == 'indices':
+            return 0
+        return super(LookupTable, self).get_dim(name)
+
+    @property
+    def input_dim(self):
+        return 0
+
+    @input_dim.setter
+    def input_dim(self, dim):
+        if dim != 0:
+            raise ValueError("LookupTable input must be integer")
+
+    @property
+    def output_dim(self):
+        return self.dim
+
+    @output_dim.setter
+    def output_dim(self, dim):
+        self.dim = dim
diff --git a/blocks/bricks/recurrent.py b/blocks/bricks/recurrent.py
index d49df50f..8417afe0 100644
--- a/blocks/bricks/recurrent.py
+++ b/blocks/bricks/recurrent.py
@@ -434,7 +434,9 @@ def apply(self, inputs, states, cells, mask=None):
             features * 4). The `inputs` needs to be four times the
             dimension of the LSTM brick to insure each four gates receive
             different transformations of the input. See [Grav13]_
-            equations 7 to 10 for more details.
+            equations 7 to 10 for more details. The `inputs` are then split
+            in this order: Input gates, forget gates, cells and output
+            gates.
         mask : :class:`~tensor.TensorVariable`
             A 1D binary array in the shape (batch,) which is 1 if there is
             data available, 0 if not. Assumed to be 1-s only if not given.
diff --git a/blocks/bricks/sequence_generators.py b/blocks/bricks/sequence_generators.py
index d43807d6..0ab1f6c7 100644
--- a/blocks/bricks/sequence_generators.py
+++ b/blocks/bricks/sequence_generators.py
@@ -280,6 +280,13 @@ def cost_matrix(self, application_call, outputs, mask=None, **kwargs):
         for name, variable in list(glimpses.items()) + list(states.items()):
             application_call.add_auxiliary_variable(
                 variable.copy(), name=name)
+
+        # This variables can be used to initialize the initial states of the
+        # next batch using the last states of the current batch.
+        for name in self._state_names + self._glimpse_names:
+            application_call.add_auxiliary_variable(
+                results[name][-1].copy(), name=name+"_final_value")
+
         return costs
 
     @recurrent
diff --git a/blocks/bricks/sequences.py b/blocks/bricks/sequences.py
new file mode 100644
index 00000000..9218c54b
--- /dev/null
+++ b/blocks/bricks/sequences.py
@@ -0,0 +1,153 @@
+"""Bricks that compose together other bricks in linear sequences."""
+from toolz import interleave
+from picklable_itertools.extras import equizip
+
+from ..utils import pack
+from .base import Brick, application, lazy
+from .interfaces import Feedforward, Initializable
+from .simple import Linear
+
+
+class Sequence(Brick):
+    """A sequence of bricks.
+
+    This brick applies a sequence of bricks, assuming that their in- and
+    outputs are compatible.
+
+    Parameters
+    ----------
+    application_methods : list
+        List of :class:`.BoundApplication` to apply
+
+    """
+    def __init__(self, application_methods, **kwargs):
+        super(Sequence, self).__init__(**kwargs)
+        self.application_methods = application_methods
+
+        seen = set()
+        self.children = [app.brick for app in application_methods
+                         if not (app.brick in seen or seen.add(app.brick))]
+
+    @application
+    def apply(self, *args):
+        child_input = args
+        for application_method in self.application_methods:
+            output = application_method(*pack(child_input))
+            child_input = output
+        return output
+
+    @apply.property('inputs')
+    def apply_inputs(self):
+        return self.application_methods[0].inputs
+
+    @apply.property('outputs')
+    def apply_outputs(self):
+        return self.application_methods[-1].outputs
+
+
+class FeedforwardSequence(Sequence, Feedforward):
+    """A sequence where the first and last bricks are feedforward.
+
+    Parameters
+    ----------
+    application_methods : list
+        List of :class:`.BoundApplication` to apply. The first and last
+        application method should belong to a :class:`Feedforward` brick.
+
+    """
+    @property
+    def input_dim(self):
+        return self.children[0].input_dim
+
+    @input_dim.setter
+    def input_dim(self, value):
+        self.children[0].input_dim = value
+
+    @property
+    def output_dim(self):
+        return self.children[-1].output_dim
+
+    @output_dim.setter
+    def output_dim(self, value):
+        self.children[-1].output_dim = value
+
+
+class MLP(Sequence, Initializable, Feedforward):
+    """A simple multi-layer perceptron.
+
+    Parameters
+    ----------
+    activations : list of :class:`.Brick`, :class:`.BoundApplication`,
+                  or ``None``
+        A list of activations to apply after each linear transformation.
+        Give ``None`` to not apply any activation. It is assumed that the
+        application method to use is ``apply``. Required for
+        :meth:`__init__`.
+    dims : list of ints
+        A list of input dimensions, as well as the output dimension of the
+        last layer. Required for :meth:`~.Brick.allocate`.
+
+    Notes
+    -----
+    See :class:`Initializable` for initialization parameters.
+
+    Note that the ``weights_init``, ``biases_init`` and ``use_bias``
+    configurations will overwrite those of the layers each time the
+    :class:`MLP` is re-initialized. For more fine-grained control, push the
+    configuration to the child layers manually before initialization.
+
+    >>> from blocks.bricks import Tanh
+    >>> from blocks.initialization import IsotropicGaussian, Constant
+    >>> mlp = MLP(activations=[Tanh(), None], dims=[30, 20, 10],
+    ...           weights_init=IsotropicGaussian(),
+    ...           biases_init=Constant(1))
+    >>> mlp.push_initialization_config()  # Configure children
+    >>> mlp.children[0].weights_init = IsotropicGaussian(0.1)
+    >>> mlp.initialize()
+
+    """
+    @lazy(allocation=['dims'])
+    def __init__(self, activations, dims, **kwargs):
+        self.activations = activations
+
+        self.linear_transformations = [Linear(name='linear_{}'.format(i))
+                                       for i in range(len(activations))]
+        # Interleave the transformations and activations
+        application_methods = []
+        for entity in interleave([self.linear_transformations, activations]):
+            if entity is None:
+                continue
+            if isinstance(entity, Brick):
+                application_methods.append(entity.apply)
+            else:
+                application_methods.append(entity)
+        if not dims:
+            dims = [None] * (len(activations) + 1)
+        self.dims = dims
+        super(MLP, self).__init__(application_methods, **kwargs)
+
+    @property
+    def input_dim(self):
+        return self.dims[0]
+
+    @input_dim.setter
+    def input_dim(self, value):
+        self.dims[0] = value
+
+    @property
+    def output_dim(self):
+        return self.dims[-1]
+
+    @output_dim.setter
+    def output_dim(self, value):
+        self.dims[-1] = value
+
+    def _push_allocation_config(self):
+        if not len(self.dims) - 1 == len(self.linear_transformations):
+            raise ValueError
+        for input_dim, output_dim, layer in \
+                equizip(self.dims[:-1], self.dims[1:],
+                        self.linear_transformations):
+            layer.input_dim = input_dim
+            layer.output_dim = output_dim
+            layer.use_bias = self.use_bias
diff --git a/blocks/bricks/simple.py b/blocks/bricks/simple.py
new file mode 100644
index 00000000..ed9215da
--- /dev/null
+++ b/blocks/bricks/simple.py
@@ -0,0 +1,395 @@
+"""Some of the simplest individual bricks."""
+import logging
+
+from theano import tensor
+
+from blocks.bricks.base import application, Brick, lazy
+from blocks.bricks.interfaces import Activation, Feedforward, Initializable
+from blocks.bricks.interfaces import Random  # noqa
+
+from blocks.bricks.wrappers import WithExtraDims
+from blocks.roles import add_role, WEIGHT, BIAS
+from blocks.utils import shared_floatx_nans
+
+logger = logging.getLogger(__name__)
+
+
+class Linear(Initializable, Feedforward):
+    r"""A linear transformation with optional bias.
+
+    Brick which applies a linear (affine) transformation by multiplying
+    the input with a weight matrix. By default, a bias term is added
+    (see :class:`Initializable` for information on disabling this).
+
+    Parameters
+    ----------
+    input_dim : int
+        The dimension of the input. Required by :meth:`~.Brick.allocate`.
+    output_dim : int
+        The dimension of the output. Required by :meth:`~.Brick.allocate`.
+
+    Notes
+    -----
+    See :class:`Initializable` for initialization parameters.
+
+    A linear transformation with bias is a matrix multiplication followed
+    by a vector summation.
+
+    .. math:: f(\mathbf{x}) = \mathbf{W}\mathbf{x} + \mathbf{b}
+
+    """
+    @lazy(allocation=['input_dim', 'output_dim'])
+    def __init__(self, input_dim, output_dim, **kwargs):
+        super(Linear, self).__init__(**kwargs)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+    @property
+    def W(self):
+        return self.parameters[0]
+
+    @property
+    def b(self):
+        return self.parameters[1]
+
+    def _allocate(self):
+        W = shared_floatx_nans((self.input_dim, self.output_dim), name='W')
+        add_role(W, WEIGHT)
+        self.parameters.append(W)
+        self.add_auxiliary_variable(W.norm(2), name='W_norm')
+        if self.use_bias:
+            b = shared_floatx_nans((self.output_dim,), name='b')
+            add_role(b, BIAS)
+            self.parameters.append(b)
+            self.add_auxiliary_variable(b.norm(2), name='b_norm')
+
+    def _initialize(self):
+        if self.use_bias:
+            W, b = self.parameters
+            self.biases_init.initialize(b, self.rng)
+        else:
+            W, = self.parameters
+        self.weights_init.initialize(W, self.rng)
+
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        """Apply the linear transformation.
+
+        Parameters
+        ----------
+        input_ : :class:`~tensor.TensorVariable`
+            The input on which to apply the transformation
+
+        Returns
+        -------
+        output : :class:`~tensor.TensorVariable`
+            The transformed input plus optional bias
+
+        """
+        if self.use_bias:
+            W, b = self.parameters
+        else:
+            W, = self.parameters
+        output = tensor.dot(input_, W)
+        if self.use_bias:
+            output += b
+        return output
+
+    def get_dim(self, name):
+        if name == 'input_':
+            return self.input_dim
+        if name == 'output':
+            return self.output_dim
+        super(Linear, self).get_dim(name)
+
+
+class Bias(Feedforward, Initializable):
+    """Add a bias (i.e. sum with a vector)."""
+    @lazy(allocation=['dim'])
+    def __init__(self, dim, **kwargs):
+        super(Bias, self).__init__(**kwargs)
+        self.dim = dim
+
+    def _allocate(self):
+        b = shared_floatx_nans((self.output_dim,), name='b')
+        add_role(b, BIAS)
+        self.parameters.append(b)
+
+    def _initialize(self):
+        b, = self.parameters
+        self.biases_init.initialize(b, self.rng)
+
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        """Apply the linear transformation.
+
+        Parameters
+        ----------
+        input_ : :class:`~tensor.TensorVariable`
+            The input on which to apply the transformation
+
+        Returns
+        -------
+        output : :class:`~tensor.TensorVariable`
+            The transformed input plus optional bias
+
+        """
+        b, = self.parameters
+        return input_ + b
+
+    def get_dim(self, name):
+        if name in ['input_', 'output']:
+            return self.dim
+        super(Bias, self).get_dim(name)
+
+    def _get_dim(self):
+        return self.dim
+
+    def _set_dim(self, value):
+        self.dim = value
+
+    input_dim = output_dim = property(_get_dim, _set_dim)
+
+
+class Maxout(Brick):
+    """Maxout pooling transformation.
+
+    A brick that does max pooling over groups of input units. If you use
+    this code in a research project, please cite [GWFM13]_.
+
+    .. [GWFM13] Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+       Courville, and Yoshua Bengio, *Maxout networks*, ICML (2013), pp.
+       1319-1327.
+
+    Parameters
+    ----------
+    num_pieces : int
+        The size of the groups the maximum is taken over.
+
+    Notes
+    -----
+    Maxout applies a set of linear transformations to a vector and selects
+    for each output dimension the result with the highest value.
+
+    """
+    @lazy(allocation=['num_pieces'])
+    def __init__(self, num_pieces, **kwargs):
+        super(Maxout, self).__init__(**kwargs)
+        self.num_pieces = num_pieces
+
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        """Apply the maxout transformation.
+
+        Parameters
+        ----------
+        input_ : :class:`~tensor.TensorVariable`
+            The input on which to apply the transformation
+
+        Returns
+        -------
+        output : :class:`~tensor.TensorVariable`
+            The transformed input
+
+        """
+        last_dim = input_.shape[-1]
+        output_dim = last_dim // self.num_pieces
+        new_shape = ([input_.shape[i] for i in range(input_.ndim - 1)] +
+                     [output_dim, self.num_pieces])
+        output = tensor.max(input_.reshape(new_shape, ndim=input_.ndim + 1),
+                            axis=input_.ndim)
+        return output
+
+
+class LinearMaxout(Initializable, Feedforward):
+    """Maxout pooling following a linear transformation.
+
+    This code combines the :class:`Linear` brick with a :class:`Maxout`
+    brick.
+
+    Parameters
+    ----------
+    input_dim : int
+        The dimension of the input. Required by :meth:`~.Brick.allocate`.
+    output_dim : int
+        The dimension of the output. Required by :meth:`~.Brick.allocate`.
+    num_pieces : int
+        The number of linear functions. Required by
+        :meth:`~.Brick.allocate`.
+
+    Notes
+    -----
+    See :class:`Initializable` for initialization parameters.
+
+    """
+    @lazy(allocation=['input_dim', 'output_dim', 'num_pieces'])
+    def __init__(self, input_dim, output_dim, num_pieces, **kwargs):
+        super(LinearMaxout, self).__init__(**kwargs)
+        self.linear = Linear()
+        self.maxout = Maxout()
+        self.children = [self.linear,
+                         self.maxout]
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.num_pieces = num_pieces
+
+    @property
+    def input_dim(self):
+        return self.linear.input_dim
+
+    @input_dim.setter
+    def input_dim(self, value):
+        self.linear.input_dim = value
+
+    def _push_allocation_config(self):
+        self.linear.output_dim = self.output_dim * self.num_pieces
+        self.maxout.num_pieces = self.num_pieces
+
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        """Apply the linear transformation followed by maxout.
+
+        Parameters
+        ----------
+        input_ : :class:`~tensor.TensorVariable`
+            The input on which to apply the transformations
+
+        Returns
+        -------
+        output : :class:`~tensor.TensorVariable`
+            The transformed input
+
+        """
+        pre_activation = self.linear.apply(input_)
+        output = self.maxout.apply(pre_activation)
+        return output
+
+
+class Identity(Activation):
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        return input_
+
+
+class Tanh(Activation):
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        return tensor.tanh(input_)
+
+
+class Logistic(Activation):
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        return tensor.nnet.sigmoid(input_)
+
+
+class Softplus(Activation):
+    r""" Softplus brick.
+
+    The softplus is defined as :math:`\zeta(x) = \log(1+e^x)`.
+
+    .. Dugas, C., Bengio, Y., Belisle, F., Nadeau, C., and Garcia,
+       R. (2001). Incorporating second-order functional knowledge
+       for better option pricing. In NIPS 13 . MIT Press.
+
+    """
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        return tensor.nnet.softplus(input_)
+
+
+class Rectifier(Activation):
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        return tensor.switch(input_ > 0, input_, 0)
+
+
+class Softmax(Brick):
+    """A softmax brick.
+
+    Works with 2-dimensional inputs only. If you need more,
+    see :class:`NDimensionalSoftmax`.
+
+    """
+    @application(inputs=['input_'], outputs=['output'])
+    def apply(self, input_):
+        """Standard softmax.
+
+        Parameters
+        ----------
+        input_ : :class:`~theano.Variable`
+            A matrix, each row contains unnormalized log-probabilities of a
+            distribution.
+
+        Returns
+        -------
+        output_ : :class:`~theano.Variable`
+            A matrix with probabilities in each row for each distribution
+            from `input_`.
+
+        """
+        return tensor.nnet.softmax(input_)
+
+    @application(inputs=['input_'], outputs=['output'])
+    def log_probabilities(self, input_):
+        """Normalize log-probabilities.
+
+        Converts unnormalized log-probabilities (exponents of which do not
+        sum to one) into actual log-probabilities (exponents of which sum
+        to one).
+
+        Parameters
+        ----------
+        input_ : :class:`~theano.Variable`
+            A matrix, each row contains unnormalized log-probabilities of a
+            distribution.
+
+        Returns
+        -------
+        output : :class:`~theano.Variable`
+            A matrix with normalized log-probabilities in each row for each
+            distribution from `input_`.
+
+        """
+        shifted = input_ - input_.max(axis=1, keepdims=True)
+        return shifted - tensor.log(
+            tensor.exp(shifted).sum(axis=1, keepdims=True))
+
+    @application(inputs=['y', 'x'], outputs=['output'])
+    def categorical_cross_entropy(self, application_call, y, x):
+        """Computationally stable cross-entropy for pre-softmax values.
+
+        Parameters
+        ----------
+        y : :class:`~tensor.TensorVariable`
+            In the case of a matrix argument, each row represents a
+            probabilility distribution. In the vector case, each element
+            represents a distribution by specifying the position of 1 in a
+            1-hot vector.
+        x : :class:`~tensor.TensorVariable`
+            A matrix, each row contains unnormalized probabilities of a
+            distribution.
+
+        Returns
+        -------
+        cost : :class:`~tensor.TensorVariable`
+            A vector of cross-entropies between respective distributions
+            from y and x.
+
+        """
+        x = self.log_probabilities(x)
+        application_call.add_auxiliary_variable(
+            x.copy(name='log_probabilities'))
+        if y.ndim == x.ndim - 1:
+            indices = tensor.arange(y.shape[0]) * x.shape[1] + y
+            cost = -x.flatten()[indices]
+        elif y.ndim == x.ndim:
+            cost = -(x * y).sum(axis=1)
+        else:
+            raise TypeError('rank mismatch between x and y')
+        return cost
+
+
+class NDimensionalSoftmax(Softmax):
+    decorators = [WithExtraDims()]
diff --git a/blocks/extensions/__init__.py b/blocks/extensions/__init__.py
index fcaadcae..445499e3 100644
--- a/blocks/extensions/__init__.py
+++ b/blocks/extensions/__init__.py
@@ -484,7 +484,7 @@ def create_bar(self):
                        progressbar.Timer(), ' ', progressbar.ETA()]
 
         return progressbar.ProgressBar(widgets=widgets,
-                                       maxval=iter_per_epoch)
+                                       max_value=iter_per_epoch)
 
     def before_epoch(self):
         self.iter_count = 0
diff --git a/blocks/extensions/saveload.py b/blocks/extensions/saveload.py
index becccf95..a2a5cfc6 100644
--- a/blocks/extensions/saveload.py
+++ b/blocks/extensions/saveload.py
@@ -6,7 +6,8 @@
 
 from blocks.extensions import SimpleExtension, TrainingExtension
 from blocks.utils import reraise_as
-from blocks.serialization import secure_dump, load, load_parameter_values
+from blocks.serialization import (
+    secure_dump, load, load_parameter_values, DEFAULT_PROTOCOL)
 
 logger = logging.getLogger(__name__)
 
@@ -97,7 +98,8 @@ def do(self, callback_name, *args):
             filenames = self.save_separately_filenames(path)
             for attribute in self.save_separately:
                 secure_dump(getattr(self.main_loop, attribute),
-                            filenames[attribute], cPickle.dump)
+                            filenames[attribute], cPickle.dump,
+                            protocol=DEFAULT_PROTOCOL)
         except Exception:
             path = None
             raise
diff --git a/blocks/filter.py b/blocks/filter.py
index b5c41a69..8248ff34 100644
--- a/blocks/filter.py
+++ b/blocks/filter.py
@@ -84,7 +84,8 @@ class VariableFilter(object):
 
     Examples
     --------
-    >>> from blocks.bricks import MLP, Linear, Logistic, Identity, BIAS
+    >>> from blocks.bricks import MLP, Linear, Logistic, Identity
+    >>> from blocks.roles import BIAS
     >>> mlp = MLP(activations=[Identity(), Logistic()], dims=[20, 10, 20])
     >>> from theano import tensor
     >>> x = tensor.matrix()
@@ -109,7 +110,7 @@ def __init__(self, roles=None, bricks=None, each_role=False, name=None,
             isinstance(application, BoundApplication)
                 for application in applications):
             raise ValueError('`applications` should be a list of '
-                             'Applications')
+                             'BoundApplications')
         self.roles = roles
         self.bricks = bricks
         self.each_role = each_role
diff --git a/blocks/graph.py b/blocks/graph/__init__.py
similarity index 79%
rename from blocks/graph.py
rename to blocks/graph/__init__.py
index 1989ad5e..5d7d1be5 100644
--- a/blocks/graph.py
+++ b/blocks/graph/__init__.py
@@ -2,6 +2,7 @@
 import logging
 from collections import OrderedDict
 from itertools import chain
+import warnings
 
 import numpy
 import theano
@@ -12,12 +13,14 @@
 from theano.scan_module.scan_op import Scan
 from toolz import unique
 
-from blocks.config import config
-from blocks.roles import (add_role, has_roles, AUXILIARY, PARAMETER, DROPOUT,
-                          COLLECTED, COLLECTOR)
-from blocks.utils import (is_graph_input, is_shared_variable, dict_union,
-                          shared_floatx_zeros, shared_like)
-import warnings
+from ..config import config
+from ..roles import (add_role, has_roles, AUXILIARY, PARAMETER, DROPOUT,
+                     COLLECTED, COLLECTOR)
+from ..utils import (is_graph_input, is_shared_variable, dict_union,
+                     shared_floatx_zeros, shared_like)
+from .annotations import add_annotation, Annotation  # noqa
+from .bn import batch_normalization, apply_batch_normalization  # noqa
+from .bn import get_batch_normalization_updates  # noqa
 
 logger = logging.getLogger(__name__)
 
@@ -306,119 +309,6 @@ def has_inputs(self, variable):
         return self._has_inputs[variable]
 
 
-def add_annotation(var, annotation):
-    annotations = getattr(var.tag, 'annotations', [])
-    if any(old_annotation.__class__ == annotation.__class__
-           for old_annotation in annotations):
-        raise ValueError
-    else:
-        var.tag.annotations = annotations + [annotation]
-
-
-class Annotation(object):
-    """Annotations on Theano variables in a graph.
-
-    In Blocks annotations are automatically attached to variables created
-    using bricks. One form of annotation is that many variables are
-    assigned a role (see :class:`.VariableRole`). A second form of
-    annotation comes in the form of attaching a :class:`Annotation`
-    instance to the variable's ``tag`` attribute, with auxiliary variables
-    and/or updates.
-
-    For example, we might be interested in the mean activation of certain
-    application of a :class:`.Linear` brick. The variable representing the
-    mean activation is attached as an auxiliary variable to the annotations
-    of the input and output variables of this brick. Using the
-    :class:`ComputationGraph` class (the
-    :attr:`~ComputationGraph.variables`,
-    :attr:`~ComputationGraph.auxiliary_variables`, etc.  attributes in
-    particular) we can retrieve these Theano variables to pass on to the
-    monitor, use as a regularizer, etc.
-
-    In most cases, annotations are added on a brick level (e.g. each brick
-    will assign the weight norm of its weights as an auxiliary value) or on
-    an application level (e.g. each time a brick is applied, its mean
-    activation will become an auxiliary variable). However, you can also
-    add annotations manually, by setting the ``annotation`` value of a
-    variable's ``tag`` field.
-
-    Examples
-    --------
-    >>> from theano import tensor
-    >>> x = tensor.vector()
-    >>> annotation = Annotation()
-    >>> annotation.add_auxiliary_variable(x + 1, name='x_plus_1')
-    >>> add_annotation(x, annotation)
-    >>> y = x ** 2
-    >>> from blocks.graph import ComputationGraph
-    >>> cg = ComputationGraph([y])
-    >>> cg.auxiliary_variables
-    [x_plus_1]
-
-    """
-    def __init__(self):
-        self.auxiliary_variables = []
-        self.updates = OrderedDict()
-
-    def add_auxiliary_variable(self, variable, roles=None, name=None):
-        """Attach an auxiliary variable to the graph.
-
-        Auxiliary variables are Theano variables that are not part of a
-        brick's output, but can be useful nonetheless e.g. as a regularizer
-        or to monitor during training progress.
-
-        Parameters
-        ----------
-        variable : :class:`~tensor.TensorVariable`
-            The variable you want to add.
-        roles : list of :class:`.VariableRole` instances, optional
-            The roles of this variable. The :const:`.AUXILIARY`
-            role will automatically be added. Other options are
-            :const:`.COST`, :const:`.WEIGHT`, etc.
-        name : str, optional
-            Name to give to the variable. If the variable already has a
-            name it will be overwritten.
-
-        Examples
-        --------
-        >>> from blocks.bricks.base import application, Brick
-        >>> from blocks.roles import COST
-        >>> from blocks.utils import shared_floatx_nans
-        >>> class Foo(Brick):
-        ...     def _allocate(self):
-        ...         W = shared_floatx_nans((10, 10))
-        ...         self.add_auxiliary_variable(W.mean(), name='mean_W')
-        ...     @application
-        ...     def apply(self, x, application_call):
-        ...         application_call.add_auxiliary_variable(
-        ...             x - 1, name='x_minus_1')
-        ...         application_call.add_auxiliary_variable(
-        ...             x.mean(), roles=[COST], name='mean_x')
-        ...         return x + 1
-        >>> from theano import tensor
-        >>> x = tensor.vector()
-        >>> y = Foo().apply(x)
-        >>> from blocks.filter import VariableFilter
-        >>> cg = ComputationGraph([y])
-        >>> var_filter = VariableFilter(roles=[AUXILIARY])
-        >>> var_filter(cg.variables)  # doctest: +SKIP
-        {x_minus_1, mean_W, mean_x}
-        >>> var_filter = VariableFilter(roles=[COST])
-        >>> var_filter(cg.variables)  # doctest: +SKIP
-        {mean_x}
-
-        """
-        add_annotation(variable, self)
-        if name is not None:
-            variable.name = name
-            variable.tag.name = name
-        add_role(variable, AUXILIARY)
-        if roles is not None:
-            for role in roles:
-                add_role(variable, role)
-        self.auxiliary_variables.append(variable)
-
-
 def apply_noise(computation_graph, variables, level, seed=None):
     """Add Gaussian noise to certain variable of a computation graph.
 
@@ -533,8 +423,8 @@ def collect_parameters(computation_graph, parameters):
 
 
 def apply_dropout(computation_graph, variables, drop_prob, rng=None,
-                  seed=None):
-    """Returns a graph to variables in a computational graph.
+                  seed=None, custom_divisor=None):
+    """Apply dropout to specified variables in a graph.
 
     Parameters
     ----------
@@ -550,6 +440,19 @@ def apply_dropout(computation_graph, variables, drop_prob, rng=None,
         Random number generator.
     seed : int
         Random seed to be used if `rng` was not specified.
+    custom_divisor : float or None, optional
+        Divide dropped variables by a given scalar value. If `None`,
+        (default) dropped variables will be divided by `(1 - drop_prob)`
+        which is equivalent to scaling by `(1 - drop_prob)` at test
+        time as recommended in [DROPOUT]_.
+
+    Returns
+    -------
+    dropped_computation_graph : instance of :class:`ComputationGraph`
+        A new computation graph with dropout applied to the specified
+        variables. In order to train with, or monitor, the outputs
+        of the original computation graph with dropout applies, use
+        the variables contained in `dropped_computation_graph.outputs`.
 
     Notes
     -----
@@ -622,11 +525,14 @@ def apply_dropout(computation_graph, variables, drop_prob, rng=None,
         seed = config.default_seed
     if not rng:
         rng = MRG_RandomStreams(seed)
-
+    if custom_divisor is None:
+        divisor = (1 - drop_prob)
+    else:
+        divisor = custom_divisor
     replacements = [(var, var *
                      rng.binomial(var.shape, p=1 - drop_prob,
                                   dtype=theano.config.floatX) /
-                     (1 - drop_prob))
+                     divisor)
                     for var in variables]
     for variable, replacement in replacements:
         add_role(replacement, DROPOUT)
diff --git a/blocks/graph/annotations.py b/blocks/graph/annotations.py
new file mode 100644
index 00000000..b684c018
--- /dev/null
+++ b/blocks/graph/annotations.py
@@ -0,0 +1,116 @@
+from collections import OrderedDict
+from ..roles import add_role, AUXILIARY
+
+
+def add_annotation(var, annotation):
+    annotations = getattr(var.tag, 'annotations', [])
+    if any(old_annotation.__class__ == annotation.__class__
+           for old_annotation in annotations):
+        raise ValueError
+    else:
+        var.tag.annotations = annotations + [annotation]
+
+
+class Annotation(object):
+    """Annotations on Theano variables in a graph.
+
+    In Blocks annotations are automatically attached to variables created
+    using bricks. One form of annotation is that many variables are
+    assigned a role (see :class:`.VariableRole`). A second form of
+    annotation comes in the form of attaching a :class:`Annotation`
+    instance to the variable's ``tag`` attribute, with auxiliary variables
+    and/or updates.
+
+    For example, we might be interested in the mean activation of certain
+    application of a :class:`.Linear` brick. The variable representing the
+    mean activation is attached as an auxiliary variable to the annotations
+    of the input and output variables of this brick. Using the
+    :class:`ComputationGraph` class (the
+    :attr:`~ComputationGraph.variables`,
+    :attr:`~ComputationGraph.auxiliary_variables`, etc.  attributes in
+    particular) we can retrieve these Theano variables to pass on to the
+    monitor, use as a regularizer, etc.
+
+    In most cases, annotations are added on a brick level (e.g. each brick
+    will assign the weight norm of its weights as an auxiliary value) or on
+    an application level (e.g. each time a brick is applied, its mean
+    activation will become an auxiliary variable). However, you can also
+    add annotations manually, by setting the ``annotation`` value of a
+    variable's ``tag`` field.
+
+    Examples
+    --------
+    >>> from theano import tensor
+    >>> x = tensor.vector()
+    >>> annotation = Annotation()
+    >>> annotation.add_auxiliary_variable(x + 1, name='x_plus_1')
+    >>> add_annotation(x, annotation)
+    >>> y = x ** 2
+    >>> from blocks.graph import ComputationGraph
+    >>> cg = ComputationGraph([y])
+    >>> cg.auxiliary_variables
+    [x_plus_1]
+
+    """
+    def __init__(self):
+        self.auxiliary_variables = []
+        self.updates = OrderedDict()
+
+    def add_auxiliary_variable(self, variable, roles=None, name=None):
+        """Attach an auxiliary variable to the graph.
+
+        Auxiliary variables are Theano variables that are not part of a
+        brick's output, but can be useful nonetheless e.g. as a regularizer
+        or to monitor during training progress.
+
+        Parameters
+        ----------
+        variable : :class:`~tensor.TensorVariable`
+            The variable you want to add.
+        roles : list of :class:`.VariableRole` instances, optional
+            The roles of this variable. The :const:`.AUXILIARY`
+            role will automatically be added. Other options are
+            :const:`.COST`, :const:`.WEIGHT`, etc.
+        name : str, optional
+            Name to give to the variable. If the variable already has a
+            name it will be overwritten.
+
+        Examples
+        --------
+        >>> from blocks.bricks.base import application, Brick
+        >>> from blocks.roles import COST
+        >>> from blocks.utils import shared_floatx_nans
+        >>> class Foo(Brick):
+        ...     def _allocate(self):
+        ...         W = shared_floatx_nans((10, 10))
+        ...         self.add_auxiliary_variable(W.mean(), name='mean_W')
+        ...     @application
+        ...     def apply(self, x, application_call):
+        ...         application_call.add_auxiliary_variable(
+        ...             x - 1, name='x_minus_1')
+        ...         application_call.add_auxiliary_variable(
+        ...             x.mean(), roles=[COST], name='mean_x')
+        ...         return x + 1
+        >>> from theano import tensor
+        >>> x = tensor.vector()
+        >>> y = Foo().apply(x)
+        >>> from blocks.graph import ComputationGraph
+        >>> cg = ComputationGraph([y])
+        >>> from blocks.filter import VariableFilter
+        >>> var_filter = VariableFilter(roles=[AUXILIARY])
+        >>> var_filter(cg.variables)  # doctest: +SKIP
+        {x_minus_1, mean_W, mean_x}
+        >>> var_filter = VariableFilter(roles=[COST])
+        >>> var_filter(cg.variables)  # doctest: +SKIP
+        {mean_x}
+
+        """
+        add_annotation(variable, self)
+        if name is not None:
+            variable.name = name
+            variable.tag.name = name
+        add_role(variable, AUXILIARY)
+        if roles is not None:
+            for role in roles:
+                add_role(variable, role)
+        self.auxiliary_variables.append(variable)
diff --git a/blocks/graph/bn.py b/blocks/graph/bn.py
new file mode 100644
index 00000000..212f143d
--- /dev/null
+++ b/blocks/graph/bn.py
@@ -0,0 +1,269 @@
+"""Implements the batch normalization training graph transform.
+
+Specifically, this module contains the implementation for the
+transformation of a batch-normalized inference graph into training graph,
+which uses minibatch statistics in place of population statistics.
+
+"""
+import collections
+import contextlib
+from functools import partial
+
+import theano
+from toolz import isdistinct
+
+from ..roles import BATCH_NORM_OFFSET, BATCH_NORM_DIVISOR, INPUT, OUTPUT
+from ..utils import find_bricks
+
+
+def _training_mode_application_calls(application_calls):
+    """Filter for application calls made in 'training mode'."""
+    from ..bricks import BatchNormalization
+    out = []
+    for app_call in application_calls:
+        assert isinstance(app_call.application.brick, BatchNormalization)
+        assert app_call.application.application == BatchNormalization.apply
+        if app_call.metadata.get('training_mode', False):
+            out.append(app_call)
+    return out
+
+
+@contextlib.contextmanager
+def batch_normalization(*bricks):
+    r"""Context manager to run batch normalization in "training mode".
+
+    Parameters
+    ----------
+    \*bricks
+        One or more bricks which will be inspected for descendant
+        instances of :class:`~blocks.bricks.BatchNormalization`.
+
+    Notes
+    -----
+    Graph replacement using :func:`apply_batch_normalization`, while
+    elegant, can lead to Theano graphs that are quite large and result
+    in very slow compiles. This provides an alternative mechanism for
+    building the batch normalized training graph. It can be somewhat
+    less convenient as it requires building the graph twice if one
+    wishes to monitor the output of the inference graph during training.
+
+    Examples
+    --------
+    First, we'll create a :class:`~blocks.bricks.BatchNormalizedMLP`.
+
+    >>> import theano
+    >>> from blocks.bricks import BatchNormalizedMLP, Tanh
+    >>> from blocks.initialization import Constant, IsotropicGaussian
+    >>> mlp = BatchNormalizedMLP([Tanh(), Tanh()], [4, 5, 6],
+    ...                          weights_init=IsotropicGaussian(0.1),
+    ...                          biases_init=Constant(0))
+    >>> mlp.initialize()
+
+    Now, we'll construct an output variable as we would normally. This
+    is getting normalized by the *population* statistics, which by
+    default are initialized to 0 (mean) and 1 (standard deviation),
+    respectively.
+
+    >>> x = theano.tensor.matrix()
+    >>> y = mlp.apply(x)
+
+    And now, to construct an output with batch normalization enabled,
+    i.e. normalizing pre-activations using per-minibatch statistics, we
+    simply make a similar call inside of a `with` statement:
+
+    >>> with batch_normalization(mlp):
+    ...     y_bn = mlp.apply(x)
+
+    Let's verify that these two graphs behave differently on the
+    same data:
+
+    >>> import numpy
+    >>> data = numpy.arange(12, dtype=theano.config.floatX).reshape(3, 4)
+    >>> inf_y = y.eval({x: data})
+    >>> trn_y = y_bn.eval({x: data})
+    >>> numpy.allclose(inf_y, trn_y)
+    False
+
+    """
+    # Avoid circular imports.
+    from blocks.bricks import BatchNormalization
+
+    bn = find_bricks(bricks, lambda b: isinstance(b, BatchNormalization))
+    # Can't use either nested() (deprecated) nor ExitStack (not available
+    # on Python 2.7). Well, that sucks.
+    try:
+        for brick in bn:
+            brick.__enter__()
+        yield
+    finally:
+        for brick in bn[::-1]:
+            brick.__exit__()
+
+
+def apply_batch_normalization(computation_graph):
+    """Transform a graph into a batch-normalized training graph.
+
+    Parameters
+    ----------
+    computation_graph : :class:`~blocks.graph.ComputationGraph`
+        The computation graph containing :class:`BatchNormalization`
+        brick applications.
+
+    Returns
+    -------
+    batch_normed_graph : :class:`~blocks.graph.ComputationGraph`
+        The computation graph, with :class:`BatchNormalization`
+        applications transformed to use minibatch statistics instead
+        of accumulated population statistics.
+
+    See Also
+    --------
+    :func:`batch_normalization`, for an alternative method to produce
+    batch normalized graphs.
+
+    Examples
+    --------
+    First, we'll create a :class:`~blocks.bricks.BatchNormalizedMLP`.
+
+    >>> import theano
+    >>> from blocks.bricks import BatchNormalizedMLP, Tanh
+    >>> from blocks.initialization import Constant, IsotropicGaussian
+    >>> mlp = BatchNormalizedMLP([Tanh(), Tanh()], [4, 5, 6],
+    ...                          weights_init=IsotropicGaussian(0.1),
+    ...                          biases_init=Constant(0))
+    >>> mlp.initialize()
+
+    Now, we'll construct an output variable as we would normally. This
+    is getting normalized by the *population* statistics, which by
+    default are initialized to 0 (mean) and 1 (standard deviation),
+    respectively.
+
+    >>> x = theano.tensor.matrix()
+    >>> y = mlp.apply(x)
+
+    Finally, we'll create a :class:`~blocks.graph.ComputationGraph`
+    and transform it to switch to minibatch standardization:
+
+    >>> from blocks.graph import ComputationGraph
+    >>> cg = apply_batch_normalization(ComputationGraph([y]))
+    >>> y_bn = cg.outputs[0]
+
+    Let's verify that these two graphs behave differently on the
+    same data:
+
+    >>> import numpy
+    >>> data = numpy.arange(12, dtype=theano.config.floatX).reshape(3, 4)
+    >>> inf_y = y.eval({x: data})
+    >>> trn_y = y_bn.eval({x: data})
+    >>> numpy.allclose(inf_y, trn_y)
+    False
+
+    """
+    # Avoid circular imports.
+    from blocks.bricks import BatchNormalization
+    from ..filter import VariableFilter, get_application_call
+
+    # Create filters for variables involved in a batch normalization brick
+    # application.
+    def make_variable_filter(role):
+        return VariableFilter(bricks=[BatchNormalization], roles=[role])
+
+    # Group inputs and outputs into dicts indexed by application call.
+    def get_app_call_dict(variable_filter):
+        return collections.OrderedDict((get_application_call(v), v) for v in
+                                       variable_filter(computation_graph))
+
+    # Compose these two so that we get 4 dicts, grouped by application
+    # call, of different variable roles involved in BatchNormalization.
+    inputs, outputs, means, stdevs = map(get_app_call_dict,
+                                         map(make_variable_filter,
+                                             [INPUT, OUTPUT, BATCH_NORM_OFFSET,
+                                              BATCH_NORM_DIVISOR]))
+
+    assert len(set([len(inputs), len(outputs), len(means), len(stdevs)])) == 1
+
+    # Remove any ApplicationCalls that were not generated by apply(), or
+    # were generated by an apply() while already in training mode.
+    app_calls = inputs.keys()
+    remove = _training_mode_application_calls(app_calls)
+    for app_call in app_calls:
+        if app_call in remove:
+            for mapping in (inputs, outputs, means, stdevs):
+                del mapping[app_call]
+
+    replacements = []
+    for app_call in inputs:
+        old_output = outputs[app_call]
+        # Get rid of the copy made on the way into the original apply.
+        op = inputs[app_call].owner.op
+        assert (isinstance(op, theano.tensor.Elemwise) and
+                isinstance(op.scalar_op, theano.scalar.basic.Identity))
+        unpacked = inputs[app_call].owner.inputs[0]
+        with app_call.application.brick:
+            new_output = app_call.application.brick.apply(unpacked)
+        new_app_call = get_application_call(new_output)
+        assert new_app_call.metadata['training_mode']
+        replacements.append((old_output, new_output))
+    return computation_graph.replace(replacements)
+
+
+def get_batch_normalization_updates(training_graph, allow_duplicates=False):
+    """Extract correspondences for learning BN population statistics.
+
+    Parameters
+    ----------
+    training_graph : :class:`~blocks.graph.ComputationGraph`
+        A graph of expressions wherein "training mode" batch normalization
+        is taking place.
+    allow_duplicates : bool, optional
+        If `True`, allow multiple training-mode application calls from the
+        same :class:`~blocks.bricks.BatchNormalization` instance, and
+        return pairs corresponding to all of them. It's then the user's
+        responsibility to do something sensible to resolve the duplicates.
+
+    Returns
+    -------
+    update_pairs : list of tuples
+        A list of 2-tuples where the first element of each tuple is the
+        shared variable containing a "population" mean or standard
+        deviation, and the second is a Theano variable for the
+        corresponding statistics on a minibatch. Note that multiple
+        applications of a single :class:`blocks.bricks.BatchNormalization`
+        may appear in the graph, and therefore (if `allow_duplicates` is
+        True) a single population variable may map to several different
+        minibatch variables, and appear multiple times in this mapping.
+        This can happen in recurrent models, siamese networks or other
+        models that reuse pathways.
+
+    Notes
+    -----
+    Used in their raw form, these updates will simply overwrite the
+    population statistics with the minibatch statistics at every gradient
+    step. You will probably want to transform these pairs into something
+    more sensible, such as keeping a moving average of minibatch values,
+    or accumulating an average over the entire training set once every few
+    epochs.
+
+    """
+    from ..bricks import BatchNormalization
+    from ..filter import VariableFilter, get_application_call
+    var_filter = VariableFilter(bricks=[BatchNormalization], roles=[OUTPUT])
+    all_app_calls = map(get_application_call, var_filter(training_graph))
+    train_app_calls = _training_mode_application_calls(all_app_calls)
+    if len(train_app_calls) == 0:
+        raise ValueError("no training mode BatchNormalization "
+                         "applications found in graph")
+    bricks = [c.application.brick for c in train_app_calls]
+
+    if not allow_duplicates and not isdistinct(bricks):
+        raise ValueError('multiple applications of the same '
+                         'BatchNormalization brick; pass allow_duplicates '
+                         '= True to override this check')
+
+    def extract_pair(brick_attribute, metadata_key, app_call):
+        return (getattr(app_call.application.brick, brick_attribute),
+                app_call.metadata[metadata_key])
+
+    mean_pair = partial(extract_pair, 'population_mean', 'offset')
+    stdev_pair = partial(extract_pair, 'population_stdev', 'divisor')
+    return sum([[mean_pair(a), stdev_pair(a)] for a in train_app_calls], [])
diff --git a/blocks/model.py b/blocks/model.py
index af0aec36..93c5f11c 100644
--- a/blocks/model.py
+++ b/blocks/model.py
@@ -7,7 +7,7 @@
 created by Blocks typically has, such as bricks and application calls.  The
 :class:`Model` adds this functionality. Using :class:`Model` you can do
 things like query all the bricks used to build the computation graph,
-request "hierarhical names" of the parameters (a hierarchical name is a
+request "hierarchical names" of the parameters (a hierarchical name is a
 path-like string which in addition to the parameter's name contains names
 of the bricks on the path from a root brick to the brick that owns the
 parameters, e.g. ``/mlp/linear/W``).
@@ -48,7 +48,7 @@ class Model(ComputationGraph):
     it:
 
     >>> model.get_top_bricks() #doctest: +ELLIPSIS
-    [<blocks.bricks.MLP object at ...]
+    [<blocks.bricks.sequences.MLP object at ...]
 
     You can also get "hierarchical" names for the parameters,
     which encode the position of the owning brick in the
diff --git a/blocks/monitoring/evaluators.py b/blocks/monitoring/evaluators.py
index dd298974..b7476d08 100644
--- a/blocks/monitoring/evaluators.py
+++ b/blocks/monitoring/evaluators.py
@@ -114,7 +114,12 @@ def __init__(self, variables, use_take_last=False):
 
         self.variable_names = [v.name for v in self.variables]
         if len(set(self.variable_names)) < len(self.variables):
-            raise ValueError("variables should have different names")
+            duplicates = []
+            for vname in set(self.variable_names):
+                if self.variable_names.count(vname) > 1:
+                    duplicates.append(vname)
+            raise ValueError("variables should have different names!"
+                             " Duplicates: {}".format(', '.join(duplicates)))
         self._computation_graph = ComputationGraph(self.variables)
         self.inputs = self._computation_graph.inputs
 
diff --git a/blocks/roles.py b/blocks/roles.py
index 4b6c4c84..d672189c 100644
--- a/blocks/roles.py
+++ b/blocks/roles.py
@@ -93,7 +93,14 @@ class CostRole(VariableRole):
 COST = CostRole()
 
 
-class ParameterRole(VariableRole):
+class PersistentRole(VariableRole):
+    pass
+
+# Any persistent quantity that should be saved as part of the model
+PERSISTENT = PersistentRole()
+
+
+class ParameterRole(PersistentRole):
     pass
 
 #: A parameter of the model
@@ -175,3 +182,74 @@ class AlgorithmBufferRole(AlgorithmStateRole):
 
 #: buffers accociated with algorithms
 ALGORITHM_BUFFER = AlgorithmBufferRole()
+
+
+class BatchNormPopulationStatisticsRole(PersistentRole):
+    pass
+
+#: base role for batch normalization population statistics
+BATCH_NORM_POPULATION_STATISTICS = BatchNormPopulationStatisticsRole()
+
+
+class BatchNormPopulationMeanRole(BatchNormPopulationStatisticsRole):
+    pass
+
+#: mean activations accumulated over the dataset
+BATCH_NORM_POPULATION_MEAN = BatchNormPopulationMeanRole()
+
+
+class BatchNormPopulationStdevRole(BatchNormPopulationStatisticsRole):
+    pass
+
+#: standard deviations of activations accumulated over the dataset
+BATCH_NORM_POPULATION_STDEV = BatchNormPopulationStdevRole()
+
+
+class BatchNormGraphVariableRole(VariableRole):
+    pass
+
+#: base for roles used for within-graph batch normalization replacement
+BATCH_NORM_GRAPH_VARIABLE = BatchNormGraphVariableRole()
+
+
+class BatchNormOffsetRole(BatchNormGraphVariableRole):
+    pass
+
+#: offset applied in a BatchNormalization application (or its
+#  batch-normalized replacement)
+BATCH_NORM_OFFSET = BatchNormOffsetRole()
+
+
+class BatchNormDivisorRole(BatchNormGraphVariableRole):
+    pass
+
+#: divisor applied in a BatchNormalization application (or its
+#  batch-normalized replacement)
+BATCH_NORM_DIVISOR = BatchNormDivisorRole()
+
+
+class BatchNormMinibatchEstimateRole(BatchNormGraphVariableRole):
+    pass
+
+#: role added to variables that are the result of a batch normalization
+#  replacement, rather than the original population statistics variables.
+BATCH_NORM_MINIBATCH_ESTIMATE = BatchNormMinibatchEstimateRole()
+
+
+class BatchNormScaleParameterRole(ParameterRole):
+    pass
+
+#: role given to the scale parameter, referred to as "scale" in the
+# batch normalization manuscript, applied after normalizing.
+BATCH_NORM_SCALE_PARAMETER = BatchNormScaleParameterRole()
+
+
+class BatchNormShiftParameterRole(BiasRole):
+    pass
+
+#: role given to the shift parameter, referred to as "beta" in the
+# batch normalization manuscript, applied after normalizing and scaling.
+# Inherits from BIAS, because there really is no functional difference
+# with a normal bias, and indeed these are the only biases present
+# inside a BatchNormalizedMLP.
+BATCH_NORM_SHIFT_PARAMETER = BatchNormShiftParameterRole()
diff --git a/blocks/serialization.py b/blocks/serialization.py
index 30c5b1de..cc18ee45 100644
--- a/blocks/serialization.py
+++ b/blocks/serialization.py
@@ -141,7 +141,7 @@ def dump(obj, file_handler, protocol=DEFAULT_PROTOCOL,
     >>> with open('model.zip', 'rb') as f:
     ...     mlp2 = load(f)
     >>> mlp2  # doctest: +ELLIPSIS
-    <blocks.bricks.MLP object at ...: name=mlp>
+    <blocks.bricks.sequences.MLP object at ...: name=mlp>
 
     """
     with closing(zipfile.ZipFile(file_handler, 'w', zipfile.ZIP_DEFLATED,
diff --git a/blocks/utils/__init__.py b/blocks/utils/__init__.py
index dae6da36..4ddbdc41 100644
--- a/blocks/utils/__init__.py
+++ b/blocks/utils/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import print_function
 import sys
 import contextlib
-from collections import OrderedDict
+from collections import OrderedDict, deque
 
 import numpy
 import six
@@ -70,6 +70,36 @@ def unpack(arg, singleton=False):
         return arg
 
 
+def shared_floatx_zeros_matching(shared_variable, name=None, **kwargs):
+    r"""Create another shared variable with matching shape and broadcast.
+
+    Parameters
+    ----------
+    shared_variable : :class:'tensor.TensorSharedVariable'
+        A Theano shared variable with the desired shape and broadcastable
+        flags.
+    name : :obj:`str`, optional
+        The name for the shared variable. Defaults to `None`.
+    \*\*kwargs
+        Keyword arguments to pass to the :func:`shared_floatx_zeros`
+        function.
+
+    Returns
+    -------
+    :class:'tensor.TensorSharedVariable'
+        A new shared variable, initialized to all zeros, with the same
+        shape and broadcastable flags as `shared_variable`.
+
+
+    """
+    if not is_shared_variable(shared_variable):
+        raise ValueError('argument must be a shared variable')
+    return shared_floatx_zeros(shared_variable.get_value().shape,
+                               name=name,
+                               broadcastable=shared_variable.broadcastable,
+                               **kwargs)
+
+
 def shared_floatx_zeros(shape, **kwargs):
     r"""Creates a shared variable array filled with zeros.
 
@@ -108,8 +138,8 @@ def shared_floatx_nans(shape, **kwargs):
     return shared_floatx(numpy.nan * numpy.zeros(shape), **kwargs)
 
 
-def shared_floatx(value, name=None, borrow=False, dtype=None):
-    """Transform a value into a shared variable of type floatX.
+def shared_floatx(value, name=None, borrow=False, dtype=None, **kwargs):
+    r"""Transform a value into a shared variable of type floatX.
 
     Parameters
     ----------
@@ -123,6 +153,8 @@ def shared_floatx(value, name=None, borrow=False, dtype=None):
     dtype : :obj:`str`, optional
         The `dtype` of the shared variable. Default value is
         :attr:`config.floatX`.
+    \*\*kwargs
+        Keyword arguments to pass to the :func:`~theano.shared` function.
 
     Returns
     -------
@@ -133,12 +165,11 @@ def shared_floatx(value, name=None, borrow=False, dtype=None):
     if dtype is None:
         dtype = theano.config.floatX
     return theano.shared(theano._asarray(value, dtype=dtype),
-                         name=name,
-                         borrow=borrow)
+                         name=name, borrow=borrow, **kwargs)
 
 
-def shared_like(variable, name=None):
-    """Construct a shared variable to hold the value of a tensor variable.
+def shared_like(variable, name=None, **kwargs):
+    r"""Construct a shared variable to hold the value of a tensor variable.
 
     Parameters
     ----------
@@ -148,6 +179,8 @@ def shared_like(variable, name=None):
     name : :obj:`str` or :obj:`None`
         The name of the shared variable. If None, the name is determined
         based on variable's name.
+    \*\*kwargs
+        Keyword arguments to pass to the :func:`~theano.shared` function.
 
     """
     variable = tensor.as_tensor_variable(variable)
@@ -155,7 +188,7 @@ def shared_like(variable, name=None):
         name = "shared_{}".format(variable.name)
     return theano.shared(numpy.zeros((0,) * variable.ndim,
                                      dtype=variable.dtype),
-                         name=name)
+                         name=name, **kwargs)
 
 
 def reraise_as(new_exc):
@@ -450,7 +483,8 @@ def print_shape(x, header=None):
 def change_recursion_limit(limit):
     """Temporarily changes the recursion limit."""
     old_limit = sys.getrecursionlimit()
-    sys.setrecursionlimit(limit)
+    if old_limit < limit:
+        sys.setrecursionlimit(limit)
     yield
     sys.setrecursionlimit(old_limit)
 
@@ -513,3 +547,34 @@ def extract_args(expected, *args, **kwargs):
                          [name for name in expected
                           if name not in routed_args]))
     return OrderedDict((key, routed_args[key]) for key in expected)
+
+
+def find_bricks(top_bricks, predicate):
+    """Walk the brick hierarchy, return bricks that satisfy a predicate.
+
+    Parameters
+    ----------
+    top_bricks : list
+        A list of root bricks to search downward from.
+    predicate : callable
+        A callable that returns `True` for bricks that meet the
+        desired criteria or `False` for those that don't.
+
+    Returns
+    -------
+    found : list
+        A list of all bricks that are descendants of any element of
+        `top_bricks` that satisfy `predicate`.
+
+    """
+    found = []
+    visited = set()
+    to_visit = deque(top_bricks)
+    while len(to_visit) > 0:
+        current = to_visit.popleft()
+        if current not in visited:
+            visited.add(current)
+            if predicate(current):
+                found.append(current)
+            to_visit.extend(current.children)
+    return found
diff --git a/blocks/utils/testing.py b/blocks/utils/testing.py
index b100bc44..a097eddd 100644
--- a/blocks/utils/testing.py
+++ b/blocks/utils/testing.py
@@ -76,6 +76,27 @@ def skip_if_not_available(modules=None, datasets=None, configurations=None):
             raise SkipTest
 
 
+def skip_if_configuration_set(configuration, value, message=None):
+    """Raise SkipTest if a configuration option has a certain value.
+
+    Parameters
+    ----------
+    configuration : str
+        Configuration option to check.
+    value : str
+        Value of `blocks.config.<attribute>` which should cause
+        a `SkipTest` to be raised.
+    message : str, optional
+        Reason for skipping the test.
+
+    """
+    if getattr(config, configuration) == value:
+        if message is not None:
+            raise SkipTest(message)
+        else:
+            raise SkipTest
+
+
 class MockAlgorithm(TrainingAlgorithm):
     """An algorithm that only saves data.
 
diff --git a/blocks/version.py b/blocks/version.py
new file mode 100644
index 00000000..60fb6a57
--- /dev/null
+++ b/blocks/version.py
@@ -0,0 +1 @@
+version = '0.2.0'
diff --git a/docs/_static/sequence_generator_scheme.png b/docs/_static/sequence_generator_scheme.png
index f0d0caca..956717d8 100644
Binary files a/docs/_static/sequence_generator_scheme.png and b/docs/_static/sequence_generator_scheme.png differ
diff --git a/docs/_static/sequence_generator_scheme.svg b/docs/_static/sequence_generator_scheme.svg
new file mode 100644
index 00000000..3d7e6f66
--- /dev/null
+++ b/docs/_static/sequence_generator_scheme.svg
@@ -0,0 +1,698 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="652.68256"
+   height="650.3139"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="sequence_generator_scheme.svg">
+  <defs
+     id="defs4">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Mend"
+       style="overflow:visible">
+      <path
+         id="path4562"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt"
+         transform="matrix(-0.4,0,0,-0.4,-4,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow1Lend"
+       style="overflow:visible">
+      <path
+         id="path4556"
+         d="M 0,0 5,-5 -12.5,0 5,5 0,0 z"
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt"
+         transform="matrix(-0.8,0,0,-0.8,-10,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="2"
+     inkscape:cx="311.40121"
+     inkscape:cy="315.27648"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1366"
+     inkscape:window-height="744"
+     inkscape:window-x="0"
+     inkscape:window-y="24"
+     inkscape:window-maximized="1"
+     fit-margin-top="100"
+     fit-margin-right="100"
+     fit-margin-bottom="100"
+     fit-margin-left="100">
+    <inkscape:grid
+       type="xygrid"
+       id="grid2990"
+       empspacing="5"
+       visible="true"
+       enabled="true"
+       snapvisiblegridlinesonly="true"
+       originx="-40.687325px"
+       originy="-436.84234px" />
+  </sodipodi:namedview>
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-40.687325,34.794067)">
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008"
+       width="30"
+       height="50"
+       x="169.49213"
+       y="301.65509" />
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.49825239;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008-3"
+       width="30.001747"
+       height="49.648193"
+       x="253.65906"
+       y="381.83771" />
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008-35"
+       width="30"
+       height="50"
+       x="399.332"
+       y="307.71573" />
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008-3-1"
+       width="30"
+       height="50"
+       x="527.67444"
+       y="378.85858" />
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.32784835;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008-35-1"
+       width="22.172153"
+       height="29.086363"
+       x="439.20355"
+       y="215.86314" />
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.32214788;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008-35-16"
+       width="20.985464"
+       height="29.671755"
+       x="232.63467"
+       y="258.89026" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot3086"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3088"><rect
+           id="rect3090"
+           width="35.355339"
+           height="63.286057"
+           x="259.86176"
+           y="214.7942" /></flowRegion><flowPara
+         id="flowPara3092" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot3109"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3111"><rect
+           id="rect3113"
+           width="57.982758"
+           height="57.629204"
+           x="265.16504"
+           y="229.99699" /></flowRegion><flowPara
+         id="flowPara3115" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot3117"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3119"><rect
+           id="rect3121"
+           width="94.398758"
+           height="100.05561"
+           x="247.84093"
+           y="219.03683" /></flowRegion><flowPara
+         id="flowPara3123" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot3127"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3129"><rect
+           id="rect3131"
+           width="55.861435"
+           height="39.59798"
+           x="246.42671"
+           y="232.82542" /></flowRegion><flowPara
+         id="flowPara3133" /></flowRoot>    <flowRoot
+       transform="translate(-79.21875,84.126444)"
+       xml:space="preserve"
+       id="flowRoot3076-4"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3078-3"><rect
+           id="rect3080-4"
+           width="107.12668"
+           height="137.17871"
+           x="254.2049"
+           y="229.28989" /></flowRegion><flowPara
+         id="flowPara3082-3"
+         style="font-size:14px">g<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan3169-1">i-1</flowSpan></flowPara></flowRoot>    <flowRoot
+       transform="translate(5.78125,164.12644)"
+       xml:space="preserve"
+       id="flowRoot3076-4-3"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3078-3-7"><rect
+           id="rect3080-4-2"
+           width="107.12668"
+           height="137.17871"
+           x="254.2049"
+           y="229.28989" /></flowRegion><flowPara
+         id="flowPara3082-3-5"
+         style="font-size:14px">s<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan3169-1-2">i-1</flowSpan></flowPara></flowRoot>    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 275,382.36218 c 24.64645,-19.34314 -20.35355,-69.34314 165,-150"
+       id="path3241"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 254,285.36218 c 5.91867,2.63052 13.29803,6.39487 20.1033,6.47301 C 320,292.36218 381.26366,254.03761 440,232.36218"
+       id="path3243"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="csc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Lend)"
+       d="M 408.33884,307.82665 C 388.66473,284.95124 381.43399,256.18024 440,232.36218"
+       id="path4015"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot4017"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="matrix(0.93367078,-0.3581325,0.3581325,0.93367078,174.59962,153.90181)"><flowRegion
+         id="flowRegion4019"><rect
+           id="rect4021"
+           width="101.82338"
+           height="55.861435"
+           x="140.71425"
+           y="156.81145" /></flowRegion><flowPara
+         id="flowPara4023"
+         style="font-size:8px">readout.readout</flowPara><flowPara
+         id="flowPara4025"
+         style="font-size:8px" /></flowRoot>    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 285,402.36218 c 21.04163,-30.9028 75.25946,-48.87438 113,-70.5"
+       id="path4035"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Lend)"
+       d="m 200,332.36218 c 33.11118,24.07306 61.08219,75.25466 199,-1"
+       id="path4037"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot4017-5"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="matrix(0.92758536,-0.37361132,0.37361132,0.92758536,113.23435,265.22823)"><flowRegion
+         id="flowRegion4019-3"><rect
+           id="rect4021-5"
+           width="101.82338"
+           height="55.861435"
+           x="140.71425"
+           y="156.81145" /></flowRegion><flowPara
+         id="flowPara4023-9"
+         style="font-size:8px">transition.take_glimpses</flowPara><flowPara
+         style="font-size:6px"
+         id="flowPara4069" /><flowPara
+         id="flowPara4025-7"
+         style="font-size:6px" /></flowRoot>    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.30361959;stroke-miterlimit:4;stroke-dasharray:0.91085878, 0.91085878;stroke-dashoffset:0"
+       id="rect3008-35-16-2"
+       width="20.296885"
+       height="27.250944"
+       x="206.92441"
+       y="215.62617" />
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.32784835;stroke-miterlimit:4;stroke-dasharray:0.98354504, 0.98354504;stroke-dashoffset:0"
+       id="rect3008-35-1-1"
+       width="22.172153"
+       height="29.086363"
+       x="491.70355"
+       y="216.11314" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 462,229.36218 30,0"
+       id="path4107"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="465"
+       y="227.36218"
+       id="text4109"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4111"
+         x="465"
+         y="227.36218"
+         style="font-size:4px">readout.emit</tspan></text>
+    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.32784835;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect3008-35-1-1-2"
+       width="22.172153"
+       height="29.086363"
+       x="521.70355"
+       y="261.11316" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 514,222.36218 19,39"
+       id="path4131"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="427.43634"
+       y="-369.73462"
+       id="text4109-6"
+       sodipodi:linespacing="125%"
+       transform="matrix(0.43357497,0.9011175,-0.9011175,0.43357497,0,0)"><tspan
+         sodipodi:role="line"
+         id="tspan4111-4"
+         x="427.43634"
+         y="-369.73462"
+         style="font-size:4px">readout.feedback</tspan><tspan
+         sodipodi:role="line"
+         x="427.43634"
+         y="-364.73462"
+         style="font-size:4px"
+         id="tspan4154" /></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Lend)"
+       d="m 285,407.36218 240,0"
+       id="path4156"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 530,292.36218 c -114.33739,99.64532 -77.6263,117.09464 -5,115"
+       id="path4158"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 411.5,357.86218 c -0.34817,23.49791 0.21577,32.19661 17.09426,40.34088 18.82112,9.08165 49.846,10.63499 96.40574,9.15912"
+       id="path4160"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="csc" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot4017-5-3"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="matrix(0.99985605,-0.01696733,0.01696733,0.99985605,266.50552,254.98703)"><flowRegion
+         id="flowRegion4019-3-2"><rect
+           id="rect4021-5-5"
+           width="109.1403"
+           height="59.834255"
+           x="140.71425"
+           y="156.81145" /></flowRegion><flowPara
+         id="flowPara4023-9-0"
+         style="font-size:8px">transition.compute_states</flowPara><flowPara
+         style="font-size:8px"
+         id="flowPara4225" /><flowPara
+         style="font-size:8px"
+         id="flowPara4069-0" /><flowPara
+         id="flowPara4025-7-6"
+         style="font-size:8px" /></flowRoot>    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.5;stroke-miterlimit:4;stroke-dasharray:none"
+       id="rect4229"
+       width="20"
+       height="30"
+       x="520"
+       y="172.36218" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 490,217.36218 c -8.03537,-25.39188 17.34054,-29.57331 30,-30"
+       id="path4231"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="469.99509"
+       y="233.35226"
+       id="text4109-6-6"
+       sodipodi:linespacing="125%"
+       transform="matrix(0.9956179,-0.09351466,0.09351466,0.9956179,0,0)"><tspan
+         sodipodi:role="line"
+         id="tspan4111-4-4"
+         x="469.99509"
+         y="233.35226"
+         style="font-size:4px">readout.cost</tspan><tspan
+         sodipodi:role="line"
+         x="469.99509"
+         y="238.35226"
+         style="font-size:4px"
+         id="tspan4257" /><tspan
+         sodipodi:role="line"
+         x="469.99509"
+         y="243.35226"
+         style="font-size:4px"
+         id="tspan4154-1" /></text>
+    <flowRoot
+       transform="translate(151.78125,89.126444)"
+       xml:space="preserve"
+       id="flowRoot3076-4-30"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3078-3-76"><rect
+           id="rect3080-4-7"
+           width="107.12668"
+           height="137.17871"
+           x="254.2049"
+           y="229.28989" /></flowRegion><flowPara
+         id="flowPara3082-3-6"
+         style="font-size:14px">g<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan3169-1-5">i</flowSpan></flowPara></flowRoot>    <flowRoot
+       transform="translate(280.78125,164.12644)"
+       xml:space="preserve"
+       id="flowRoot3076-4-9"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion3078-3-9"><rect
+           id="rect3080-4-8"
+           width="107.12668"
+           height="137.17871"
+           x="254.2049"
+           y="229.28989" /></flowRegion><flowPara
+         id="flowPara3082-3-8"
+         style="font-size:14px">s<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan3169-1-3">i</flowSpan></flowPara><flowPara
+         style="font-size:14px"
+         id="flowPara4334" /></flowRoot>    <flowRoot
+       transform="translate(90,10.000001)"
+       xml:space="preserve"
+       id="flowRoot4340-6"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion4342-7"><rect
+           id="rect4344-2"
+           width="58"
+           height="33.75"
+           x="121"
+           y="209.36218" /></flowRegion><flowPara
+         id="flowPara4346-8">y<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan4348-7">i-1</flowSpan></flowPara></flowRoot>    <flowRoot
+       transform="translate(115,55.000001)"
+       xml:space="preserve"
+       id="flowRoot4340"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion4342"><rect
+           id="rect4344"
+           width="58"
+           height="33.75"
+           x="121"
+           y="209.36218" /></flowRegion><flowPara
+         id="flowPara4346">f<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan4348">i-1</flowSpan></flowPara></flowRoot>    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 220,243.36218 12,29"
+       id="path4410"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="316.12149"
+       y="-86.490181"
+       id="text4109-6-1"
+       sodipodi:linespacing="125%"
+       transform="matrix(0.43357497,0.9011175,-0.9011175,0.43357497,0,0)"><tspan
+         sodipodi:role="line"
+         id="tspan4111-4-6"
+         x="316.12149"
+         y="-86.490181"
+         style="font-size:4px">readout.feedback</tspan><tspan
+         sodipodi:role="line"
+         x="316.12149"
+         y="-81.490181"
+         style="font-size:4px"
+         id="tspan4154-12" /></text>
+    <flowRoot
+       transform="translate(321,13.019691)"
+       xml:space="preserve"
+       id="flowRoot4340-6-7"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion4342-7-9"><rect
+           id="rect4344-2-8"
+           width="58"
+           height="33.75"
+           x="121"
+           y="209.36218" /></flowRegion><flowPara
+         id="flowPara4346-8-7">r<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan4348-7-8">i</flowSpan></flowPara><flowPara
+         id="flowPara4489" /></flowRoot>    <flowRoot
+       transform="translate(375,13.000001)"
+       xml:space="preserve"
+       id="flowRoot4340-6-1"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion4342-7-95"><rect
+           id="rect4344-2-0"
+           width="58"
+           height="33.75"
+           x="121"
+           y="209.36218" /></flowRegion><flowPara
+         id="flowPara4346-8-6">y<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan4348-7-4">i</flowSpan></flowPara><flowPara
+         id="flowPara4491" /></flowRoot>    <flowRoot
+       transform="translate(405,-31.999999)"
+       xml:space="preserve"
+       id="flowRoot4340-6-1-5"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion4342-7-95-8"><rect
+           id="rect4344-2-0-3"
+           width="58"
+           height="33.75"
+           x="121"
+           y="209.36218" /></flowRegion><flowPara
+         id="flowPara4346-8-6-5">c<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan4348-7-4-6">i</flowSpan></flowPara><flowPara
+         id="flowPara4491-9" /></flowRoot>    <flowRoot
+       transform="translate(405,58.000001)"
+       xml:space="preserve"
+       id="flowRoot4340-6-1-1"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion4342-7-95-81"><rect
+           id="rect4344-2-0-6"
+           width="58"
+           height="33.75"
+           x="121"
+           y="209.36218" /></flowRegion><flowPara
+         id="flowPara4346-8-6-59">f<flowSpan
+   style="font-size:65.00091553%;baseline-shift:sub"
+   id="flowSpan4348-7-4-5">i</flowSpan></flowPara><flowPara
+         id="flowPara4491-0" /></flowRoot>    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       d="m 450,216.36218 c 18.45397,-26.28319 43.91257,-26.96325 69,-29"
+       id="path4547"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6653"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="translate(175.75,31.539196)"><flowRegion
+         id="flowRegion6655"><rect
+           id="rect6657"
+           width="254.5"
+           height="175"
+           x="37"
+           y="29.862181" /></flowRegion><flowPara
+         id="flowPara6659"
+         style="font-size:16px;text-align:center;text-anchor:middle">Legend:</flowPara><flowPara
+         id="flowPara6661">s - states</flowPara><flowPara
+         id="flowPara6663">g - glimpses</flowPara><flowPara
+         id="flowPara6665">r - readouts</flowPara><flowPara
+         id="flowPara6667">y - outputs</flowPara><flowPara
+         id="flowPara6669">f - feedback</flowPara><flowPara
+         id="flowPara6671">c - costs </flowPara><flowPara
+         id="flowPara6673" /><flowPara
+         id="flowPara6677">Dashed rectangle for outputs expresses the fact that they can be provided by the user (see BaseSequenceGenerator.cost method) </flowPara><flowPara
+         id="flowPara6681" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6689"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion6691"><rect
+           id="rect6693"
+           width="318"
+           height="181.5"
+           x="-15"
+           y="9.3621817" /></flowRegion><flowPara
+         id="flowPara6695" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6743"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion6745"><rect
+           id="rect6747"
+           width="48.436813"
+           height="25.455845"
+           x="281.07495"
+           y="294.34372" /></flowRegion><flowPara
+         id="flowPara6749">fork.apply</flowPara></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6751"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="matrix(0.74474366,-0.66735064,0.66735064,0.74474366,62.93166,292.19374)"><flowRegion
+         id="flowRegion6753"><rect
+           id="rect6755"
+           width="57.982758"
+           height="25.102291"
+           x="290.62088"
+           y="297.5257" /></flowRegion><flowPara
+         id="flowPara6757">fork.apply</flowPara><flowPara
+         id="flowPara6759" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6761"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion6763"><rect
+           id="rect6765"
+           width="132.58252"
+           height="40.658642"
+           x="171.82695"
+           y="459.45316" /></flowRegion><flowPara
+         id="flowPara6767" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6769"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion6771"><rect
+           id="rect6773"
+           width="104.29825"
+           height="48.790367"
+           x="181.37289"
+           y="453.44275" /></flowRegion><flowPara
+         id="flowPara6775" /></flowRoot>    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6777"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"><flowRegion
+         id="flowRegion6779"><rect
+           id="rect6781"
+           width="145.31044"
+           height="57.982758"
+           x="191.27238"
+           y="460.86737" /></flowRegion><flowPara
+         id="flowPara6783" /></flowRoot>    <rect
+       style="fill:#000000;fill-opacity:0;stroke:#000000;stroke-width:0.37787628;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect6787"
+       width="243.11633"
+       height="23.776453"
+       x="242.11334"
+       y="454.08878" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6789"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="translate(162.12311,-10.31802)"><flowRegion
+         id="flowRegion6791"><rect
+           id="rect6793"
+           width="166.52365"
+           height="30.052038"
+           x="162.98811"
+           y="465.81711" /></flowRegion><flowPara
+         id="flowPara6795"
+         style="font-size:14px">contexts</flowPara><flowPara
+         id="flowPara6797"
+         style="font-size:14px" /></flowRoot>    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 312.77733,455.29445 c 17.71074,-41.46378 -23.09365,-62.60423 83.63961,-122.54438"
+       id="path6799"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 315.25,453.11218 c 21.04843,-56.1807 127.61911,-41.93132 212.25,-45.75"
+       id="path6801"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
+       d="m 314.40233,454.14984 c -61.21,-147.88166 36.89388,-175.55236 125.51145,-222.03152"
+       id="path6803"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#0000ff;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:1.5, 1.5;stroke-dashoffset:0"
+       d="m 593.11001,447.56264 c -4.28319,-41.35628 -13.56512,-61.91898 -41.00899,-82.67808 -28.73595,-21.73646 -81.35462,-54.11069 -133.10201,-72.61111 -39.91899,-4.55023 -190.43272,-1.43055 -247.57655,3.62099 -35.42709,3.13177 -32.47435,78.18842 -27.56957,139.99767 75.49554,60.0195 139.45573,87.3674 288.07077,77.35322 44.61612,-11.71534 162.43578,-53.6188 161.18635,-65.68269 z"
+       id="path6807"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="sscsccs" />
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot6809"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       transform="matrix(0.93890781,-0.34416876,0.34416876,0.93890781,-0.90353322,119.53983)"><flowRegion
+         id="flowRegion6811"><rect
+           id="rect6813"
+           width="151.5"
+           height="32.5"
+           x="306"
+           y="520.36218" /></flowRegion><flowPara
+         id="flowPara6815"
+         style="fill:#0000ff">transition.apply</flowPara><flowPara
+         id="flowPara6817"
+         style="fill:#0000ff" /></flowRoot>    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+       x="292.5"
+       y="227.8139"
+       id="text7009"
+       sodipodi:linespacing="125%"
+       transform="translate(40.687325,-34.794067)"><tspan
+         sodipodi:role="line"
+         id="tspan7011"
+         x="292.5"
+         y="227.8139" /></text>
+  </g>
+</svg>
diff --git a/docs/api/log.rst b/docs/api/log.rst
index ba1d4e62..916f46bd 100644
--- a/docs/api/log.rst
+++ b/docs/api/log.rst
@@ -3,7 +3,26 @@
 Logging
 =======
 
+Log has two different backends configurable in ``.blocksrc``,
+see :doc:`../configuration`.
+
 .. automodule:: blocks.log
     :members:
     :undoc-members:
     :show-inheritance:
+
+Dictionary backend
+------------------
+
+.. automodule:: blocks.log.log
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+Sqlite backend
+--------------
+
+.. automodule:: blocks.log.sqlite
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/bricks_overview.rst b/docs/bricks_overview.rst
index d1949923..1f3801d9 100644
--- a/docs/bricks_overview.rst
+++ b/docs/bricks_overview.rst
@@ -174,7 +174,7 @@ automatically pushed the weight matrix and biases initialization
 configuration to its children.
 
     >>> mlp.initialize()
-    >>> mlp.children[1].parameters[0].get_value() # doctest: +SKIP
+    >>> mlp.children[0].parameters[0].get_value() # doctest: +SKIP
     array([[-0.38312393, -1.7718271 ,  0.78074479, -0.74750996],
            ...
            [ 1.32390416, -0.56375355, -0.24268186, -2.06008577]])
diff --git a/docs/conf.py b/docs/conf.py
index a726e04c..edae1956 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -94,9 +94,10 @@ def __getattr__(cls, name):
 # built documents.
 #
 # The short X.Y version.
-version = '0.1'
+import blocks
+version = '.'.join(blocks.__version__.split('.')[:2])
 # The full version, including alpha/beta/rc tags.
-release = '0.1.1'
+release = blocks.__version__
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/create_your_own_brick.rst b/docs/create_your_own_brick.rst
index d67b0ce5..275d6232 100644
--- a/docs/create_your_own_brick.rst
+++ b/docs/create_your_own_brick.rst
@@ -40,7 +40,7 @@ advocated whenever it makes sense.
 Here are examples of possible bricks to inherit from:
 
 * :class:`.Sequence`: a sequence of bricks.
-* :class:`.Initializable`: a brick that defines a same initialiation scheme
+* :class:`.Initializable`: a brick that defines a same initialization scheme
   (weights and biases) for all its children.
 * :class:`.Feedforward`: declares an interface for bricks with one input and
   one output.
@@ -58,7 +58,7 @@ of :class:`.Brick` for a precise description of the life-cycle of a brick):
 * :meth:`.Brick.__init__`: you should pass by argument the attributes of your
   brick. It is also in this method that you should create the potential
   "children bricks" that belongs to your brick (in that case, you have to put
-  the children bricks into ``self.children``). The initialiazation of the
+  the children bricks into ``self.children``). The initialization of the
   attributes can be lazy as described later in the tutorial.
 * :meth:`apply`: you need to implement a method that actually
   implements the operation of the brick, taking as arguments the inputs
@@ -119,12 +119,12 @@ to the tag attributes of the variables, as shown below:
     >>> i2 = tensor.matrix('i2')
     >>> y = foo.apply(i1, i2)
     >>> theano.printing.debugprint(y)
-    Elemwise{identity} [@A] 'foo_apply_output'   
-     |Elemwise{add,no_inplace} [@B] ''   
-       |Elemwise{identity} [@C] 'foo_apply_input1'   
-       | |i1 [@D]
-       |Elemwise{identity} [@E] 'foo_apply_input2'   
-         |i2 [@F]
+    Elemwise{identity} [id A] 'foo_apply_output'   
+     |Elemwise{add,no_inplace} [id B] ''   
+       |Elemwise{identity} [id C] 'foo_apply_input1'   
+       | |i1 [id D]
+       |Elemwise{identity} [id E] 'foo_apply_input2'   
+         |i2 [id F]
     >>> print(y.name)
     foo_apply_output
     >>> print(y.tag.name)
diff --git a/docs/development/docs.rst b/docs/development/docs.rst
index 0d4efd48..83bc74e9 100644
--- a/docs/development/docs.rst
+++ b/docs/development/docs.rst
@@ -4,15 +4,25 @@ Building documentation
 If you've made significant changes to the documentation, you can build a local
 to see how your changes are rendered. You will need to install Sphinx_, the
 Napoleon_ extension (to enable NumPy docstring support), and the `Read the Docs
-theme`_. You can do this by installing the optional ``docs`` requirements:
+theme`_. You can do this by installing the optional ``docs`` requirements.
+
+For Blocks:
 
 .. code-block:: bash
 
    $ pip install --upgrade git+git://github.com/user/blocks.git#egg=blocks[docs]
 
+
+For Fuel:
+
+.. code-block:: bash
+
+   $ pip install --upgrade git+git://github.com/user/fuel.git#egg=fuel[docs]
+
+
 After the requirements have been installed, you can build a copy of the
 documentation by running the following command from the root ``blocks``
-directory.
+(or ``fuel``) directory.
 
 .. code-block:: bash
 
@@ -24,9 +34,9 @@ directory.
 Docstrings
 ----------
 
-Blocks follows the `NumPy docstring standards`_. For a quick introduction, have
-a look at the NumPy_ or Napoleon_ examples of compliant docstrings. A few common
-mistakes to avoid:
+Blocks and Fuel follow the `NumPy docstring standards`_. For a quick
+introduction, have a look at the NumPy_ or Napoleon_ examples of
+compliant docstrings. A few common mistakes to avoid:
 
 * There is no line break after the opening quotes (``"""``).
 * There is an empty line before the closing quotes (``"""``).
@@ -36,8 +46,9 @@ The docstrings are formatted using reStructuredText_, and can make use of all
 the formatting capabilities this provides. They are rendered into HTML
 documentation using the `Read the Docs`_ service. After code has been merged,
 please ensure that documentation was built successfully and that your docstrings
-rendered as you intended by looking at the `online documentation`_, which is
-automatically updated.
+rendered as you intended by looking at the online documentation (for
+`Blocks <Blocks online documentation_>`_ or `Fuel <Fuel online documentation_>`_,
+which is automatically updated.
 
 Writing doctests_ is encouraged, and they are run as part of the test suite.
 They should use Python 3 syntax.
@@ -48,7 +59,8 @@ They should use Python 3 syntax.
 .. _reStructuredText: http://docutils.sourceforge.net/rst.html
 .. _doctests: https://docs.python.org/2/library/doctest.html
 .. _Read the Docs: https://readthedocs.org/
-.. _online documentation: http://blocks.readthedocs.org/
+.. _Blocks online documentation: http://blocks.readthedocs.org/
+.. _Fuel online documentation: http://fuel.readthedocs.org/
 .. _a bug in Napoleon: https://bitbucket.org/birkenfeld/sphinx-contrib/issue/82/napoleon-return-type-containing-colons-is
 
 .. _references_and_intersphinx:
diff --git a/docs/development/index.rst b/docs/development/index.rst
index 125a654a..9b401a2f 100644
--- a/docs/development/index.rst
+++ b/docs/development/index.rst
@@ -1,10 +1,10 @@
 Development
 ===========
 
-We want to encourage everyone to contribute to the development of Blocks. To
-ensure the codebase is of high quality, we ask all new developers to have a
-quick read through these rules to make sure that any code you contribute will be
-easy to merge!
+We want to encourage everyone to contribute to the development of Blocks
+and Fuel. To ensure the codebase is of high quality, we ask all new
+developers to have a quick read through these rules to make sure that
+any code you contribute will be easy to merge!
 
 
 .. image:: /_static/code_quality.png
@@ -13,7 +13,9 @@ easy to merge!
 Formatting guidelines
 ---------------------
 Blocks follows the `PEP8 style guide`_ closely, so please make sure you are
-familiar with it. Our `Travis CI buildbot`_ runs flake8_ as part of every build,
+familiar with it. Our Travis CI buildbots (for `Blocks <Blocks buildbot_>`_,
+`Fuel <Fuel buildbot_>`_, and `Blocks-extras <Blocks-extras buildbot_>`_)
+run flake8_ as part of every build,
 which checks for PEP8 compliance (using the pep8_ tool) and for some common
 coding errors using pyflakes_. You might want to install and run flake8_ on your
 code before submitting a PR to make sure that your build doesn't fail because of
@@ -27,7 +29,7 @@ compliant! Some guidelines which aren't checked by flake8_:
 * Variable names should be explanatory and unambiguous.
 
 There are also some style guideline decisions that were made specifically for
-Blocks:
+Blocks and Fuel:
 
 * Do not rename imports i.e. do not use ``import theano.tensor as T`` or
   ``import numpy as np``.
@@ -53,7 +55,9 @@ Blocks:
            self.baz = baz
 
 .. _PEP8 style guide: https://www.python.org/dev/peps/pep-0008/
-.. _Travis CI buildbot: https://travis-ci.org/mila-udem/blocks
+.. _Blocks buildbot: https://travis-ci.org/mila-udem/blocks
+.. _Blocks-extras buildbot: https://travis-ci.org/mila-udem/blocks-extras
+.. _Fuel buildbot: https://travis-ci.org/mila-udem/fuel
 .. _flake8: https://pypi.python.org/pypi/flake8
 .. _pep8: https://pypi.python.org/pypi/pep8
 .. _pyflakes: https://pypi.python.org/pypi/pyflakes
@@ -61,9 +65,10 @@ Blocks:
 
 Code guidelines
 ---------------
-Some guidelines to keep in mind when coding for Blocks. Some of these are simply
-preferences, others stem from particular requirements we have e.g. in order to
-serialize training progress, support Python 2 and 3 simultaneously, etc.
+Some guidelines to keep in mind when coding for Blocks or Fuel. Some of
+these are simply preferences, others stem from particular requirements
+we have, e.g., in order to serialize training progress, support Python 2
+and 3 simultaneously, etc.
 
 Validating function arguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -119,10 +124,11 @@ building documentation. To prevent this, make sure to always use the
 
 Python 2 and 3
 ~~~~~~~~~~~~~~
-Blocks aims to be both Python 2 and Python 3 compliant using a single code-base,
-without using 2to3_. There are many online resources which discuss the writing
-of compatible code. For a quick overview see `the cheatsheet from Python
-Charmers`_. For non-trivial cases, we use the six_ compatibility library.
+Blocks and Fuel aim to be both Python 2 and Python 3 compliant using a
+single code-base, without using 2to3_. There are many online resources
+which discuss the writing of compatible code. For a quick overview see
+`the cheatsheet from Python Charmers`_. For non-trivial cases, we use
+the six_ compatibility library.
 
 Documentation should be written to be Python 3 compliant.
 
@@ -143,16 +149,18 @@ Not doing so `clobbers the original traceback`_, making it impossible to use
 
 Serialization
 ~~~~~~~~~~~~~
-To ensure the reproducibility of scientific experiments Blocks tries to make
-sure that stopping and resuming training doesn't affect the final results. In
-order to do so it takes a radical approach, serializing the entire training
-state using pickle_. Some things cannot be pickled, so their use should be
-avoided when the object will be pickled as part of the main loop:
+To ensure the reproducibility of scientific experiments, Blocks and Fuel
+try to make sure that stopping and resuming training doesn't affect
+the final results. In order to do so it takes a radical approach,
+serializing the entire training state using pickle_. Some things cannot
+be pickled, so their use should be avoided when the object will be
+pickled as part of the main loop:
 
 * Lambda functions
 * Iterators and generators (use picklable_itertools_)
 * References to methods as attributes
-* Any variable that lies outside of the global namespace e.g. nested functions
+* Any variable that lies outside of the global namespace, e.g.,
+  nested functions
 * Dynamically generated classes (possible_ but complicated)
 
 .. _pickle: https://docs.python.org/3/library/pickle.html
@@ -210,9 +218,11 @@ the error is being raised for.
 
 Unit testing
 ------------
-Blocks uses unit testing to ensure that individual parts of the library behave
-as intended. It's also essential in ensuring that parts of the library are not
-broken by proposed changes.
+Blocks and Fuel use unit testing to ensure that individual parts of
+the library behave as intended. It's also essential in ensuring that
+parts of the library are not broken by proposed changes. Since Blocks
+and Fuel were designed to be used together, it is important to make sure
+changes in Fuel do not break Blocks.
 
 All new code should be accompanied by extensive unit tests. Whenever a pull
 request is made, the full test suite is run on `Travis CI`_, and pull requests
@@ -236,8 +246,8 @@ The test suite can be executed locally using nose2_ [#]_.
 Writing and building documentation
 ----------------------------------
 The :doc:`documentation guidelines <docs>` outline how to write documentation
-for Blocks, and how to build a local copy of the documentation for testing
-purposes.
+for Blocks and Fuel, and how to build a local copy of the documentation for
+testing purposes.
 
 Internal API
 ------------
@@ -253,7 +263,65 @@ See the instructions at the bottom of the :doc:`installation instructions
 Sending a pull request
 ----------------------
 See our :doc:`pull request workflow <pull_request>` for a refresher on the
-general recipe for sending a pull request to Blocks.
+general recipe for sending a pull request to Blocks or Fuel.
+
+Making a new release
+--------------------
+.. note:
+   This section is targeted for Blocks and Fuel administrators.
+
+Create an initial pull request and copy the following piece of markdown code. 
+This pull request should only change the version number. Then, create a pull 
+request to Fuel which refers the first PR. Follow the instruction carefully 
+and check the boxes in process. 
+```
+- **Stage 1**: Make changes in `master`:
+  - [ ] Freeze other PRs.
+
+        After we agreed to initiate the process of releasing a new version,
+        other PRs shouldn't be merged.
+  - [ ] Increase the version number counter of Blocks.
+
+        Change the version number in `blocks/__init__.py`.
+  - [ ] Increase the version number counter of Fuel.
+        
+        Change the version number in `fuel/version.py`.
+- **Stage 2**: After two PRs merged to Blocks and Fuel:
+  - [ ] Create a pull request to merge `master` into `stable`.
+
+        Add a link to the initial PR in order not to get lost in the numerous
+        pull requests.
+  - [ ] Create a pull request to Fuel.
+
+        This will be a corresponding PR to Fuel which merges its `master` into
+        `stable`. Add a link to  the initial PR.
+  - [ ] Check the Travis CI build log *on both the pull requests merging
+        `master` into `stable`*.
+   
+        Read carefully the Travis CI messages, check that it tests the
+        right version.
+  - [ ] Check the Theano version.
+
+        The `req*.txt` should refer the last development Theano version
+        which is known not to have bugs.
+  - [ ] Check the Fuel version in `req*.txt` files.
+  
+        We should reference the stable version of Fuel. It can be seen
+        in the Travis CI output.
+  - [ ] Merge Fuel pull request.
+  - [ ] Merge this pull request.
+- **Stage 3**: After the PRs are merged:
+  - [ ] Wait the build to pass.
+  - [ ] Check documentation build at ReadTheDocs.
+  - [ ] Double check that the version corresponds `__version__`.
+  - [ ] Create a release of Fuel by going to the
+        [releases page](https://github.com/mila-udem/fuel/releases) and
+        clicking "Draft new release".
+  - [ ] Create a release of Blocks by going to the
+        [releases page](https://github.com/mila-udem/blocks/releases) and
+        clicking "Draft new release".
+
+```
 
 .. toctree::
    :hidden:
diff --git a/docs/rnn.rst b/docs/rnn.rst
index 508a43fe..5bebc6a8 100644
--- a/docs/rnn.rst
+++ b/docs/rnn.rst
@@ -63,8 +63,8 @@ receives (figure below).
    x_1; x_2; x_3;
 
    node [shape=plaintext];
-   h_0 [label="(0, 0, 0)"]; h_1 [label="(1, 1, 1)"];
-   h_2 [label="(2, 2, 2)"]; h_3 [label="(3, 3, 3)"];
+   h_0 [label="(0, 0, 0)"]; h_1 [label="(2, 2, 2)"];
+   h_2 [label="(4, 4, 4)"]; h_3 [label="(6, 6, 6)"];
 
    node [shape=diamond,regular=1,label="+"];
    plus_1; plus_2; plus_3;
@@ -186,21 +186,23 @@ figure below).
 
    node [shape=plaintext];
    h1_0 [label="(0, 0, 0)"]; h1_1 [label="(1, 1, 1)"];
-   h1_2 [label="(4, 4, 4)"]; h1_3 [label="(12, 12, 12)"];
+   h1_2 [label="(3, 3, 3)"]; h1_3 [label="(8, 8, 8)"];
    h2_0 [label="(0, 0, 0)"]; h2_1 [label="(1, 1, 1)"];
-   h2_2 [label="(3, 3, 3)"]; h2_3 [label="(8, 8, 8)"];
+   h2_2 [label="(4, 4, 4)"]; h2_3 [label="(12, 12, 12)"];
 
    node [shape=diamond,regular=1,label="+"];
    plus_1_1; plus_1_2; plus_1_3; plus_2_1; plus_2_2; plus_2_3;
 
    x_1 -> plus_1_1; x_2 -> plus_1_2; x_3 -> plus_1_3;
    h1_0 -> plus_1_1 -> h1_1 -> plus_1_2 -> h1_2 -> plus_1_3 -> h1_3;
+   plus_1_1 -> plus_2_1; plus_1_2 -> plus_2_2; plus_1_3 -> plus_2_3;
    h2_0 -> plus_2_1 -> h2_1 -> plus_2_2 -> h2_2 -> plus_2_3 -> h2_3;
    h2_0 -> plus_1_1; h2_1 -> plus_1_2; h2_2 -> plus_1_3;
 
+
+
    edge [style=invis];
-   h2_0 -> h1_0; h2_1 -> h1_1; h2_2 -> h1_2; h2_3 -> h1_3;
-   plus_2_1 -> plus_1_1; plus_2_2 -> plus_1_2; plus_2_3 -> plus_1_3;
+   h2_0 -> h1_0; h2_1 -> h1_1; h2_2 -> h1_2;
 
    { rank=source; h2_0, h2_1, h2_2, h2_3, plus_2_1, plus_2_2, plus_2_3 }
    { rank=same; h1_0, h1_1, h1_2, h1_3, plus_1_1, plus_1_2, plus_1_3 }
diff --git a/docs/serialization.rst b/docs/serialization.rst
index 8e6adbf5..6582b977 100644
--- a/docs/serialization.rst
+++ b/docs/serialization.rst
@@ -54,7 +54,7 @@ Parameter saving
 
 This is why Blocks intercepts the pickling of all Theano shared variables (which
 includes the parameters), and stores them as separate NPY_ files. The resulting
-file is a ZIP arcive that contains the pickled main loop as well as a collection
+file is a ZIP archive that contains the pickled main loop as well as a collection
 of NumPy arrays. The NumPy arrays (and hence parameters) in the ZIP file can be
 read, across platforms, using the :func:`numpy.load` function, making it
 possible to inspect and load parameter values, even if the unpickling of the
diff --git a/docs/setup.rst b/docs/setup.rst
index a062cad1..e0bd7ad9 100644
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -1,9 +1,9 @@
 Installation
 ============
 
-The easiest way to install Blocks using the Python package manager pip. Blocks
-isn't listed yet on the Python Package Index (PyPI), so you will have to grab it
-directly from GitHub.
+The easiest way to install Blocks is using the Python package manager
+``pip``. Blocks isn't listed yet on the Python Package Index (PyPI), so
+you will have to grab it directly from GitHub.
 
 .. code-block:: bash
 
@@ -60,6 +60,8 @@ Blocks' requirements are
 Bokeh_ is an optional requirement for if you want to use live plotting of your
 training progress (part of ``blocks-extras_``).
 
+nose2_ is an optional requirement, used to run the tests.
+
 We develop using the bleeding-edge version of Theano, so be sure to follow the
 `relevant installation instructions`_ to make sure that your Theano version is
 up to date if you didn't install it through Blocks.
@@ -105,4 +107,3 @@ Documentation
 
 If you want to build a local copy of the documentation, follow the instructions
 at the :doc:`documentation development guidelines <development/docs>`.
-
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index ff0b6b74..7237bb44 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -104,7 +104,7 @@ the parameters by adding a :math:`L2`-regularization term (also known as
 To get the weights from our model, we will use Blocks' annotation features (read
 more about them in the :doc:`cg` tutorial).
 
->>> from blocks.bricks import WEIGHT
+>>> from blocks.roles import WEIGHT
 >>> from blocks.graph import ComputationGraph
 >>> from blocks.filter import VariableFilter
 >>> cg = ComputationGraph(cost)
diff --git a/req-rtd.txt b/req-rtd.txt
index 3d8af082..eeec19ea 100644
--- a/req-rtd.txt
+++ b/req-rtd.txt
@@ -1,5 +1,5 @@
 picklable-itertools==0.1.1
-progressbar2==2.7.3
+progressbar2==3.6.0
 pyyaml==3.11
 six==1.9.0
 toolz==0.7.2
diff --git a/req-travis-pip.txt b/req-travis-pip.txt
index 0cce6731..f0ed3514 100644
--- a/req-travis-pip.txt
+++ b/req-travis-pip.txt
@@ -1,7 +1,7 @@
 coveralls==1.0
 nose2[coverage-plugin]==0.5.0
 picklable-itertools==0.1.1
-progressbar2==2.7.3
+progressbar2==3.6.0
 
 --allow-external theano
 --allow-unverified theano
diff --git a/requirements.txt b/requirements.txt
index 01b04615..609caa77 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
 numpy==1.9.3
 picklable-itertools==0.1.1
-progressbar2==2.7.3
+progressbar2==3.6.0
 pyyaml==3.11
 six==1.9.0
 toolz==0.7.2
 
 --allow-external theano
 --allow-unverified theano
-git+https://github.com/Theano/Theano.git@8d3a67b73fda49350d9944c9a24fc9660131861c#egg=theano
+git+https://github.com/Theano/Theano.git@0aa5ff77273f45a1566013872732ea78675b388d#egg=theano
 
 --allow-external fuel
 --allow-unverified fuel
diff --git a/setup.py b/setup.py
index a8e3638a..065e4666 100644
--- a/setup.py
+++ b/setup.py
@@ -10,9 +10,14 @@
         pass
     long_description = 'Blocks\n' + f.read().strip()
 
+exec_results = {}
+with open(path.join(path.dirname(__file__), 'blocks/version.py')) as file_:
+    exec(file_.read(), exec_results)
+version = exec_results['version']
+
 setup(
     name='blocks',
-    version=blocks.__version__,  # PEP 440 compliant
+    version=version,
     description='A Theano framework for building and training neural networks',
     long_description=long_description,
     url='https://github.com/mila-udem/blocks',
diff --git a/tests/algorithms/test_algorithms.py b/tests/algorithms/test_algorithms.py
index 07339955..4d26d56d 100644
--- a/tests/algorithms/test_algorithms.py
+++ b/tests/algorithms/test_algorithms.py
@@ -10,7 +10,27 @@
                                CompositeRule, Scale, StepRule, BasicMomentum,
                                Momentum, AdaDelta, BasicRMSProp, RMSProp, Adam,
                                AdaGrad, RemoveNotFinite, Restrict)
-from blocks.utils import shared_floatx
+from blocks.utils import shared_floatx, shared_floatx_zeros
+
+
+def verify_broadcastable_handling(step_rule):
+    def check(param):
+        grad = tensor.grad(param.sum(), wrt=param)
+        step, _ = step_rule.compute_steps(OrderedDict([(param, grad)]))
+        assert step[param].broadcastable == grad.broadcastable
+
+    check(shared_floatx_zeros((5, 6, 1, 5),
+                              broadcastable=(False, False, True, False)))
+    check(shared_floatx_zeros((2, 1, 3),
+                              broadcastable=(False, True, False)))
+    check(shared_floatx_zeros((3, 4, 1),
+                              broadcastable=(False, False, True)))
+    check(shared_floatx_zeros((1, 9, 6),
+                              broadcastable=(True, False, False)))
+    check(shared_floatx_zeros((1, 1, 1),
+                              broadcastable=(True, True, True)))
+    check(shared_floatx_zeros((1, 5, 1),
+                              broadcastable=(True, False, True)))
 
 
 def test_gradient_descent():
@@ -69,6 +89,10 @@ def test_basic_momentum():
     assert_allclose(f()[0], [10.5, 14.])
 
 
+def test_basic_momentum_broadcastable():
+    verify_broadcastable_handling(BasicMomentum(0.5))
+
+
 def test_momentum():
     a = shared_floatx([3, 4])
     cost = (a ** 2).sum()
@@ -80,6 +104,10 @@ def test_momentum():
     assert_allclose(f()[0], [1.05, 1.4])
 
 
+def test_momentum_broadcastable():
+    verify_broadcastable_handling(Momentum(0.5))
+
+
 def test_adadelta():
     a = shared_floatx([3, 4])
     cost = (a ** 2).sum()
@@ -110,6 +138,10 @@ def test_basicrmsprop():
     assert_allclose(f()[0], [0.6172134, 0.64699664])
 
 
+def test_basicrmsprop_broadcastable():
+    verify_broadcastable_handling(BasicRMSProp(0.5, 1e5))
+
+
 def test_basicrmsprop_max_scaling():
     a = shared_floatx([1e-6, 1e-6])
     cost = (a ** 2).sum()
@@ -143,6 +175,10 @@ def test_rmsprop():
     assert_allclose(f()[0], [0.06172134, 0.064699664])
 
 
+def test_rmsprop_broadcastable():
+    verify_broadcastable_handling(RMSProp(0.1, 0.5, 1e5))
+
+
 def test_step_clipping():
     rule1 = StepClipping(4)
     rule2 = StepClipping(5)
@@ -156,6 +192,10 @@ def test_step_clipping():
     assert_allclose(clipped2[1].eval(), 4.0)
 
 
+def test_step_clipping_broadcastable():
+    verify_broadcastable_handling(StepClipping(0.4))
+
+
 def test_variable_clipping():
     # Test simple variable clipping with no axis.
     rule1 = VariableClipping(5)
@@ -208,6 +248,10 @@ def test_variable_clipping():
     assert_raises(ValueError, VariableClipping, 50, axis=(0, 0))
 
 
+def test_variable_clipping_broadcastable():
+    verify_broadcastable_handling(VariableClipping(1))
+
+
 def test_composite_rule():
     rule = CompositeRule([StepClipping(4), Scale(0.1)])
     gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
@@ -242,6 +286,10 @@ def test_adam():
     assert_allclose(f()[0], [0.00178724, 0.0018223], rtol=rtol)
 
 
+def test_adam_broadcastable():
+    verify_broadcastable_handling(Adam())
+
+
 def test_adagrad():
     a = shared_floatx([3, 4])
     cost = (a ** 2).sum()
@@ -257,6 +305,10 @@ def test_adagrad():
     assert_allclose(f()[0], [0.00053452,  0.0005747], rtol=rtol)
 
 
+def test_adagrad_broadcastable():
+    verify_broadcastable_handling(AdaGrad())
+
+
 def test_remove_not_finite():
     rule1 = RemoveNotFinite(0.1)
     rule2 = RemoveNotFinite()
@@ -272,6 +324,11 @@ def test_remove_not_finite():
     assert_allclose(rval2[2].eval(), 0.0)
 
 
+def test_remove_not_finite_broadcastable():
+    verify_broadcastable_handling(RemoveNotFinite())
+    verify_broadcastable_handling(RemoveNotFinite(0.1))
+
+
 class DummyUpdatesStepRule(StepRule):
     def compute_step(self, parameter, previous_step):
         return previous_step + 2, [(parameter * 10, parameter * 100)]
diff --git a/tests/bricks/test_bn.py b/tests/bricks/test_bn.py
new file mode 100644
index 00000000..dc853b84
--- /dev/null
+++ b/tests/bricks/test_bn.py
@@ -0,0 +1,306 @@
+import collections
+import theano
+from theano import tensor
+import numpy
+from numpy.testing import assert_raises, assert_allclose, assert_equal
+from blocks.bricks import Tanh, Sequence
+from blocks.bricks import (BatchNormalization, SpatialBatchNormalization,
+                           BatchNormalizedMLP)
+from blocks.bricks.conv import (Convolutional, ConvolutionalSequence,
+                                MaxPooling, AveragePooling)
+from blocks.initialization import Constant
+from blocks.graph import (ComputationGraph, batch_normalization,
+                          get_batch_normalization_updates)
+
+
+def random_unif(rng, dim, low=1, high=10):
+    """Generate some floatX uniform random numbers."""
+    return (rng.uniform(low, high, size=dim)
+            .astype(theano.config.floatX))
+
+
+def test_batch_normalization_allocation_initialization():
+    """Sanity check allocation & initialization of BN bricks."""
+    def check(input_dim, expected_shape, broadcastable=None,
+              conserve_memory=True):
+        bn = BatchNormalization(input_dim=input_dim,
+                                broadcastable=broadcastable,
+                                conserve_memory=conserve_memory)
+        if broadcastable is None:
+            if not isinstance(input_dim, collections.Sequence):
+                b_input_dim = (input_dim,)
+            else:
+                b_input_dim = input_dim
+            input_broadcastable = tuple(False for _ in range(len(b_input_dim)))
+        else:
+            input_broadcastable = broadcastable
+        bn.allocate()
+        assert conserve_memory == bn.conserve_memory
+        assert input_dim == bn.input_dim
+        assert bn.broadcastable == broadcastable
+        assert bn.scale.broadcastable == input_broadcastable
+        assert bn.shift.broadcastable == input_broadcastable
+        assert bn.population_mean.broadcastable == input_broadcastable
+        assert bn.population_stdev.broadcastable == input_broadcastable
+        assert_allclose(bn.population_mean.get_value(borrow=True), 0.)
+        assert_allclose(bn.population_stdev.get_value(borrow=True), 1.)
+        assert_equal(bn.scale.get_value(borrow=True).shape, expected_shape)
+        assert_equal(bn.shift.get_value(borrow=True).shape, expected_shape)
+        assert_equal(bn.population_mean.get_value(borrow=True).shape,
+                     expected_shape)
+        assert_equal(bn.population_stdev.get_value(borrow=True).shape,
+                     expected_shape)
+        assert numpy.isnan(bn.shift.get_value(borrow=True)).all()
+        assert numpy.isnan(bn.scale.get_value(borrow=True)).all()
+        bn.initialize()
+        assert_allclose(bn.shift.get_value(borrow=True), 0.)
+        assert_allclose(bn.scale.get_value(borrow=True), 1.)
+
+    yield check, 5, (5,)
+    yield check, (6, 7, 9), (6, 7, 9), (False, False, False)
+    yield check, (7, 4, 3), (1, 4, 3), (True, False, False)
+    yield check, (9, 3, 6), (9, 1, 1), (False, True, True)
+    yield check, (7, 4, 5), (7, 1, 5), (False, True, False), False
+
+
+def apply_setup(input_dim, broadcastable, conserve_memory):
+    """Common setup code."""
+    bn = BatchNormalization(input_dim, broadcastable, conserve_memory,
+                            epsilon=1e-4)
+    bn.initialize()
+    b_len = (len(input_dim) if isinstance(input_dim, collections.Sequence)
+             else 1)
+    x = tensor.TensorType(theano.config.floatX,
+                          [False] * (b_len + 1))()
+    return bn, x
+
+
+def test_batch_normalization_inference_apply():
+    """Test that BatchNormalization.apply works in inference mode."""
+    def check(input_dim, variable_dim, broadcastable=None,
+              conserve_memory=True):
+        bn, x = apply_setup(input_dim, broadcastable, conserve_memory)
+        y = bn.apply(x)
+        rng = numpy.random.RandomState((2015, 12, 16))
+        input_ = random_unif(rng,
+                             (9,) +
+                             (input_dim
+                              if isinstance(input_dim, collections.Sequence)
+                              else (input_dim,)))
+
+        # Upon initialization, should be just the identity function.
+        assert_allclose(y.eval({x: input_}), input_, rtol=1e-4)
+
+        # Test population mean gets subtracted.
+        pop_mean = random_unif(rng, variable_dim)
+        bn.population_mean.set_value(pop_mean)
+        assert_allclose(y.eval({x: input_}), input_ - pop_mean, rtol=1e-4)
+
+        # Test population stdev is divided out.
+        pop_stdev = random_unif(rng, variable_dim)
+        bn.population_stdev.set_value(pop_stdev)
+        assert_allclose(y.eval({x: input_}), (input_ - pop_mean) / pop_stdev,
+                        rtol=1e-4)
+
+        # Test learned scale is applied.
+        gamma = random_unif(rng, variable_dim)
+        bn.scale.set_value(gamma)
+        assert_allclose(y.eval({x: input_}),
+                        (input_ - pop_mean) * (gamma / pop_stdev),
+                        rtol=1e-4)
+
+        # Test learned offset is applied.
+        beta = random_unif(rng, variable_dim)
+        bn.shift.set_value(beta)
+        assert_allclose(y.eval({x: input_}),
+                        (input_ - pop_mean) * (gamma / pop_stdev) + beta,
+                        rtol=1e-4)
+
+    yield check, 9, (9,)
+    yield check, (5, 4), (5, 4), None, False
+    yield check, (2, 9, 7), (2, 1, 1), (False, True, True)
+
+
+def test_batch_normalization_train_apply():
+    def check(input_dim, variable_dim, broadcastable=None,
+              conserve_memory=True):
+        # Default epsilon value.
+        epsilon = numpy.cast[theano.config.floatX](1e-4)
+        bn, x = apply_setup(input_dim, broadcastable, conserve_memory)
+        with bn:
+            y_hat = bn.apply(x)
+
+        rng = numpy.random.RandomState((2015, 12, 16))
+        input_ = random_unif(rng, (9,) +
+                             (input_dim
+                              if isinstance(input_dim, collections.Sequence)
+                              else (input_dim,)))
+        # i + 1 because the axes are all shifted one over when the batch
+        # axis is added.
+        axes = (0,) + tuple((i + 1) for i, b in
+                            enumerate(bn.population_mean.broadcastable) if b)
+
+        # NumPy implementation of the batch-normalization transform.
+        def normalize(x):
+            return ((x - x.mean(axis=axes, keepdims=True,
+                                dtype=theano.config.floatX)) /
+                    numpy.sqrt(numpy.var(x, axis=axes, keepdims=True,
+                                         dtype=theano.config.floatX) +
+                               epsilon))
+
+        # Check that batch norm is doing what it should be.
+        assert_allclose(y_hat.eval({x: input_}), normalize(input_),
+                        atol=(1e-3 if theano.config.floatX == 'float32'
+                              else 1e-7))
+
+        # Check that the scale parameters are still getting applied.
+        gamma = random_unif(rng, variable_dim)
+        bn.scale.set_value(gamma)
+        assert_allclose(y_hat.eval({x: input_}), normalize(input_) * gamma,
+                        atol=(1e-3 if theano.config.floatX == 'float32'
+                              else 1e-7))
+
+        beta = random_unif(rng, variable_dim)
+        bn.shift.set_value(beta)
+        # Check that the shift parameters are still getting applied.
+        assert_allclose(y_hat.eval({x: input_}),
+                        normalize(input_) * gamma + beta,
+                        atol=(1e-3 if theano.config.floatX == 'float32'
+                              else 1e-7))
+
+        # Double check that setting the population parameters doesn't
+        # affect anything.
+        bn.population_mean.set_value(numpy.nan *
+                                     bn.population_mean.get_value())
+        bn.population_stdev.set_value(numpy.nan *
+                                      bn.population_mean.get_value())
+        assert_allclose(y_hat.eval({x: input_}),
+                        normalize(input_) * gamma + beta,
+                        atol=(1e-3 if theano.config.floatX == 'float32'
+                              else 1e-7))
+
+    yield check, 9, (9,)
+    yield check, (5, 4), (5, 4), None, False
+    yield check, (2, 9, 7), (2, 1, 1), (False, True, True)
+
+
+def test_batch_normalization_image_size_setter():
+    """Test that setting image_size on a BatchNormalization works."""
+    bn = BatchNormalization()
+    bn.image_size = (5, 4)
+    assert bn.input_dim == (None, 5, 4)
+    bn.image_size = (4, 5)
+    assert bn.input_dim == (None, 4, 5)
+
+
+def test_spatial_batch_normalization():
+    """Smoke test for SpatialBatchNormalization."""
+    def check(*input_dim):
+        sbn = SpatialBatchNormalization(input_dim)
+        sbn.initialize()
+        x = theano.tensor.TensorType(theano.config.floatX,
+                                     [False] * (len(input_dim) + 1))()
+        y = sbn.apply(x)
+        rng = numpy.random.RandomState((2015, 12, 17))
+        input_ = random_unif(rng, (11,) + input_dim)
+        assert_equal(y.eval({x: input_}), input_)
+
+    # Work around a stupid bug in nose2 by passing as *args.
+    yield check, 2, 3, 5
+    yield check, 5, 3, 2, 3
+    yield check, 1, 11
+
+
+def test_raise_exception_spatial():
+    """Test that SpatialBatchNormalization raises an expected exception."""
+    # Work around a stupid bug in nose2 that unpacks the tuple into
+    # separate arguments.
+    sbn1 = SpatialBatchNormalization((5,))
+    yield assert_raises, (ValueError, sbn1.allocate)
+    sbn2 = SpatialBatchNormalization(3)
+    yield assert_raises, (ValueError, sbn2.allocate)
+
+    def do_not_fail(*input_dim):
+        try:
+            sbn = SpatialBatchNormalization(input_dim)
+            sbn.allocate()
+        except ValueError:
+            assert False
+
+    # Work around a stupid bug in nose2 by passing as *args.
+    yield do_not_fail, 5, 4, 3
+    yield do_not_fail, 7, 6
+    yield do_not_fail, 3, 9, 2, 3
+
+
+def test_batch_normalization_inside_convolutional_sequence():
+    """Test that BN bricks work in ConvolutionalSequences."""
+    conv_seq = ConvolutionalSequence(
+        [Convolutional(filter_size=(3, 3), num_filters=4),
+         BatchNormalization(broadcastable=(False, True, True)),
+         AveragePooling(pooling_size=(2, 2)),
+         BatchNormalization(broadcastable=(False, False, False)),
+         MaxPooling(pooling_size=(2, 2), step=(1, 1))],
+        weights_init=Constant(1.),
+        biases_init=Constant(2.),
+        image_size=(10, 8), num_channels=9)
+
+    conv_seq_no_bn = ConvolutionalSequence(
+        [Convolutional(filter_size=(3, 3), num_filters=4),
+         AveragePooling(pooling_size=(2, 2)),
+         MaxPooling(pooling_size=(2, 2), step=(1, 1))],
+        weights_init=Constant(1.),
+        biases_init=Constant(2.),
+        image_size=(10, 8), num_channels=9)
+
+    conv_seq.initialize()
+    conv_seq_no_bn.initialize()
+    rng = numpy.random.RandomState((2015, 12, 17))
+    input_ = random_unif(rng, (2, 9, 10, 8))
+
+    x = theano.tensor.tensor4()
+    ybn = conv_seq.apply(x)
+    y = conv_seq_no_bn.apply(x)
+    yield (assert_equal, ybn.eval({x: input_}), y.eval({x: input_}))
+
+    std = conv_seq.children[-2].population_stdev
+    std.set_value(3 * std.get_value(borrow=True))
+    yield (assert_equal, ybn.eval({x: input_}), y.eval({x: input_}) / 3.)
+
+
+def test_batch_normalized_mlp_construction():
+    """Test that BatchNormalizedMLP performs construction correctly."""
+    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
+    assert all(isinstance(a, Sequence) for a in mlp.activations)
+    assert all(isinstance(a.children[0], BatchNormalization)
+               for a in mlp.activations)
+    assert all(isinstance(a.children[1], Tanh)
+               for a in mlp.activations)
+
+
+def test_batch_normalized_mlp_allocation():
+    """Test that BatchNormalizedMLP performs allocation correctly."""
+    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
+    mlp.allocate()
+    assert mlp.activations[0].children[0].input_dim == 7
+    assert mlp.activations[1].children[0].input_dim == 9
+    assert not any(l.use_bias for l in mlp.linear_transformations)
+
+
+def test_batch_normalized_mlp_transformed():
+    """Smoke test that a graph involving a BatchNormalizedMLP transforms."""
+    x = tensor.matrix('x')
+    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
+    with batch_normalization(mlp):
+        y = mlp.apply(x)
+    assert len(get_batch_normalization_updates(ComputationGraph([y]))) == 4
+
+
+def test_batch_normalized_mlp_conserve_memory_propagated():
+    """Test that setting conserve_memory on a BatchNormalizedMLP works."""
+    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9],
+                             conserve_memory=False)
+    assert not any(act.children[0].conserve_memory for act in mlp.activations)
+    mlp.conserve_memory = True
+    assert mlp.conserve_memory
+    assert all(act.children[0].conserve_memory for act in mlp.activations)
diff --git a/tests/bricks/test_bricks.py b/tests/bricks/test_bricks.py
index 6287f1d7..86d93acb 100644
--- a/tests/bricks/test_bricks.py
+++ b/tests/bricks/test_bricks.py
@@ -3,6 +3,7 @@
 import theano
 from numpy.testing import assert_allclose, assert_raises
 from theano import tensor
+from six.moves import cPickle
 
 from blocks.bricks import (Identity, Linear, Maxout, LinearMaxout, MLP, Tanh,
                            Sequence, Random, Logistic, Softplus, Softmax)
@@ -249,6 +250,10 @@ def test_application():
     assert TestBrick.delegated_apply.inputs == ['w']
 
 
+def test_application_serialization():
+    assert id(cPickle.loads(cPickle.dumps(Linear.apply))) == id(Linear.apply)
+
+
 def test_apply():
     brick = TestBrick(0)
     assert TestBrick.apply(brick, [0]) == [0, 1]
diff --git a/tests/bricks/test_conv.py b/tests/bricks/test_conv.py
index 554bed01..8565ad5e 100644
--- a/tests/bricks/test_conv.py
+++ b/tests/bricks/test_conv.py
@@ -1,14 +1,16 @@
+import pickle
 import numpy
 from nose.tools import assert_raises_regexp
 
 import theano
-from numpy.testing import assert_allclose, assert_equal
+from numpy.testing import assert_allclose
 from theano import tensor
 from theano import function
 
-from blocks.bricks import Rectifier
-from blocks.bricks.conv import (Convolutional, ConvolutionalLayer, MaxPooling,
-                                ConvolutionalActivation, ConvolutionalSequence)
+from blocks.bricks import Rectifier, Tanh
+from blocks.bricks.conv import (Convolutional, ConvolutionalTranspose,
+                                MaxPooling, AveragePooling,
+                                ConvolutionalSequence)
 from blocks.initialization import Constant
 from blocks.graph import ComputationGraph
 
@@ -36,23 +38,53 @@ def test_convolutional():
     assert conv.get_dim('output') == (num_filters, 15, 11)
 
 
+def test_convolutional_transpose():
+    x = tensor.tensor4('x')
+    num_channels = 4
+    num_filters = 3
+    image_size = (8, 6)
+    original_image_size = (17, 13)
+    batch_size = 5
+    filter_size = (3, 3)
+    step = (2, 2)
+    conv = ConvolutionalTranspose(
+        original_image_size, filter_size, num_filters, num_channels, step=step,
+        image_size=image_size, weights_init=Constant(1.),
+        biases_init=Constant(5.))
+    conv.initialize()
+    y = conv.apply(x)
+    func = function([x], y)
+
+    x_val = numpy.ones((batch_size, num_channels) + image_size,
+                       dtype=theano.config.floatX)
+    expected_value = num_channels * numpy.ones(
+        (batch_size, num_filters) + original_image_size)
+    expected_value[:, :, 2:-2:2, :] += num_channels
+    expected_value[:, :, :, 2:-2:2] += num_channels
+    expected_value[:, :, 2:-2:2, 2:-2:2] += num_channels
+    assert_allclose(func(x_val), expected_value + 5)
+
+
 def test_border_mode_not_pushed():
     layers = [Convolutional(border_mode='full'),
-              ConvolutionalActivation(Rectifier().apply),
-              ConvolutionalActivation(Rectifier().apply, border_mode='valid'),
-              ConvolutionalLayer(Rectifier().apply, border_mode='full')]
+              Convolutional(),
+              Rectifier(),
+              Convolutional(border_mode='valid'),
+              Rectifier(),
+              Convolutional(border_mode='full'),
+              Rectifier()]
     stack = ConvolutionalSequence(layers)
     stack.push_allocation_config()
     assert stack.children[0].border_mode == 'full'
     assert stack.children[1].border_mode == 'valid'
-    assert stack.children[2].border_mode == 'valid'
-    assert stack.children[3].border_mode == 'full'
+    assert stack.children[3].border_mode == 'valid'
+    assert stack.children[5].border_mode == 'full'
     stack2 = ConvolutionalSequence(layers, border_mode='full')
     stack2.push_allocation_config()
     assert stack2.children[0].border_mode == 'full'
     assert stack2.children[1].border_mode == 'full'
-    assert stack2.children[2].border_mode == 'full'
     assert stack2.children[3].border_mode == 'full'
+    assert stack2.children[5].border_mode == 'full'
 
 
 def test_no_input_size():
@@ -117,40 +149,108 @@ def test_max_pooling():
                        dtype=theano.config.floatX)
     assert_allclose(func(x_val),
                     numpy.ones((batch_size, num_channels,
-                                x_size / pool_size + 1,
-                                y_size / pool_size + 1)))
+                                x_size / pool_size,
+                                y_size / pool_size)))
     pool.input_dim = (x_size, y_size)
     pool.get_dim('output') == (num_channels, x_size / pool_size + 1,
                                y_size / pool_size + 1)
 
 
-def test_convolutional_layer():
+def test_max_pooling_ignore_border_true():
     x = tensor.tensor4('x')
-    num_channels = 4
-    batch_size = 5
-    pooling_size = 3
-    num_filters = 3
-    filter_size = (3, 3)
-    activation = Rectifier().apply
-
-    conv = ConvolutionalLayer(activation, filter_size, num_filters,
-                              (pooling_size, pooling_size),
-                              num_channels, image_size=(17, 13),
-                              batch_size=batch_size,
-                              weights_init=Constant(1.),
-                              biases_init=Constant(5.))
-    conv.initialize()
+    brick = MaxPooling((3, 4), ignore_border=True)
+    y = brick.apply(x)
+    out = y.eval({x: numpy.zeros((8, 3, 10, 13), dtype=theano.config.floatX)})
+    assert out.shape == (8, 3, 3, 3)
 
-    y = conv.apply(x)
-    func = function([x], y)
 
-    x_val = numpy.ones((batch_size, num_channels, 17, 13),
-                       dtype=theano.config.floatX)
-    assert_allclose(func(x_val), numpy.prod(filter_size) * num_channels *
-                    numpy.ones((batch_size, num_filters, 5, 4)) + 5)
+def test_max_pooling_ignore_border_false():
+    x = tensor.tensor4('x')
+    brick = MaxPooling((5, 7), ignore_border=False)
+    y = brick.apply(x)
+    out = y.eval({x: numpy.zeros((4, 6, 12, 15), dtype=theano.config.floatX)})
+    assert out.shape == (4, 6, 3, 3)
+
 
-    assert_equal(conv.convolution.batch_size, batch_size)
-    assert_equal(conv.pooling.batch_size, batch_size)
+def test_max_pooling_padding():
+    x = tensor.tensor4('x')
+    brick = MaxPooling((6, 2), padding=(3, 1), ignore_border=True)
+    y = brick.apply(x)
+    out = y.eval({x: numpy.zeros((2, 3, 6, 10), dtype=theano.config.floatX)})
+    assert out.shape == (2, 3, 2, 6)
+
+
+def test_max_pooling_old_pickle():
+    brick = MaxPooling((3, 4))
+    brick.allocate()
+    # Simulate old pickle, before #899.
+    del brick.ignore_border
+    del brick.mode
+    del brick.padding
+    # Pickle in this broken state and re-load.
+    broken_pickled = pickle.dumps(brick)
+    loaded = pickle.loads(broken_pickled)
+    # Same shape, same step.
+    assert brick.pooling_size == loaded.pooling_size
+    assert brick.step == loaded.step
+    # Check that the new attributes were indeed added.
+    assert hasattr(loaded, 'padding') and loaded.padding == (0, 0)
+    assert hasattr(loaded, 'mode') and loaded.mode == 'max'
+    assert hasattr(loaded, 'ignore_border') and not loaded.ignore_border
+    try:
+        loaded.apply(tensor.tensor4())
+    except Exception:
+        raise AssertionError("failed to apply on unpickled MaxPooling")
+    # Make sure we're not overriding these attributes wrongly.
+    new_brick = MaxPooling((4, 3), padding=(2, 1))
+    new_brick_unpickled = pickle.loads(pickle.dumps(new_brick))
+    assert new_brick_unpickled.padding == (2, 1)
+    assert new_brick_unpickled.ignore_border
+
+
+def test_average_pooling():
+    x = tensor.tensor4('x')
+    brick = AveragePooling((2, 2))
+    y = brick.apply(x)
+    tmp = numpy.arange(16, dtype=theano.config.floatX).reshape(1, 1, 4, 4)
+    x_ = numpy.tile(tmp, [2, 3, 1, 1])
+    out = y.eval({x: x_})
+    assert_allclose(
+        out - numpy.array([[10 / 4., 18 / 4.], [42 / 4., 50 / 4.]]),
+        numpy.zeros_like(out))
+
+
+def test_average_pooling_inc_padding():
+    x = tensor.tensor4('x')
+    brick = AveragePooling((2, 2), ignore_border=True, padding=(1, 1),
+                           include_padding=True)
+    y = brick.apply(x)
+    output = y.eval({x: 3 * numpy.ones((1, 1, 2, 2),
+                                       dtype=theano.config.floatX)})
+    expected_out = numpy.array([0.75, 0.75, 0.75, 0.75]).reshape(1, 1, 2, 2)
+    assert_allclose(expected_out, output)
+
+
+def test_average_pooling_exc_padding():
+    x = tensor.tensor4('x')
+    brick = AveragePooling((2, 2), ignore_border=True, padding=(1, 1),
+                           include_padding=False)
+    y = brick.apply(x)
+    x_ = 3 * numpy.ones((1, 1, 2, 2), dtype=theano.config.floatX)
+    output = y.eval({x: x_})
+    assert_allclose(x_, output)
+
+
+def test_pooling_works_in_convolutional_sequence():
+    x = tensor.tensor4('x')
+    brick = ConvolutionalSequence([AveragePooling((2, 2), step=(2, 2)),
+                                   MaxPooling((4, 4), step=(2, 2),
+                                              ignore_border=True)],
+                                  image_size=(16, 32), num_channels=3)
+    brick.allocate()
+    y = brick.apply(x)
+    out = y.eval({x: numpy.empty((2, 3, 16, 32), dtype=theano.config.floatX)})
+    assert out.shape == (2, 3, 3, 7)
 
 
 def test_convolutional_sequence():
@@ -158,52 +258,72 @@ def test_convolutional_sequence():
     num_channels = 4
     pooling_size = 3
     batch_size = 5
-    activation = Rectifier().apply
+    act = Rectifier()
 
-    conv = ConvolutionalLayer(activation, (3, 3), 5,
-                              (pooling_size, pooling_size),
-                              weights_init=Constant(1.),
-                              biases_init=Constant(5.))
-    conv2 = ConvolutionalActivation(activation, (2, 2), 4,
-                                    weights_init=Constant(1.))
+    conv = Convolutional((3, 3), 5, weights_init=Constant(1.),
+                         biases_init=Constant(5.))
+    pooling = MaxPooling(pooling_size=(pooling_size, pooling_size))
+    conv2 = Convolutional((2, 2), 4, weights_init=Constant(1.))
 
-    seq = ConvolutionalSequence([conv, conv2], num_channels,
+    seq = ConvolutionalSequence([conv, act, pooling, conv2, act], num_channels,
                                 image_size=(17, 13))
     seq.push_allocation_config()
     assert conv.num_channels == 4
     assert conv2.num_channels == 5
-    conv2.convolution.use_bias = False
+    conv2.use_bias = False
     y = seq.apply(x)
     seq.initialize()
     func = function([x], y)
 
     x_val = numpy.ones((batch_size, 4, 17, 13), dtype=theano.config.floatX)
-    y_val = (numpy.ones((batch_size, 4, 4, 3)) *
+    y_val = (numpy.ones((batch_size, 4, 4, 2)) *
              (9 * 4 + 5) * 4 * 5)
     assert_allclose(func(x_val), y_val)
 
 
-def test_convolutional_activation_use_bias():
-    act = ConvolutionalActivation(Rectifier().apply, (3, 3), 5, 4,
-                                  image_size=(9, 9), use_bias=False)
-    act.allocate()
-    assert not act.convolution.use_bias
-    assert len(ComputationGraph([act.apply(tensor.tensor4())]).parameters) == 1
-
-
-def test_convolutional_layer_use_bias():
-    act = ConvolutionalLayer(Rectifier().apply, (3, 3), 5, (2, 2), 6,
-                             image_size=(9, 9), use_bias=False)
-    act.allocate()
-    assert not act.convolution.use_bias
-    assert len(ComputationGraph([act.apply(tensor.tensor4())]).parameters) == 1
+def test_convolutional_sequence_with_raw_activation():
+    seq = ConvolutionalSequence([Rectifier()], num_channels=4,
+                                image_size=(20, 14))
+    input_ = (((numpy.arange(2 * 4 * 20 * 14)
+                .reshape((2, 4, 20, 14)) % 2) * 2 - 1)
+              .astype(theano.config.floatX))
+    expected_ = input_ * (input_ > 0)
+    x = theano.tensor.tensor4()
+    assert_allclose(seq.apply(x).eval({x: input_}), expected_)
+
+
+def test_convolutional_sequence_with_convolutions_raw_activation():
+    seq = ConvolutionalSequence(
+        [Convolutional(filter_size=(3, 3), num_filters=4),
+         Rectifier(),
+         Convolutional(filter_size=(5, 5), num_filters=3, step=(2, 2)),
+         Tanh()],
+        num_channels=2,
+        image_size=(21, 39))
+    seq.allocate()
+    x = theano.tensor.tensor4()
+    out = seq.apply(x).eval({x: numpy.ones((10, 2, 21, 39),
+                                           dtype=theano.config.floatX)})
+    assert out.shape == (10, 3, 8, 17)
+
+
+def test_convolutional_sequence_activation_get_dim():
+    seq = ConvolutionalSequence([Tanh()], num_channels=9, image_size=(4, 6))
+    seq.allocate()
+    assert seq.get_dim('output') == (9, 4, 6)
+
+    seq = ConvolutionalSequence([Convolutional(filter_size=(7, 7),
+                                               num_filters=5,
+                                               border_mode=(1, 1)),
+                                 Tanh()], num_channels=8, image_size=(8, 11))
+    seq.allocate()
+    assert seq.get_dim('output') == (5, 4, 7)
 
 
 def test_convolutional_sequence_use_bias():
     cnn = ConvolutionalSequence(
-        [ConvolutionalActivation(activation=Rectifier().apply,
-                                 filter_size=(1, 1), num_filters=1)
-         for _ in range(3)],
+        sum([[Convolutional(filter_size=(1, 1), num_filters=1), Rectifier()]
+             for _ in range(3)], []),
         num_channels=1, image_size=(1, 1),
         use_bias=False)
     cnn.allocate()
diff --git a/tests/bricks/test_lookup.py b/tests/bricks/test_lookup.py
index 6ed0a6df..d4418ea3 100644
--- a/tests/bricks/test_lookup.py
+++ b/tests/bricks/test_lookup.py
@@ -1,5 +1,5 @@
 import numpy
-from numpy.testing import assert_equal
+from numpy.testing import assert_equal, assert_raises
 
 import theano
 from theano import tensor
@@ -21,3 +21,19 @@ def test_lookup_table():
     desired = numpy.array([[[3, 4, 5], [6, 7, 8]], [[0, 1, 2], [9, 10, 11]]],
                           dtype=theano.config.floatX)
     assert_equal(f(x_val)[0], desired)
+
+    # Test get_dim
+    assert_equal(lt.get_dim(lt.apply.inputs[0]), 0)
+    assert_equal(lt.get_dim(lt.apply.outputs[0]), lt.dim)
+    assert_raises(ValueError, lt.get_dim, 'random_name')
+
+    # Test feedforward interface
+    assert lt.input_dim == 0
+    assert lt.output_dim == 3
+    lt.output_dim = 4
+    assert lt.output_dim == 4
+
+    def assign_input_dim():
+        lt.input_dim = 11
+    assert_raises(ValueError, assign_input_dim)
+    lt.input_dim = 0
diff --git a/tests/bricks/test_recurrent.py b/tests/bricks/test_recurrent.py
index bc7801fd..f7fc45cb 100644
--- a/tests/bricks/test_recurrent.py
+++ b/tests/bricks/test_recurrent.py
@@ -14,7 +14,8 @@
     recurrent, BaseRecurrent, GatedRecurrent,
     SimpleRecurrent, Bidirectional, LSTM,
     RecurrentStack, RECURRENTSTACK_SEPARATOR)
-from blocks.initialization import Constant, IsotropicGaussian, Orthogonal
+from blocks.initialization import (
+    Constant, IsotropicGaussian, Orthogonal, Identity)
 from blocks.filter import get_application_call, VariableFilter
 from blocks.graph import ComputationGraph
 from blocks.roles import INITIAL_STATE
@@ -533,6 +534,50 @@ def test(self):
         assert_allclose(h_simple_rev, h_bidir[::-1, ...,  3:], rtol=1e-04)
 
 
+class TestBidirectionalStack(unittest.TestCase):
+    def setUp(self):
+        prototype = SimpleRecurrent(dim=3, activation=Tanh())
+        self.layers = [
+            Bidirectional(weights_init=Orthogonal(), prototype=prototype)
+            for _ in range(3)]
+        self.stack = RecurrentStack(self.layers)
+        for fork in self.stack.forks:
+            fork.weights_init = Identity(1)
+            fork.biases_init = Constant(0)
+        self.stack.initialize()
+
+        self.x_val = 0.1 * numpy.asarray(
+            list(itertools.permutations(range(4))),
+            dtype=theano.config.floatX)
+        self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) *
+                      self.x_val[..., None])
+        self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
+        self.mask_val[12:24, 3] = 0
+
+    def test_steps(self):
+        x = tensor.tensor3('x')
+        mask = tensor.matrix('mask')
+
+        calc_stack_layers = [
+            theano.function([x, mask], self.stack.apply(x, mask=mask)[i])
+            for i in range(len(self.layers))]
+        stack_layers = [
+            f(self.x_val, self.mask_val) for f in calc_stack_layers]
+
+        h_val = self.x_val
+        for stack_layer_value, bidir_net in zip(stack_layers, self.layers):
+            calc = theano.function([x, mask], bidir_net.apply(x, mask=mask))
+            simple_layer_value = calc(h_val, self.mask_val)
+            assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04)
+            h_val = simple_layer_value[..., :3]
+
+    def test_dims(self):
+        self.assertEqual(self.stack.get_dim("inputs"), 3)
+        for i in range(len(self.layers)):
+            state_name = self.stack.suffix("states", i)
+            self.assertEqual(self.stack.get_dim(state_name), 6)
+
+
 def test_saved_inner_graph():
     """Make sure that the original inner graph is saved."""
     x = tensor.tensor3()
diff --git a/tests/extensions/test_progressbar.py b/tests/extensions/test_progressbar.py
index 4402d9db..e31f5e66 100644
--- a/tests/extensions/test_progressbar.py
+++ b/tests/extensions/test_progressbar.py
@@ -16,8 +16,12 @@ def setup_mainloop(extension):
     DataStream and a minimal model/cost to optimize.
 
     """
+    # Since progressbar2 3.6.0, the `maxval` kwarg has been replaced by
+    # `max_value`, which has a default value of 100. If we're still using
+    # `maxval` by accident, this test should fail complaining that
+    # the progress bar has received a value out of range.
     features = [numpy.array(f, dtype=theano.config.floatX)
-                for f in [[1, 2], [3, 4], [5, 6]]]
+                for f in [[1, 2]] * 101]
     dataset = IterableDataset(dict(features=features))
 
     W = shared_floatx([0, 0], name='W')
diff --git a/tests/extensions/test_training.py b/tests/extensions/test_training.py
index d874e40b..607aac8f 100644
--- a/tests/extensions/test_training.py
+++ b/tests/extensions/test_training.py
@@ -18,7 +18,7 @@
 from blocks.extensions.predicates import OnLogRecord
 from blocks.main_loop import MainLoop
 from blocks.utils import shared_floatx
-from blocks.utils.testing import MockMainLoop
+from blocks.utils.testing import MockMainLoop, skip_if_configuration_set
 
 
 def test_shared_variable_modifier():
@@ -129,6 +129,8 @@ def after_batch(self, batch):
 
 
 def test_save_the_best():
+    skip_if_configuration_set('log_backend', 'sqlite',
+                              "Known to be flaky with SQLite log backend.")
     with NamedTemporaryFile(dir=config.temp_dir) as dst,\
             NamedTemporaryFile(dir=config.temp_dir) as dst_best:
         track_cost = TrackTheBest("cost", after_epoch=False, after_batch=True)
diff --git a/tests/graph/test_bn.py b/tests/graph/test_bn.py
new file mode 100644
index 00000000..f6b1c8b3
--- /dev/null
+++ b/tests/graph/test_bn.py
@@ -0,0 +1,133 @@
+import numpy
+from numpy.testing import assert_allclose
+import theano
+from theano import tensor
+
+from blocks.bricks import (BatchNormalization, Sequence, Tanh, MLP,
+                           BatchNormalizedMLP)
+from blocks.filter import get_brick
+from blocks.graph import (ComputationGraph, batch_normalization,
+                          apply_batch_normalization,
+                          get_batch_normalization_updates)
+from blocks.initialization import Constant
+from blocks.roles import (has_roles, BATCH_NORM_POPULATION_MEAN,
+                          BATCH_NORM_POPULATION_STDEV)
+from blocks.utils import is_shared_variable
+
+
+def test_batch_normalization_simple():
+    x = tensor.matrix()
+    eps = 1e-4
+    bn = BatchNormalization(input_dim=4, epsilon=eps)
+    bn.initialize()
+    with batch_normalization(bn):
+        y = bn.apply(x)
+    rng = numpy.random.RandomState((2016, 1, 18))
+    x_ = rng.uniform(size=(5, 4)).astype(theano.config.floatX)
+    y_ = y.eval({x: x_})
+    y_expected = (x_ - x_.mean(axis=0)) / numpy.sqrt(x_.var(axis=0) + eps)
+    assert_allclose(y_, y_expected, rtol=1e-4)
+
+
+def test_batch_normalization_nested():
+    x = tensor.tensor4()
+    eps = 1e-4
+    r_dims = (0, 2, 3)
+    batch_dims = (5, 4, 3, 2)
+    bn = BatchNormalization(input_dim=batch_dims[1:],
+                            broadcastable=(False, True, True),
+                            epsilon=eps)
+    seq = Sequence([bn.apply, Tanh().apply])
+    seq.initialize()
+    with batch_normalization(seq):
+        y = seq.apply(x)
+    rng = numpy.random.RandomState((2016, 1, 18))
+    x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX)
+    y_ = y.eval({x: x_})
+    y_expected = numpy.tanh((x_ - x_.mean(axis=r_dims, keepdims=True)) /
+                            numpy.sqrt(x_.var(axis=r_dims, keepdims=True) +
+                                       eps))
+    assert_allclose(y_, y_expected, rtol=1e-4)
+
+
+def test_apply_batch_normalization_nested():
+    x = tensor.matrix()
+    eps = 1e-8
+    batch_dims = (3, 9)
+    bn = BatchNormalization(input_dim=5, epsilon=eps)
+    mlp = MLP([Sequence([bn.apply, Tanh().apply])], [9, 5],
+              weights_init=Constant(0.4), biases_init=Constant(1))
+    mlp.initialize()
+    y = mlp.apply(x)
+    cg = apply_batch_normalization(ComputationGraph([y]))
+    y_bn = cg.outputs[0]
+    rng = numpy.random.RandomState((2016, 1, 18))
+    x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX)
+    y_ = y_bn.eval({x: x_})
+    W_, b_ = map(lambda s: (getattr(mlp.linear_transformations[0], s)
+                            .get_value(borrow=True)), ['W', 'b'])
+    z_ = numpy.dot(x_, W_) + b_
+    y_expected = numpy.tanh((z_ - z_.mean(axis=0)) /
+                            numpy.sqrt(z_.var(axis=0) + eps))
+    assert_allclose(y_, y_expected, rtol=1e-3)
+
+
+class TestSimpleGetBatchNormalizationUpdates(object):
+    def setUp(self):
+        self.mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
+        self.x = tensor.matrix()
+
+    def simple_assertions(self, updates, num_bricks=2, num_updates=4):
+        """Shared assertions for simple tests."""
+        assert len(updates) == num_updates
+        assert all(is_shared_variable(u[0]) for u in updates)
+        # This order is somewhat arbitrary and implementation_dependent
+        means = set(u[0] for u in updates
+                    if has_roles(u[0], [BATCH_NORM_POPULATION_MEAN]))
+        stdevs = set(u[0] for u in updates
+                     if has_roles(u[0], [BATCH_NORM_POPULATION_STDEV]))
+        assert means.isdisjoint(stdevs)
+        assert len(set(get_brick(v) for v in means)) == num_bricks
+        assert len(set(get_brick(v) for v in stdevs)) == num_bricks
+
+    def test_get_batch_normalization_updates(self):
+        """Test that get_batch_normalization_updates works as expected."""
+        with batch_normalization(self.mlp):
+            y_bn = self.mlp.apply(self.x)
+        graph = ComputationGraph([y_bn])
+        updates = get_batch_normalization_updates(graph)
+        self.simple_assertions(updates)
+
+    def test_get_batch_normalization_updates_non_training_applications(self):
+        """Test updates extracton in graph with non-training apply."""
+        y = self.mlp.apply(self.x)
+        with batch_normalization(self.mlp):
+            y_bn = self.mlp.apply(self.x)
+        graph = ComputationGraph([y_bn, y])
+        updates = get_batch_normalization_updates(graph)
+        self.simple_assertions(updates)
+
+    def test_get_batch_normalization_updates_no_training(self):
+        """Test for exception if there are no training-mode nodes."""
+        y = self.mlp.apply(self.x)
+        graph = ComputationGraph([y])
+        numpy.testing.assert_raises(ValueError,
+                                    get_batch_normalization_updates, graph)
+
+    def test_get_batch_normalization_updates_duplicates_error(self):
+        """Test that we get an error by default on multiple apply."""
+        with batch_normalization(self.mlp):
+            y = self.mlp.apply(self.x)
+            y2 = self.mlp.apply(self.x)
+        graph = ComputationGraph([y, y2])
+        numpy.testing.assert_raises(ValueError,
+                                    get_batch_normalization_updates, graph)
+
+    def test_get_batch_normalization_updates_allow_duplicates(self):
+        """Test get_batch_normalization_updates(allow_duplicates=True)."""
+        with batch_normalization(self.mlp):
+            y = self.mlp.apply(self.x)
+            y2 = self.mlp.apply(self.x)
+        graph = ComputationGraph([y, y2])
+        updates = get_batch_normalization_updates(graph, allow_duplicates=True)
+        self.simple_assertions(updates, num_bricks=2, num_updates=8)
diff --git a/tests/monitoring/test_aggregation.py b/tests/monitoring/test_aggregation.py
index ba16e936..a4298ae3 100644
--- a/tests/monitoring/test_aggregation.py
+++ b/tests/monitoring/test_aggregation.py
@@ -1,6 +1,6 @@
 import numpy
 import theano
-from numpy.testing import assert_allclose
+from numpy.testing import assert_allclose, assert_raises
 from theano import tensor
 
 from blocks import bricks
@@ -14,7 +14,7 @@
 from fuel.streams import DataStream
 from fuel.schemes import SequentialScheme
 
-from blocks.monitoring.evaluators import DatasetEvaluator
+from blocks.monitoring.evaluators import DatasetEvaluator, AggregationBuffer
 
 
 class TestBrick(bricks.Brick):
@@ -89,3 +89,9 @@ def test_mean_aggregator():
                     numpy.array([8.25, 26.75], dtype=theano.config.floatX))
     assert_allclose(DatasetEvaluator([z]).evaluate(data_stream)['z'],
                     numpy.array([35], dtype=theano.config.floatX))
+
+
+def test_aggregation_buffer():
+    x1 = tensor.matrix('x')
+    x2 = tensor.matrix('x')
+    assert_raises(ValueError, AggregationBuffer, [x1, x2])
diff --git a/tests/test_graph.py b/tests/test_graph.py
index 027640d5..0c8eb70c 100644
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
@@ -8,7 +8,8 @@
 from blocks.bricks import MLP, Identity, Logistic
 from blocks.bricks.cost import SquaredError
 from blocks.filter import VariableFilter
-from blocks.graph import apply_noise, collect_parameters, ComputationGraph
+from blocks.graph import (apply_dropout, apply_noise, collect_parameters,
+                          ComputationGraph)
 from blocks.initialization import Constant
 from blocks.roles import add_role, COLLECTED, PARAMETER, AUXILIARY
 from tests.bricks.test_bricks import TestBrick
@@ -140,6 +141,36 @@ def test_apply_noise():
         2 + MRG_RandomStreams(1).normal(tuple()).eval())
 
 
+def test_apply_dropout():
+    x = tensor.vector()
+    y = tensor.vector()
+    z = x * y
+    cg = ComputationGraph([z])
+    dropped_cg = apply_dropout(cg, [x], 0.4, seed=1)
+
+    x_ = numpy.array([5., 6., 7.], dtype=theano.config.floatX)
+    y_ = numpy.array([1., 2., 3.], dtype=theano.config.floatX)
+
+    assert_allclose(
+        dropped_cg.outputs[0].eval({x: x_, y: y_}),
+        x_ * y_ * MRG_RandomStreams(1).binomial((3,), p=0.6).eval() / 0.6)
+
+
+def test_apply_dropout_custom_divisor():
+    x = tensor.vector()
+    y = tensor.vector()
+    z = x - y
+    cg = ComputationGraph([z])
+    scaled_dropped_cg = apply_dropout(cg, [y], 0.8, seed=2, custom_divisor=2.5)
+
+    x_ = numpy.array([9., 8., 9.], dtype=theano.config.floatX)
+    y_ = numpy.array([4., 5., 6.], dtype=theano.config.floatX)
+
+    assert_allclose(
+        scaled_dropped_cg.outputs[0].eval({x: x_, y: y_}),
+        x_ - (y_ * MRG_RandomStreams(2).binomial((3,), p=0.2).eval() / 2.5))
+
+
 def test_snapshot():
     x = tensor.matrix('x')
     linear = MLP([Identity(), Identity()], [10, 10, 10],
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index 418fce67..f96341b2 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -1,7 +1,8 @@
 from numpy.testing import assert_raises
 from theano import tensor
 
-from blocks.utils import check_theano_variable, unpack
+from blocks.utils import check_theano_variable, unpack, find_bricks
+from blocks.bricks import MLP, Sequence, Tanh, Identity, Logistic
 
 
 def test_unpack():
@@ -20,3 +21,60 @@ def test_check_theano_variable():
                   tensor.vector(), 2, 'float')
     assert_raises(ValueError, check_theano_variable,
                   tensor.vector(), 1, 'int')
+
+
+class TestFindBricks(object):
+    def setUp(self):
+        self.mlp = MLP([Sequence([Identity(name='id1').apply,
+                                  Tanh(name='tanh1').apply],
+                                 name='sequence1'),
+                        Sequence([Logistic(name='logistic1').apply,
+                                  Identity(name='id2').apply,
+                                  Tanh(name='tanh2').apply],
+                                 name='sequence2'),
+                        Logistic(name='logistic2'),
+                        Sequence([Sequence([Logistic(name='logistic3').apply],
+                                           name='sequence4').apply],
+                                 name='sequence3')],
+                       [10, 5, 9, 5, 9])
+
+    def test_find_zeroth_level(self):
+        found = find_bricks([self.mlp], lambda x: isinstance(x, MLP))
+        assert len(found) == 1
+        assert found[0] == self.mlp
+
+    def test_find_zeroth_level_repeated(self):
+        found = find_bricks([self.mlp, self.mlp], lambda x: isinstance(x, MLP))
+        assert len(found) == 1
+        assert found[0] == self.mlp
+
+    def test_find_all_unique(self):
+        found = find_bricks([self.mlp, self.mlp] + list(self.mlp.children),
+                            lambda _: True)
+        assert len(found) == 16  # 12 activations plus 4 linear transformations
+
+    def test_find_none(self):
+        found = find_bricks([self.mlp], lambda _: False)
+        assert len(found) == 0
+
+    def test_find_first_level(self):
+        found = set(find_bricks([self.mlp], lambda x: isinstance(x, Sequence)))
+        assert len(found) == 5
+        assert self.mlp in found
+        found.remove(self.mlp)
+        sequences = set(self.mlp.activations[0:2] +
+                        [self.mlp.activations[3],
+                         self.mlp.activations[3].children[0]])
+        assert sequences == found
+
+    def test_find_second_and_third_level(self):
+        found = set(find_bricks([self.mlp], lambda x: isinstance(x, Identity)))
+        assert len(found) == 2
+        assert self.mlp.activations[0].children[0] in found
+        assert self.mlp.activations[1].children[1] in found
+
+    def test_find_first_and_second_and_third_level(self):
+        found = set(find_bricks([self.mlp], lambda x: isinstance(x, Logistic)))
+        assert self.mlp.activations[2] in found
+        assert self.mlp.activations[1].children[0] in found
+        assert self.mlp.activations[3].children[0].children[0]