diff --git a/ctc_cost.py b/ctc_cost.py
index 979ed93..b5d6160 100644
--- a/ctc_cost.py
+++ b/ctc_cost.py
@@ -1,7 +1,7 @@
 """
 CTC-Connectionist Temporal Classification
 
-Code provided by Mohammad Pezeshki - May. 2015 -
+Code provided by "Mohammad Pezeshki" and "Philemon Brakel"- May. 2015 -
 Montreal Institute for Learning Algorithms
 
 Referece: Graves, Alex, et al. "Connectionist temporal classification:
@@ -13,194 +13,464 @@
 
 This code is distributed without any warranty, express or implied.
 """
-
+"""Connectionist Temporal Classification
+y_hat : T x B x C+1
+y : L x B
+y_hat_mask : T x B
+y_mask : L x B
+"""
 import theano
+import numpy
 from theano import tensor
+from theano import tensor as T
+
 
 floatX = theano.config.floatX
 
 
-# T: INPUT_SEQUENCE_LENGTH
-# B: BATCH_SIZE
-# L: OUTPUT_SEQUENCE_LENGTH
-# C: NUM_CLASSES
-class CTC(object):
-    """Connectionist Temporal Classification
-    y_hat : T x B x C+1
-    y : L x B
-    y_hat_mask : T x B
-    y_mask : L x B
-    """
-    @staticmethod
-    def add_blanks(y, blank_symbol, y_mask=None):
-        """Add blanks to a matrix and updates mask
-
-        Input shape: L x B
-        Output shape: 2L+1 x B
-
-        """
-        # for y
-        y_extended = y.T.dimshuffle(0, 1, 'x')
-        blanks = tensor.zeros_like(y_extended) + blank_symbol
-        concat = tensor.concatenate([y_extended, blanks], axis=2)
-        res = concat.reshape((concat.shape[0],
-                              concat.shape[1] * concat.shape[2])).T
-        begining_blanks = tensor.zeros((1, res.shape[1])) + blank_symbol
-        blanked_y = tensor.concatenate([begining_blanks, res], axis=0)
-        # for y_mask
-        if y_mask is not None:
-            y_mask_extended = y_mask.T.dimshuffle(0, 1, 'x')
-            concat = tensor.concatenate([y_mask_extended,
-                                         y_mask_extended], axis=2)
-            res = concat.reshape((concat.shape[0],
-                                  concat.shape[1] * concat.shape[2])).T
-            begining_blanks = tensor.ones((1, res.shape[1]), dtype=floatX)
-            blanked_y_mask = tensor.concatenate([begining_blanks, res], axis=0)
-        else:
-            blanked_y_mask = None
-        return blanked_y, blanked_y_mask
-
-    @staticmethod
-    def class_batch_to_labeling_batch(y, y_hat, y_hat_mask=None):
-        y_hat = y_hat * y_hat_mask.dimshuffle(0, 'x', 1)
-        batch_size = y_hat.shape[2]
-        res = y_hat[:, y.astype('int32'), tensor.arange(batch_size)]
-        return res
-
-    @staticmethod
-    def recurrence_relation(y, y_mask, blank_symbol):
-        n_y = y.shape[0]
-        blanks = tensor.zeros((2, y.shape[1])) + blank_symbol
-        ybb = tensor.concatenate((y, blanks), axis=0).T
-        sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) *
-                    tensor.eq(ybb[:, 1:-1], blank_symbol) *
-                    y_mask.T)
-
-        # r1: LxL
-        # r2: LxL
-        # r3: LxLxB
-        r2 = tensor.eye(n_y, k=1)
-        r3 = (tensor.eye(n_y, k=2).dimshuffle(0, 1, 'x') *
-              sec_diag.dimshuffle(1, 'x', 0))
-
-        return r2, r3
-
-    @classmethod
-    def path_probabs(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol):
-        pred_y = cls.class_batch_to_labeling_batch(y, y_hat, y_hat_mask)
-
-        r2, r3 = cls.recurrence_relation(y, y_mask, blank_symbol)
-
-        def step(p_curr, p_prev):
-            # instead of dot product, we * first
-            # and then sum oven one dimension.
-            # objective: T.dot((p_prev)BxL, LxLxB)
-            # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB
-            dotproduct = (p_prev + tensor.dot(p_prev, r2) +
-                          (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T)
-            return p_curr.T * dotproduct * y_mask.T  # B x L
-
-        probabilities, _ = theano.scan(
-            step,
-            sequences=[pred_y],
-            outputs_info=[tensor.eye(y.shape[0])[0] * tensor.ones(y.T.shape)])
-        return probabilities, probabilities.shape
-
-    @classmethod
-    def cost(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol):
-        y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32')
-        y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32')
-        probabilities, sth = cls.path_probabs(y, y_hat,
-                                              y_mask, y_hat_mask,
-                                              blank_symbol)
-        batch_size = probabilities.shape[1]
-        labels_probab = (probabilities[y_hat_mask_len - 1,
-                                       tensor.arange(batch_size),
-                                       y_mask_len - 1] +
-                         probabilities[y_hat_mask_len - 1,
-                                       tensor.arange(batch_size),
-                                       y_mask_len - 2])
-        avg_cost = tensor.mean(-tensor.log(labels_probab))
-        return avg_cost, sth
-
-    @staticmethod
-    def _epslog(x):
-        return tensor.cast(tensor.log(tensor.clip(x, 1E-12, 1E12)),
-                           theano.config.floatX)
-
-    @staticmethod
-    def log_add(a, b):
-        max_ = tensor.maximum(a, b)
-        return (max_ + tensor.log1p(tensor.exp(a + b - 2 * max_)))
-
-    @staticmethod
-    def log_dot_matrix(x, z):
-        inf = 1E12
-        log_dot = tensor.dot(x, z)
-        zeros_to_minus_inf = (z.max(axis=0) - 1) * inf
-        return log_dot + zeros_to_minus_inf
-
-    @staticmethod
-    def log_dot_tensor(x, z):
-        inf = 1E12
-        log_dot = (x.dimshuffle(1, 'x', 0) * z).sum(axis=0).T
-        zeros_to_minus_inf = (z.max(axis=0) - 1) * inf
-        return log_dot + zeros_to_minus_inf.T
-
-    @classmethod
-    def log_path_probabs(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol):
-        pred_y = cls.class_batch_to_labeling_batch(y, y_hat, y_hat_mask)
-        r2, r3 = cls.recurrence_relation(y, y_mask, blank_symbol)
-
-        def step(log_p_curr, log_p_prev):
-            p1 = log_p_prev
-            p2 = cls.log_dot_matrix(p1, r2)
-            p3 = cls.log_dot_tensor(p1, r3)
-            p123 = cls.log_add(p3, cls.log_add(p1, p2))
-
-            return (log_p_curr.T +
-                    p123 +
-                    cls._epslog(y_mask.T))
-
-        log_probabilities, _ = theano.scan(
-            step,
-            sequences=[cls._epslog(pred_y)],
-            outputs_info=[cls._epslog(tensor.eye(y.shape[0])[0] *
-                                      tensor.ones(y.T.shape))])
-        return log_probabilities
-
-    @classmethod
-    def log_cost(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol):
-        y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32')
-        y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32')
-        log_probabs = cls.log_path_probabs(y, y_hat,
-                                           y_mask, y_hat_mask,
-                                           blank_symbol)
+def get_targets(y, log_y_hat, y_mask, y_hat_mask):
+    """
+    Returns the target values according to the CTC cost with respect to y_hat.
+    Note that this is part of the gradient with respect to the softmax output
+    and not with respect to the input of the original softmax function.
+    All computations are done in log scale
+    """
+    num_classes = log_y_hat.shape[2] - 1
+    blanked_y, blanked_y_mask = _add_blanks(
+        y=y,
+        blank_symbol=num_classes,
+        y_mask=y_mask)
+
+    log_alpha, log_beta = _log_forward_backward(blanked_y,
+                                                log_y_hat, blanked_y_mask,
+                                                y_hat_mask, num_classes)
+    # explicitly not using a mask to prevent inf - inf
+    y_prob = _class_batch_to_labeling_batch(blanked_y, log_y_hat,
+                                            y_hat_mask=None)
+    marginals = log_alpha + log_beta - y_prob
+    max_marg = marginals.max(2)
+    max_marg = T.switch(T.le(max_marg, -numpy.inf), 0, max_marg)
+    log_Z = T.log(T.exp(marginals - max_marg[:, :, None]).sum(2))
+    log_Z = log_Z + max_marg
+    log_Z = T.switch(T.le(log_Z, -numpy.inf), 0, log_Z)
+    targets = _labeling_batch_to_class_batch(blanked_y,
+                                             T.exp(marginals -
+                                                   log_Z[:, :, None]),
+                                             num_classes + 1)
+    return targets
+
+
+def pseudo_cost(y, y_hat, y_mask, y_hat_mask, skip_softmax=False):
+    """
+    Training objective.
+    Computes the marginal label probabilities and returns the
+    cross entropy between this distribution and y_hat, ignoring the
+    dependence of the two.
+    This cost should have the same gradient but it should be more
+    numerically stable.
+
+    Here's how it works:
+
+    Say delta_y is the gradient we want theano to return with respect to
+    the input y and let's assume both variables are vectors. By simply
+    computing dot(delta_y, y), we obtain a cost with gradient delta_y.
+
+    Parameters
+    ----------
+    y : matrix (L, B)
+        the target label sequences
+    y_hat : tensor3 (T, B, C)
+        class probabily distribution sequences, potentially in log domain
+    y_mask : matrix (L, B)
+        indicates which values of y to use
+    y_hat_mask : matrix (T, B)
+        indicates the lenghts of the sequences in y_hat
+    skip_softmax : bool
+        whether to interpret y_hat as probabilities or unnormalized energy
+        values. The latter might be more numerically stable and efficient
+        because it avoids the computation of the explicit cost and softmax
+        gradients. You probably want to use this.
+    """
+    if skip_softmax:
+        y_hat_softmax = (T.exp(y_hat - y_hat.max(2)[:, :, None]) /
+                         T.exp(y_hat -
+                               y_hat.max(2)[:, :, None]).sum(2)[:, :, None])
+        y_hat_safe = y_hat - y_hat.max(2)[:, :, None]
+        log_y_hat_softmax = (y_hat_safe -
+                             T.log(T.exp(y_hat_safe).sum(2))[:, :, None])
+        targets = get_targets(y, log_y_hat_softmax, y_mask, y_hat_mask)
+    else:
+        y_hat_softmax = y_hat
+        targets = get_targets(y, (T.log(y_hat) -
+                                  T.log(y_hat.sum(2)[:, :, None])),
+                              y_mask, y_hat_mask)
+
+    mask = y_hat_mask[:, :, None]
+    if skip_softmax:
+        y_hat_grad = y_hat_softmax - targets
+        return (y_hat * mask *
+                theano.gradient.disconnected_grad(y_hat_grad)).sum(0).sum(1)
+    return -T.sum(theano.gradient.disconnected_grad(targets) *
+                  T.log(y_hat**mask), axis=0).sum(1)
+
+
+def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol,
+                            log_scale=True):
+    """
+    Based on code from Shawn Tan.
+    Credits to Kyle Kastner as well.
+
+    This function computes the CTC log likelihood for a sequence that has
+    been augmented with blank labels.
+
+    
+    """
+    y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32')
+    y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32')
+
+    if log_scale:
+        log_probabs = _log_path_probabs(y, T.log(y_hat),
+                                        y_mask, y_hat_mask,
+                                        blank_symbol)
         batch_size = log_probabs.shape[1]
-        labels_probab = cls.log_add(
+
+        # Add the probabilities of the final time steps to get the total
+        # sequence likelihood.
+        log_labels_probab = _log_add(
             log_probabs[y_hat_mask_len - 1,
                         tensor.arange(batch_size),
                         y_mask_len - 1],
             log_probabs[y_hat_mask_len - 1,
                         tensor.arange(batch_size),
                         y_mask_len - 2])
-        avg_cost = tensor.mean(-labels_probab)
-        return avg_cost
-
-    @classmethod
-    def apply(cls, y, y_hat, y_mask, y_hat_mask, scale='log_scale'):
-        y_hat = y_hat.dimshuffle(0, 2, 1)
-        num_classes = y_hat.shape[1] - 1
-        blanked_y, blanked_y_mask = cls.add_blanks(
-            y=y,
-            blank_symbol=num_classes.astype(floatX),
-            y_mask=y_mask)
-        if scale == 'log_scale':
-            final_cost = cls.log_cost(blanked_y, y_hat,
-                                      blanked_y_mask, y_hat_mask,
-                                      num_classes)
-        else:
-            final_cost, sth = cls.cost(blanked_y, y_hat,
-                                       blanked_y_mask, y_hat_mask,
-                                       num_classes)
-        return final_cost
+    else:
+        probabilities = _path_probabs(y, y_hat,
+                                      y_mask, y_hat_mask,
+                                      blank_symbol)
+        batch_size = probabilities.shape[1]
+        labels_probab = (probabilities[y_hat_mask_len - 1,
+                                       tensor.arange(batch_size),
+                                       y_mask_len - 1] +
+                         probabilities[y_hat_mask_len - 1,
+                                       tensor.arange(batch_size),
+                                       y_mask_len - 2])
+        log_labels_probab = tensor.log(labels_probab)
+    return log_labels_probab
+
+
+def cost(y, y_hat, y_mask, y_hat_mask, log_scale=True):
+    """
+    Training objective.
+    Computes the CTC cost using just the forward computations.
+    The difference between this function and the vanilla 'cost' function
+    is that this function adds blanks first.
+
+    Note: don't try to compute the gradient of this version of the cost!
+
+    ----
+    Parameters
+    ----------
+    y : matrix (L, B)
+        the target label sequences
+    y_hat : tensor3 (T, B, C)
+        class probabily distribution sequences
+    y_mask : matrix (L, B)
+        indicates which values of y to use
+    y_hat_mask : matrix (T, B)
+        indicates the lenghts of the sequences in y_hat
+    log_scale : bool
+        uses log domain computations if True
+    
+    """
+    num_classes = y_hat.shape[2] - 1
+    blanked_y, blanked_y_mask = _add_blanks(
+        y=y,
+        blank_symbol=num_classes,
+        y_mask=y_mask)
+    final_cost = -sequence_log_likelihood(blanked_y, y_hat,
+                                          blanked_y_mask, y_hat_mask,
+                                          num_classes,
+                                          log_scale=log_scale)
+    return final_cost
+
+
+def _add_blanks(y, blank_symbol, y_mask=None):
+    """Add blanks to a matrix and updates mask
+    Input shape: L x B
+    Output shape: 2L+1 x B
+    """
+    # for y
+    y_extended = y.T.dimshuffle(0, 1, 'x')
+    blanks = tensor.zeros_like(y_extended) + blank_symbol
+    concat = tensor.concatenate([y_extended, blanks], axis=2)
+    res = concat.reshape((concat.shape[0],
+                          concat.shape[1] * concat.shape[2])).T
+    begining_blanks = tensor.zeros((1, res.shape[1])) + blank_symbol
+    blanked_y = tensor.concatenate([begining_blanks, res], axis=0)
+    # for y_mask
+    if y_mask is not None:
+        y_mask_extended = y_mask.T.dimshuffle(0, 1, 'x')
+        concat = tensor.concatenate([y_mask_extended,
+                                     y_mask_extended], axis=2)
+        res = concat.reshape((concat.shape[0],
+                              concat.shape[1] * concat.shape[2])).T
+        begining_blanks = tensor.ones((1, res.shape[1]), dtype=floatX)
+        blanked_y_mask = tensor.concatenate([begining_blanks, res], axis=0)
+    else:
+        blanked_y_mask = None
+    return blanked_y.astype('int32'), blanked_y_mask
+
+
+def _class_batch_to_labeling_batch(y, y_hat, y_hat_mask=None):
+    """
+    Convert (T, B, C) tensor into (T, B, L) tensor.
+
+    In other words, convert lattice of class probabilities into a lattice
+    of label probabilities costrained by the sequence y.
+
+    Notes
+    -----
+    T: number of time steps
+    B: batch size
+    L: length of label sequence
+    C: number of classes
+    Parameters
+    ----------
+    y : matrix (L, B)
+        the target label sequences
+    y_hat : tensor3 (T, B, C)
+        class probabily distribution sequences
+    y_hat_mask : matrix (T, B)
+        indicates the lenghts of the sequences in y_hat
+    Returns
+    -------
+    tensor3 (T, B, L):
+        A tensor that contains the probabilities per time step of the
+        labels that occur in the target sequence.
+    """
+    if y_hat_mask is not None:
+        y_hat = y_hat * y_hat_mask[:, :, None]
+    batch_size = y_hat.shape[1]
+    y_hat = y_hat.dimshuffle(0, 2, 1)
+    res = y_hat[:, y.astype('int32'), T.arange(batch_size)]
+    return res.dimshuffle(0, 2, 1)
+
+
+def _recurrence_relation(y, y_mask, blank_symbol):
+    """
+    Construct a permutation matrix and tensor for computing CTC transitions.
+
+    This matrix is represented as an actual matrix that contains the
+    permutations that are common to all transitions in the batch and a tensor
+    with permutations that are uniqe for each individual sequence.
+
+    This 'matrix' is used to take the transition costraints into account
+    using just matrix algebra operations.
+
+    Parameters
+    ----------
+    y : matrix (L, B)
+        the target label sequences
+    y_mask : matrix (L, B)
+        indicates which values of y to use
+    blank_symbol: integer
+        indicates the symbol that signifies a blank label.
+    Returns
+    -------
+    matrix (L, L)
+    tensor3 (L, L, B)
+    """
+    n_y = y.shape[0]
+    blanks = tensor.zeros((2, y.shape[1])) + blank_symbol
+    ybb = tensor.concatenate((y, blanks), axis=0).T
+    sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) *
+                tensor.eq(ybb[:, 1:-1], blank_symbol) *
+                y_mask.T)
+
+    # r1: LxL
+    # r2: LxL
+    # r3: LxLxB
+    eye2 = tensor.eye(n_y + 2)
+    r2 = eye2[2:, 1:-1]  # tensor.eye(n_y, k=1)
+    r3 = (eye2[2:, :-2].dimshuffle(0, 1, 'x') *
+          sec_diag.dimshuffle(1, 'x', 0))
+
+    return r2, r3
+
+
+def _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol):
+    """Compute the probabilities of the paths that are compatible with the
+    sequence y.
+
+    This function uses scan to get the forward probabilities (often denoted
+    with the symbol alpha in the literature).
+
+    See _log_path_probabs for a version that works in log domain.
+    """
+
+
+
+
+    pred_y = _class_batch_to_labeling_batch(y, y_hat, y_hat_mask)
+    pred_y = pred_y.dimshuffle(0, 2, 1)
+    n_labels = y.shape[0]
+
+    r2, r3 = _recurrence_relation(y, y_mask, blank_symbol)
+
+    def step(p_curr, p_prev):
+        # instead of dot product, we * first
+        # and then sum oven one dimension.
+        # objective: T.dot((p_prev)BxL, LxLxB)
+        # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB
+        dotproduct = (p_prev + tensor.dot(p_prev, r2) +
+                      (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T)
+        return p_curr.T * dotproduct * y_mask.T  # B x L
+
+    probabilities, _ = theano.scan(
+        step,
+        sequences=[pred_y],
+        outputs_info=[tensor.eye(n_labels)[0] * tensor.ones(y.T.shape)])
+    return probabilities
+
+
+def _log_add(a, b):
+    # TODO: move functions like this to utils
+    max_ = tensor.maximum(a, b)
+    result = (max_ + tensor.log1p(tensor.exp(a + b - 2 * max_)))
+    return T.switch(T.isnan(result), max_, result)
+
+
+def _log_dot_matrix(x, z):
+    y = x[:, :, None] + z[None, :, :]
+    y_max = y.max(axis=1)
+    out = T.log(T.sum(T.exp(y - y_max[:, None, :]), axis=1)) + y_max
+    return T.switch(T.isnan(out), -numpy.inf, out)
+
+
+def _log_dot_tensor(x, z):
+    log_dot = x.dimshuffle(1, 'x', 0) + z
+    max_ = log_dot.max(axis=0)
+    out = (T.log(T.sum(T.exp(log_dot - max_[None, :, :]), axis=0)) + max_)
+    out = out.T
+    return T.switch(T.isnan(out), -numpy.inf, out)
+
+
+def _log_path_probabs(y, log_y_hat, y_mask, y_hat_mask, blank_symbol,
+                     reverse=False):
+    """
+    Uses dynamic programming to compute the path probabilities.
+
+    This function uses scan to get the forward probabilities (often denoted
+    with the symbol alpha in the literature).
+
+    This function computes the probabilities in log domain and can be used
+    both the forward and backward passes of the CTC algorithm.
+
+    Notes
+    -----
+    T: number of time steps
+    B: batch size
+    L: length of label sequence
+    C: number of classes
+    Parameters
+    ----------
+    y : matrix (L, B)
+        the target label sequences
+    log_y_hat : tensor3 (T, B, C)
+        log class probabily distribution sequences
+    y_mask : matrix (L, B)
+        indicates which values of y to use
+    y_hat_mask : matrix (T, B)
+        indicates the lenghts of the sequences in log_y_hat
+    blank_symbol: integer
+        indicates the symbol that signifies a blank label.
+    Returns
+    -------
+    tensor3 (T, B, L):
+        the log forward probabilities for each label at every time step.
+        masked values should be -inf
+    """
+
+    n_labels, batch_size = y.shape
+
+    if reverse:
+        y = y[::-1]
+        log_y_hat = log_y_hat[::-1]
+        y_hat_mask = y_hat_mask[::-1]
+        y_mask = y_mask[::-1]
+        # going backwards, the first non-zero alpha value should be the
+        # first non-masked label.
+        start_positions = T.cast(n_labels - y_mask.sum(0), 'int64')
+    else:
+        start_positions = T.zeros((batch_size,), dtype='int64')
+
+    log_pred_y = _class_batch_to_labeling_batch(y, log_y_hat, y_hat_mask)
+    log_pred_y = log_pred_y.dimshuffle(0, 2, 1)
+    r2, r3 = _recurrence_relation(y, y_mask, blank_symbol)
+    r2, r3 = T.log(r2), T.log(r3)
+
+    def step(log_p_curr, y_hat_mask_t, log_p_prev):
+        # applies the transitions matrices to take the sequence constraints of y
+        # into account.
+        p1 = log_p_prev
+        p2 = _log_dot_matrix(p1, r2)
+        p3 = _log_dot_tensor(p1, r3)
+        p12 = _log_add(p1, p2)
+        p123 = _log_add(p3, p12)
+
+        y_hat_mask_t = y_hat_mask_t[:, None]
+        out = log_p_curr.T + p123 + T.log(y_mask.T)
+        return _log_add(T.log(y_hat_mask_t) + out,
+                        T.log(1 - y_hat_mask_t) + log_p_prev)
+
+    log_probabilities, _ = theano.scan(
+        step,
+        sequences=[log_pred_y, y_hat_mask],
+        outputs_info=[T.log(tensor.eye(n_labels)[start_positions])])
+
+    return log_probabilities + T.log(y_hat_mask[:, :, None])
+
+
+def _log_forward_backward(y, log_y_hat, y_mask, y_hat_mask, blank_symbol):
+    """Simply calls _log_path_probabs in both directions."""
+
+    log_probabs_forward = _log_path_probabs(y,
+                                            log_y_hat,
+                                            y_mask,
+                                            y_hat_mask,
+                                            blank_symbol)
+    log_probabs_backward = _log_path_probabs(y,
+                                             log_y_hat,
+                                             y_mask,
+                                             y_hat_mask,
+                                             blank_symbol,
+                                             reverse=True)
+    return log_probabs_forward, log_probabs_backward[::-1][:, :, ::-1]
+
+
+def _labeling_batch_to_class_batch(y, y_labeling, num_classes):
+    """Coverts a sequence label lattice into a lattice of scores/probabilities
+    for each class per input time step.
+    """
+
+    batch_size = y.shape[1]
+    N = y_labeling.shape[0]
+    n_labels = y.shape[0]
+    # sum over all repeated labels
+    # from (T, B, L) to (T, C, B)
+    out = T.zeros((num_classes, batch_size, N))
+    y_labeling = y_labeling.dimshuffle((2, 1, 0))  # L, B, T
+    y_ = y
+
+    def scan_step(index, prev_res, y_labeling, y_):
+        res_t = T.inc_subtensor(prev_res[y_[index, T.arange(batch_size)],
+                                T.arange(batch_size)],
+                                y_labeling[index, T.arange(batch_size)])
+        return res_t
+
+    result, updates = theano.scan(scan_step,
+                                  sequences=[T.arange(n_labels)],
+                                  non_sequences=[y_labeling, y_],
+                                  outputs_info=[out])
+    # result will be (C, B, T) so we make it (T, B, C)
+    return result[-1].dimshuffle(2, 1, 0)
diff --git a/test_ctc.py b/test_ctc.py
index ae60cd8..b0c556a 100644
--- a/test_ctc.py
+++ b/test_ctc.py
@@ -1,122 +1,446 @@
+import numpy as np
 import theano
-import numpy
-from theano import tensor
-from blocks.model import Model
-from blocks.bricks import Linear, Tanh
-from ctc_cost import CTC
-from blocks.initialization import IsotropicGaussian, Constant
-from fuel.datasets import IterableDataset
-from fuel.streams import DataStream
-from blocks.algorithms import (GradientDescent, Scale,
-                               StepClipping, CompositeRule)
-from blocks.extensions.monitoring import TrainingDataMonitoring
-from blocks.main_loop import MainLoop
-from blocks.extensions import FinishAfter, Printing
-from blocks.bricks.recurrent import SimpleRecurrent
-from blocks.graph import ComputationGraph
-import cPickle as pickle
+import ctc_cost
+import theano.tensor as T
+from numpy import testing
+from itertools import izip, islice
+
 
 floatX = theano.config.floatX
 
 
-@theano.compile.ops.as_op(itypes=[tensor.lvector],
-                          otypes=[tensor.lvector])
-def print_pred(y_hat):
-    blank_symbol = 4
-    res = []
-    for i, s in enumerate(y_hat):
-        if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]):
-            res += [s]
-    return numpy.asarray(res)
-
-n_epochs = 200
-x_dim = 4
-h_dim = 9
-num_classes = 4
-
-with open("ctc_test_data.pkl", "rb") as pkl_file:
-    data = pickle.load(pkl_file)
-    inputs = data['inputs']
-    labels = data['labels']
-    # from S x T x B x D to S x T x B
-    inputs_mask = numpy.max(data['mask_inputs'], axis=-1)
-    labels_mask = data['mask_labels']
-
-print 'Building model ...'
-# T x B x F
-x = tensor.tensor3('x', dtype=floatX)
-# T x B
-x_mask = tensor.matrix('x_mask', dtype=floatX)
-# L x B
-y = tensor.matrix('y', dtype=floatX)
-# L x B
-y_mask = tensor.matrix('y_mask', dtype=floatX)
-
-x_to_h = Linear(name='x_to_h',
-                input_dim=x_dim,
-                output_dim=h_dim)
-x_transform = x_to_h.apply(x)
-rnn = SimpleRecurrent(activation=Tanh(),
-                      dim=h_dim, name="rnn")
-h = rnn.apply(x_transform)
-h_to_o = Linear(name='h_to_o',
-                input_dim=h_dim,
-                output_dim=num_classes + 1)
-h_transform = h_to_o.apply(h)
-# T x B x C+1
-y_hat = tensor.nnet.softmax(
-    h_transform.reshape((-1, num_classes + 1))
-).reshape((h.shape[0], h.shape[1], -1))
-y_hat.name = 'y_hat'
-
-y_hat_mask = x_mask
-cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale')
-cost.name = 'CTC'
-# Initialization
-for brick in (rnn, x_to_h, h_to_o):
-    brick.weights_init = IsotropicGaussian(0.01)
-    brick.biases_init = Constant(0)
-    brick.initialize()
-
-print 'Bulding DataStream ...'
-dataset = IterableDataset({'x': inputs,
-                           'x_mask': inputs_mask,
-                           'y': labels,
-                           'y_mask': labels_mask})
-stream = DataStream(dataset)
-
-print 'Bulding training process...'
-algorithm = GradientDescent(cost=cost,
-                            params=ComputationGraph(cost).parameters,
-                            step_rule=CompositeRule([StepClipping(10.0),
-                                                     Scale(0.02)]))
-monitor_cost = TrainingDataMonitoring([cost],
-                                      prefix="train",
-                                      after_epoch=True)
-
-# sample number to monitor
-sample = 8
-
-y_hat_max_path = print_pred(tensor.argmax(y_hat[:, sample, :], axis=1))
-y_hat_max_path.name = 'Viterbi'
-monitor_output = TrainingDataMonitoring([y_hat_max_path],
-                                        prefix="y_hat",
-                                        every_n_epochs=1)
-
-length = tensor.sum(y_mask[:, sample]).astype('int32')
-tar = y[:length, sample].astype('int32')
-tar.name = '_Target_Seq'
-monitor_target = TrainingDataMonitoring([tar],
-                                        prefix="y",
-                                        every_n_epochs=1)
-
-model = Model(cost)
-main_loop = MainLoop(data_stream=stream, algorithm=algorithm,
-                     extensions=[monitor_cost, monitor_output,
-                                 monitor_target,
-                                 FinishAfter(after_n_epochs=n_epochs),
-                                 Printing()],
-                     model=model)
-
-print 'Starting training ...'
-main_loop.run()
+def test_log_add():
+    x = T.scalar()
+    y = T.scalar()
+    z = ctc_cost._log_add(x, y)
+    X = -3.0
+    Y = -np.inf
+    value = z.eval({x: X, y: Y})
+    assert value == -3.0
+
+
+def test_log_dot_matrix():
+    x = T.matrix()
+    y = T.matrix()
+    z = ctc_cost._log_dot_matrix(y, x)
+    X = np.asarray(np.random.normal(0, 1, (5, 4)), dtype=floatX)
+    Y = np.asarray(np.random.normal(0, 1, (3, 5)), dtype=floatX)
+    #Y = np.ones((3, 5), dtype=floatX) * 3
+    value = z.eval({x: X, y: Y})
+    np_value = np.log(np.dot(np.exp(Y), np.exp(X)))
+    assert np.mean((value - np_value)**2) < 1e5
+
+
+def test_log_dot_matrix_zeros():
+    x = T.matrix()
+    y = T.matrix()
+    z = ctc_cost._log_dot_matrix(y, x)
+    X = np.log(np.asarray(np.eye(5), dtype=floatX))
+    Y = np.asarray(np.random.normal(0, 1, (3, 5)), dtype=floatX)
+    #Y = np.ones((3, 5), dtype=floatX) * 3
+    value = z.eval({x: X, y: Y})
+    np_value = np.log(np.dot(np.exp(Y), np.exp(X)))
+    assert np.mean((value - np_value)**2) < 1e5
+
+
+def test_ctc_add_blanks():
+    BATCHES = 3
+    N_LABELS = 3
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    blanked_y, blanked_y_mask = ctc_cost._add_blanks(
+        y=y,
+        blank_symbol=1,
+        y_mask=y_mask)
+    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    Y_mask[-1, 0] = 0
+    Blanked_y_mask = blanked_y_mask.eval({y_mask: Y_mask})
+    Blanked_y = blanked_y.eval({y: Y})
+    assert (Blanked_y == np.array([[1, 1, 1],
+                                   [0, 0, 0],
+                                   [1, 1, 1],
+                                   [0, 0, 0],
+                                   [1, 1, 1],
+                                   [0, 0, 0],
+                                   [1, 1, 1]], dtype='int32')).all()
+    assert (Blanked_y_mask == np.array([[1., 1., 1.],
+                                        [1., 1., 1.],
+                                        [1., 1., 1.],
+                                        [1., 1., 1.],
+                                        [1., 1., 1.],
+                                        [0., 1., 1.],
+                                        [0., 1., 1.]], dtype=floatX)).all()
+
+
+def test_ctc_symmetry_logscale():
+    LENGTH = 5000
+    BATCHES = 3
+    CLASSES = 4
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask)
+
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
+    Y_hat[:, :, 0] = .3
+    Y_hat[:, :, 1] = .2
+    Y_hat[:, :, 2] = .4
+    Y_hat[:, :, 3] = .1
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    # default blank symbol is the highest class index (3 in this case)
+    Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]),
+                  BATCHES).reshape((9, BATCHES))
+    # the masks for this test should be all ones.
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y,
+                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1],
+                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    testing.assert_almost_equal(forward_cost[0], backward_cost[0])
+    assert not np.isnan(forward_cost[0])
+    assert not np.isnan(backward_cost[0])
+    assert not np.isinf(np.abs(forward_cost[0]))
+    assert not np.isinf(np.abs(backward_cost[0]))
+
+
+def test_ctc_symmetry():
+    LENGTH = 20
+    BATCHES = 3
+    CLASSES = 4
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False)
+
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
+    Y_hat[:, :, 0] = .3
+    Y_hat[:, :, 1] = .2
+    Y_hat[:, :, 2] = .4
+    Y_hat[:, :, 3] = .1
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    # default blank symbol is the highest class index (3 in this case)
+    Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]),
+                  BATCHES).reshape((9, BATCHES))
+    # the masks for this test should be all ones.
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y,
+                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1],
+                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    testing.assert_almost_equal(forward_cost[0], backward_cost[0])
+    assert not np.isnan(forward_cost[0])
+    assert not np.isnan(backward_cost[0])
+    assert not np.isinf(np.abs(forward_cost[0]))
+    assert not np.isinf(np.abs(backward_cost[0]))
+
+
+def test_ctc_exact_log_scale():
+    LENGTH = 4
+    BATCHES = 1
+    CLASSES = 2
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=True)
+
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
+    Y_hat[:, :, 0] = .7
+    Y_hat[:, :, 1] = .3
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    # default blank symbol is the highest class index (3 in this case)
+    Y = np.zeros((2, 1), dtype='int64')
+    # -0-0
+    # 0-0-
+    # 0--0
+    # 0-00
+    # 00-0
+    answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3)
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y,
+                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1],
+                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    assert not np.isnan(forward_cost[0])
+    assert not np.isnan(backward_cost[0])
+    assert not np.isinf(np.abs(forward_cost[0]))
+    assert not np.isinf(np.abs(backward_cost[0]))
+    testing.assert_almost_equal(-forward_cost[0], answer)
+    testing.assert_almost_equal(-backward_cost[0], answer)
+
+
+def test_ctc_exact():
+    LENGTH = 4
+    BATCHES = 1
+    CLASSES = 2
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False)
+
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX)
+    Y_hat[:, :, 0] = .7
+    Y_hat[:, :, 1] = .3
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    # default blank symbol is the highest class index (3 in this case)
+    Y = np.zeros((2, 1), dtype='int64')
+    # -0-0
+    # 0-0-
+    # 0--0
+    # 0-00
+    # 00-0
+    answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3)
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y,
+                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1],
+                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    assert not np.isnan(forward_cost[0])
+    assert not np.isnan(backward_cost[0])
+    assert not np.isinf(np.abs(forward_cost[0]))
+    assert not np.isinf(np.abs(backward_cost[0]))
+    testing.assert_almost_equal(-forward_cost[0], answer)
+    testing.assert_almost_equal(-backward_cost[0], answer)
+
+
+def test_ctc_log_path_probabs():
+    LENGTH = 10
+    BATCHES = 3
+    CLASSES = 2
+    N_LABELS = 3
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    blanked_y, blanked_y_mask = ctc_cost._add_blanks(
+        y=y,
+        blank_symbol=1,
+        y_mask=y_mask)
+    p = ctc_cost._log_path_probabs(blanked_y, y_hat, blanked_y_mask, y_hat_mask, 1)
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX)
+    Y_hat[:, :, 0] = .7
+    Y_hat[:, :, 1] = .2
+    Y_hat[:, :, 2] = .1
+    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-2:, 0] = 0
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    forward_probs = p.eval({y_hat: Y_hat, y: Y,
+                            y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    assert forward_probs[-2, 0, 0] == -np.inf
+    Y_mask[-1] = 0
+    forward_probs_y_mask = p.eval({y_hat: Y_hat, y: Y,
+                                   y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    assert forward_probs_y_mask[-1, 1, -2] == -np.inf
+    assert not np.isnan(forward_probs).any()
+
+
+def test_ctc_log_forward_backward():
+    LENGTH = 8
+    BATCHES = 4
+    CLASSES = 2
+    N_LABELS = 3
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    blanked_y, blanked_y_mask = ctc_cost._add_blanks(
+        y=y,
+        blank_symbol=1,
+        y_mask=y_mask)
+    f, b = ctc_cost._log_forward_backward(blanked_y, y_hat,
+                                          blanked_y_mask, y_hat_mask, CLASSES)
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX)
+    Y_hat[:, :, 0] = .7
+    Y_hat[:, :, 1] = .2
+    Y_hat[:, :, 2] = .1
+    Y_hat[3, :, 0] = .3
+    Y_hat[3, :, 1] = .4
+    Y_hat[3, :, 2] = .3
+    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-2:] = 0
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    Y_mask[-2:, 0] = 0
+    y_prob = ctc_cost._class_batch_to_labeling_batch(blanked_y,
+                                                    y_hat,
+                                                    y_hat_mask)
+    forward_probs = f.eval({y_hat: Y_hat, y: Y,
+                            y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    backward_probs = b.eval({y_hat: Y_hat, y: Y,
+                            y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    y_probs = y_prob.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask})
+    assert not ((forward_probs + backward_probs)[:, 0, :] == -np.inf).all()
+    marg = forward_probs + backward_probs - np.log(y_probs)
+    forward_probs = np.exp(forward_probs)
+    backward_probs = np.exp(backward_probs)
+    L = (forward_probs * backward_probs[::-1][:, :, ::-1] / y_probs).sum(2)
+    assert not np.isnan(forward_probs).any()
+
+
+def finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=None):
+    y_hat = T.tensor3('features')
+    y_hat_mask = T.matrix('features_mask')
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask)
+    get_cost = theano.function([y, y_hat, y_mask, y_hat_mask],
+                               ctc_cost_t.sum())
+    diff_grad = np.zeros_like(Y_hat)
+    
+    for grad, val in islice(izip(np.nditer(diff_grad, op_flags=['readwrite']),
+                                 np.nditer(Y_hat, op_flags=['readwrite'])),
+                            0, n_steps):
+        val += eps
+        error_inc = get_cost(Y, Y_hat, Y_mask, Y_hat_mask)
+        val -= 2.0 * eps
+        error_dec = get_cost(Y, Y_hat, Y_mask, Y_hat_mask)
+        grad[...] = .5 * (error_inc - error_dec) / eps
+        val += eps
+
+    return diff_grad
+
+
+def test_ctc_class_batch_to_labeling_batch():
+    LENGTH = 20
+    BATCHES = 4
+    CLASSES = 2
+    LABELS = 2
+    y_hat = T.tensor3()
+    y_hat_mask = T.matrix('features_mask')
+    y = T.lmatrix('phonemes')
+    y_labeling = ctc_cost._class_batch_to_labeling_batch(y, y_hat, y_hat_mask)
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX)
+    Y = np.zeros((2, BATCHES), dtype='int64')
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-5:] = 0
+    Y_labeling = y_labeling.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask})
+    assert Y_labeling.shape == (LENGTH, BATCHES, LABELS)
+
+
+def test_ctc_labeling_batch_to_class_batch():
+    LENGTH = 20
+    BATCHES = 4
+    CLASSES = 2
+    LABELS = 2
+    y_labeling = T.tensor3()
+    y = T.lmatrix('phonemes')
+    y_hat = ctc_cost._labeling_batch_to_class_batch(y, y_labeling, CLASSES + 1)
+    Y_labeling = np.zeros((LENGTH, BATCHES, LABELS), dtype=floatX)
+    Y = np.zeros((2, BATCHES), dtype='int64')
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-5:] = 0
+    Y_hat = y_hat.eval({y_labeling: Y_labeling, y: Y})
+    assert Y_hat.shape == (LENGTH, BATCHES, CLASSES + 1)
+
+
+def test_ctc_targets():
+    LENGTH = 20
+    BATCHES = 4
+    CLASSES = 2
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    ctc_target = ctc_cost.get_targets(y, T.log(y_hat), y_mask, y_hat_mask)
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX)
+    Y_hat[:, :, 0] = .7
+    Y_hat[:, :, 1] = .2
+    Y_hat[:, :, 2] = .1
+    Y_hat[3, :, 0] = .3
+    Y_hat[3, :, 1] = .4
+    Y_hat[3, :, 2] = .3
+    Y = np.zeros((2, BATCHES), dtype='int64')
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-5:] = 0
+    # default blank symbol is the highest class index (3 in this case)
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    target = ctc_target.eval({y_hat: Y_hat, y: Y,
+                              y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    # Note that this part is the same as the cross entropy gradient
+    grad = -target / Y_hat
+    test_grad = finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=5)
+    testing.assert_almost_equal(grad.flatten()[:5],
+                                test_grad.flatten()[:5], decimal=3)
+
+
+def test_ctc_pseudo_cost():
+    LENGTH = 500
+    BATCHES = 40
+    CLASSES = 2
+    N_LABELS = 45
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask)
+
+    Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX)
+    Y_hat[:, :, 0] = .75
+    Y_hat[:, :, 1] = .2
+    Y_hat[:, :, 2] = .05
+    Y_hat[3, 0, 0] = .3
+    Y_hat[3, 0, 1] = .4
+    Y_hat[3, 0, 2] = .3
+    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
+    Y[25:, :] = 1
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-5:] = 0
+    # default blank symbol is the highest class index (3 in this case)
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    Y_mask[30:] = 0
+    cost = pseudo_cost.eval({y_hat: Y_hat, y: Y,
+                             y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    pseudo_grad = T.grad(ctc_cost.pseudo_cost(y, y_hat,
+                                              y_mask, y_hat_mask).sum(),
+                         y_hat)
+    #test_grad2 = pseudo_grad.eval({y_hat: Y_hat, y: Y,
+    #                               y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    # TODO: write some more meaningful asserts here
+    assert cost.sum() > 0
+
+
+def test_ctc_pseudo_cost_skip_softmax_stability():
+    LENGTH = 500
+    BATCHES = 40
+    CLASSES = 2
+    N_LABELS = 45
+    y_hat = T.tensor3('features')
+    input_mask = T.matrix('features_mask')
+    y_hat_mask = input_mask
+    y = T.lmatrix('phonemes')
+    y_mask = T.matrix('phonemes_mask')
+    pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask,
+                                       skip_softmax=True)
+
+    Y_hat = np.asarray(np.random.normal(0, 1, (LENGTH, BATCHES, CLASSES + 1)),
+                       dtype=floatX)
+    Y = np.zeros((N_LABELS, BATCHES), dtype='int64')
+    Y[25:, :] = 1
+    Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX)
+    Y_hat_mask[-5:] = 0
+    # default blank symbol is the highest class index (3 in this case)
+    Y_mask = np.asarray(np.ones_like(Y), dtype=floatX)
+    Y_mask[30:] = 0
+    pseudo_grad = T.grad(pseudo_cost.sum(), y_hat)
+    test_grad = pseudo_grad.eval({y_hat: Y_hat, y: Y,
+                                  y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    y_hat_softmax = T.exp(y_hat) / T.exp(y_hat).sum(2)[:, :, None]
+    pseudo_cost2 = ctc_cost.pseudo_cost(y, y_hat_softmax, y_mask, y_hat_mask,
+                                        skip_softmax=False)
+    pseudo_grad2 = T.grad(pseudo_cost2.sum(), y_hat)
+    test_grad2 = pseudo_grad2.eval({y_hat: Y_hat, y: Y,
+                                    y_hat_mask: Y_hat_mask, y_mask: Y_mask})
+    testing.assert_almost_equal(test_grad, test_grad2, decimal=4)