diff --git a/ctc_cost.py b/ctc_cost.py index 979ed93..b5d6160 100644 --- a/ctc_cost.py +++ b/ctc_cost.py @@ -1,7 +1,7 @@ """ CTC-Connectionist Temporal Classification -Code provided by Mohammad Pezeshki - May. 2015 - +Code provided by "Mohammad Pezeshki" and "Philemon Brakel"- May. 2015 - Montreal Institute for Learning Algorithms Referece: Graves, Alex, et al. "Connectionist temporal classification: @@ -13,194 +13,464 @@ This code is distributed without any warranty, express or implied. """ - +"""Connectionist Temporal Classification +y_hat : T x B x C+1 +y : L x B +y_hat_mask : T x B +y_mask : L x B +""" import theano +import numpy from theano import tensor +from theano import tensor as T + floatX = theano.config.floatX -# T: INPUT_SEQUENCE_LENGTH -# B: BATCH_SIZE -# L: OUTPUT_SEQUENCE_LENGTH -# C: NUM_CLASSES -class CTC(object): - """Connectionist Temporal Classification - y_hat : T x B x C+1 - y : L x B - y_hat_mask : T x B - y_mask : L x B - """ - @staticmethod - def add_blanks(y, blank_symbol, y_mask=None): - """Add blanks to a matrix and updates mask - - Input shape: L x B - Output shape: 2L+1 x B - - """ - # for y - y_extended = y.T.dimshuffle(0, 1, 'x') - blanks = tensor.zeros_like(y_extended) + blank_symbol - concat = tensor.concatenate([y_extended, blanks], axis=2) - res = concat.reshape((concat.shape[0], - concat.shape[1] * concat.shape[2])).T - begining_blanks = tensor.zeros((1, res.shape[1])) + blank_symbol - blanked_y = tensor.concatenate([begining_blanks, res], axis=0) - # for y_mask - if y_mask is not None: - y_mask_extended = y_mask.T.dimshuffle(0, 1, 'x') - concat = tensor.concatenate([y_mask_extended, - y_mask_extended], axis=2) - res = concat.reshape((concat.shape[0], - concat.shape[1] * concat.shape[2])).T - begining_blanks = tensor.ones((1, res.shape[1]), dtype=floatX) - blanked_y_mask = tensor.concatenate([begining_blanks, res], axis=0) - else: - blanked_y_mask = None - return blanked_y, blanked_y_mask - - @staticmethod - def class_batch_to_labeling_batch(y, y_hat, y_hat_mask=None): - y_hat = y_hat * y_hat_mask.dimshuffle(0, 'x', 1) - batch_size = y_hat.shape[2] - res = y_hat[:, y.astype('int32'), tensor.arange(batch_size)] - return res - - @staticmethod - def recurrence_relation(y, y_mask, blank_symbol): - n_y = y.shape[0] - blanks = tensor.zeros((2, y.shape[1])) + blank_symbol - ybb = tensor.concatenate((y, blanks), axis=0).T - sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) * - tensor.eq(ybb[:, 1:-1], blank_symbol) * - y_mask.T) - - # r1: LxL - # r2: LxL - # r3: LxLxB - r2 = tensor.eye(n_y, k=1) - r3 = (tensor.eye(n_y, k=2).dimshuffle(0, 1, 'x') * - sec_diag.dimshuffle(1, 'x', 0)) - - return r2, r3 - - @classmethod - def path_probabs(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol): - pred_y = cls.class_batch_to_labeling_batch(y, y_hat, y_hat_mask) - - r2, r3 = cls.recurrence_relation(y, y_mask, blank_symbol) - - def step(p_curr, p_prev): - # instead of dot product, we * first - # and then sum oven one dimension. - # objective: T.dot((p_prev)BxL, LxLxB) - # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB - dotproduct = (p_prev + tensor.dot(p_prev, r2) + - (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T) - return p_curr.T * dotproduct * y_mask.T # B x L - - probabilities, _ = theano.scan( - step, - sequences=[pred_y], - outputs_info=[tensor.eye(y.shape[0])[0] * tensor.ones(y.T.shape)]) - return probabilities, probabilities.shape - - @classmethod - def cost(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol): - y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32') - y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32') - probabilities, sth = cls.path_probabs(y, y_hat, - y_mask, y_hat_mask, - blank_symbol) - batch_size = probabilities.shape[1] - labels_probab = (probabilities[y_hat_mask_len - 1, - tensor.arange(batch_size), - y_mask_len - 1] + - probabilities[y_hat_mask_len - 1, - tensor.arange(batch_size), - y_mask_len - 2]) - avg_cost = tensor.mean(-tensor.log(labels_probab)) - return avg_cost, sth - - @staticmethod - def _epslog(x): - return tensor.cast(tensor.log(tensor.clip(x, 1E-12, 1E12)), - theano.config.floatX) - - @staticmethod - def log_add(a, b): - max_ = tensor.maximum(a, b) - return (max_ + tensor.log1p(tensor.exp(a + b - 2 * max_))) - - @staticmethod - def log_dot_matrix(x, z): - inf = 1E12 - log_dot = tensor.dot(x, z) - zeros_to_minus_inf = (z.max(axis=0) - 1) * inf - return log_dot + zeros_to_minus_inf - - @staticmethod - def log_dot_tensor(x, z): - inf = 1E12 - log_dot = (x.dimshuffle(1, 'x', 0) * z).sum(axis=0).T - zeros_to_minus_inf = (z.max(axis=0) - 1) * inf - return log_dot + zeros_to_minus_inf.T - - @classmethod - def log_path_probabs(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol): - pred_y = cls.class_batch_to_labeling_batch(y, y_hat, y_hat_mask) - r2, r3 = cls.recurrence_relation(y, y_mask, blank_symbol) - - def step(log_p_curr, log_p_prev): - p1 = log_p_prev - p2 = cls.log_dot_matrix(p1, r2) - p3 = cls.log_dot_tensor(p1, r3) - p123 = cls.log_add(p3, cls.log_add(p1, p2)) - - return (log_p_curr.T + - p123 + - cls._epslog(y_mask.T)) - - log_probabilities, _ = theano.scan( - step, - sequences=[cls._epslog(pred_y)], - outputs_info=[cls._epslog(tensor.eye(y.shape[0])[0] * - tensor.ones(y.T.shape))]) - return log_probabilities - - @classmethod - def log_cost(cls, y, y_hat, y_mask, y_hat_mask, blank_symbol): - y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32') - y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32') - log_probabs = cls.log_path_probabs(y, y_hat, - y_mask, y_hat_mask, - blank_symbol) +def get_targets(y, log_y_hat, y_mask, y_hat_mask): + """ + Returns the target values according to the CTC cost with respect to y_hat. + Note that this is part of the gradient with respect to the softmax output + and not with respect to the input of the original softmax function. + All computations are done in log scale + """ + num_classes = log_y_hat.shape[2] - 1 + blanked_y, blanked_y_mask = _add_blanks( + y=y, + blank_symbol=num_classes, + y_mask=y_mask) + + log_alpha, log_beta = _log_forward_backward(blanked_y, + log_y_hat, blanked_y_mask, + y_hat_mask, num_classes) + # explicitly not using a mask to prevent inf - inf + y_prob = _class_batch_to_labeling_batch(blanked_y, log_y_hat, + y_hat_mask=None) + marginals = log_alpha + log_beta - y_prob + max_marg = marginals.max(2) + max_marg = T.switch(T.le(max_marg, -numpy.inf), 0, max_marg) + log_Z = T.log(T.exp(marginals - max_marg[:, :, None]).sum(2)) + log_Z = log_Z + max_marg + log_Z = T.switch(T.le(log_Z, -numpy.inf), 0, log_Z) + targets = _labeling_batch_to_class_batch(blanked_y, + T.exp(marginals - + log_Z[:, :, None]), + num_classes + 1) + return targets + + +def pseudo_cost(y, y_hat, y_mask, y_hat_mask, skip_softmax=False): + """ + Training objective. + Computes the marginal label probabilities and returns the + cross entropy between this distribution and y_hat, ignoring the + dependence of the two. + This cost should have the same gradient but it should be more + numerically stable. + + Here's how it works: + + Say delta_y is the gradient we want theano to return with respect to + the input y and let's assume both variables are vectors. By simply + computing dot(delta_y, y), we obtain a cost with gradient delta_y. + + Parameters + ---------- + y : matrix (L, B) + the target label sequences + y_hat : tensor3 (T, B, C) + class probabily distribution sequences, potentially in log domain + y_mask : matrix (L, B) + indicates which values of y to use + y_hat_mask : matrix (T, B) + indicates the lenghts of the sequences in y_hat + skip_softmax : bool + whether to interpret y_hat as probabilities or unnormalized energy + values. The latter might be more numerically stable and efficient + because it avoids the computation of the explicit cost and softmax + gradients. You probably want to use this. + """ + if skip_softmax: + y_hat_softmax = (T.exp(y_hat - y_hat.max(2)[:, :, None]) / + T.exp(y_hat - + y_hat.max(2)[:, :, None]).sum(2)[:, :, None]) + y_hat_safe = y_hat - y_hat.max(2)[:, :, None] + log_y_hat_softmax = (y_hat_safe - + T.log(T.exp(y_hat_safe).sum(2))[:, :, None]) + targets = get_targets(y, log_y_hat_softmax, y_mask, y_hat_mask) + else: + y_hat_softmax = y_hat + targets = get_targets(y, (T.log(y_hat) - + T.log(y_hat.sum(2)[:, :, None])), + y_mask, y_hat_mask) + + mask = y_hat_mask[:, :, None] + if skip_softmax: + y_hat_grad = y_hat_softmax - targets + return (y_hat * mask * + theano.gradient.disconnected_grad(y_hat_grad)).sum(0).sum(1) + return -T.sum(theano.gradient.disconnected_grad(targets) * + T.log(y_hat**mask), axis=0).sum(1) + + +def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol, + log_scale=True): + """ + Based on code from Shawn Tan. + Credits to Kyle Kastner as well. + + This function computes the CTC log likelihood for a sequence that has + been augmented with blank labels. + + + """ + y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32') + y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32') + + if log_scale: + log_probabs = _log_path_probabs(y, T.log(y_hat), + y_mask, y_hat_mask, + blank_symbol) batch_size = log_probabs.shape[1] - labels_probab = cls.log_add( + + # Add the probabilities of the final time steps to get the total + # sequence likelihood. + log_labels_probab = _log_add( log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1], log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2]) - avg_cost = tensor.mean(-labels_probab) - return avg_cost - - @classmethod - def apply(cls, y, y_hat, y_mask, y_hat_mask, scale='log_scale'): - y_hat = y_hat.dimshuffle(0, 2, 1) - num_classes = y_hat.shape[1] - 1 - blanked_y, blanked_y_mask = cls.add_blanks( - y=y, - blank_symbol=num_classes.astype(floatX), - y_mask=y_mask) - if scale == 'log_scale': - final_cost = cls.log_cost(blanked_y, y_hat, - blanked_y_mask, y_hat_mask, - num_classes) - else: - final_cost, sth = cls.cost(blanked_y, y_hat, - blanked_y_mask, y_hat_mask, - num_classes) - return final_cost + else: + probabilities = _path_probabs(y, y_hat, + y_mask, y_hat_mask, + blank_symbol) + batch_size = probabilities.shape[1] + labels_probab = (probabilities[y_hat_mask_len - 1, + tensor.arange(batch_size), + y_mask_len - 1] + + probabilities[y_hat_mask_len - 1, + tensor.arange(batch_size), + y_mask_len - 2]) + log_labels_probab = tensor.log(labels_probab) + return log_labels_probab + + +def cost(y, y_hat, y_mask, y_hat_mask, log_scale=True): + """ + Training objective. + Computes the CTC cost using just the forward computations. + The difference between this function and the vanilla 'cost' function + is that this function adds blanks first. + + Note: don't try to compute the gradient of this version of the cost! + + ---- + Parameters + ---------- + y : matrix (L, B) + the target label sequences + y_hat : tensor3 (T, B, C) + class probabily distribution sequences + y_mask : matrix (L, B) + indicates which values of y to use + y_hat_mask : matrix (T, B) + indicates the lenghts of the sequences in y_hat + log_scale : bool + uses log domain computations if True + + """ + num_classes = y_hat.shape[2] - 1 + blanked_y, blanked_y_mask = _add_blanks( + y=y, + blank_symbol=num_classes, + y_mask=y_mask) + final_cost = -sequence_log_likelihood(blanked_y, y_hat, + blanked_y_mask, y_hat_mask, + num_classes, + log_scale=log_scale) + return final_cost + + +def _add_blanks(y, blank_symbol, y_mask=None): + """Add blanks to a matrix and updates mask + Input shape: L x B + Output shape: 2L+1 x B + """ + # for y + y_extended = y.T.dimshuffle(0, 1, 'x') + blanks = tensor.zeros_like(y_extended) + blank_symbol + concat = tensor.concatenate([y_extended, blanks], axis=2) + res = concat.reshape((concat.shape[0], + concat.shape[1] * concat.shape[2])).T + begining_blanks = tensor.zeros((1, res.shape[1])) + blank_symbol + blanked_y = tensor.concatenate([begining_blanks, res], axis=0) + # for y_mask + if y_mask is not None: + y_mask_extended = y_mask.T.dimshuffle(0, 1, 'x') + concat = tensor.concatenate([y_mask_extended, + y_mask_extended], axis=2) + res = concat.reshape((concat.shape[0], + concat.shape[1] * concat.shape[2])).T + begining_blanks = tensor.ones((1, res.shape[1]), dtype=floatX) + blanked_y_mask = tensor.concatenate([begining_blanks, res], axis=0) + else: + blanked_y_mask = None + return blanked_y.astype('int32'), blanked_y_mask + + +def _class_batch_to_labeling_batch(y, y_hat, y_hat_mask=None): + """ + Convert (T, B, C) tensor into (T, B, L) tensor. + + In other words, convert lattice of class probabilities into a lattice + of label probabilities costrained by the sequence y. + + Notes + ----- + T: number of time steps + B: batch size + L: length of label sequence + C: number of classes + Parameters + ---------- + y : matrix (L, B) + the target label sequences + y_hat : tensor3 (T, B, C) + class probabily distribution sequences + y_hat_mask : matrix (T, B) + indicates the lenghts of the sequences in y_hat + Returns + ------- + tensor3 (T, B, L): + A tensor that contains the probabilities per time step of the + labels that occur in the target sequence. + """ + if y_hat_mask is not None: + y_hat = y_hat * y_hat_mask[:, :, None] + batch_size = y_hat.shape[1] + y_hat = y_hat.dimshuffle(0, 2, 1) + res = y_hat[:, y.astype('int32'), T.arange(batch_size)] + return res.dimshuffle(0, 2, 1) + + +def _recurrence_relation(y, y_mask, blank_symbol): + """ + Construct a permutation matrix and tensor for computing CTC transitions. + + This matrix is represented as an actual matrix that contains the + permutations that are common to all transitions in the batch and a tensor + with permutations that are uniqe for each individual sequence. + + This 'matrix' is used to take the transition costraints into account + using just matrix algebra operations. + + Parameters + ---------- + y : matrix (L, B) + the target label sequences + y_mask : matrix (L, B) + indicates which values of y to use + blank_symbol: integer + indicates the symbol that signifies a blank label. + Returns + ------- + matrix (L, L) + tensor3 (L, L, B) + """ + n_y = y.shape[0] + blanks = tensor.zeros((2, y.shape[1])) + blank_symbol + ybb = tensor.concatenate((y, blanks), axis=0).T + sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) * + tensor.eq(ybb[:, 1:-1], blank_symbol) * + y_mask.T) + + # r1: LxL + # r2: LxL + # r3: LxLxB + eye2 = tensor.eye(n_y + 2) + r2 = eye2[2:, 1:-1] # tensor.eye(n_y, k=1) + r3 = (eye2[2:, :-2].dimshuffle(0, 1, 'x') * + sec_diag.dimshuffle(1, 'x', 0)) + + return r2, r3 + + +def _path_probabs(y, y_hat, y_mask, y_hat_mask, blank_symbol): + """Compute the probabilities of the paths that are compatible with the + sequence y. + + This function uses scan to get the forward probabilities (often denoted + with the symbol alpha in the literature). + + See _log_path_probabs for a version that works in log domain. + """ + + + + + pred_y = _class_batch_to_labeling_batch(y, y_hat, y_hat_mask) + pred_y = pred_y.dimshuffle(0, 2, 1) + n_labels = y.shape[0] + + r2, r3 = _recurrence_relation(y, y_mask, blank_symbol) + + def step(p_curr, p_prev): + # instead of dot product, we * first + # and then sum oven one dimension. + # objective: T.dot((p_prev)BxL, LxLxB) + # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB + dotproduct = (p_prev + tensor.dot(p_prev, r2) + + (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T) + return p_curr.T * dotproduct * y_mask.T # B x L + + probabilities, _ = theano.scan( + step, + sequences=[pred_y], + outputs_info=[tensor.eye(n_labels)[0] * tensor.ones(y.T.shape)]) + return probabilities + + +def _log_add(a, b): + # TODO: move functions like this to utils + max_ = tensor.maximum(a, b) + result = (max_ + tensor.log1p(tensor.exp(a + b - 2 * max_))) + return T.switch(T.isnan(result), max_, result) + + +def _log_dot_matrix(x, z): + y = x[:, :, None] + z[None, :, :] + y_max = y.max(axis=1) + out = T.log(T.sum(T.exp(y - y_max[:, None, :]), axis=1)) + y_max + return T.switch(T.isnan(out), -numpy.inf, out) + + +def _log_dot_tensor(x, z): + log_dot = x.dimshuffle(1, 'x', 0) + z + max_ = log_dot.max(axis=0) + out = (T.log(T.sum(T.exp(log_dot - max_[None, :, :]), axis=0)) + max_) + out = out.T + return T.switch(T.isnan(out), -numpy.inf, out) + + +def _log_path_probabs(y, log_y_hat, y_mask, y_hat_mask, blank_symbol, + reverse=False): + """ + Uses dynamic programming to compute the path probabilities. + + This function uses scan to get the forward probabilities (often denoted + with the symbol alpha in the literature). + + This function computes the probabilities in log domain and can be used + both the forward and backward passes of the CTC algorithm. + + Notes + ----- + T: number of time steps + B: batch size + L: length of label sequence + C: number of classes + Parameters + ---------- + y : matrix (L, B) + the target label sequences + log_y_hat : tensor3 (T, B, C) + log class probabily distribution sequences + y_mask : matrix (L, B) + indicates which values of y to use + y_hat_mask : matrix (T, B) + indicates the lenghts of the sequences in log_y_hat + blank_symbol: integer + indicates the symbol that signifies a blank label. + Returns + ------- + tensor3 (T, B, L): + the log forward probabilities for each label at every time step. + masked values should be -inf + """ + + n_labels, batch_size = y.shape + + if reverse: + y = y[::-1] + log_y_hat = log_y_hat[::-1] + y_hat_mask = y_hat_mask[::-1] + y_mask = y_mask[::-1] + # going backwards, the first non-zero alpha value should be the + # first non-masked label. + start_positions = T.cast(n_labels - y_mask.sum(0), 'int64') + else: + start_positions = T.zeros((batch_size,), dtype='int64') + + log_pred_y = _class_batch_to_labeling_batch(y, log_y_hat, y_hat_mask) + log_pred_y = log_pred_y.dimshuffle(0, 2, 1) + r2, r3 = _recurrence_relation(y, y_mask, blank_symbol) + r2, r3 = T.log(r2), T.log(r3) + + def step(log_p_curr, y_hat_mask_t, log_p_prev): + # applies the transitions matrices to take the sequence constraints of y + # into account. + p1 = log_p_prev + p2 = _log_dot_matrix(p1, r2) + p3 = _log_dot_tensor(p1, r3) + p12 = _log_add(p1, p2) + p123 = _log_add(p3, p12) + + y_hat_mask_t = y_hat_mask_t[:, None] + out = log_p_curr.T + p123 + T.log(y_mask.T) + return _log_add(T.log(y_hat_mask_t) + out, + T.log(1 - y_hat_mask_t) + log_p_prev) + + log_probabilities, _ = theano.scan( + step, + sequences=[log_pred_y, y_hat_mask], + outputs_info=[T.log(tensor.eye(n_labels)[start_positions])]) + + return log_probabilities + T.log(y_hat_mask[:, :, None]) + + +def _log_forward_backward(y, log_y_hat, y_mask, y_hat_mask, blank_symbol): + """Simply calls _log_path_probabs in both directions.""" + + log_probabs_forward = _log_path_probabs(y, + log_y_hat, + y_mask, + y_hat_mask, + blank_symbol) + log_probabs_backward = _log_path_probabs(y, + log_y_hat, + y_mask, + y_hat_mask, + blank_symbol, + reverse=True) + return log_probabs_forward, log_probabs_backward[::-1][:, :, ::-1] + + +def _labeling_batch_to_class_batch(y, y_labeling, num_classes): + """Coverts a sequence label lattice into a lattice of scores/probabilities + for each class per input time step. + """ + + batch_size = y.shape[1] + N = y_labeling.shape[0] + n_labels = y.shape[0] + # sum over all repeated labels + # from (T, B, L) to (T, C, B) + out = T.zeros((num_classes, batch_size, N)) + y_labeling = y_labeling.dimshuffle((2, 1, 0)) # L, B, T + y_ = y + + def scan_step(index, prev_res, y_labeling, y_): + res_t = T.inc_subtensor(prev_res[y_[index, T.arange(batch_size)], + T.arange(batch_size)], + y_labeling[index, T.arange(batch_size)]) + return res_t + + result, updates = theano.scan(scan_step, + sequences=[T.arange(n_labels)], + non_sequences=[y_labeling, y_], + outputs_info=[out]) + # result will be (C, B, T) so we make it (T, B, C) + return result[-1].dimshuffle(2, 1, 0) diff --git a/test_ctc.py b/test_ctc.py index ae60cd8..b0c556a 100644 --- a/test_ctc.py +++ b/test_ctc.py @@ -1,122 +1,446 @@ +import numpy as np import theano -import numpy -from theano import tensor -from blocks.model import Model -from blocks.bricks import Linear, Tanh -from ctc_cost import CTC -from blocks.initialization import IsotropicGaussian, Constant -from fuel.datasets import IterableDataset -from fuel.streams import DataStream -from blocks.algorithms import (GradientDescent, Scale, - StepClipping, CompositeRule) -from blocks.extensions.monitoring import TrainingDataMonitoring -from blocks.main_loop import MainLoop -from blocks.extensions import FinishAfter, Printing -from blocks.bricks.recurrent import SimpleRecurrent -from blocks.graph import ComputationGraph -import cPickle as pickle +import ctc_cost +import theano.tensor as T +from numpy import testing +from itertools import izip, islice + floatX = theano.config.floatX -@theano.compile.ops.as_op(itypes=[tensor.lvector], - otypes=[tensor.lvector]) -def print_pred(y_hat): - blank_symbol = 4 - res = [] - for i, s in enumerate(y_hat): - if (s != blank_symbol) and (i == 0 or s != y_hat[i - 1]): - res += [s] - return numpy.asarray(res) - -n_epochs = 200 -x_dim = 4 -h_dim = 9 -num_classes = 4 - -with open("ctc_test_data.pkl", "rb") as pkl_file: - data = pickle.load(pkl_file) - inputs = data['inputs'] - labels = data['labels'] - # from S x T x B x D to S x T x B - inputs_mask = numpy.max(data['mask_inputs'], axis=-1) - labels_mask = data['mask_labels'] - -print 'Building model ...' -# T x B x F -x = tensor.tensor3('x', dtype=floatX) -# T x B -x_mask = tensor.matrix('x_mask', dtype=floatX) -# L x B -y = tensor.matrix('y', dtype=floatX) -# L x B -y_mask = tensor.matrix('y_mask', dtype=floatX) - -x_to_h = Linear(name='x_to_h', - input_dim=x_dim, - output_dim=h_dim) -x_transform = x_to_h.apply(x) -rnn = SimpleRecurrent(activation=Tanh(), - dim=h_dim, name="rnn") -h = rnn.apply(x_transform) -h_to_o = Linear(name='h_to_o', - input_dim=h_dim, - output_dim=num_classes + 1) -h_transform = h_to_o.apply(h) -# T x B x C+1 -y_hat = tensor.nnet.softmax( - h_transform.reshape((-1, num_classes + 1)) -).reshape((h.shape[0], h.shape[1], -1)) -y_hat.name = 'y_hat' - -y_hat_mask = x_mask -cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale') -cost.name = 'CTC' -# Initialization -for brick in (rnn, x_to_h, h_to_o): - brick.weights_init = IsotropicGaussian(0.01) - brick.biases_init = Constant(0) - brick.initialize() - -print 'Bulding DataStream ...' -dataset = IterableDataset({'x': inputs, - 'x_mask': inputs_mask, - 'y': labels, - 'y_mask': labels_mask}) -stream = DataStream(dataset) - -print 'Bulding training process...' -algorithm = GradientDescent(cost=cost, - params=ComputationGraph(cost).parameters, - step_rule=CompositeRule([StepClipping(10.0), - Scale(0.02)])) -monitor_cost = TrainingDataMonitoring([cost], - prefix="train", - after_epoch=True) - -# sample number to monitor -sample = 8 - -y_hat_max_path = print_pred(tensor.argmax(y_hat[:, sample, :], axis=1)) -y_hat_max_path.name = 'Viterbi' -monitor_output = TrainingDataMonitoring([y_hat_max_path], - prefix="y_hat", - every_n_epochs=1) - -length = tensor.sum(y_mask[:, sample]).astype('int32') -tar = y[:length, sample].astype('int32') -tar.name = '_Target_Seq' -monitor_target = TrainingDataMonitoring([tar], - prefix="y", - every_n_epochs=1) - -model = Model(cost) -main_loop = MainLoop(data_stream=stream, algorithm=algorithm, - extensions=[monitor_cost, monitor_output, - monitor_target, - FinishAfter(after_n_epochs=n_epochs), - Printing()], - model=model) - -print 'Starting training ...' -main_loop.run() +def test_log_add(): + x = T.scalar() + y = T.scalar() + z = ctc_cost._log_add(x, y) + X = -3.0 + Y = -np.inf + value = z.eval({x: X, y: Y}) + assert value == -3.0 + + +def test_log_dot_matrix(): + x = T.matrix() + y = T.matrix() + z = ctc_cost._log_dot_matrix(y, x) + X = np.asarray(np.random.normal(0, 1, (5, 4)), dtype=floatX) + Y = np.asarray(np.random.normal(0, 1, (3, 5)), dtype=floatX) + #Y = np.ones((3, 5), dtype=floatX) * 3 + value = z.eval({x: X, y: Y}) + np_value = np.log(np.dot(np.exp(Y), np.exp(X))) + assert np.mean((value - np_value)**2) < 1e5 + + +def test_log_dot_matrix_zeros(): + x = T.matrix() + y = T.matrix() + z = ctc_cost._log_dot_matrix(y, x) + X = np.log(np.asarray(np.eye(5), dtype=floatX)) + Y = np.asarray(np.random.normal(0, 1, (3, 5)), dtype=floatX) + #Y = np.ones((3, 5), dtype=floatX) * 3 + value = z.eval({x: X, y: Y}) + np_value = np.log(np.dot(np.exp(Y), np.exp(X))) + assert np.mean((value - np_value)**2) < 1e5 + + +def test_ctc_add_blanks(): + BATCHES = 3 + N_LABELS = 3 + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + blanked_y, blanked_y_mask = ctc_cost._add_blanks( + y=y, + blank_symbol=1, + y_mask=y_mask) + Y = np.zeros((N_LABELS, BATCHES), dtype='int64') + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + Y_mask[-1, 0] = 0 + Blanked_y_mask = blanked_y_mask.eval({y_mask: Y_mask}) + Blanked_y = blanked_y.eval({y: Y}) + assert (Blanked_y == np.array([[1, 1, 1], + [0, 0, 0], + [1, 1, 1], + [0, 0, 0], + [1, 1, 1], + [0, 0, 0], + [1, 1, 1]], dtype='int32')).all() + assert (Blanked_y_mask == np.array([[1., 1., 1.], + [1., 1., 1.], + [1., 1., 1.], + [1., 1., 1.], + [1., 1., 1.], + [0., 1., 1.], + [0., 1., 1.]], dtype=floatX)).all() + + +def test_ctc_symmetry_logscale(): + LENGTH = 5000 + BATCHES = 3 + CLASSES = 4 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) + + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) + Y_hat[:, :, 0] = .3 + Y_hat[:, :, 1] = .2 + Y_hat[:, :, 2] = .4 + Y_hat[:, :, 3] = .1 + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + # default blank symbol is the highest class index (3 in this case) + Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]), + BATCHES).reshape((9, BATCHES)) + # the masks for this test should be all ones. + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + testing.assert_almost_equal(forward_cost[0], backward_cost[0]) + assert not np.isnan(forward_cost[0]) + assert not np.isnan(backward_cost[0]) + assert not np.isinf(np.abs(forward_cost[0])) + assert not np.isinf(np.abs(backward_cost[0])) + + +def test_ctc_symmetry(): + LENGTH = 20 + BATCHES = 3 + CLASSES = 4 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False) + + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) + Y_hat[:, :, 0] = .3 + Y_hat[:, :, 1] = .2 + Y_hat[:, :, 2] = .4 + Y_hat[:, :, 3] = .1 + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + # default blank symbol is the highest class index (3 in this case) + Y = np.repeat(np.array([0, 1, 2, 1, 2, 0, 2, 2, 2]), + BATCHES).reshape((9, BATCHES)) + # the masks for this test should be all ones. + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + testing.assert_almost_equal(forward_cost[0], backward_cost[0]) + assert not np.isnan(forward_cost[0]) + assert not np.isnan(backward_cost[0]) + assert not np.isinf(np.abs(forward_cost[0])) + assert not np.isinf(np.abs(backward_cost[0])) + + +def test_ctc_exact_log_scale(): + LENGTH = 4 + BATCHES = 1 + CLASSES = 2 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=True) + + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) + Y_hat[:, :, 0] = .7 + Y_hat[:, :, 1] = .3 + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + # default blank symbol is the highest class index (3 in this case) + Y = np.zeros((2, 1), dtype='int64') + # -0-0 + # 0-0- + # 0--0 + # 0-00 + # 00-0 + answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3) + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + assert not np.isnan(forward_cost[0]) + assert not np.isnan(backward_cost[0]) + assert not np.isinf(np.abs(forward_cost[0])) + assert not np.isinf(np.abs(backward_cost[0])) + testing.assert_almost_equal(-forward_cost[0], answer) + testing.assert_almost_equal(-backward_cost[0], answer) + + +def test_ctc_exact(): + LENGTH = 4 + BATCHES = 1 + CLASSES = 2 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask, log_scale=False) + + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES), dtype=floatX) + Y_hat[:, :, 0] = .7 + Y_hat[:, :, 1] = .3 + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + # default blank symbol is the highest class index (3 in this case) + Y = np.zeros((2, 1), dtype='int64') + # -0-0 + # 0-0- + # 0--0 + # 0-00 + # 00-0 + answer = np.log(3 * (.3 * .7)**2 + 2 * .3 * .7**3) + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + forward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + backward_cost = ctc_cost_t.eval({y_hat: Y_hat, y: Y[::-1], + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + assert not np.isnan(forward_cost[0]) + assert not np.isnan(backward_cost[0]) + assert not np.isinf(np.abs(forward_cost[0])) + assert not np.isinf(np.abs(backward_cost[0])) + testing.assert_almost_equal(-forward_cost[0], answer) + testing.assert_almost_equal(-backward_cost[0], answer) + + +def test_ctc_log_path_probabs(): + LENGTH = 10 + BATCHES = 3 + CLASSES = 2 + N_LABELS = 3 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + blanked_y, blanked_y_mask = ctc_cost._add_blanks( + y=y, + blank_symbol=1, + y_mask=y_mask) + p = ctc_cost._log_path_probabs(blanked_y, y_hat, blanked_y_mask, y_hat_mask, 1) + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) + Y_hat[:, :, 0] = .7 + Y_hat[:, :, 1] = .2 + Y_hat[:, :, 2] = .1 + Y = np.zeros((N_LABELS, BATCHES), dtype='int64') + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-2:, 0] = 0 + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + forward_probs = p.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + assert forward_probs[-2, 0, 0] == -np.inf + Y_mask[-1] = 0 + forward_probs_y_mask = p.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + assert forward_probs_y_mask[-1, 1, -2] == -np.inf + assert not np.isnan(forward_probs).any() + + +def test_ctc_log_forward_backward(): + LENGTH = 8 + BATCHES = 4 + CLASSES = 2 + N_LABELS = 3 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + blanked_y, blanked_y_mask = ctc_cost._add_blanks( + y=y, + blank_symbol=1, + y_mask=y_mask) + f, b = ctc_cost._log_forward_backward(blanked_y, y_hat, + blanked_y_mask, y_hat_mask, CLASSES) + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) + Y_hat[:, :, 0] = .7 + Y_hat[:, :, 1] = .2 + Y_hat[:, :, 2] = .1 + Y_hat[3, :, 0] = .3 + Y_hat[3, :, 1] = .4 + Y_hat[3, :, 2] = .3 + Y = np.zeros((N_LABELS, BATCHES), dtype='int64') + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-2:] = 0 + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + Y_mask[-2:, 0] = 0 + y_prob = ctc_cost._class_batch_to_labeling_batch(blanked_y, + y_hat, + y_hat_mask) + forward_probs = f.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + backward_probs = b.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + y_probs = y_prob.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask}) + assert not ((forward_probs + backward_probs)[:, 0, :] == -np.inf).all() + marg = forward_probs + backward_probs - np.log(y_probs) + forward_probs = np.exp(forward_probs) + backward_probs = np.exp(backward_probs) + L = (forward_probs * backward_probs[::-1][:, :, ::-1] / y_probs).sum(2) + assert not np.isnan(forward_probs).any() + + +def finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=None): + y_hat = T.tensor3('features') + y_hat_mask = T.matrix('features_mask') + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + ctc_cost_t = ctc_cost.cost(y, y_hat, y_mask, y_hat_mask) + get_cost = theano.function([y, y_hat, y_mask, y_hat_mask], + ctc_cost_t.sum()) + diff_grad = np.zeros_like(Y_hat) + + for grad, val in islice(izip(np.nditer(diff_grad, op_flags=['readwrite']), + np.nditer(Y_hat, op_flags=['readwrite'])), + 0, n_steps): + val += eps + error_inc = get_cost(Y, Y_hat, Y_mask, Y_hat_mask) + val -= 2.0 * eps + error_dec = get_cost(Y, Y_hat, Y_mask, Y_hat_mask) + grad[...] = .5 * (error_inc - error_dec) / eps + val += eps + + return diff_grad + + +def test_ctc_class_batch_to_labeling_batch(): + LENGTH = 20 + BATCHES = 4 + CLASSES = 2 + LABELS = 2 + y_hat = T.tensor3() + y_hat_mask = T.matrix('features_mask') + y = T.lmatrix('phonemes') + y_labeling = ctc_cost._class_batch_to_labeling_batch(y, y_hat, y_hat_mask) + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) + Y = np.zeros((2, BATCHES), dtype='int64') + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-5:] = 0 + Y_labeling = y_labeling.eval({y_hat: Y_hat, y: Y, y_hat_mask: Y_hat_mask}) + assert Y_labeling.shape == (LENGTH, BATCHES, LABELS) + + +def test_ctc_labeling_batch_to_class_batch(): + LENGTH = 20 + BATCHES = 4 + CLASSES = 2 + LABELS = 2 + y_labeling = T.tensor3() + y = T.lmatrix('phonemes') + y_hat = ctc_cost._labeling_batch_to_class_batch(y, y_labeling, CLASSES + 1) + Y_labeling = np.zeros((LENGTH, BATCHES, LABELS), dtype=floatX) + Y = np.zeros((2, BATCHES), dtype='int64') + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-5:] = 0 + Y_hat = y_hat.eval({y_labeling: Y_labeling, y: Y}) + assert Y_hat.shape == (LENGTH, BATCHES, CLASSES + 1) + + +def test_ctc_targets(): + LENGTH = 20 + BATCHES = 4 + CLASSES = 2 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + ctc_target = ctc_cost.get_targets(y, T.log(y_hat), y_mask, y_hat_mask) + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) + Y_hat[:, :, 0] = .7 + Y_hat[:, :, 1] = .2 + Y_hat[:, :, 2] = .1 + Y_hat[3, :, 0] = .3 + Y_hat[3, :, 1] = .4 + Y_hat[3, :, 2] = .3 + Y = np.zeros((2, BATCHES), dtype='int64') + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-5:] = 0 + # default blank symbol is the highest class index (3 in this case) + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + target = ctc_target.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + # Note that this part is the same as the cross entropy gradient + grad = -target / Y_hat + test_grad = finite_diff(Y, Y_hat, Y_mask, Y_hat_mask, eps=1e-2, n_steps=5) + testing.assert_almost_equal(grad.flatten()[:5], + test_grad.flatten()[:5], decimal=3) + + +def test_ctc_pseudo_cost(): + LENGTH = 500 + BATCHES = 40 + CLASSES = 2 + N_LABELS = 45 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask) + + Y_hat = np.zeros((LENGTH, BATCHES, CLASSES + 1), dtype=floatX) + Y_hat[:, :, 0] = .75 + Y_hat[:, :, 1] = .2 + Y_hat[:, :, 2] = .05 + Y_hat[3, 0, 0] = .3 + Y_hat[3, 0, 1] = .4 + Y_hat[3, 0, 2] = .3 + Y = np.zeros((N_LABELS, BATCHES), dtype='int64') + Y[25:, :] = 1 + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-5:] = 0 + # default blank symbol is the highest class index (3 in this case) + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + Y_mask[30:] = 0 + cost = pseudo_cost.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + pseudo_grad = T.grad(ctc_cost.pseudo_cost(y, y_hat, + y_mask, y_hat_mask).sum(), + y_hat) + #test_grad2 = pseudo_grad.eval({y_hat: Y_hat, y: Y, + # y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + # TODO: write some more meaningful asserts here + assert cost.sum() > 0 + + +def test_ctc_pseudo_cost_skip_softmax_stability(): + LENGTH = 500 + BATCHES = 40 + CLASSES = 2 + N_LABELS = 45 + y_hat = T.tensor3('features') + input_mask = T.matrix('features_mask') + y_hat_mask = input_mask + y = T.lmatrix('phonemes') + y_mask = T.matrix('phonemes_mask') + pseudo_cost = ctc_cost.pseudo_cost(y, y_hat, y_mask, y_hat_mask, + skip_softmax=True) + + Y_hat = np.asarray(np.random.normal(0, 1, (LENGTH, BATCHES, CLASSES + 1)), + dtype=floatX) + Y = np.zeros((N_LABELS, BATCHES), dtype='int64') + Y[25:, :] = 1 + Y_hat_mask = np.ones((LENGTH, BATCHES), dtype=floatX) + Y_hat_mask[-5:] = 0 + # default blank symbol is the highest class index (3 in this case) + Y_mask = np.asarray(np.ones_like(Y), dtype=floatX) + Y_mask[30:] = 0 + pseudo_grad = T.grad(pseudo_cost.sum(), y_hat) + test_grad = pseudo_grad.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + y_hat_softmax = T.exp(y_hat) / T.exp(y_hat).sum(2)[:, :, None] + pseudo_cost2 = ctc_cost.pseudo_cost(y, y_hat_softmax, y_mask, y_hat_mask, + skip_softmax=False) + pseudo_grad2 = T.grad(pseudo_cost2.sum(), y_hat) + test_grad2 = pseudo_grad2.eval({y_hat: Y_hat, y: Y, + y_hat_mask: Y_hat_mask, y_mask: Y_mask}) + testing.assert_almost_equal(test_grad, test_grad2, decimal=4)