我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用theano.tensor.unbroadcast()。
def train_one(self, x, target): x, target = tt.unbroadcast(x, 0), tt.unbroadcast(target, 0) # F'ing scan states = {} for layer in self.layers: x, layer_state = layer.forward_pass_and_state(x, count_ops=True) states[layer]=layer_state loss = self.loss(x, target) param_grad_pairs = [] grad = None for layer in self.layers[::-1]: grad, param_grads = layer.backward_pass(state=states[layer], grad=grad, cost = loss, count_ops=True) loss = None param_grad_pairs += list(izip_equal(layer.parameters, param_grads)) all_params, all_param_grads = zip(*param_grad_pairs) self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads) return create_constant(0.) # scan demands some return
def get_output(self, train=False): X = self.get_input(train) # shape: (nb_samples, time (padded with zeros), input_dim) # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x = T.dot(X, self.W) + self.b # scan = theano symbolic loop. # See: http://deeplearning.net/software/theano/library/scan.html # Iterate over the first dimension of the x array (=time). outputs, updates = theano.scan( self._step, # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i]) sequences=[x, dict(input=padded_mask, taps=[-1])], # tensors to iterate over, inputs to _step # initialization of the output. Input to _step with default tap=-1. outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=self.U, # static inputs to _step truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_z = T.dot(X, self.W_z) + self.b_z x_r = T.dot(X, self.W_r) + self.b_r x_h = T.dot(X, self.W_h) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h, padded_mask], outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=[self.U_z, self.U_r, self.U_h], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xi, xf, xo, xc, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_z = T.dot(X, self.W_z) + self.b_z x_r = T.dot(X, self.W_r) + self.b_r x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h, padded_mask], outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=[self.U_r, self.U_h], truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def __theano__globalpool(self, inp, poolmode='mean', dim=None, issequence=False): # Determine the dimensionality of data (2 or 3?) if dim is None: dim = 3 if not issequence and inp.ndim == 5 else 2 # TODO Implement global pooling in 3D and for sequential data if dim == 3 or issequence: raise NotImplementedError("Global pooling is not yet implemented in 3D and for sequential data.") # Parse poolmode if not callable(poolmode): poolmode = getattr(T, poolmode) # Pool if dim == 2: y = poolmode(inp, axis=(2, 3), keepdims=True) # The last two dimensions of y are broadcastable. Fix that. y = T.unbroadcast(y, 2, 3) elif dim == 3: # TODO get this done you lazy piece of shit raise NotImplementedError("") return y
def apply(self, sentences, init_hid=None): """ Parameters ---------- sentences: (length, batch, featuresdim) Returns ------- hs: (n_blocks, batch, hid_size) """ if sentences.ndim == 3: batch_size = sentences.shape[1] n_steps = sentences.shape[0] else: raise NotImplementedError if init_hid is None: init_hid = T.unbroadcast(T.alloc(numpy.float32(0.), batch_size, self.hid_size)) rval, updates = theano.scan(self._step_forward, sequences=[sentences], outputs_info=[init_hid], n_steps=n_steps ) self.hs = rval return self.hs
def get_output(self, train=False): X = self.get_input(train) # shape: (nb_samples, time (padded with zeros), input_dim) # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x = T.dot(X, self.W) + self.b # scan = theano symbolic loop. # See: http://deeplearning.net/software/theano/library/scan.html # Iterate over the first dimension of the x array (=time). outputs, updates = theano.scan( self._step, # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i]) sequences=[x, dict(input=padded_mask, taps=[-1])], # tensors to iterate over, inputs to _step # initialization of the output. Input to _step with default tap=-1. outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=self.U, # static inputs to _step truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_z = T.dot(X, self.W_z) + self.b_z x_r = T.dot(X, self.W_r) + self.b_r x_h = T.dot(X, self.W_h) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h, padded_mask], outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=[self.U_z, self.U_r, self.U_h], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) xi = T.dot(X, self.W_i) + self.b_i xf = T.dot(X, self.W_f) + self.b_f xc = T.dot(X, self.W_c) + self.b_c xo = T.dot(X, self.W_o) + self.b_o [outputs, memories], updates = theano.scan( self._step, sequences=[xi, xf, xo, xc, padded_mask], outputs_info=[ T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) ], non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1) X = X.dimshuffle((1, 0, 2)) x_z = T.dot(X, self.W_z) + self.b_z x_r = T.dot(X, self.W_r) + self.b_r x_h = T.tanh(T.dot(X, self.Pmat)) + self.b_h outputs, updates = theano.scan( self._step, sequences=[x_z, x_r, x_h, padded_mask], outputs_info=T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), non_sequences=[self.U_r, self.U_h], truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def predict_one(self, x): x = tt.unbroadcast(x, 0) # F'ing scan for layer in self.layers: x = layer.forward_pass(x) return x
def build_sampler(self, n_samples, n_steps, T, c): states = [TT.zeros(shape=(n_samples,), dtype='int64'), TT.zeros(shape=(n_samples,), dtype='float32')] init_c = c[0, -self.state['dim']:] states += [ReplicateLayer(n_samples)(init(init_c).out).out for init in self.initializers] # added by Zhaopeng Tu, 2015-10-30 # init_coverage if self.state['maintain_coverage']: # in sampling, init_c is two-dimension (source_length*c_dim), same for init_coverage if self.state['use_accumulated_coverage'] and self.state['coverage_accumulated_operation'] == 'subtractive': init_coverage = TT.unbroadcast(TT.ones((c.shape[0], self.state['coverage_dim']), dtype='float32'), 1) else: init_coverage = TT.unbroadcast(TT.zeros((c.shape[0], self.state['coverage_dim']), dtype='float32'), 1) states.append(init_coverage) if not self.state['search']: c = PadLayer(n_steps)(c).out # Pad with final states non_sequences = [c, T] outputs, updates = theano.scan(self.sampling_step, outputs_info=states, non_sequences=non_sequences, sequences=[TT.arange(n_steps, dtype="int64")], n_steps=n_steps, name="{}_sampler_scan".format(self.prefix)) if self.state['maintain_coverage']: return (outputs[0], outputs[1], outputs[-1]), updates else: return (outputs[0], outputs[1]), updates
def build_sampler(self, n_samples, n_steps, T, c): states = [TT.zeros(shape=(n_samples,), dtype='int64'), TT.zeros(shape=(n_samples,), dtype='float32')] init_c = c[0, -self.state['dim']:] states += [ReplicateLayer(n_samples)(init(init_c).out).out for init in self.initializers] # added by Zhaopeng Tu, 2015-10-30 # init_coverage if self.state['maintain_coverage']: # in sampling, init_c is two-dimension (source_length*c_dim), same for init_coverage # modified by Zhaopeng Tu, 2015-12-18, big mistake here!!! # coverage should be always 3D, the first two dimensions are consistent with alignment probs # while the last one is the coverage dim if self.state['use_linguistic_coverage'] and self.state['coverage_accumulated_operation'] == 'subtractive': init_coverage = TT.unbroadcast(TT.ones((c.shape[0], n_samples, self.state['coverage_dim']), dtype='float32'), 2) else: init_coverage = TT.unbroadcast(TT.zeros((c.shape[0], n_samples, self.state['coverage_dim']), dtype='float32'), 2) states.append(init_coverage) if not self.state['search']: c = PadLayer(n_steps)(c).out # Pad with final states non_sequences = [c, T] if self.state['maintain_coverage'] and self.state['use_linguistic_coverage'] and self.state['use_fertility_model']: fertility = self.state['max_fertility'] * self.fertility_inputer(c).out non_sequences.append(fertility) outputs, updates = theano.scan(self.sampling_step, outputs_info=states, non_sequences=non_sequences, sequences=[TT.arange(n_steps, dtype="int64")], n_steps=n_steps, name="{}_sampler_scan".format(self.prefix)) if self.state['maintain_coverage']: if self.state['use_fertility_model'] and self.state['use_linguistic_coverage']: return (outputs[0], outputs[1], outputs[-1], fertility), updates else: return (outputs[0], outputs[1], outputs[-1]), updates else: return (outputs[0], outputs[1]), updates
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=self.depth) X = X.dimshuffle((1, 0, 2)) x = T.dot(X, self.W) + self.b if self.depth == 1: initial = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) else: initial = T.unbroadcast(T.unbroadcast(alloc_zeros_matrix(self.depth, X.shape[1], self.output_dim), 0), 2) outputs, updates = theano.scan( self._step, sequences=[x, dict( input = padded_mask, taps = [(-i) for i in range(self.depth)] )], outputs_info=[dict( initial = initial, taps = [(-i-1) for i in range(self.depth)] )], non_sequences=self.Us, truncate_gradient=self.truncate_gradient ) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def _forward(self): if theano.config.device.startswith('gpu'): from theano.tensor.nnet.abstract_conv import bilinear_upsampling else: raise AssertionError('Bilinear interpolation requires GPU and cuDNN.') inpt = T.reshape(self.inpt, (self.inpt_depth, self.n_inpt, self.inpt_height, self.inpt_width)) pre_res = bilinear_upsampling(input=inpt, ratio=self.up_factor) shuffle_res = pre_res.dimshuffle((2, 3, 0, 1)) res = self._bilinear_upsampling_1D(inpt=shuffle_res, ratio=self.up_factor) self.output = res.dimshuffle((2, 3, 0, 1)) self.output = T.shape_padaxis(self.output, axis=0) self.output = T.unbroadcast(self.output, 0)
def forward_batch(self, x, mask): """ :param x: (batch, length, dim) :param mask: (batch, length, ) :return: (batch, length, hidden_dim) """ # conv_after_length = length - kernel + 2 * padding_size + 1 new_x = x if self.padding_size > 0: # (padding_size + length + padding_size, dim) new_x = temporal_padding_3d(x, (self.padding_size, self.padding_size)) # (batch, conv_after_length) mask = temporal_padding_mask(mask, kernel_size=self.kernel_size, padding_size=self.padding_size) elif self.padding_size == 0: # (batch, conv_after_length) mask = temporal_padding_mask(mask, kernel_size=self.kernel_size, padding_size=0) else: raise RuntimeError("Dilation Rate >= 0") # safe_x = temporal_padding_3d(x, (0, self.kernel_size - x.shape[1])) # safe_mask = T.ones((x.shape[0], ), dtype=theano.config.floatX).dimshuffle([0, 'x']) # !!! convert safe_mask from col to matrix # safe_mask = T.unbroadcast(safe_mask, 1) # x, mask = ifelse(T.gt(self.kernel_size - x.shape[1], 0), # (safe_x, safe_mask), # (new_x, mask)) # (batch, conv_after_length, hidden_dim) conv_result = self.forward_conv_batch(new_x) # new_x = Print(new_x) # mask = Print()(mask) pooling_result = get_pooling_batch(conv_result, mask, self.pooling) dropout_out = dropout_from_layer(pooling_result, self.dropout) return self.act.activate(dropout_out + self.b)
def feedforward(self, inp=None): # Parse if inp is None: inp = self.x else: self.x = inp # Input is 5D sequential, i.e. we average over the T axis self.y = T.unbroadcast(T.mean(inp, axis=1, keepdims=self.keepdims), 1) # Return return self.y
def feedforward(self, inp=None): # Parse if inp is None: inp = self.x else: self.x = inp # Add in a new axis and unbroadcast it out = T.unbroadcast(inp.dimshuffle(0, 'x', 1, 2, 3), 1) # Return return out
def __init__(self, merge=True, batch_size=None, *args, **kwargs): self.merge = merge self.batch_size = batch_size if K._BACKEND != "theano": raise NotImplementedError("Check the unbroadcast in TensorFlow") super(MergeSequences, self).__init__(*args, **kwargs)
def call(self, x, mask=None): sh = x.shape bs = self.batch_size if self.merge: sh = (sh[0]*sh[1], ) + tuple(sh[2:]) return T.reshape(x, sh, ndim=4) else: sh = (bs, sh[0]/bs, ) + tuple(sh[1:]) ret = T.reshape(x, sh, ndim=5) return T.unbroadcast(ret, 0) # Works TH and TF
def test_rebroadcast(): d = numpy.random.rand(10, 10).astype('float32') v = theano.tensor.fmatrix() up = tensor.unbroadcast(v.sum().dimshuffle('x', 'x'), 0, 1) f = theano.function([v], [up], mode=mode_with_gpu) f(d) topo = f.maker.fgraph.toposort() rebrs = [node for node in topo if isinstance(node.op, tensor.Rebroadcast)] assert len(rebrs) == 1 rebr = rebrs[0] assert isinstance(rebr.inputs[0].type, GpuArrayType) assert isinstance(rebr.outputs[0].type, GpuArrayType)
def test_rebroadcast(self): # I need the sum, because the setup expects the output to be a # vector self.check_rop_lop(tensor.unbroadcast( self.x[:4].dimshuffle('x', 0), 0).sum(axis=1), (1,))
def test_rebroadcast_rebroadcast(self): mode = theano.compile.get_default_mode().including('canonicalize') m = T.matrix() s = T.addbroadcast(m, 0, 1) v = T.unbroadcast(s, 1) f = theano.function([m], v, mode=mode) f([[76]]) e = f.maker.fgraph.toposort() rebroadcast_nodes = [n for n in e if isinstance(n.op, T.Rebroadcast)] assert len(rebroadcast_nodes) == 1 assert rebroadcast_nodes[0].op.axis == {0: True}
def test_broadcast(self): # Test that we can rebroadcast data = numpy.random.rand(10, 10).astype('float32') output_var = f32sc(name="output", value=data) up = tensor.unbroadcast(output_var.sum().dimshuffle('x', 'x'), 0, 1) output_func = theano.function(inputs=[], outputs=[], updates=[(output_var, up)]) output_func() up = tensor.patternbroadcast(output_var.sum().dimshuffle('x', 'x'), output_var.type.broadcastable) output_func = theano.function(inputs=[], outputs=[], updates=[(output_var, up)]) output_func()
def apply(self, state_below, mask_below, init_state=None, context=None): if state_below.ndim == 3: # e.g. state_below=(n_step 10, batch_size 5, vector_size 30) batch_size = state_below.shape[1] n_steps = state_below.shape[0] else: raise NotImplementedError if mask_below == None: mask_below = T.ones(state_below.shape[:2], dtype='float32') # mask_below = T.ones_like(state_below,'float32') # print mask_below if self.with_contex: if init_state is None: init_state = T.tanh(theano.dot(context, self.W_c_init)) c_z = theano.dot(context, self.W_cz) c_r = theano.dot(context, self.W_cr) c_h = theano.dot(context, self.W_ch) non_sequences = [c_z, c_r, c_h] rval, updates = theano.scan(self._step_forward_with_context, sequences=[state_below, mask_below], outputs_info=[init_state], non_sequences=non_sequences, n_steps=n_steps ) else: if init_state is None: # init_state = T.alloc(numpy.float32(0.), batch_size, self.n_hids) # here if the batch_size = 1, it will meet the error: # "Inconsistency in the inner graph of scan 'scan_fn' : an input and an output are associated with the same recurrent state and should have the same type but have type 'TensorType(float32, row)' and 'TensorType(float32, matrix)' respectively.") # so I correct this line to the below init_state = T.unbroadcast(T.alloc(numpy.float32(0.), batch_size, self.n_hids), 0) rval, updates = theano.scan(self._step_forward, sequences=[state_below, mask_below], outputs_info=[init_state], n_steps=n_steps ) self.output = rval return self.output
def unbroadcast(x, axis): return T.unbroadcast(x, axis)
def get_output(self, train=False): X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=self.depth) X = X.dimshuffle((1, 0, 2)) x = T.dot(X, self.W) + self.b if self.depth == 1: initial = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) else: initial = T.unbroadcast(T.unbroadcast(alloc_zeros_matrix(self.depth, X.shape[1], self.output_dim), 0), 2) outputs, updates = theano.scan( self._step, sequences=[x, dict( input=padded_mask, taps=[(-i) for i in range(self.depth)] )], outputs_info=[dict( initial=initial, taps=[(-i-1) for i in range(self.depth)] )], non_sequences=self.Us, truncate_gradient=self.truncate_gradient, go_backwards=self.go_backwards) if self.return_sequences: return outputs.dimshuffle((1, 0, 2)) return outputs[-1]
def quick_theano_zero(dim_vec): ret = TT.unbroadcast(TT.alloc(numpy_floatX(0), *dim_vec), *range(len(dim_vec))) return ret # TODO If CUDNN is enabled use theano.sandbox.cuda.dnn.dnn_conv to achieve faster speed. But the current dnn_conv has some problems! So we use conv2d instead
def applyBn(numberEpochApplyRolling, inputTrain, inputTest, inputShapeTrain) : numberOfChannels = inputShapeTrain[1] gBn_values = np.ones( (numberOfChannels), dtype = 'float32' ) gBn = theano.shared(value=gBn_values, borrow=True) bBn_values = np.zeros( (numberOfChannels), dtype = 'float32') bBn = theano.shared(value=bBn_values, borrow=True) # For rolling average: muArray = theano.shared(np.zeros( (numberEpochApplyRolling, numberOfChannels), dtype = 'float32' ), borrow=True) varArray = theano.shared(np.ones( (numberEpochApplyRolling, numberOfChannels), dtype = 'float32' ), borrow=True) sharedNewMu_B = theano.shared(np.zeros( (numberOfChannels), dtype = 'float32'), borrow=True) sharedNewVar_B = theano.shared(np.ones( (numberOfChannels), dtype = 'float32'), borrow=True) e1 = np.finfo(np.float32).tiny mu_B = inputTrain.mean(axis=[0,2,3,4]) mu_B = T.unbroadcast(mu_B, (0)) var_B = inputTrain.var(axis=[0,2,3,4]) var_B = T.unbroadcast(var_B, (0)) var_B_plusE = var_B + e1 #---computing mu and var for inference from rolling average--- mu_RollingAverage = muArray.mean(axis=0) effectiveSize = inputShapeTrain[0]*inputShapeTrain[2]*inputShapeTrain[3]*inputShapeTrain[4] var_RollingAverage = (effectiveSize/(effectiveSize-1))*varArray.mean(axis=0) var_RollingAverage_plusE = var_RollingAverage + e1 # training normXi_train = (inputTrain - mu_B.dimshuffle('x', 0, 'x', 'x', 'x')) / T.sqrt(var_B_plusE.dimshuffle('x', 0, 'x', 'x', 'x')) normYi_train = gBn.dimshuffle('x', 0, 'x', 'x', 'x') * normXi_train + bBn.dimshuffle('x', 0, 'x', 'x', 'x') # testing normXi_test = (inputTest - mu_RollingAverage.dimshuffle('x', 0, 'x', 'x', 'x')) / T.sqrt(var_RollingAverage_plusE.dimshuffle('x', 0, 'x', 'x', 'x')) normYi_test = gBn.dimshuffle('x', 0, 'x', 'x', 'x') * normXi_test + bBn.dimshuffle('x', 0, 'x', 'x', 'x') return (normYi_train, normYi_test, gBn, bBn, muArray, varArray, sharedNewMu_B, sharedNewVar_B, mu_B, var_B ) # ----------------- Apply Softmax ---------------#
def __init__(self, layer_param): super(LSTMLayer, self).__init__(layer_param) assert 3 == self.input.ndim #assert ("init_hidden_state" in layer_param or "init_cell_state" in layer_param) self.gate_activation = layer_param.get('gate_activation', 'sigmoid') self.modular_activation = layer_param.get('modular_activation', 'tanh') self.hidden_activation = layer_param.get('hidden_activation', 'tanh') self.init_hidden_state = layer_param.get("init_hidden_state", quick_theano_zero((self.minibatch_size,) + self.dim_out)) self.init_cell_state = layer_param.get("init_cell_state", quick_theano_zero((self.minibatch_size,) + self.dim_out)) self.init_hidden_state = TT.unbroadcast(self.init_hidden_state, *range(self.init_hidden_state.ndim)) self.init_cell_state = TT.unbroadcast(self.init_cell_state, *range(self.init_cell_state.ndim)) if 'n_steps' in layer_param: self.n_steps = layer_param['n_steps'] else: self.n_steps = layer_param.get('n_steps', self.input.shape[0]) self.input_mat_size = (self.feature_in, self.feature_out) self.transition_mat_size = (self.feature_out, self.feature_out) # input to LSTM self.W_xi = quick_init_norm(self.rng, self.input_mat_size, self._s("W_xi"), scale=0.1) self.W_xf = quick_init_norm(self.rng, self.input_mat_size, self._s("W_xf"), scale=0.1) self.W_xo = quick_init_norm(self.rng, self.input_mat_size, self._s("W_xo"), scale=0.1) self.W_xc = quick_init_norm(self.rng, self.input_mat_size, self._s("W_xc"), scale=0.1) # LSTM to LSTM self.W_hi = quick_init_norm(self.rng, self.transition_mat_size, self._s("W_hi"), scale=0.1) self.W_hf = quick_init_norm(self.rng, self.transition_mat_size, self._s("W_hf"), scale=0.1) self.W_ho = quick_init_norm(self.rng, self.transition_mat_size, self._s("W_ho"), scale=0.1) self.W_hc = quick_init_norm(self.rng, self.transition_mat_size, self._s("W_hc"), scale=0.1) # bias to LSTM self.b_i = quick_zero((self.feature_out, ), self._s("b_i")) self.b_f = quick_zero((self.feature_out, ), self._s("b_f")) self.b_o = quick_zero((self.feature_out, ), self._s("b_o")) self.b_c = quick_zero((self.feature_out, ), self._s("b_c")) # collect all parameters self.param = [self.W_xi, self.W_hi, self.b_i, self.W_xf, self.W_hf, self.b_f, self.W_xo, self.W_ho, self.b_o, self.W_xc, self.W_hc, self.b_c] self.is_recurrent = True self.fprop()
def path_probability(self, queryseq_padded, scorematrix, queryseq_mask_padded=None, scorematrix_mask=None, blank_symbol=None): """ Compute p(l|x) using only the forward variable :param queryseq_padded: (2L+1, B) :param scorematrix: (T, C+1, B) :param queryseq_mask_padded: (2L+1, B) :param scorematrix_mask: (T, B) :param blank_symbol: = C by default :return: """ if blank_symbol is None: blank_symbol = scorematrix.shape[1] - 1 if queryseq_mask_padded is None: queryseq_mask_padded = tensor.ones_like(queryseq_padded, dtype=floatX) pred_y = self._class_batch_to_labeling_batch(queryseq_padded, scorematrix, scorematrix_mask) # (T, 2L+1, B), reshaped scorematrix r2, r3 = self._recurrence_relation(queryseq_padded, queryseq_mask_padded, blank_symbol) # r2 (2L+1, 2L+1), r3 (2L+1, 2L+1, B) def step(p_curr, p_prev, LLForward, countdown, r2, r3, queryseq_mask_padded): """ [DV, 1-14-2016]: A very weird problem encountered when integrating this CTC implementation into Keras. Before this revision there were no input parameters (r2, r3, queryseq_mask_padded) specified, they just referred to the outer scope ones. However, this will cause the CTC integrated within Keras producing inaccurate loss value, meanwhile when compiled as a separate function, the returned ctc loss value is accurate anyway. But if with these 3 parameters added as input, the problem vanished. This took me two days to find this remedy. I suspect this'd be the bug of theano. :param p_curr: (2L+1, B), one column of scorematrix :param p_prev: (B, 2L+1) :param LLForward: (B, 1) :param countdown: scalar :param r2: :param r3: :param queryseq_mask_padded: :return: """ dotproduct = (p_prev + tensor.dot(p_prev, r2) + # tensor.dot(p_prev, r2) = alpha(t-1, u-1) (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T) # = alpha(t-1, u-2) conditionally p_curr = p_curr.T * dotproduct if queryseq_mask_padded is not None: p_curr *= queryseq_mask_padded.T # (B, 2L+1) * (B, 2L+1) * (B, 2L+1) = (B, 2L+1) start = tensor.max([0, queryseq_padded.shape[0] - 2 * countdown]) mask = tensor.concatenate([tensor.zeros([queryseq_padded.shape[1], start]), tensor.ones([queryseq_padded.shape[1], queryseq_padded.shape[0] - start])], axis=1) p_curr *= mask c_batch = p_curr.sum(axis=1, keepdims=True) # (B, 1) p_curr /= c_batch LLForward += tensor.log(c_batch) countdown -= 1 return p_curr, LLForward, countdown # (B, 2L+1), (B, 1), scalar results, _ = theano.scan( step, sequences=[pred_y], # scan only work on the first dimension outputs_info=[tensor.eye(queryseq_padded.shape[0])[0] * tensor.ones(queryseq_padded.T.shape), tensor.unbroadcast(tensor.zeros([queryseq_padded.shape[1], 1]), 1), scorematrix.shape[0]], non_sequences=[r2, r3, queryseq_mask_padded]) return results