我们从Python开源项目中,提取了以下45个代码示例,用于说明如何使用theano.tensor.eye()。
def op_ortho_loss(s_x_, axes_=(-2, -1), ndim_=None): ''' orthogoal matrix loss used to regularize parameter to unitary Args: s_x_: (batch of) matrices axes_: tuple of two integers, specify which axes to be for matrix, defaults to last two axes ndim_: specify args to be (ndim_ x ndim_) matrices ''' if ndim_ is None: ax = axes_[0] ndim = T.shape(s_x_)[ax] else: ndim = ndim_ tpat = list(range(ndim)) bpat = ['x'] * s_x_.ndim tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]] bpat[axes_[0]] = 0 bpat[axes_[1]] = 1 s_y = T.dot(s_x_.transpose(*tpat), s_x_) return T.sqr(s_y - T.eye(ndim).dimshuffle(*bpat))
def op_covmat(s_x_, l1_normize_=True, eps_=1e-7): ''' Return covariance matrix given a batch of data points Args: s_x_: batch of row vectors l1_normize_: Defatuls to True. Make covariance matrix is L1 normalized wrt number of data points. eps_: Adds a small identity matrix I*eps_ to result, this is applied after L1 - normalization ''' assert s_x_.ndim == 2 s_mean = s_x_ - T.mean(s_x_, axis=0, keepdims=True) s_shp = T.shape(s_x_) s_covmat = T.dot(s_mean.T, s_mean) if l1_normize_: s_covmat /= s_shp[0] return s_covmat + T.eye(s_shp[1]) * eps_
def _ctc_normal(self, predict,labels): n = labels.shape[0] labels2 = T.concatenate((labels, [self.tpo["CTC_blank"], self.tpo["CTC_blank"]])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * \ T.eq(labels2[1:-1], self.tpo["CTC_blank"]) recurrence_relation = \ T.eye(n) + \ T.eye(n, k=1) + \ T.eye(n, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = predict[:, labels] probabilities, _ = theano.scan( lambda curr, accum: curr * T.dot(accum, recurrence_relation), sequences=[pred_y], outputs_info=[T.eye(n)[0]] ) labels_probab = T.sum(probabilities[-1, -2:]) return -T.log(labels_probab)
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1)
def linear_mmd2_and_hotelling(X, Y, biased=True, reg=0): if not biased: raise ValueError("linear_mmd2_and_hotelling only works for biased est") n = X.shape[0] p = X.shape[1] Z = X - Y Z_bar = Z.mean(axis=0) mmd2 = Z_bar.dot(Z_bar) Z_cent = Z - Z_bar S = Z_cent.T.dot(Z_cent) / (n - 1) # z' inv(S) z = z' inv(L L') z = z' inv(L)' inv(L) z = ||inv(L) z||^2 L = slinalg.cholesky(S + reg * T.eye(p)) Linv_Z_bar = slinalg.solve_lower_triangular(L, Z_bar) lambda_ = n * Linv_Z_bar.dot(Linv_Z_bar) # happens on the CPU! return mmd2, lambda_
def note_to_encoding(self, chosen_note, relative_position, low_bound, high_bound): assert chosen_note.ndim == 1 n_batch = chosen_note.shape[0] dont_play_version = T.switch( T.shape_padright(T.eq(chosen_note, 0)), T.tile(np.array([[1,0] + [0]*(self.ENCODING_WIDTH-2)], dtype=np.float32), (n_batch, 1)), T.tile(np.array([[0,1] + [0]*(self.ENCODING_WIDTH-2)], dtype=np.float32), (n_batch, 1))) rcp = T.tile(np.array([0,0,1],dtype=np.float32), (n_batch, 1)) circle_1 = T.eye(4)[(chosen_note-2)%4] circle_2 = T.eye(3)[(chosen_note-2)%3] octave = T.eye(self.num_octaves)[(chosen_note-2+low_bound-self.octave_start)//12] play_version = T.concatenate([rcp, circle_1, circle_2, octave], 1) encoded_form = T.switch( T.shape_padright(T.lt(chosen_note, 2)), dont_play_version, play_version ) return encoded_form
def _recurrence_relation(queryseq_padded, queryseq_mask_padded=None, blank_symbol=None): """ Generate structured matrix r2 & r3 for dynamic programming recurrence :param queryseq_padded: (2L+1, B) :param queryseq_mask_padded: (2L+1, B) :param blank_symbol: = C :return: r2 (2L+1, 2L+1), r3 (2L+1, 2L+1, B) """ L2 = queryseq_padded.shape[0] # = 2L+1 blanks = tensor.zeros((2, queryseq_padded.shape[1])) + blank_symbol # (2, B) ybb = tensor.concatenate((queryseq_padded, blanks), axis=0).T # (2L+3, B) -> (B, 2L+3) sec_diag = tensor.neq(ybb[:, :-2], ybb[:, 2:]) * tensor.eq(ybb[:, 1:-1], blank_symbol) # (B, 2L+1) if queryseq_mask_padded is not None: sec_diag *= queryseq_mask_padded.T r2 = tensor.eye(L2, k=1) # upper diagonal matrix (2L+1, 2L+1) r3 = tensor.eye(L2, k=2).dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0) # (2L+1, 2L+1, B) return r2, r3
def f(self, x, sampling=True, **kwargs): x /= np.cast[theano.config.floatX](np.sqrt(self.dim_in)) indx, indy = self.params[3], self.params[4] indx /= np.cast[theano.config.floatX](np.sqrt(self.dim_in)) if sampling: stdx, stdy = self._get_stds() noisex, noisey = sample_mult_noise(stdx, indx.shape), sample_mult_noise(stdy, indy.shape) indy *= noisey; indx *= noisex Rr, Rc = T.exp(self.params[1]), T.exp(self.params[2]) U = T.sqr(Rr) sigma11 = T.dot(indx * U.dimshuffle('x', 0), indx.T) + eps_ind * T.eye(self.n_inducing) sigma22 = T.dot(x * U.dimshuffle('x', 0), x.T) sigma12 = T.dot(indx * U.dimshuffle('x', 0), x.T) mu_ind = T.dot(indx, self.params[0]) inv_sigma11 = Tn.matrix_inverse(sigma11) mu_x = T.dot(x, self.params[0]) + T.dot(sigma12.T, inv_sigma11).dot(indy - mu_ind) if not sampling: return mu_x sigma_x = Tn.extract_diag(sigma22 - T.dot(sigma12.T, inv_sigma11).dot(sigma12)) std = T.outer(T.sqrt(sigma_x), Rc) out_sample = sample_gauss(mu_x, std) return out_sample
def f(self, x, sampling=True, **kwargs): x /= np.cast[theano.config.floatX](np.sqrt(self.dim_in)) indx, indy = self.params[3], self.params[4] indx /= np.cast[theano.config.floatX](np.sqrt(self.dim_in)) if sampling: noisex = sample_mult_noise(T.exp(self.params[-2]), indx.shape) noisey = sample_mult_noise(T.exp(self.params[-1]), indy.shape) indy *= noisey; indx *= noisex Rr, Rc = T.exp(self.params[1]), T.exp(self.params[2]) U = T.sqr(Rr) sigma11 = T.dot(indx * U.dimshuffle('x', 0), indx.T) + eps_ind * T.eye(self.n_inducing) sigma22 = T.dot(x * U.dimshuffle('x', 0), x.T) sigma12 = T.dot(indx * U.dimshuffle('x', 0), x.T) mu_ind = T.dot(indx, self.params[0]) inv_sigma11 = Tn.matrix_inverse(sigma11) mu_x = T.dot(x, self.params[0]) + T.dot(sigma12.T, inv_sigma11).dot(indy - mu_ind) if not sampling: return mu_x sigma_x = Tn.extract_diag(sigma22 - T.dot(sigma12.T, inv_sigma11).dot(sigma12)) std = T.outer(T.sqrt(sigma_x), Rc) out_sample = sample_gauss(mu_x, std) return out_sample
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Input Mixture of Gaussian Layer
def op_unitary_loss(s_re_, s_im_, axes_=None, size_=None): ''' unitary matrix loss of real/imag part, used to regularize parameter to unitary Args: s_re_: real part, square matrix s_im_: imag part, square matrix size_: specify args to be (size_ x size_) matrices axes_: tuple of two integers, specify which axes to be for matrix, defaults to last two axes ''' if axes_ is None: axes_ = (-2, -1) if size_ is None: ax = axes_[0] size = T.shape(s_re_)[ax] else: size = size_ assert s_re_.ndim == s_im_.ndim tpat = list(range(s_re_.ndim)) bpat = ['x'] * s_re_.ndim tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]] bpat[axes_[0]] = 0 bpat[axes_[1]] = 1 s_y_re_ = T.dot(s_re_.transpose(*tpat), s_re_) + T.dot(s_im_.transpose(*tpat), s_im_) s_tmp = T.dot(s_re_.transpose(*tpat), s_im_) s_y_im_ = s_tmp - s_tmp.transpose(*tpat) return T.mean(T.sqr(s_y_re_ - T.eye(size).dimshuffle(*bpat)) + T.sqr(s_y_im_))
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Convenience function to define an inception-style block
def _cost_func(self, y): #y = T.clip(y, EPSILON, 1.0 - EPSILON) #return CrossEntropyCost(y, self.k).get() k_onehot = T.eye(y.shape[1])[self.k] k_centered = 2.0 * k_onehot - 1.0 loss = T.mean(T.sqr(T.maximum(0.0, 1.0 - y*k_centered))) return loss
def generate_latent_pair(self): h0 = T.zeros((self.dim,)).astype(floatX)[None, :] h = T.eye(self.dim).astype(floatX) return h0, h
def eye(n, m): return T.eye(n=n, m=m)
def _recurrence_relation(y, y_mask, blank_symbol): """ Construct a permutation matrix and tensor for computing CTC transitions. Parameters ---------- y : matrix (L, B) the target label sequences y_mask : matrix (L, B) indicates which values of y to use blank_symbol: integer indicates the symbol that signifies a blank label. Returns ------- matrix (L, L) tensor3 (L, L, B) """ n_y = y.shape[0] blanks = tensor.zeros((2, y.shape[1])) + blank_symbol ybb = tensor.concatenate((y, blanks), axis=0).T sec_diag = (tensor.neq(ybb[:, :-2], ybb[:, 2:]) * tensor.eq(ybb[:, 1:-1], blank_symbol) * y_mask.T) # r1: LxL # r2: LxL # r3: LxLxB eye2 = tensor.eye(n_y + 2) r2 = eye2[2:, 1:-1] # tensor.eye(n_y, k=1) r3 = (eye2[2:, :-2].dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0)) return r2, r3
def dotcol(x, dim, mat_b): return mat_b.dot(T.eye(dim)[:,x])
def fit(self, X, y): """ Fits a Student-t regressor using MCMC. Parameters ---------- X: np.ndarray, shape=(nsamples, nfeatures) Training instances to fit the GP. y: np.ndarray, shape=(nsamples,) Corresponding continuous target values to `X`. """ self.X = X self.n = self.X.shape[0] self.y = y self.model = pm.Model() with self.model as model: l = pm.Uniform('l', 0, 10) log_s2_f = pm.Uniform('log_s2_f', lower=-7, upper=5) s2_f = pm.Deterministic('sigmaf', tt.exp(log_s2_f)) log_s2_n = pm.Uniform('log_s2_n', lower=-7, upper=5) s2_n = pm.Deterministic('sigman', tt.exp(log_s2_n)) f_cov = s2_f * covariance_equivalence[type(self.covfunc).__name__](1, l) Sigma = f_cov(self.X) + tt.eye(self.n) * s2_n ** 2 y_obs = pm.MvStudentT('y_obs', nu=self.nu, mu=np.zeros(self.n), Sigma=Sigma, observed=self.y) with self.model as model: if self.step is not None: self.trace = pm.sample(self.niter, step=self.step())[self.burnin:] else: self.trace = pm.sample(self.niter, init=self.init)[self.burnin:]
def fit(self, X, y): """ Fits a Gaussian Process regressor using MCMC. Parameters ---------- X: np.ndarray, shape=(nsamples, nfeatures) Training instances to fit the GP. y: np.ndarray, shape=(nsamples,) Corresponding continuous target values to `X`. """ self.X = X self.n = self.X.shape[0] self.y = y self.model = pm.Model() with self.model as model: l = pm.Uniform('l', 0, 10) log_s2_f = pm.Uniform('log_s2_f', lower=-7, upper=5) s2_f = pm.Deterministic('sigmaf', tt.exp(log_s2_f)) log_s2_n = pm.Uniform('log_s2_n', lower=-7, upper=5) s2_n = pm.Deterministic('sigman', tt.exp(log_s2_n)) f_cov = s2_f * covariance_equivalence[type(self.covfunc).__name__](1, l) Sigma = f_cov(self.X) + tt.eye(self.n) * s2_n ** 2 y_obs = pm.MvNormal('y_obs', mu=np.zeros(self.n), cov=Sigma, observed=self.y) with self.model as model: if self.step is not None: self.trace = pm.sample(self.niter, step=self.step())[self.burnin:] else: self.trace = pm.sample(self.niter, init=self.init)[self.burnin:]
def l1_regularization_F_I(self): return (self.F - T.eye(self.emb_dim)).norm(1)
def l2_regularization_F(self): return (self.F - np.eye(self.emb_dim)).norm(2)
def new_attention_step(self, ct, prev_g, mem, q_q, c_vecs): cWq = T.dot(T.ones((1, 4), dtype=floatX), T.dot(T.dot(ct.T, self.W_b), q_q) * T.eye(n=4, m=4, dtype=floatX)) cWm = T.dot(T.ones((1, 4), dtype=floatX), T.dot(T.dot(ct.T, self.W_b), mem) * T.eye(n=4, m=4, dtype=floatX)) cWc_vecs = T.dot(T.ones((1, 4), dtype=floatX), T.dot(T.dot(ct.T, self.W_b), c_vecs) * T.eye(n=4, m=4, dtype=floatX)) z = T.concatenate([ct, mem, q_q, c_vecs, ct * q_q, ct * mem, ct * c_vecs, ct - q_q, ct - mem, ct - c_vecs, cWq, cWm, cWc_vecs], axis=0) l_1 = T.dot(self.W_1, z) + self.b_1.dimshuffle(0, 'x') l_1 = T.tanh(l_1) l_2 = T.dot(self.W_2, l_1) + self.b_2.dimshuffle(0, 'x') G = T.nnet.sigmoid(l_2)[0] return G
def test_gpueye(): def check(dtype, N, M_=None): # Theano does not accept None as a tensor. # So we must use a real value. M = M_ # Currently DebugMode does not support None as inputs even if this is # allowed. if M is None: M = N N_symb = T.iscalar() M_symb = T.iscalar() k_symb = numpy.asarray(0) out = T.eye(N_symb, M_symb, k_symb, dtype=dtype) f = theano.function([N_symb, M_symb], T.stack(out), mode=mode_with_gpu) result = numpy.asarray(f(N, M)) assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype)) assert result.dtype == numpy.dtype(dtype) assert any([isinstance(node.op, GpuEye) for node in f.maker.fgraph.toposort()]) for dtype in ['float32', 'int32', 'float16']: yield check, dtype, 3 # M != N, k = 0 yield check, dtype, 3, 5 yield check, dtype, 5, 3
def test_gpueye(): def check(dtype, N, M_=None, K=0): # Theano does not accept None as a tensor. # So we must use a real value. M = M_ # Currently DebugMode does not support None as inputs even if this is # allowed. if M is None: M = N N_symb = T.iscalar() M_symb = T.iscalar() k_symb = numpy.asarray(K) out = T.eye(N_symb, M_symb, k_symb, dtype=dtype) f = theano.function([N_symb, M_symb], B.as_cuda_ndarray_variable(out), mode=mode_with_gpu) result = numpy.asarray(f(N, M)) utt.assert_allclose(result, numpy.eye(N, M_, K, dtype=dtype)) assert result.dtype == numpy.dtype(dtype) if K == 0: assert any([isinstance(node.op, B.GpuEye) for node in f.maker.fgraph.toposort()]) for dtype in ['float32']: yield check, dtype, 3 # M != N, k = 0 yield check, dtype, 3, 5 yield check, dtype, 5, 3 yield check, dtype, 5, 3, 1
def to_one_hot(x, n_y): # TODO: Replace this with built-in Theano function in extra_ops assert type(n_y) == int return T.eye(n_y)[x]
def _symbolic_arrows(self, A): """Computes the number of unclosed triangles involving any two nodes. (1 - A) A^2 + A (D + D^T - A^2 - 1) """ # Compute and broadcast degree. num_nodes = A.shape[0] D = T.tile(T.sum(A, axis=1), (num_nodes, 1)) return ( (T.eye(num_nodes) - A) * T.dot(A, A) + A * (D + D.T - T.dot(A, A) - 2) )
def parser_loss(energies, heads, types, masks): """ compute minus log likelihood of parser as parser loss. :param energies: Theano 4D tensor energies of each edge. the shape is [batch_size, n_steps, n_steps, num_labels], where the summy root is at index 0. :param heads: Theano 2D tensor heads in the shape [batch_size, n_steps]. :param types: Theano 2D tensor types in the shape [batch_size, n_steps]. :param masks: Theano 2D tensor masks in the shape [batch_size, n_steps]. :return: Theano 1D tensor an expression for minus log likelihood loss. """ input_shape = energies.shape batch_size = input_shape[0] length = input_shape[1] # get the exp of energies, and add along the label axis. # the shape is [batch_size, n, n]. E = T.exp(energies).sum(axis=3) # zero out the elements out the length of each sentence. if masks is not None: masks_shuffled = masks.dimshuffle(0, 1, 'x') E = E * masks_shuffled masks_shuffled = masks.dimshuffle(0, 'x', 1) E = E * masks_shuffled # compute the D tensor. # the shape is [batch_size, n, n] D = E.sum(axis=1) D = T.zeros_like(E) + D.dimshuffle(0, 1, 'x') # zeros out all elements except diagonal. D = D * T.eye(length, length, 0).dimshuffle('x', 0, 1) # compute lengths lengths = T.cast(masks, dtype='int32').sum(axis=1) # compute laplacian matrix L = D - E # compute partition Z(x) partitions, _ = theano.scan(fn=lambda laps, length: nlinalg.logabsdet(laps[1:length, 1:length]), outputs_info=None, sequences=[L, lengths]) # compute targets energy # first create indice matrix indices = T.zeros_like(heads) + T.arange(length).dimshuffle('x', 0) # compute loss matrix shape = [n_steps, batch_size] target_energy = energies[T.arange(batch_size), heads.T, indices.T, types.T] # shuffle loss to [batch_size, n_steps] target_energy = target_energy.dimshuffle(1, 0) # remove the first element [batch, n_steps -1] target_energy = target_energy[:, 1:] # sum over n_step shape = [batch_size] target_energy = target_energy.sum(axis=1) return partitions - target_energy#, E, D, L, partitions, target_energy
def _ctc_log(self, predict,labels): def safe_log(x): return T.log(T.maximum(x, 1e-20).astype(theano.config.floatX)) def safe_exp(x): return T.exp(T.minimum(x, 1e20).astype(theano.config.floatX)) def logadd_simple(x, y): return x + safe_log(1 + safe_exp(y - x)) def logadd(x, y, *zs): sum = logadd_simple(x, y) for z in zs: sum = logadd_simple(sum, z) return sum def logmul(x, y): return x + y n = labels.shape[0] _1000 = T.eye(n)[0] prev_mask = 1 - _1000 prevprev_mask = T.neq(labels[:-2], labels[2:]) * \ T.eq(labels[1:-1], self.tpo["CTC_blank"]) prevprev_mask = T.concatenate(([0, 0], prevprev_mask)) prev_mask = safe_log(prev_mask) prevprev_mask = safe_log(prevprev_mask) prev = T.arange(-1, n-1) prevprev = T.arange(-2, n-2) log_pred_y = T.log(predict[:, labels]) def step(curr, accum): return logmul(curr, logadd(accum, logmul(prev_mask, accum[prev]), logmul(prevprev_mask, accum[prevprev]))) log_probs, _ = theano.scan( step, sequences=[log_pred_y], outputs_info=[safe_log(_1000)] ) # TO DO: Add -2 if n > 1 and blank at end log_labels_probab = log_probs[-1, -1] #T.sum(log_probs[-1, -2:]) to do return -log_labels_probab
def _mb_normal_ctc(self, network_output, labels, mask): n_y = labels.shape[1] / 2 y = labels[:,:n_y] y = y.dimshuffle(1,0) y_mask = labels[:,n_y:].astype(theano.config.floatX) # y_row = labels.dimshuffle(1,0) # n_y = y_row.shape[0] / 2 # y = y_row[:n_y,:] # y_mask = y_row[n_y:,:].astype(theano.config.floatX) y_hat = network_output.dimshuffle(0, 2, 1) pred_y = y_hat[:, y.astype('int32'), T.arange(self.tpo["batch_size"])] ybb = T.concatenate((y, self.blanks), axis=0).T sec_diag = (T.neq(ybb[:, :-2], ybb[:, 2:]) * T.eq(ybb[:, 1:-1], self.tpo["CTC_blank"]) * y_mask) # r1: LxL # r2: LxL # r3: LxLxB r2 = T.eye(n_y, k=1) r3 = (T.eye(n_y, k=2).dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0)) def step(p_curr, p_prev): # instead of dot product, we * first # and then sum oven one dimension. # objective: T.dot((p_prev)BxL, LxLxB) # solusion: Lx1xB * LxLxB --> LxLxB --> (sumover)xLxB dotproduct = (p_prev + T.dot(p_prev, r2) + (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T) return p_curr.T * dotproduct * y_mask # B x L probabilities, _ = theano.scan( step, sequences=[pred_y], outputs_info=[T.eye(n_y)[0] * T.ones([self.tpo["batch_size"], n_y])]) labels_probab = T.sum(probabilities[-1,:, -2:]) return T.mean(-T.log(labels_probab))
def _mb_log_ctc(self, network_output, labels, mask): #y_row = labels.dimshuffle(1,0) n_y = labels.shape[1] / 2 y = labels[:,:n_y] y = y.dimshuffle(1,0) y_mask = labels[:,n_y:].astype(theano.config.floatX) y_hat = network_output.dimshuffle(0, 2, 1) pred_y = y_hat[:, y.astype('int32'), T.arange(self.tpo["batch_size"])] ybb = T.concatenate((y, self.blanks), axis=0).T sec_diag = (T.neq(ybb[:, :-2], ybb[:, 2:]) * T.eq(ybb[:, 1:-1], self.tpo["CTC_blank"]) * y_mask) r2 = T.eye(n_y, k=1) r3 = (T.eye(n_y, k=2).dimshuffle(0, 1, 'x') * sec_diag.dimshuffle(1, 'x', 0)) def step(log_p_curr, log_p_prev): p1 = log_p_prev p2 = self.log_dot_matrix(p1, r2) p3 = self.log_dot_T(p1, r3) p123 = self.log_add(p3, self.log_add(p1, p2)) return (log_p_curr.T + p123 + self._epslog(y_mask)) log_probabs, _ = theano.scan( step, sequences=[self._epslog(pred_y)], outputs_info=[self._epslog(T.eye(n_y)[0] * T.ones([self.tpo["batch_size"], n_y]))]) labels_probab = T.sum(log_probabs[-1,:, -2:]) return T.mean(-labels_probab)
def get_output(self): if self.dropout_rate!=0: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) retain_prob = 1. - self.dropout_rate self.input *= rng.binomial(self.input.shape, p=retain_prob, dtype=self.input.dtype) self.input /= retain_prob conv_out = conv2d(self.input, self.Cnn_W) #(batch size, output channels, output rows, output columns) conv_out = conv_out + self.Cnn_B.dimshuffle('x', 0, 'x', 'x') # out_put_shape = self.get_output_shape() # r_matrix_s = np.eye(out_put_shape[3], out_put_shape[3], 0) # r_matrix_x = np.eye(out_put_shape[3], out_put_shape[3], -1) # test = [[r_matrix_s for i in range(self.input_shape[1])] for j in range(self.input_shape[0])] # print test # r_matrix_s = theano.shared(np.array(r_matrix_s).astype(np.float32)) # # r_matrix_x = theano.shared(np.array(r_matrix_x).astype(np.float32)) # # r_matrix = r_matrix_s*self.Rnn_W_s.dimshuffle(0, 'x', 'x') + \ # r_matrix_x*(1-self.Rnn_W_s).dimshuffle(0, 'x', 'x') # conv_out = conv_out.dimshuffle(1, 0, 2, 3) # def step (con, r_m, r_b): # return T.dot(con, r_m) + r_b # conv_out, _ = theano.scan(step, sequences=[conv_out, r_matrix, self.Rnn_W_b]) # conv_out = conv_out.dimshuffle(1, 0, 2, 3) # R_conv_out = T.concatenate([T.zeros_like(conv_out[:, :, :, :1]), conv_out], axis = 3) # R_conv_out = R_conv_out[:, :, :,:conv_out.shape[3]] # RNN_Ws = self.Rnn_W_s.dimshuffle('x', 0, 'x', 'x') # RNN_b = self.Rnn_W_b # R_conv_out = R_conv_out *RNN_Ws + conv_out * (1-RNN_Ws) + RNN_b # conv_out = conv_out.dimshuffle(1,0,2,3) # # def Rnn_add(channel,RNN_b,RNN_Ws,RNN_Wx): # RNN_channel = T.concatenate([T.zeros_like(channel[:, :, :1]),channel],axis = 2) # RNN_channel = RNN_channel[:,:,:channel.shape[2]] # res = RNN_channel*RNN_Ws + channel*RNN_Wx + RNN_b # return res #self.Rnn_W_s = T.abs_(self.Rnn_W_s) # R_conv_out,_ = theano.scan(Rnn_add,sequences= [conv_out,self.Rnn_W_b,self.Rnn_W_s,1 - self.Rnn_W_s]) # R_conv_out = R_conv_out.dimshuffle(1,0,2,3) #output = self.activition(R_conv_out) #return self.input return self.activition(conv_out) #return output
def CTC_LOSS(self): outpts = self.output inpts = self.Y def each_loss(outpt, inpt): # y ????blank???ans blank = 26 y_nblank = T.neq(inpt, blank) n = T.dot(y_nblank, y_nblank) # ??????? N = 2 * n + 1 # ?????????????????? labels = inpt[:N] labels2 = T.concatenate((labels, [blank, blank])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * T.eq(labels2[1:-1], blank) recurrence_relation = \ T.eye(N) + \ T.eye(N, k=1) + \ T.eye(N, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = outpt[:, labels] fwd_pbblts, _ = theano.scan( lambda curr, accum: T.switch(T.eq(curr*T.dot(accum, recurrence_relation), 0.0), T.dot(accum, recurrence_relation) , curr*T.dot(accum, recurrence_relation)), sequences=[pred_y], outputs_info=[T.eye(N)[0]] ) #return fwd_pbblts #liklihood = fwd_pbblts[0, 0] liklihood = fwd_pbblts[-1, -1] + fwd_pbblts[-1, -2] #liklihood = T.switch(T.lt(liklihood, 1e-35), 1e-35, liklihood) #loss = -T.log(T.cast(liklihood, "float32")) #loss = 10 * (liklihood - 1) * (liklihood - 100) loss = (T.le(liklihood, 1.0)*(10*(liklihood-1)*(liklihood-100)))+(T.gt(liklihood, 1.0)*(-T.log(T.cast(liklihood, "float32")))) return loss #return pred_y ctc_losss, _ = theano.scan(each_loss, sequences=[outpts, inpts], ) self.ctc_loss = theano.function([self.X, self.Y], ctc_losss) return ctc_losss
def __init__(self, input, feat, emb_dim, batch_size, init_mean, init_range, mean_length, feat_dim=0): # Matrices for predicting score mean_length = float(mean_length) self.emb_dim = emb_dim self.M = theano.shared(np.eye(emb_dim).astype(theano.config.floatX), borrow=True) self.N = theano.shared(np.eye(emb_dim).astype(theano.config.floatX), borrow=True) # Fair projection matrix self.F = theano.shared(np.eye(emb_dim).astype(theano.config.floatX), borrow=True) # Parameters for auxiliary features self.f = theano.shared(np.zeros((feat_dim,)).astype(theano.config.floatX), borrow=True) # Set embeddings by slicing tensor self.emb_context = input[:,0,:] self.emb_true_response = input[:,1,:] self.emb_response = input[:,2,:] self.feat = feat self.x = input # Projects embeddings into 'fair' space self.emb_response_fair = T.dot(self.emb_response, self.F) # Compute score predictions self.pred1 = T.sum(self.emb_context * T.dot(self.emb_response_fair, self.M), axis=1) self.pred2 = T.sum(self.emb_true_response * T.dot(self.emb_response_fair, self.N), axis=1) self.pred3 = T.dot(self.feat, self.f) self.pred = self.pred1 + self.pred2 + self.pred3 #self.pred = T.sum(T.dot(self.emb_response, self.f), axis=1) + 0*T.sum(self.feat) # If only using response self.output = 2.5 + 5 * (self.pred - init_mean) / init_range # to re-scale dot product values to [0,5] range # Feed-forward neural net for predicting length #n_hid = emb_dim #self.q = theano.shared(np.zeros((emb_dim, n_hid)).astype(theano.config.floatX), borrow=True) #self.q2 = theano.shared(np.zeros((n_hid,)).astype(theano.config.floatX), borrow=True) #self.b = theano.shared(np.zeros((n_hid,))) #self.b2 = theano.shared(1. * mean_length) #l1 = T.dot(self.emb_response_fair, self.q) + self.b #l1 = T.nnet.relu(l1) #l2 = T.dot(l1, self.q2) + self.b2 * np.ones((batch_size,)) #self.nuis = l2 # Using Lasagne #net = lasagne.layers.InputLayer((None, emb_dim), self.emb_response_fair.T) #net = lasagne.layers.DenseLayer(net, num_units=n_hid) #net = lasagne.layers.DenseLayer(net, num_units=1, b=lasagne.init.Constant(mean_length),\ # nonlinearity=lasagne.nonlinearities.linear) #self.nuis = lasagne.layers.get_output(net) #self.net = net # Compute length with linear regression self.q = theano.shared(np.zeros((emb_dim,)).astype(theano.config.floatX), borrow=True) self.b = theano.shared(1. * mean_length) self.nuis = T.dot(self.emb_response_fair, self.q) + self.b * np.ones((batch_size,))
def em_ctc_cost(e_pred, pred, pred_len, token, blank): ''' ctc_cost of only one sentence :param e_pred: (T, nb, voca_size + 1) :param pred: (T, nb, voca_size + 1) (4,1,3) :param pred_len: (nb,) pred_len of prediction (1) :param token: (nb, U) -1 for NIL (1,2) :param blank: (1) :return: ctc_cost ''' nb, U = token.shape[0], token.shape[1] token_len = T.sum(T.neq(token, -1), axis=-1) # token_with_blank token = token[:, :, None] # (nb, U, 1) token_with_blank = T.concatenate((T.ones_like(token, dtype='int32')*blank, token), axis=2).reshape((nb, 2*U)) token_with_blank = T.concatenate((token_with_blank, T.ones((nb, 1), dtype='int32')*blank), axis=1) # (nb, 2*U+1) length = token_with_blank.shape[1] # only use these predictions pred = pred[:, T.arange(nb, dtype='int32')[:, None], token_with_blank] # (T, nb, 2U+1) e_pred = e_pred[:, T.arange(nb, dtype='int32')[:, None], token_with_blank] # (T, nb, 2U+1) # recurrence relation sec_diag = T.concatenate((T.zeros((nb, 2), dtype=intX), T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])), axis=1) * T.neq(token_with_blank, blank) # (nb, 2U+1) recurrence_relation = T.tile((T.eye(length) + T.eye(length, k=1)), (nb, 1, 1)) + T.tile(T.eye(length, k=2), (nb, 1, 1))*sec_diag[:, None, :] # (nb, 2U+1, 2U+1) recurrence_relation = recurrence_relation.astype(floatX) # alpha for estimate alpha = T.zeros_like(token_with_blank, dtype=floatX) alpha = T.set_subtensor(alpha[:, :2], e_pred[0, :, :2])################(nb, 2U+1) p # beta for maximize beta = T.zeros_like(token_with_blank, dtype=floatX) beta = T.set_subtensor(beta[:, :2], e_pred[0, :, :2]*log_safe(pred[0, :, :2]))################(nb, 2U+1) e_p * log(p) # dynamic programming # (T, nb, 2U+1) (probability_alpha, probability_beta), _ = theano.scan(compute_one_step, sequences=[e_pred[1:], pred[1:]], outputs_info=[alpha, beta], non_sequences=[recurrence_relation]) # estimate prob labels_e_2 = probability_alpha[pred_len - 2, T.arange(nb, dtype='int32'), 2 * token_len - 1] labels_e_1 = probability_alpha[pred_len - 2, T.arange(nb, dtype='int32'), 2 * token_len] labels_e_prob = labels_e_2 + labels_e_1 # maximize prob labels_m_2 = probability_beta[pred_len - 2, T.arange(nb, dtype='int32'), 2 * token_len - 1] labels_m_1 = probability_beta[pred_len - 2, T.arange(nb, dtype='int32'), 2 * token_len] labels_m_prob = labels_m_2 + labels_m_1 cost = -divide_safe(labels_m_prob, labels_e_prob) return cost
def ctc_cost(pred, pred_len, token, blank): ''' ctc_cost of multi sentences :param pred: (T, nb, voca_size+1) (4,1,3) :param pred_len: (nb,) pred_len of prediction (1) :param token: (nb, U) -1 for NIL (1,2) :param blank: (1) :return: ctc_cost ''' eps = theano.shared(np.float32(1e-35)) nb, U = token.shape[0], token.shape[1] token_len = T.sum(T.neq(token, -1), axis=-1) # token_with_blank token = token[:, :, None] # (nb, U, 1) token_with_blank = T.concatenate((T.ones_like(token, dtype=intX)*blank, token), axis=2).reshape((nb, 2*U)) token_with_blank = T.concatenate((token_with_blank, T.ones((nb, 1), dtype=intX)*blank), axis=1) # (nb, 2*U+1) length = token_with_blank.shape[1] # only use these predictions pred = pred[:, T.tile(T.arange(nb), (length, 1)).T, token_with_blank] # (T, nb, 2U+1) # recurrence relation sec_diag = T.concatenate((T.zeros((nb, 2), dtype=intX), T.neq(token_with_blank[:, :-2], token_with_blank[:, 2:])), axis=1) * T.neq(token_with_blank, blank) # (nb, 2U+1) recurrence_relation = T.tile((T.eye(length) + T.eye(length, k=1)), (nb, 1, 1)) + T.tile(T.eye(length, k=2), (nb, 1, 1))*sec_diag[:, None, :] # (nb, 2U+1, 2U+1) recurrence_relation = recurrence_relation.astype(floatX) # alpha alpha = T.zeros_like(token_with_blank, dtype=floatX) alpha = T.set_subtensor(alpha[:, :2], pred[0, :, :2])################(nb, 2U+1) # dynamic programming # (T, nb, 2U+1) probability, _ = theano.scan(lambda curr, accum: T.batched_dot(accum, recurrence_relation) * curr, sequences=[pred[1:]], outputs_info=[alpha]) labels_2 = probability[pred_len-2, T.arange(nb), 2*token_len-1] labels_1 = probability[pred_len-2, T.arange(nb), 2*token_len] labels_prob = labels_2 + labels_1 cost = -T.log(labels_prob+eps) return cost
def path_probability(self, queryseq_padded, scorematrix, queryseq_mask_padded=None, scorematrix_mask=None, blank_symbol=None): """ Compute p(l|x) using only the forward variable :param queryseq_padded: (2L+1, B) :param scorematrix: (T, C+1, B) :param queryseq_mask_padded: (2L+1, B) :param scorematrix_mask: (T, B) :param blank_symbol: = C by default :return: """ if blank_symbol is None: blank_symbol = scorematrix.shape[1] - 1 if queryseq_mask_padded is None: queryseq_mask_padded = tensor.ones_like(queryseq_padded, dtype=floatX) pred_y = self._class_batch_to_labeling_batch(queryseq_padded, scorematrix, scorematrix_mask) # (T, 2L+1, B), reshaped scorematrix r2, r3 = self._recurrence_relation(queryseq_padded, queryseq_mask_padded, blank_symbol) # r2 (2L+1, 2L+1), r3 (2L+1, 2L+1, B) def step(p_curr, p_prev, LLForward, countdown, r2, r3, queryseq_mask_padded): """ [DV, 1-14-2016]: A very weird problem encountered when integrating this CTC implementation into Keras. Before this revision there were no input parameters (r2, r3, queryseq_mask_padded) specified, they just referred to the outer scope ones. However, this will cause the CTC integrated within Keras producing inaccurate loss value, meanwhile when compiled as a separate function, the returned ctc loss value is accurate anyway. But if with these 3 parameters added as input, the problem vanished. This took me two days to find this remedy. I suspect this'd be the bug of theano. :param p_curr: (2L+1, B), one column of scorematrix :param p_prev: (B, 2L+1) :param LLForward: (B, 1) :param countdown: scalar :param r2: :param r3: :param queryseq_mask_padded: :return: """ dotproduct = (p_prev + tensor.dot(p_prev, r2) + # tensor.dot(p_prev, r2) = alpha(t-1, u-1) (p_prev.dimshuffle(1, 'x', 0) * r3).sum(axis=0).T) # = alpha(t-1, u-2) conditionally p_curr = p_curr.T * dotproduct if queryseq_mask_padded is not None: p_curr *= queryseq_mask_padded.T # (B, 2L+1) * (B, 2L+1) * (B, 2L+1) = (B, 2L+1) start = tensor.max([0, queryseq_padded.shape[0] - 2 * countdown]) mask = tensor.concatenate([tensor.zeros([queryseq_padded.shape[1], start]), tensor.ones([queryseq_padded.shape[1], queryseq_padded.shape[0] - start])], axis=1) p_curr *= mask c_batch = p_curr.sum(axis=1, keepdims=True) # (B, 1) p_curr /= c_batch LLForward += tensor.log(c_batch) countdown -= 1 return p_curr, LLForward, countdown # (B, 2L+1), (B, 1), scalar results, _ = theano.scan( step, sequences=[pred_y], # scan only work on the first dimension outputs_info=[tensor.eye(queryseq_padded.shape[0])[0] * tensor.ones(queryseq_padded.T.shape), tensor.unbroadcast(tensor.zeros([queryseq_padded.shape[1], 1]), 1), scorematrix.shape[0]], non_sequences=[r2, r3, queryseq_mask_padded]) return results