我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用theano.tensor.tensordot()。
def get_output(self, train=False): input = self.get_input(train) proj_input = self.activation(T.tensordot(input, self.att_proj, axes=(3,0))) if self.context == 'word': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 0)) elif self.context == 'clause': def step(a_t, h_tm1, W_in, W, sc): h_t = T.tanh(T.tensordot(a_t, W_in, axes=(2,0)) + T.tensordot(h_tm1, W, axes=(2,0))) s_t = T.tensordot(h_t, sc, axes=(2,0)) return h_t, s_t [_, scores], _ = theano.scan(step, sequences=[proj_input.dimshuffle(2,0,1,3)], outputs_info=[T.zeros((proj_input.shape[0], self.td1, self.rec_hid_dim)), None], non_sequences=[self.rec_in_weights, self.rec_hid_weights, self.att_scorer]) att_scores = scores.dimshuffle(1,2,0) elif self.context == 'para': att_scores = T.tensordot(proj_input, self.att_scorer, axes=(3, 2)).sum(axis=(1, 2)) # Nested scans. For shame! def get_sample_att(sample_input, sample_att): sample_att_inp, _ = theano.scan(fn=lambda s_att_i, s_input_i: T.dot(s_att_i, s_input_i), sequences=[T.nnet.softmax(sample_att), sample_input]) return sample_att_inp att_input, _ = theano.scan(fn=get_sample_att, sequences=[input, att_scores]) return att_input
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1)
def get_output_for(self, input, **kwargs): # cf * bc01... = fb01... out_r = T.tensordot(self.W, input, axes=[[0], [1]]) # input dims to broadcast over remaining_dims = range(2, input.ndim) # bf01... out = out_r.dimshuffle(1, 0, *remaining_dims) if self.b is None: activation = out else: if self.untie_biases: # no broadcast remaining_dims_biases = range(1, input.ndim - 1) else: remaining_dims_biases = ['x'] * (input.ndim - 2) # broadcast b_shuffled = self.b.dimshuffle('x', 0, *remaining_dims_biases) activation = out + b_shuffled return self.nonlinearity(activation)
def create_corr_func(): import numpy as np Xa, Xb = T.tensor4('Xa'), T.tensor4('Xb') def correlation(A, B): Ap, Bp = A.reshape((-1, 15 * 15)), B.reshape((-1, 15 * 15)) C = T.tensordot(Ap.T, Bp, axes=1).reshape((-1, 15, 15)) return C result, updates = theano.scan(fn=correlation, outputs_info=None, sequences=[Xa, Xb], non_sequences=None) corr_func = theano.function( inputs=[Xa, Xb], outputs=result, ) X = np.random.random((32, 128, 15, 15)).astype(np.float32) Y = np.random.random(X.shape).astype(np.float32) output = corr_func(X, Y) print output.shape
def call(self, x, mask=None): ax = 1 if self.is_q else 2 def _step(v1, v2): cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6), (v2) / T.sqrt(T.sum(T.sqr(v2), axis=ax, keepdims=True) + 1e-6), [[2], [ax]]) return cosine_score l_s = x[0] # n_b x n_s x n_w_s x D l_a = x[1] # n_b x 4 x n_w_qa x D # w_qa = self.layers[2].get_output(train) # n_b x 4 x n_w_qa x 1 # w_qa = T.addbroadcast(w_qa, len(self.layers[2].output_shape) - 1) # get cosine similarity for ALL word pairs output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None) if not self.is_q: output = output.dimshuffle(0, 1, 3, 2, 4) # n_b x n_s x 4 x n_w_s x n_w_qa return output
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Input Mixture of Gaussian Layer
def __call__(self, X, w_temp, m_temp): # input dimensions # X: (nb_samples, input_dim) # w_temp: (nb_samples, memory_dim) # m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory key = dot(X, self.W_key, self.b_key) # (nb_samples, memory_width) lock = dot(m_temp, self.W_lock) # (nb_samples, memory_dim, memory_width) shift = self.softmax( dot(X, self.W_shift, self.b_shift)) # (nb_samples, shift_width) beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None] # (nb_samples, x) gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1. # (nb_samples,) gamma = gamma[:, None] # (nb_samples, x) g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None] # (nb_samples, x) signal = [key, shift, beta, gamma, g] energy = T.sum(key[:, None, :] * lock, axis=2) # energy = T.tensordot(key[:, None, :] + lock, self.v, [2, 0]) w_c = self.softmax(beta * energy) # w_c = self.softmax( # beta * cosine_sim2d(key, m_temp)) # (nb_samples, memory_dim) //content-based addressing w_g = g * w_c + (1 - g) * w_temp # (nb_samples, memory_dim) //history interpolation w_s = shift_convolve2d(w_g, shift, self.shift_conv) # (nb_samples, memory_dim) //convolutional shift w_p = w_s ** gamma # (nb_samples, memory_dim) //sharpening w_t = w_p / T.sum(w_p, axis=1)[:, None] # (nb_samples, memory_dim) return w_t
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Convenience function to define an inception-style block
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- :param inputs: list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. :return: theano.TensorType Symbolic output variable. """ input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # compute out by tensor dot ([batch, length, input] * [input, num_label, num_label] # the shape of out should be [batch, length, num_label, num_label] out = T.tensordot(input, self.W, axes=[[2], [0]]) if self.b is not None: b_shuffled = self.b.dimshuffle('x', 'x', 0, 1) out = out + b_shuffled if mask is not None: mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x') out = out * mask_shuffled return out
def __init__(self, a_n_x, a_n_y): """Class constructor. Args: a_n_x (int): number of underlying cassifiers a_n_y (int): number of classes to predict """ self.n_x = a_n_x self.n_y = a_n_y # define the network # input matrix self.x = TT.dmatrix(name="x") # mapping from input to output vector self.X2Y = self._init_X2Y() self.y_bias = theano.shared(value=HE_UNIFORM((1, self.n_y)), name="y_bias") # prediction vector self.y_pred = TT.nnet.softmax( TT.tensordot(self.x, self.X2Y, ((1, 0), (2, 1))) + self.y_bias) # predicted label self.y_lbl = TT.argmax(self.y_pred, axis=1)[0] self._predict = theano.function([self.x], [self.y_lbl, self.y_pred], name="predict") # define trainable parameters self._params = [self.X2Y, self.y_bias]
def get_output_for(self, input, **kwargs): res = [(input ** (- i * 2 - 2)).dimshuffle(0, 'x', 1, 2) for i in range(self.projection_level)] res = T.concatenate(res, axis=1) # return T.tensordot(res, self.W, [[1], [0]]).dimshuffle(0, 'x', 1, 2) return res
def sym_mask_logdensity_estimator_intermediate(self, x, mask): non_linearity_name = self.parameters["nonlinearity"].get_name() assert(non_linearity_name == "sigmoid" or non_linearity_name == "RLU") x = x.T # BxD mask = mask.T # BxD output_mask = constantX(1) - mask # BxD D = constantX(self.n_visible) d = mask.sum(1) # d is the 1-based index of the dimension whose value to infer (not the size of the context) masked_input = x * mask # BxD h = self.nonlinearity(T.dot(masked_input, self.W1) + T.dot(mask, self.Wflags) + self.b1) # BxH for l in xrange(self.n_layers - 1): h = self.nonlinearity(T.dot(h, self.Ws[l]) + self.bs[l]) # BxH z_alpha = T.tensordot(h, self.V_alpha, [[1], [1]]) + T.shape_padleft(self.b_alpha) z_mu = T.tensordot(h, self.V_mu, [[1], [1]]) + T.shape_padleft(self.b_mu) z_sigma = T.tensordot(h, self.V_sigma, [[1], [1]]) + T.shape_padleft(self.b_sigma) temp = T.exp(z_alpha) # + 1e-6 # temp += T.shape_padright(temp.sum(2)/1e-3) Alpha = temp / T.shape_padright(temp.sum(2)) # BxDxC Mu = z_mu # BxDxC Sigma = T.exp(z_sigma) # + 1e-6 #BxDxC # Alpha = Alpha * T.shape_padright(output_mask) + T.shape_padright(mask) # Mu = Mu * T.shape_padright(output_mask) # Sigma = Sigma * T.shape_padright(output_mask) + T.shape_padright(mask) # Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x*output_mask)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2*np.pi)) #BxDxC Phi = -constantX(0.5) * T.sqr((Mu - T.shape_padright(x)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) # BxDxC logdensity = (log_sum_exp(Phi + T.log(Alpha), axis=2) * output_mask).sum(1) * D / (D - d) return (logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h)
def sym_masked_neg_loglikelihood_gradient(self, x, mask): """ x is a matrix of column datapoints (DxB) D = n_visible, Bfloat = batch size """ logdensity, z_alpha, z_mu, z_sigma, Alpha, Mu, Sigma, h = self.sym_mask_logdensity_estimator_intermediate(x, mask) # nnz = output_mask.sum(0) # sparsity_multiplier = T.shape_padright(T.shape_padleft((B+1e-6)/(nnz+1e-6))) # wPhi = T.maximum(Phi + T.log(Alpha), constantX(-100.0)) #BxDxC # lp_current = log_sum_exp(wPhi, axis = 2) * output_mask #BxD # lp_current_sum = (lp_current.sum(1) * D / (D-d)).sum() #1 loglikelihood = logdensity.mean(dtype=floatX) loss = -loglikelihood dp_dz_alpha = T.grad(loss, z_alpha) # BxDxC gb_alpha = dp_dz_alpha.sum(0) # DxC gV_alpha = T.tensordot(h.T, dp_dz_alpha, [[1], [0]]).dimshuffle((1, 0, 2)) # DxHxC dp_dz_mu = T.grad(loss, z_mu) # BxDxC dp_dz_mu = dp_dz_mu * Sigma # Heuristic gb_mu = dp_dz_mu.sum(0) # DxC gV_mu = T.tensordot(h.T, dp_dz_mu, [[1], [0]]).dimshuffle((1, 0, 2)) # DxHxC dp_dz_sigma = T.grad(loss, z_sigma) # BxDxC gb_sigma = dp_dz_sigma.sum(0) # DxC gV_sigma = T.tensordot(h.T, dp_dz_sigma, [[1], [0]]).dimshuffle((1, 0, 2)) # DxHxC if self.n_layers > 1: gWs, gbs, gW1, gWflags, gb1 = T.grad(loss, [self.Ws, self.bs, self.W1, self.Wflags, self.b1]) gradients = {"V_alpha":gV_alpha, "b_alpha":gb_alpha, "V_mu":gV_mu, "b_mu":gb_mu, "V_sigma":gV_sigma, "b_sigma":gb_sigma, "Ws":gWs, "bs":gbs, "W1":gW1, "b1":gb1, "Wflags":gWflags} else: gW1, gWflags, gb1 = T.grad(loss, [self.W1, self.Wflags, self.b1]) gradients = {"V_alpha":gV_alpha, "b_alpha":gb_alpha, "V_mu":gV_mu, "b_mu":gb_mu, "V_sigma":gV_sigma, "b_sigma":gb_sigma, "W1":gW1, "b1":gb1, "Wflags":gWflags} # Gradients return (loss, gradients)
def construct(self, input_tv): ''' Params ------ input_tv : a matrix of size (n_sentences, n_tokens, vecdim) ''' # N is the linear transformation matrix. N = self._declare_mat('N', self.in_dim, self.out_dim) N.clip_gradient = self.prm('clip_gradient') N.l2_project = self.prm('l2_project') N.l2_projection_axis = 0 if self.prm('do_dropout'): N.dropout_retention_freq = self.prm('dropout_retention_freq') # Create a dropout mask. dropout_mask = dropout_mask_creator( self.in_dim, N.dropout_retention_freq) # Apply dropout mask to input variable. # Note that dropout_mask is a vector and input_tv is a # matrix. We are broadcasting this multiplication. # Essentially we are dropping entire columns from input_tv. dropout_input_tv = (input_tv * dropout_mask) dropout_input_tv.name = self.kn('dropout_input_tv') # During train time the output is the matrix multiplication of # dropped out variables with the matrix. transformed_tv = T.tensordot( dropout_input_tv, N, axes=[dropout_input_tv.ndim - 1, 0]) else: transformed_tv = T.dot(input_tv, N) if self.prm('add_bias'): b = self._declare_mat('b', self.out_dim, is_regularizable=False) b.l2_project = self.prm('l2_project') b.l2_projection_axis = 0 self.output_tv = transformed_tv + b return (N, b) else: self.output_tv = transformed_tv return (N,)
def construct(self, input_tv): sod2c = self.prm('shape_of_dim_to_collapse') v = self._declare_mat('v', sod2c) self.output_tv = T.tensordot(input_tv, v, axes=(input_tv.ndim - 1, 0)) return (v,)
def construct(self, input_tv): ''' Params ------ input_tv : The input is a 3D tensor representing a batch of sentences with embedded tokens. Returns ------- The input_tv is a matrix that has the tokens as its 0th dimension and usually LSTM features as the first dimension. NOTE: We don't need to project the input of this class_chip inside. We can just add a Linear class_chip before ConcatenativeMixture. ''' Y_prev = self._declare_mat('A', self.out_dim + 1, self.in_dim) Y_next = self._declare_mat('B', self.out_dim, self.in_dim) Y_prev.clip_gradient = self.prm('clip_gradient') Y_next.clip_gradient = self.prm('clip_gradient') prev_next_cross = (Y_prev.dimshuffle(0, 'x', 1) + Y_next.dimshuffle('x', 0, 1)) Y_nl = self.prm('activation_fn')(prev_next_cross) # NOTE: The last dimension corresponds to the hidden layer # nodes. v = self._declare_mat('v', self.in_dim) PairWise_Factor = T.tensordot(Y_nl, v, axes=(Y_nl.ndim - 1, 0)) self.output_tv = (PairWise_Factor.dimshuffle('x', 'x', 0, 1) + input_tv.dimshuffle(0, 1, 'x', 2)) return (Y_prev, Y_next, v)
def correlation(self, A, B): Af = A.reshape((A.shape[0], A.shape[1] * A.shape[2])) Bf = B.reshape((B.shape[0], B.shape[1] * B.shape[2])) C = T.tensordot(Af.T, Bf, axes=1) return C.reshape((-1, A.shape[1], A.shape[2]))
def grad(self, inputs, cost_grad): """ In defining the gradient, the Finite Fourier Transform is viewed as a complex-differentiable function of a complex variable """ a = inputs[0] n = inputs[1] axis = inputs[2] grad = cost_grad[0] if not isinstance(axis, tensor.TensorConstant): raise NotImplementedError('%s: gradient is currently implemented' ' only for axis being a Theano constant' % self.__class__.__name__) axis = int(axis.data) # notice that the number of actual elements in wrto is independent of # possible padding or truncation: elem = tensor.arange(0, tensor.shape(a)[axis], 1) # accounts for padding: freq = tensor.arange(0, n, 1) outer = tensor.outer(freq, elem) pow_outer = tensor.exp(((-2 * math.pi * 1j) * outer) / (1. * n)) res = tensor.tensordot(grad, pow_outer, (axis, 0)) # This would be simpler but not implemented by theano: # res = tensor.switch(tensor.lt(n, tensor.shape(a)[axis]), # tensor.set_subtensor(res[...,n::], 0, False, False), res) # Instead we resort to that to account for truncation: flip_shape = list(numpy.arange(0, a.ndim)[::-1]) res = res.dimshuffle(flip_shape) res = tensor.switch(tensor.lt(n, tensor.shape(a)[axis]), tensor.set_subtensor(res[n::, ], 0, False, False), res) res = res.dimshuffle(flip_shape) # insures that gradient shape conforms to input shape: out_shape = list(numpy.arange(0, axis)) + [a.ndim - 1] +\ list(numpy.arange(axis, a.ndim - 1)) res = res.dimshuffle(*out_shape) return [res, None, None]
def __init__(self,x,y,metrics='eucdian'): if metrics=='eucdian': x=x.dimshuffle(1,0,2) y = y.dimshuffle(1,2,0) activation=T.batched_dot(x,y) #activation=T.tensordot(x,y,axes=-1) self.activation=activation.dimshuffle(0,'x',1,2)
def tensordot(x, y, axes): return T.tensordot(x, y, axes=axes)
def get_output(self, train=False): [X_w, X_t] = self.get_input(train) t_w = self.W_t[X_w[:,:, 0]] # doc_l, n_tags*n_samples, n_dim w_w = self.W_w[X_w[:,:, 1]] dot_tw = T.sum(w_w * t_w, axis=2) inter_1 = T.tensordot(w_w, self.S, axes = [[2],[2]]) inter_2 = T.tensordot(t_w, self.P, axes = [[2],[2]]) # doc_l, n_tags*n_samples, 2,5 inter = T.sum(inter_1 * inter_2, axis = 3) sim_tw = T.tensordot(inter + T.shape_padleft(self.B, 2), self.U, axes=[[2],[0]]) sim_tw = T.reshape(sim_tw, (X_w.shape[0], X_w.shape[1])) dot_sum_w = T.sum(dot_tw * T.nnet.sigmoid(sim_tw), axis = 0)/(X_w.shape[0]) dot_w = theano.tensor.reshape(dot_sum_w, (X_w.shape[1], 1)) return self.activation(dot_w) ''' t_t = self.W_t[X_t[:,:, 0]] # doc_l, n_tags*n_samples, n_dim w_t = self.W_t[X_t[:,:, 1]] dot_tt = T.sum(w_t * t_t, axis=2) #dot_sum = T.sum(dot_tw, axis = 0)#/(X.shape[0]) #dot_sum_t = T.sum(dot_tt , axis = 0)#/(X_t.shape[0]) inter_t_1 = T.tensordot(t_t, self.P, axes = [[2],[2]]) inter_t_2 = T.tensordot(w_t, self.P, axes = [[2],[2]]) # doc_l, n_tags*n_samples, 2,5 inter_t = T.sum(inter_t_1 * inter_t_2, axis = 3) sim_tt = T.tensordot(inter_t, self.U_t, axes=[[2],[0]]) sim_tt = T.reshape(sim_tt, (X_t.shape[0], X_t.shape[1])) dot_sum_t = T.sum(dot_tt * sim_tt, axis = 0)/(X_t.shape[0]) dot_twc_t = dot_sum_t#*dot_sum#_p dot_t = theano.tensor.reshape(dot_twc_t, (X_t.shape[1], 1)) return 0.5 * self.activation(dot_w) + 0.5 * self.activation(dot_t) '''
def output_func(self, input): self.f = T.tensordot(input.dimshuffle(0,'x',1),self.W.dimshuffle('x',0,1),axes=[[1,2],[0,2]]) # cosine sim self.y_pred = T.argmax(self.f,axis=0) return self.y_pred
def __call__(self, expected, observed, weights): obs = T.tensordot(observed, self.W_observed, axes=(2, 0)) exp = T.dot(expected, self.W_expected) weights = weights[:, None] * self.W_weights[None, :] return self.nonlinearity(obs + exp + weights)
def call(self, x, mask=None): def _step(v1, v2): cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6), (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6), [[2], [2]]) return cosine_score l_s = x[0] # n_b x n_s x n_w_s x D l_a = x[1] # n_b x 4 x n_w_qa x D # get cosine similarity for ALL word pairs output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None) # n_b x n_s x n_w_s x 4 x n_w_qa # return T.max(T.max(output, axis=4), axis=2) output = output.dimshuffle(2, 1, 0, 3, 4) # n_w_s x n_s x n_b x 4 x n_w_qa def slide_max(i, X): size = self.window_size M = X[i:i + size] W = self.w_gaussian return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1) output, _ = theano.scan(slide_max, sequences=[ T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=3, dtype='int32')], non_sequences=output) if self.use_qa_idf: average = weighted_average(output.dimshuffle(2, 1, 0, 3, 4), x[2], axis=4) else: average = masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4) return T.max(average, axis=2) * self.alpha # return T.max(masked_mean(output.dimshuffle(2, 1, 0, 3, 4), axis=4), axis=2) * self.alpha
def call(self, x, mask=None): def _step(v1, v2): cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=1, keepdims=True) + 1e-6), (v2) / T.sqrt(T.sum(T.sqr(v2), axis=2, keepdims=True) + 1e-6), [[1], [2]]) return cosine_score l_s = x[0] # n_b x n_w_st x D l_a = x[1] # n_b x 4 x n_w_qa x D # get cosine similarity for ALL word pairs output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None) # n_b x n_w_st x 4 x n_w_qa output = output.dimshuffle(1, 0, 2, 3) def slide_max(i, X): size = self.window_size M = X[i:i + size] W = self.w_gaussian return T.max((W * M.T).T, axis=0), theano.scan_module.until(i >= X.shape[0] - size + 1) output, _ = theano.scan(slide_max, sequences=[ T.arange(0, stop=(output.shape[0] - self.window_size + 1), step=5, dtype='int32')], non_sequences=output) if self.use_qa_idf: average = weighted_average(output.dimshuffle(1, 0, 2, 3), x[2], axis=3) else: average = masked_mean(output.dimshuffle(1, 0, 2, 3), axis=3) return T.max(average, axis=1) * self.alpha
def get_output_for(self, inputs, **kwargs): """ :param inputs: inputs: list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. :return: theano.TensorType Symbolic output variable. """ input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # compute head part by tensor dot ([batch, length, dim] * [dim, num_label] # the shape of s_h should be [batch, length, num_label] s_h = T.tensordot(input, self.W_h, axes=[[2], [0]]) if self.b is not None: b_shuffled = self.b.dimshuffle('x', 'x', 0) s_h = s_h + b_shuffled # compute child part by tensor dot ([batch, length, dim] * [dim, num_label] # the shape of s_c should be [batch, length, num_label] s_c = T.tensordot(input, self.W_c, axes=[[2], [0]]) # compute out input_shape = input.shape # output shape = [batch, length, length, num_label] out = T.cast(T.alloc(0.0, input_shape[0], input_shape[1], input_shape[1], self.num_labels), 'floatX') out = out + s_h.dimshuffle(0, 1, 'x', 2) out = out + s_c.dimshuffle(0, 'x', 1, 2) if mask is not None: mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x') out = out * mask_shuffled mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x') out = out * mask_shuffled return out
def get_output_for(self, inputs, **kwargs): """ :param inputs: inputs: list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. :return: theano.TensorType Symbolic output variable. """ input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # compute the bi-affine part # first via tensor dot ([batch, length, dim] * [dim, dim, num_label]) # output shape = [batch, length, dim, num_label] out = T.tensordot(input, self.U, axes=[[2], [0]]) # second via tensor dot ([batch, length, dim, num_label] * [batch, dim, length) # output shape = [batch, length, length, num_label] out = T.batched_tensordot(out, input.dimshuffle(0, 2, 1), axes=([2], [1])) out = out.dimshuffle(0, 1, 3, 2) # compute head bias part by tensor dot ([batch, length, dim] * [dim, num_label]) # the shape of s_h should be [batch, length, num_label] if self.W_h is not None: s_h = T.tensordot(input, self.W_h, axes=[[2], [0]]) out = out + s_h.dimshuffle(0, 1, 'x', 2) # compute child part by tensor dot ([batch, length, dim] * [dim, num_label] # the shape of s_c should be [batch, length, num_label] if self.W_c is not None: s_c = T.tensordot(input, self.W_c, axes=[[2], [0]]) out = out + s_c.dimshuffle(0, 'x', 1, 2) # add bias part. if self.b is not None: out = out + self.b.dimshuffle('x', 'x', 'x', 0) if mask is not None: mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x') out = out * mask_shuffled mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x') out = out * mask_shuffled return out
def get_output_for(self, inputs, **kwargs): """ :param inputs: inputs: list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. :return: theano.TensorType Symbolic output variable. """ input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # compute head part by tensor dot ([batch, length, input] * [input, num_label] # the shape of s_h should be [batch, length, num_label] s_h = T.tensordot(input, self.W_h, axes=[[2], [0]]) if self.b is not None: b_shuffled = self.b.dimshuffle('x', 'x', 0) s_h = s_h + b_shuffled # compute child part by tensor dot ([batch, length, input] * [input, num_label] # the shape of s_c should be [batch, length, num_label] s_c = T.tensordot(input, self.W_c, axes=[[2], [0]]) # compute out input_shape = input.shape out = T.cast(T.alloc(0.0, input_shape[0], input_shape[1], input_shape[1], self.num_labels), 'floatX') out = out + s_h.dimshuffle(0, 1, 'x', 2) out = out + s_c.dimshuffle(0, 'x', 1, 2) if mask is not None: mask_shuffled = mask.dimshuffle(0, 1, 'x', 'x') out = out * mask_shuffled mask_shuffled = mask.dimshuffle(0, 'x', 1, 'x') out = out * mask_shuffled return out