我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.sum()。
def svgd_kernel(self, h = -1): sq_dist = pdist(self.theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, self.theta) sumkxy = np.sum(Kxy, axis=1) for i in range(self.theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def l2normalize(layer, train_scale=True): W_param = layer.W s = W_param.get_value().shape if len(s)==4: axes_to_sum = (1,2,3) dimshuffle_args = [0,'x','x','x'] k = s[0] else: axes_to_sum = 0 dimshuffle_args = ['x',0] k = s[1] layer.W_scale = layer.add_param(lasagne.init.Constant(1.), (k,), name="W_scale", trainable=train_scale, regularizable=False) layer.W = W_param * (layer.W_scale/T.sqrt(1e-6 + T.sum(T.square(W_param),axis=axes_to_sum))).dimshuffle(*dimshuffle_args) return layer # fully connected layer with weight normalization
def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true): mus_ex = mus[np.newaxis, :, :] X = y_true[:, np.newaxis, :] diff = X - mus_ex diffprod = T.prod(diff, axis=-1) corxy2 = corxy **2 diff2 = diff ** 2 sigmas2 = sigmas ** 2 sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] diffsigma = diff2 / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods oneminuscorxy2inv = 1.0 / (1.0 - corxy2) expterm = -0.5 * z * oneminuscorxy2inv new_exponent = T.log(0.5/np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis) max_exponent = T.max(new_exponent ,axis=1, keepdims=True) mod_exponent = new_exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent),axis=1) log_gauss = max_exponent + T.log(gauss_mix) loss = -T.mean(log_gauss) return loss
def rbf_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h ** 2 / 2.0) neighbors = T.argsort(H, axis=1)[:, 1] return Kxy, neighbors, h
def __call__(self, X, w_temp, m_temp): # input dimensions # X: (nb_samples, input_dim) # w_temp: (nb_samples, memory_dim) # m_temp: (nb_samples, memory_dim, memory_width) ::tensor_memory key = dot(X, self.W_key, self.b_key) # (nb_samples, memory_width) shift = self.softmax( dot(X, self.W_shift, self.b_shift)) # (nb_samples, shift_width) beta = self.softplus(dot(X, self.W_beta, self.b_beta))[:, None] # (nb_samples, x) gamma = self.softplus(dot(X, self.W_gama, self.b_gama)) + 1. # (nb_samples,) gamma = gamma[:, None] # (nb_samples, x) g = self.sigmoid(dot(X, self.W_g, self.b_g))[:, None] # (nb_samples, x) signal = [key, shift, beta, gamma, g] w_c = self.softmax( beta * cosine_sim2d(key, m_temp)) # (nb_samples, memory_dim) //content-based addressing w_g = g * w_c + (1 - g) * w_temp # (nb_samples, memory_dim) //history interpolation w_s = shift_convolve2d(w_g, shift, self.shift_conv) # (nb_samples, memory_dim) //convolutional shift w_p = w_s ** gamma # (nb_samples, memory_dim) //sharpening w_t = w_p / T.sum(w_p, axis=1)[:, None] # (nb_samples, memory_dim) return w_t
def dot(inp, matrix, bias=None): """ Decide the right type of dot product depending on the input arguments """ if 'int' in inp.dtype and inp.ndim == 2: return matrix[inp.flatten()] elif 'int' in inp.dtype: return matrix[inp] elif 'float' in inp.dtype and inp.ndim == 3: shape0 = inp.shape[0] shape1 = inp.shape[1] shape2 = inp.shape[2] if bias: return (T.dot(inp.reshape((shape0 * shape1, shape2)), matrix) + bias).reshape((shape0, shape1, matrix.shape[1])) else: return T.dot(inp.reshape((shape0 * shape1, shape2)), matrix).reshape((shape0, shape1, matrix.shape[1])) else: if bias: return T.dot(inp, matrix) + bias else: return T.dot(inp, matrix) # Numerically stable log(sum(exp(A))). Can also be used in softmax function.
def dot_2d(k, M, b=None, g=None): # k: (nb_samples, memory_width) # M: (nb_samples, memory_dim, memory_width) # norms of keys and memories # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,) # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,) k = k[:, None, :] # (nb_samples, 1, memory_width) value = k * M if b is not None: b = b[:, None, :] value *= b # (nb_samples, memory_dim,) if g is not None: g = g[None, None, :] value *= g sim = T.sum(value, axis=2) return sim
def crossentropy(y_pred, y_true, void_labels, one_hot=False): # Clip predictions y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON) if one_hot: y_true = T.argmax(y_true, axis=1) # Create mask mask = T.ones_like(y_true, dtype=_FLOATX) for el in void_labels: mask = T.set_subtensor(mask[T.eq(y_true, el).nonzero()], 0.) # Modify y_true temporarily y_true_tmp = y_true * mask y_true_tmp = y_true_tmp.astype('int32') # Compute cross-entropy loss = T.nnet.categorical_crossentropy(y_pred, y_true_tmp) # Compute masked mean loss loss *= mask loss = T.sum(loss) / T.sum(mask) return loss
def dice_loss(y_pred, y_true, void_class, class_for_dice=1): ''' Dice loss -- works for only binary classes. y_pred is a softmax output y_true is one hot ''' smooth = 1 y_true_f = T.flatten(y_true[:, class_for_dice, :, :]) y_true_f = T.cast(y_true_f, 'int32') y_pred_f = T.flatten(y_pred[:, class_for_dice, :, :]) # remove void classes from dice if len(void_class): for i in range(len(void_class)): # get idx of non void classes and remove void classes # from y_true and y_pred idxs = T.neq(y_true_f, void_class[i]).nonzero() y_pred_f = y_pred_f[idxs] y_true_f = y_true_f[idxs] intersection = T.sum(y_true_f * y_pred_f) return -(2.*intersection+smooth) / (T.sum(y_true_f)+T.sum(y_pred_f)+smooth)
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(nn.layers.get_output(model.l_target), 'int32') enable_targets = nn.layers.get_output(model.l_enable_target) predictions = T.clip(predictions, epsilon, 1.-epsilon) sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): n_classes = len(property_bin_borders[obj_name]) v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) # take the mean of the objectives where it matters (enabled targets) obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx])) if deterministic: d_objectives_deterministic[obj_name] = obj_scalar else: d_objectives[obj_name] = obj_scalar sum_of_objectives += T.mean(v_obj) unit_ptr = unit_ptr+n_classes return sum_of_objectives
def build_objective(model, deterministic=False, epsilon=1e-12): predictions = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.cast(nn.layers.get_output(model.l_target), 'int32') enable_targets = nn.layers.get_output(model.l_enable_target) predictions = T.clip(predictions, epsilon, 1.-epsilon) sum_of_objectives = 0 unit_ptr = 0 for obj_idx, obj_name in enumerate(order_objectives): n_classes = len(property_bin_borders[obj_name]) v_obj = objective(obj_idx, (unit_ptr, unit_ptr+n_classes), predictions, targets) # take the mean of the objectives where it matters (enabled targets) obj_scalar = T.sum(enable_targets[:,obj_idx] * v_obj) / (0.00001 + T.sum(enable_targets[:,obj_idx])) if deterministic: d_objectives_deterministic[obj_name] = obj_scalar else: d_objectives[obj_name] = obj_scalar sum_of_objectives += obj_scalar unit_ptr = unit_ptr+n_classes return sum_of_objectives
def op_cosine_c( s_xr_, s_xi_, s_yr_, s_yi_, axis_=-1, keepdims_=True, eps_=1e-7): ''' cosine between two complex vectors, uses standard complex inner product Args: s_xr_: real part of x s_xi_: imag part of x s_yr_: real part of y s_yi_: imag part of y eps_: small number to prevent divide by zero ''' s_nrm = s_xr_*s_yr_ + s_xi_*s_yi_ s_nx = T.sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_) s_ny = T.sum(T.sqr(s_yr_) + T.sqr(s_yi_), axis=axis_, keepdims=keepdims_) return T.sum(s_nrm, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_)
def sample(self, x, K): if x.ndim == 1: x = x.reshape(1, x.shape[0]) hn = self.encode(x) W = self.params[0] ww = T.dot(W.T, W) samples = [] for _ in range(K): s = hn * (1. - hn) jj = ww * s.dimshuffle(0, 'x', 1) * s.dimshuffle(0, 1, 'x') alpha = self.srng.normal(size=hn.shape, avg=0., std=self.sigma, dtype=theano.config.floatX) delta = (alpha.dimshuffle(0, 1, 'x')*jj).sum(1) zn = self.decode(hn + delta) hn = self.encode(zn) # zn2 = self.decode(hn) samples.append(zn.eval()) return samples
def get_cost(aes, l, eye=True): """Get the sum of all the reconstruction costs of the AEs. Input: aes_in: list. List of all the aes. l: shared variable or a list of shared variables for the importance weights. """ costs = [] for ae, i in zip(aes, range(len(aes))): if isinstance(ae, ConvolutionalAutoencoder): costs.append(l[i] * ae.get_train_cost()[0]) else: costs.append(l[i] * ae.get_train_cost(face=eye)[0]) cost = None if costs not in [[], None]: cost = reduce(lambda x, y: x + y, costs) return cost
def get_eval_fn(model, in3D=False, use_dice=False): """Compile the evaluation function of the model.""" if use_dice: insec = T.sum(model.trg * model.output, axis=1) tmp = 1 - 2.0 * insec/(T.sum(model.trg, axis=1) + T.sum(model.output, axis=1)) error = T.mean(tmp) else: error = T.mean(T.mean(T.power(model.output - model.trg, 2), axis=1)) if in3D: x = T.tensor4('x') else: x = T.fmatrix("x") y = T.fmatrix("y") theano_arg_vl = [x, y] output_fn_vl = [error, model.output] eval_fn = theano.function( theano_arg_vl, output_fn_vl, givens={model.x: x, model.trg: y}) return eval_fn
def __init__(self, input_size, output_size, hidden_sizes, activation = T.nnet.sigmoid): self.hidden_layers = [] self.params = [] self.input = T.matrix('x') self.target = T.matrix('y') for i, layer_size in enumerate(hidden_sizes): if i == 0: layer_input_size = input_size layer_input = self.input else: layer_input_size = hidden_sizes[i - 1] layer_input = self.hidden_layers[-1].output layer = Layer(layer_input, layer_input_size, layer_size, activation = activation) self.hidden_layers.append(layer) self.params.extend(layer.params) self.output_layer = Layer(self.hidden_layers[-1].output, hidden_sizes[-1], output_size) self.params.extend(self.output_layer.params) self.output = self.output_layer.output self.cost = T.sum((self.output - self.target)**2)
def get_output_for(self, input, **kwargs): """ Given 2d input find the probability of each input in each of num_units Diagonal Gaussians using the formula from http://mathworld.wolfram.com/BivariateNormalDistribution.html """ #make sure sigma is positive and nonzero softplus(x) (0, +inf) sigmas = T.nnet.softplus(self.sigmas) sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] sigmas2 = sigmas ** 2 mus = self.mus[np.newaxis, :, :] X = input[:, np.newaxis, :] diff = (X - mus) ** 2 diffsigma = diff / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) expterm = T.exp(-0.5 * diffsigmanorm) probs = (0.5 / np.pi) * sigmainvprods * expterm return probs
def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type) ) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction return T.mean(T.neq(self.y_pred, y)) else: return T.sum((y - self.y_pred) ** 2);
def _generate_train_model_function(self, scores): u = T.lvector('u') i = T.lvector('i') j = T.lvector('j') self.W = theano.shared(numpy.zeros((self._dim)).astype('float32'), name='W'); self.S = theano.shared(scores, name='S'); x_ui = T.dot(self.W, self.S[u,i,:].T); x_uj = T.dot(self.W, self.S[u,j,:].T); x_uij = x_ui - x_uj; obj = T.sum( T.log(T.nnet.sigmoid(x_uij)).sum() - \ self._lambda_w * 0.5 * (self.W ** 2).sum() ) cost = -obj g_cost_W = T.grad(cost=cost, wrt=self.W) updates = [ (self.W, self.W - self._learning_rate * g_cost_W) ] self.train_model = theano.function(inputs=[u,i,j], outputs=cost, updates=updates);
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) def Hx_plain(): Hx_plain_splits = TT.grad( TT.sum([TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn' ) return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: ext.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def kl_sym(self, old_dist_info_vars, new_dist_info_vars): old_means = old_dist_info_vars["mean"] old_log_stds = old_dist_info_vars["log_std"] new_means = new_dist_info_vars["mean"] new_log_stds = new_dist_info_vars["log_std"] """ Compute the KL divergence of two multivariate Gaussian distribution with diagonal covariance matrices """ old_std = TT.exp(old_log_stds) new_std = TT.exp(new_log_stds) # means: (N*A) # std: (N*A) # formula: # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + # ln(\sigma_2/\sigma_1) numerator = TT.square(old_means - new_means) + \ TT.square(old_std) - TT.square(new_std) denominator = 2 * TT.square(new_std) + 1e-8 return TT.sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info): old_means = old_dist_info["mean"] old_log_stds = old_dist_info["log_std"] new_means = new_dist_info["mean"] new_log_stds = new_dist_info["log_std"] """ Compute the KL divergence of two multivariate Gaussian distribution with diagonal covariance matrices """ old_std = np.exp(old_log_stds) new_std = np.exp(new_log_stds) # means: (N*A) # std: (N*A) # formula: # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + # ln(\sigma_2/\sigma_1) numerator = np.square(old_means - new_means) + \ np.square(old_std) - np.square(new_std) denominator = 2 * np.square(new_std) + 1e-8 return np.sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def discrim(X): current_input = dropout(X, 0.3) ### encoder ### cv1 = relu(dnn_conv(current_input, aew1, subsample=(1,1), border_mode=(1,1))) cv2 = relu(batchnorm(dnn_conv(cv1, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2, b=aeb2)) cv3 = relu(batchnorm(dnn_conv(cv2, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3, b=aeb3)) cv4 = relu(batchnorm(dnn_conv(cv3, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4, b=aeb4)) cv5 = relu(batchnorm(dnn_conv(cv4, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5, b=aeb5)) cv6 = relu(batchnorm(dnn_conv(cv5, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6, b=aeb6)) ### decoder ### dv6 = relu(batchnorm(deconv(cv6, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6t, b=aeb6t)) dv5 = relu(batchnorm(deconv(dv6, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5t, b=aeb5t)) dv4 = relu(batchnorm(deconv(dv5, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4t, b=aeb4t)) dv3 = relu(batchnorm(deconv(dv4, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3t, b=aeb3t)) dv2 = relu(batchnorm(deconv(dv3, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2t, b=aeb2t)) dv1 = tanh(deconv(dv2, aew1, subsample=(1,1), border_mode=(1,1))) rX = dv1 mse = T.sqrt(T.sum(T.abs_(T.flatten(X-rX, 2)),axis=1)) + T.sqrt(T.sum(T.flatten((X-rX)**2, 2), axis=1)) return T.flatten(cv6, 2), rX, mse
def discrim(X): current_input = dropout(X, 0.3) ### encoder ### cv1 = relu(dnn_conv(current_input, aew1, subsample=(1,1), border_mode=(1,1))) cv2 = relu(batchnorm(dnn_conv(cv1, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2, b=aeb2)) cv3 = relu(batchnorm(dnn_conv(cv2, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3, b=aeb3)) cv4 = relu(batchnorm(dnn_conv(cv3, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4, b=aeb4)) cv5 = relu(batchnorm(dnn_conv(cv4, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5, b=aeb5)) cv6 = relu(batchnorm(dnn_conv(cv5, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6, b=aeb6)) ### decoder ### dv6 = relu(batchnorm(deconv(cv6, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6t, b=aeb6t)) dv5 = relu(batchnorm(deconv(dv6, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5t, b=aeb5t)) dv4 = relu(batchnorm(deconv(dv5, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4t, b=aeb4t)) dv3 = relu(batchnorm(deconv(dv4, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3t, b=aeb3t)) dv2 = relu(batchnorm(deconv(dv3, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2t, b=aeb2t)) dv1 = tanh(deconv(dv2, aew1, subsample=(1,1), border_mode=(1,1))) rX = dv1 mse = T.sqrt(T.sum(T.abs_(T.flatten(X-rX, 2)),axis=1)) + T.sqrt(T.sum(T.flatten((X-rX)**2, 2), axis=1)) # L1 and L2 loss return T.flatten(cv6, 2), rX, mse
def svgd_gradient(X0): hidden, _, mse = discrim(X0) grad = -1.0 * T.grad( mse.sum(), X0) kxy, neighbors, h = rbf_kernel(hidden) #TODO coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 ) v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2 X1 = X0[neighbors] hidden1, _, _ = discrim(X1) dxkxy = T.Lop(hidden1, X1, v) #svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x') svgd_grad = grad + dxkxy / 2. return grad, svgd_grad, dxkxy
def get_recon_loss(self, idxs, sent_output): len_sent, len_doc_batch, n_d = sent_output.shape recon_layer = self.recon_layer padding_id = self.padding_id dropout = self.dropout # (len(sent)*len(doc)*batch)*n_e input_flat = idxs.ravel() true_recon = self.embedding_layer.recon_forward(input_flat) sent_output = apply_dropout(sent_output, dropout) pred_recon = recon_layer.forward(sent_output.reshape((len_sent*len_doc_batch, n_d))) # (len(sent)*len(doc)*batch) mask = T.cast(T.neq(input_flat, padding_id), theano.config.floatX) n = T.sum(mask) loss = T.sum((true_recon - pred_recon) ** 2, axis=1) * mask loss = T.sum(loss) / n return loss
def sample_weights(sizeX, sizeY, sparsity, scale, rng): """ Initialization that fixes the largest singular value. """ sizeX = int(sizeX) sizeY = int(sizeY) sparsity = numpy.minimum(sizeY, sparsity) values = numpy.zeros((sizeX, sizeY), dtype=theano.config.floatX) for dx in xrange(sizeX): perm = rng.permutation(sizeY) new_vals = rng.uniform(low=-scale, high=scale, size=(sparsity,)) vals_norm = numpy.sqrt((new_vals**2).sum()) new_vals = scale*new_vals/vals_norm values[dx, perm[:sparsity]] = new_vals _,v,_ = numpy.linalg.svd(values) values = scale * values/v[0] return values.astype(theano.config.floatX)
def categorical_crossentropy(logit, y, mask, length_var, need_softmax=False): logit_shp = logit.shape # (n_samples, n_timesteps_f, n_labels) # softmax, predict label prob # (n_samples * n_timesteps_f, n_labels) if need_softmax: probs = T.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) else: probs = logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]) # (n_samples * n_timesteps_f) y_flat = y.flatten() # clip to avoid nan loss probs = T.clip(probs, _EPSILON, 1.0 - _EPSILON) loss = lasagne.objectives.categorical_crossentropy(probs, y_flat) # (n_samples, n_timesteps_f) loss = loss.reshape((logit_shp[0], logit_shp[1])) loss = loss * mask loss = T.sum(loss, axis=1) / length_var probs = probs.reshape(logit_shp) return loss, probs
def binary_crossentropy(logit, y, mask, length_var): logit_shp = logit.shape # logit_shp[2] == 1 # n_labels is 1 # (n_samples, n_timesteps_f, n_labels) # softmax, predict label prob # (n_samples * n_timesteps_f, n_labels) probs = T.nnet.sigmoid(logit.flatten()) # (n_samples * n_timesteps_f) y_flat = y.flatten() loss = lasagne.objectives.binary_crossentropy(probs, y_flat) # (n_samples, n_timesteps_f) loss = loss.reshape((logit_shp[0], logit_shp[1])) loss = loss * mask loss = T.sum(loss, axis=1) / length_var # (n_samples, n_timesteps_f) probs = probs.reshape([logit_shp[0], logit_shp[1]]) return loss, probs
def log_sum_exp(x, axis=1): m = T.max(x, axis=axis) return m+T.log(T.sum(T.exp(x-m.dimshuffle(0,'x')), axis=axis))
def softmax_loss(p_true, output_before_softmax): output_before_softmax -= T.max(output_before_softmax, axis=1, keepdims=True) if p_true.ndim==2: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - T.sum(p_true*output_before_softmax, axis=1)) else: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - output_before_softmax[T.arange(p_true.shape[0]),p_true])
def op_l2norm(s_x_, eps_=1e-6): return T.sqrt(eps_+T.sum(T.sqr(s_x_)))
def op_cosine(s_u_, s_v_, flatten_=True, eps_=1e-6): if flatten_: s_u = s_u_.flatten() s_v = s_v_.flatten() return T.dot(s_u, s_v) / T.sqrt(eps_+T.sum(T.sqr(s_u))*T.sum(T.sqr(s_v))) else: s_u = s_u_ s_v = s_v_ T.sum(s_u*s_v, axis=-1)/T.sqrt(eps_+T.sum(T.sqr(s_u), axis=-1)*T.sum(T.sqr(s_v), axis=-1))
def compute_loss(output, num_samples, num_entries=6, gamma=500.0): """Compute the loss of a dataset, given the output of the DSSM. Args: output (:class:`lasagne.layers.Layer`): the output of the DSSM num_samples (int): the number of samples in the dataset num_entries (int): the number of compared papers in the DSSM structure gamma (float): the coefficient applied in the softmax of the similarities Returns: theano.tensor.TensorType: the loss of the dataset """ assert (num_entries > 2) assert (num_samples > 0) # Post-NN operations to compute the loss # First, we extract the first output of each bundle mask = np.zeros(num_entries * num_samples) mask[::num_entries] = 1 unmask = np.ones(num_entries * num_samples) - mask cited = T.extra_ops.compress(mask, output, axis=0) odocs = T.extra_ops.compress(unmask, output, axis=0) # We duplicate each row 'x' num_entries-1 times cited = T.extra_ops.repeat(cited, num_entries-1, axis=0) # Then we compute element-wise product of x with each y, for each bundle sims = T.sum(cited * odocs, axis=1) # We reshape the similarities sims = T.reshape(sims, (num_samples, num_entries-1)) sims = gamma * sims # We take the softmax of each row probs = T.nnet.softmax(sims) # We compute the loss as the sum of element on the first column loss_mask = np.zeros(num_entries-1) loss_mask[0] = 1 loss = T.extra_ops.compress(loss_mask, probs, axis=1) return -T.log(T.prod(loss))
def softmax(x): e_x = T.exp(x - x.max(axis=0, keepdims=True)) out = e_x / e_x.sum(axis=0, keepdims=True) return out
def get_norm(x): x = np.array(x) return np.sum(x * x)
def l2_reg(params): return T.sum([T.sum(x ** 2) for x in params])
def l1_reg(params): return T.sum([T.sum(abs(x)) for x in params])
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs): super(WeightNormLayer, self).__init__(incoming, **kwargs) self.nonlinearity = nonlinearity self.init_stdv = init_stdv k = self.input_shape[1] if b is not None: self.b = self.add_param(b, (k,), name="b", regularizable=False) if g is not None: self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g) if len(self.input_shape)==4: self.axes_to_sum = (0,2,3) self.dimshuffle_args = ['x',0,'x','x'] else: self.axes_to_sum = 0 self.dimshuffle_args = ['x',0] # scale weights in layer below incoming.W_param = incoming.W #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape)) if incoming.W_param.ndim==4: if isinstance(incoming, Deconv2DLayer): W_axes_to_sum = (0,2,3) W_dimshuffle_args = ['x',0,'x','x'] else: W_axes_to_sum = (1,2,3) W_dimshuffle_args = [0,'x','x','x'] else: W_axes_to_sum = 0 W_dimshuffle_args = ['x',0] if g is not None: incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args) else: incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def __init__(self, incoming, num_kernels, dim_per_kernel=5, theta=lasagne.init.Normal(0.05), log_weight_scale=lasagne.init.Constant(0.), b=lasagne.init.Constant(-1.), **kwargs): super(MinibatchLayer, self).__init__(incoming, **kwargs) self.num_kernels = num_kernels num_inputs = int(np.prod(self.input_shape[1:])) self.theta = self.add_param(theta, (num_inputs, num_kernels, dim_per_kernel), name="theta") self.log_weight_scale = self.add_param(log_weight_scale, (num_kernels, dim_per_kernel), name="log_weight_scale") self.W = self.theta * (T.exp(self.log_weight_scale)/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0,1) self.b = self.add_param(b, (num_kernels,), name="b")
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Input Mixture of Gaussian Layer
def _read(w_read, memory): # w_read : (nb_sample, memory_dim) # memory : (nb_sample, memory_dim, memory_width) # return dot(w_read, memory) return T.sum(w_read[:, :, None] * memory, axis=1)