我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.exp()。
def svgd_kernel(self, h = -1): sq_dist = pdist(self.theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, self.theta) sumkxy = np.sum(Kxy, axis=1) for i in range(self.theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true): mus_ex = mus[np.newaxis, :, :] X = y_true[:, np.newaxis, :] diff = X - mus_ex diffprod = T.prod(diff, axis=-1) corxy2 = corxy **2 diff2 = diff ** 2 sigmas2 = sigmas ** 2 sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] diffsigma = diff2 / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods oneminuscorxy2inv = 1.0 / (1.0 - corxy2) expterm = -0.5 * z * oneminuscorxy2inv new_exponent = T.log(0.5/np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis) max_exponent = T.max(new_exponent ,axis=1, keepdims=True) mod_exponent = new_exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent),axis=1) log_gauss = max_exponent + T.log(gauss_mix) loss = -T.mean(log_gauss) return loss
def rbf_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h ** 2 / 2.0) neighbors = T.argsort(H, axis=1)[:, 1] return Kxy, neighbors, h
def rbf_kernel(X): XY = T.dot(X, X.T) x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x') X2e = T.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2. * XY V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(.5 * h / T.log(H.shape[0].astype('float32') + 1.)) # compute the rbf kernel kxy = T.exp(-H / (h ** 2) / 2.0) dxkxy = -T.dot(kxy, X) sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x') dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / (h ** 2) return kxy, dxkxy
def evaluation(self, X_test, y_test): # normalization X_test = self.normalization(X_test) # average over the output pred_y_test = np.zeros([self.M, len(y_test)]) prob = np.zeros([self.M, len(y_test)]) ''' Since we have M particles, we use a Bayesian view to calculate rmse and log-likelihood ''' for i in range(self.M): w1, b1, w2, b2, loggamma, loglambda = self.unpack_weights(self.theta[i, :]) pred_y_test[i, :] = self.nn_predict(X_test, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train prob[i, :] = np.sqrt(np.exp(loggamma)) /np.sqrt(2*np.pi) * np.exp( -1 * (np.power(pred_y_test[i, :] - y_test, 2) / 2) * np.exp(loggamma) ) pred = np.mean(pred_y_test, axis=0) # evaluation svgd_rmse = np.sqrt(np.mean((pred - y_test)**2)) svgd_ll = np.mean(np.log(np.mean(prob, axis = 0))) return (svgd_rmse, svgd_ll)
def matrix2att(self, matrix): '''Input is vector of size (batch_size,5) in theano terms''' g_hat_x = matrix[:,0] g_hat_y = matrix[:,1] log_delta = matrix[:,2] log_sigma_sqr = matrix[:,3] log_gamma = matrix[:,4] g_x = (self.A + 1.0) / 2.0 * (g_hat_x + 1.0) g_y = (self.B + 1.0) / 2.0 * (g_hat_y + 1.0) delta = (max(self.A,self.B) - 1.0) / (self.N - 1) * T.exp(log_delta) gamma = T.exp(log_gamma).dimshuffle(0, 'x') sigma = T.exp(log_sigma_sqr/2.0) return g_y, g_x, delta, sigma, gamma
def matrix2att_cpu(self, matrix): '''Input is vector of size (batch_size,5) in numpy terms''' g_hat_x = matrix[:,0] g_hat_y = matrix[:,1] log_delta = matrix[:,2] log_sigma_sqr = matrix[:,3] log_gamma = matrix[:,4] g_x = (self.A + 1.0) / 2.0 * (g_hat_x + 1.0) g_y = (self.B + 1.0) / 2.0 * (g_hat_y + 1.0) delta = (max(self.A,self.B) - 1.0) / (self.N - 1) * np.exp(log_delta) gamma = np.exp(log_gamma) sigma = np.exp(log_sigma_sqr/2.0) return g_y, g_x, delta, sigma, gamma
def iou_loss(p, t): # print "pass" tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2)) overlaps_t0 = T.maximum(tp[:, 0, :], tt[:, 0, :]) overlaps_t1 = T.minimum(tp[:, 1, :], tt[:, 1, :]) intersection = overlaps_t1 - overlaps_t0 bool_overlap = T.min(intersection, axis=1) > 0 intersection = intersection[:, 0] * intersection[:, 1] intersection = T.maximum(intersection, np.float32(0.)) dims_p = tp[:, 1, :] - tp[:, 0, :] areas_p = dims_p[:, 0] * dims_p[:, 1] dims_t = tt[:, 1, :] - tt[:, 0, :] areas_t = dims_t[:, 0] * dims_t[:, 1] union = areas_p + areas_t - intersection loss = 1. - T.minimum( T.exp(T.log(T.abs_(intersection)) - T.log(T.abs_(union) + np.float32(1e-5))), np.float32(1.) ) # return loss return T.mean(loss)
def iou_loss_val(p, t): tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2)) overlaps = np.zeros_like(tp, dtype=np.float32) overlaps[:, 0, :] = np.maximum(tp[:, 0, :], tt[:, 0, :]) overlaps[:, 1, :] = np.minimum(tp[:, 1, :], tt[:, 1, :]) intersection = overlaps[:, 1, :] - overlaps[:, 0, :] bool_overlap = np.min(intersection, axis=1) > 0 intersection = intersection[:, 0] * intersection[:, 1] intersection = np.maximum(intersection, 0.) # print "bool", bool_overlap # print "Int", intersection dims_p = tp[:, 1, :] - tp[:, 0, :] areas_p = dims_p[:, 0] * dims_p[:, 1] dims_t = tt[:, 1, :] - tt[:, 0, :] areas_t = dims_t[:, 0] * dims_t[:, 1] union = areas_p + areas_t - intersection # print "un", union loss = 1. - np.minimum( np.exp(np.log(np.abs(intersection)) - np.log(np.abs(union) + 1e-5)), 1. ) # print loss return np.mean(loss)
def get_output_for(self, input, **kwargs): """ Given 2d input find the probability of each input in each of num_units Diagonal Gaussians using the formula from http://mathworld.wolfram.com/BivariateNormalDistribution.html """ #make sure sigma is positive and nonzero softplus(x) (0, +inf) sigmas = T.nnet.softplus(self.sigmas) sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] sigmas2 = sigmas ** 2 mus = self.mus[np.newaxis, :, :] X = input[:, np.newaxis, :] diff = (X - mus) ** 2 diffsigma = diff / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) expterm = T.exp(-0.5 * diffsigmanorm) probs = (0.5 / np.pi) * sigmainvprods * expterm return probs
def log_marginal(self, y, h, py, q): '''Computes the approximate log marginal. Uses \log \sum p / q - \log N Args: y: T.tensor, target values. h: T.tensor, latent samples. py: T.tesnor, conditional density p(y | h) q: approximate posterior q(h | y) Returns: approximate log marginal. ''' log_py_h = -self.conditional.neg_log_prob(y, py) log_ph = -self.prior.neg_log_prob(h) log_qh = -self.posterior.neg_log_prob(h, q) assert log_py_h.ndim == log_ph.ndim == log_qh.ndim log_p = log_py_h + log_ph - log_qh log_p_max = T.max(log_p, axis=0, keepdims=True) w = T.exp(log_p - log_p_max) return (T.log(w.mean(axis=0, keepdims=True)) + log_p_max).mean()
def step_free_energy(self, x, beta, *params): '''Step free energy function. Args: x (T.tensor): data sample. beta (float): beta value for annealing. *params: theano shared variables. Returns: T.tensor: free energy. ''' W, v_params, h_params = self.split_params(*params) vis_term = beta * self.v_dist.get_energy_bias(x, *v_params) x = self.v_dist.scale_for_energy_model(x, *v_params) hid_act = beta * (T.dot(x, W) + self.h_dist.get_center(*h_params)) fe = -vis_term - T.log(1. + T.exp(hid_act)).sum(axis=1) return fe
def step_free_energy_h(self, h, beta, *params): '''Step free energy function for hidden states. Args: h (T.tensor): hidden sample. beta (float): beta value for annealing. *params: theano shared variables. Returns: T.tensor: free energy. ''' W, v_params, h_params = self.split_params(*params) hid_term = beta * self.h_dist.get_energy_bias(h, *h_params) h = self.h_dist.scale_for_energy_model(h, *h_params) vis_act = beta * (T.dot(h, W.T) + self.v_dist.get_center(*v_params)) fe = -hid_term - T.log(1. + T.exp(vis_act)).sum(axis=1) return fe
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0]), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def kl_sym(self, old_dist_info_vars, new_dist_info_vars): old_means = old_dist_info_vars["mean"] old_log_stds = old_dist_info_vars["log_std"] new_means = new_dist_info_vars["mean"] new_log_stds = new_dist_info_vars["log_std"] """ Compute the KL divergence of two multivariate Gaussian distribution with diagonal covariance matrices """ old_std = TT.exp(old_log_stds) new_std = TT.exp(new_log_stds) # means: (N*A) # std: (N*A) # formula: # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + # ln(\sigma_2/\sigma_1) numerator = TT.square(old_means - new_means) + \ TT.square(old_std) - TT.square(new_std) denominator = 2 * TT.square(new_std) + 1e-8 return TT.sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info): old_means = old_dist_info["mean"] old_log_stds = old_dist_info["log_std"] new_means = new_dist_info["mean"] new_log_stds = new_dist_info["log_std"] """ Compute the KL divergence of two multivariate Gaussian distribution with diagonal covariance matrices """ old_std = np.exp(old_log_stds) new_std = np.exp(new_log_stds) # means: (N*A) # std: (N*A) # formula: # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + # ln(\sigma_2/\sigma_1) numerator = np.square(old_means - new_means) + \ np.square(old_std) - np.square(new_std) denominator = 2 * np.square(new_std) + 1e-8 return np.sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def svgd_gradient(X0): hidden, _, mse = discrim(X0) grad = -1.0 * T.grad( mse.sum(), X0) kxy, neighbors, h = rbf_kernel(hidden) #TODO coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 ) v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2 X1 = X0[neighbors] hidden1, _, _ = discrim(X1) dxkxy = T.Lop(hidden1, X1, v) #svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x') svgd_grad = grad + dxkxy / 2. return grad, svgd_grad, dxkxy
def metropolis_hastings_accept(energy_prev, energy_next, s_rng): """ Performs a Metropolis-Hastings accept-reject move. Parameters ---------- energy_prev: theano vector Symbolic theano tensor which contains the energy associated with the configuration at time-step t. energy_next: theano vector Symbolic theano tensor which contains the energy associated with the proposed configuration at time-step t+1. s_rng: theano.tensor.shared_randomstreams.RandomStreams Theano shared random stream object used to generate the random number used in proposal. Returns ------- return: boolean True if move is accepted, False otherwise """ ediff = energy_prev - energy_next return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0
def apply_activation(self, lin_output, activation): if activation == 'SIGMOID': final_output = T.nnet.sigmoid(lin_output) elif activation == 'TANH': final_output = T.tanh(lin_output) elif activation == 'LINEAR': final_output = lin_output elif activation == 'ReLU': ## rectifier linear unit final_output = T.maximum(0.0, lin_output) elif activation == 'ReSU': ## rectifier smooth unit final_output = numpy.log(1.0 + numpy.exp(lin_output)) else: self.logger.critical('the input activation function: %s is not supported right now. Please modify layers.py to support' % (activation)) raise return final_output
def EGD(cost, params, learning_rate = 0.33, constraint = 1.0): updates = OrderedDict() grads = T.grad(cost, params) U = T.constant(constraint) #first half of params rw_pos = T.exp(-learning_rate * U * grads[0]) rb_pos = T.exp(-learning_rate * U * grads[1]) #second half rw_neg = 1/rw_pos rb_neg = 1/rb_pos rs = [rw_pos, rb_pos, rw_neg, rb_neg] partition = T.sum(params[0]*rs[0]) + T.sum(params[1]*rs[1]) + T.sum(params[2]*rs[2]) + T.sum(params[3]*rs[3]) for param, r in zip(params, rs): updates[param] = U*param*r/partition return updates
def stable_softmax(y_hat): """Calculate softmax and log softmax in numerically stable way Parameters ---------- y_hat : tensor3 (input_seq_len, num_batch, num_classes+1) class energies Return ------ softmax values in normal and log domain """ y_hat_safe = y_hat - y_hat.max(axis=2, keepdims=True) y_hat_safe_exp = T.exp(y_hat_safe) y_hat_safe_normalizer = y_hat_safe_exp.sum(axis=2, keepdims=True) log_y_hat_safe_normalizer = T.log(y_hat_safe_normalizer) y_hat_softmax = y_hat_safe_exp / y_hat_safe_normalizer log_y_hat_softmax = y_hat_safe - log_y_hat_safe_normalizer return y_hat_softmax, log_y_hat_softmax
def get_output_for(self, input, **kwargs): distances = conv_pairwise_distance(input, self.V) similarities = T.exp(-distances / T.abs_(self.gamma)) norm = T.sum(similarities, 1).reshape((similarities.shape[0], 1, similarities.shape[2], similarities.shape[3])) membership = similarities / (norm + self.eps) histogram = T.mean(membership, axis=(2, 3)) if self.spatial_level == 1: pivot1, pivot2 = membership.shape[2] / 2, membership.shape[3] / 2 h1 = T.mean(membership[:, :, :pivot1, :pivot2], axis=(2, 3)) h2 = T.mean(membership[:, :, :pivot1, pivot2:], axis=(2, 3)) h3 = T.mean(membership[:, :, pivot1:, :pivot2], axis=(2, 3)) h4 = T.mean(membership[:, :, pivot1:, pivot2:], axis=(2, 3)) # Pyramid is not used in the paper # histogram = T.horizontal_stack(h1, h2, h3, h4) histogram = T.horizontal_stack(histogram, h1, h2, h3, h4) return histogram
def theano_logsumexp(x, axis=None): """ Compute log(sum(exp(x), axis=axis) in a numerically stable fashion. Parameters ---------- x : tensor_like A Theano tensor (any dimension will do). axis : int or symbolic integer scalar, or None Axis over which to perform the summation. `None`, the default, performs over all axes. Returns ------- result : ndarray or scalar The result of the log(sum(exp(...))) operation. """ xmax = x.max(axis=axis, keepdims=True) xmax_ = x.max(axis=axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
def sequence_iteration(self, output, mask,use_dropout=0,dropout_value=0.5): dot_product = T.dot(output , self.t_w_out) net_o = T.add( dot_product , self.t_b_out ) ex_net = T.exp(net_o) sum_net = T.sum(ex_net, axis=2, keepdims=True) softmax_o = ex_net / sum_net mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, softmax_o) + T.mul( (1. - mask) , 1e-6 ) return output #result ###### Linear Layer ########################################
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x, deterministic, binary, L): layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list + l_x_list + [l_x], deterministic=deterministic) z_mu = layer_outputs[0] z_ls = layer_outputs[1] x_mu = [] if binary else layer_outputs[2:2+L] x_ls = [] if binary else layer_outputs[2+L:2+2*L] x_list = layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L] x = layer_outputs[-1] kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls)) if binary: logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum() for x in x_list) * (-1./L) prediction = x_list[0] if deterministic else x else: logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls) for mu, ls in zip(x_mu, x_ls))/L prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L loss = -1 * (logpxz + kl_div) return loss, prediction
def sym_logdensity(self, x): """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """ def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha)) return (p, a, x) # First element is different (it is predicted from the bias only) a0 = T.zeros_like(T.dot(x.T, self.W)) # BxH p0 = T.zeros_like(x[0]) x0 = T.ones_like(x[0]) ([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x, sequences=[x, self.W, self.V_alpha, self.b_alpha, self.V_mu, self.b_mu, self.V_sigma, self.b_sigma, self.activation_rescaling], outputs_info=[p0, a0, x0]) return (ps[-1], updates)
def sample(self, n): W = self.W.get_value() V_alpha = self.V_alpha.get_value() b_alpha = self.b_alpha.get_value() V_mu = self.V_mu.get_value() b_mu = self.b_mu.get_value() V_sigma = self.V_sigma.get_value() b_sigma = self.b_sigma.get_value() activation_rescaling = self.activation_rescaling.get_value() samples = np.zeros((self.n_visible, n)) for s in xrange(n): a = np.zeros((self.n_hidden,)) # H for i in xrange(self.n_visible): if i == 0: a = W[i, :] else: a = a + W[i, :] * samples[i - 1, s] h = self.parameters["nonlinearity"].get_numpy_f()(a * activation_rescaling[i]) alpha = Utils.nnet.softmax(np.dot(h, V_alpha[i]) + b_alpha[i]) # C Mu = np.dot(h, V_mu[i]) + b_mu[i] # C Sigma = np.minimum(np.exp(np.dot(h, V_sigma[i]) + b_sigma[i]), 1) comp = Utils.nnet.random_component(alpha) samples[i, s] = np.random.normal(Mu[comp], Sigma[comp]) return samples
def sample(self, n): W = self.W.get_value() V_alpha = self.V_alpha.get_value() b_alpha = self.b_alpha.get_value() V_mu = self.V_mu.get_value() b_mu = self.b_mu.get_value() V_sigma = self.V_sigma.get_value() b_sigma = self.b_sigma.get_value() activation_rescaling = self.activation_rescaling.get_value() samples = np.zeros((self.n_visible, n)) for s in xrange(n): a = np.zeros((self.n_hidden,)) # H for i in xrange(self.n_visible): if i == 0: a = W[i, :] else: a = a + W[i, :] * samples[i - 1, s] h = self.parameters["nonlinearity"].get_numpy_f()(a * activation_rescaling[i]) alpha = Utils.nnet.softmax(np.dot(h, V_alpha[i]) + b_alpha[i]) # C Mu = np.dot(h, V_mu[i]) + b_mu[i] # C # Sigma = np.minimum(np.exp(np.dot(h, V_sigma[i]) + b_sigma[i]), 1) Sigma = np.exp(np.dot(h, V_sigma[i]) + b_sigma[i]) comp = Utils.nnet.random_component(alpha) samples[i, s] = np.random.laplace(Mu[comp], Sigma[comp]) return samples
def conditional_logdensities(self, x_lt_i, range): raise(Exception("Not implemented")) W = self.W.get_value() V_alpha = self.V_alpha.get_value() b_alpha = self.b_alpha.get_value() V_mu = self.V_mu.get_value() b_mu = self.b_mu.get_value() V_sigma = self.V_sigma.get_value() b_sigma = self.b_sigma.get_value() activation_rescaling = self.activation_rescaling.get_value() # Calculate i = len(x_lt_i) a = W[0, :] + np.dot(x_lt_i, W[1:len(x_lt_i) + 1, :]) h = self.parameters["nonlinearity"].get_numpy_f()(a * activation_rescaling[i]) alpha = Utils.nnet.softmax(np.tanh(np.dot(h, V_alpha[i]) + b_alpha[i]) * 10.0) # C Mu = np.dot(h, V_mu[i]) + b_mu[i] # C Sigma = np.log(1.0 + np.exp((np.dot(h, V_sigma[i]) + b_sigma[i]) * 10)) / 10 # C def ld(x): lds = np.array([scipy.stats.norm.logpdf(x, Mu[c], Sigma[c]) for c in xrange(self.n_components)]) return Utils.nnet.logsumexp(lds + np.log(alpha)) return np.array([ld(x) for x in range])
def log_sum_exp(x, axis=1): m = T.max(x, axis=axis) return m+T.log(T.sum(T.exp(x-m.dimshuffle(0,'x')), axis=axis))
def softmax_loss(p_true, output_before_softmax): output_before_softmax -= T.max(output_before_softmax, axis=1, keepdims=True) if p_true.ndim==2: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - T.sum(p_true*output_before_softmax, axis=1)) else: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - output_before_softmax[T.arange(p_true.shape[0]),p_true])
def GMM_nll(x, mus, sigmas, mix_weights): """ D is dimension of each observation (e.g. frame_size) for each component (multivariate Normal with diagonal covariance matrix) See `gaussian_nll` x : (batch_size, D) mus : (batch_size, D, num_gaussians) sigmas : (batch_size, D, num_gaussians) mix_weights : (batch_size, num_gaussians) """ x = x.dimshuffle(0, 1, 'x') # Similar to `gaussian_nll` ll_component_wise = lib.floatX(numpy.log(2. * numpy.pi)) ll_component_wise += 2. * T.log(sigmas) ll_component_wise += ((x - mus) / sigmas) ** 2. ll_component_wise = ll_component_wise.sum(axis=1) # on FRAME_SIZE ll_component_wise *= lib.floatX(-0.5) # LL not NLL # Now ready to take care of weights of each component # Simply applying exp could potentially cause inf/NaN. # Look up LogSumExp trick, Softmax in theano, or this: # hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ weighted_ll = ll_component_wise + T.log(mix_weights) ll_max = T.max(weighted_ll, axis=1, keepdims=True) nll = T.log(T.sum(T.exp(weighted_ll - ll_max), axis=1, keepdims=True)) nll += ll_max nll = -nll.sum(axis=1) return nll
def softmax(x): e_x = T.exp(x - x.max(axis=0, keepdims=True)) out = e_x / e_x.sum(axis=0, keepdims=True) return out
def __init__(self, incoming, num_kernels, dim_per_kernel=5, theta=lasagne.init.Normal(0.05), log_weight_scale=lasagne.init.Constant(0.), b=lasagne.init.Constant(-1.), **kwargs): super(MinibatchLayer, self).__init__(incoming, **kwargs) self.num_kernels = num_kernels num_inputs = int(np.prod(self.input_shape[1:])) self.theta = self.add_param(theta, (num_inputs, num_kernels, dim_per_kernel), name="theta") self.log_weight_scale = self.add_param(log_weight_scale, (num_kernels, dim_per_kernel), name="log_weight_scale") self.W = self.theta * (T.exp(self.log_weight_scale)/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0,1) self.b = self.add_param(b, (num_kernels,), name="b")
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Input Mixture of Gaussian Layer
def gaussian_kl_divergence(mean, ln_var): """Computes the KL-divergence of Gaussian variables from the standard one. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function returns a variable representing the KL-divergence between the given multi-dimensional Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` .. math:: D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` and :math:`I` is an identity matrix. Args: mean (~chainer.Variable): A variable representing mean of given gaussian distribution, :math:`\\mu`. ln_var (~chainer.Variable): A variable representing logarithm of variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. Returns: ~chainer.Variable: A variable representing KL-divergence between given gaussian distribution and the standard gaussian. """ var = T.exp(ln_var) return 0.5 * T.sum(mean * mean + var - ln_var - 1, 1) # aliases
def get_output_for(self, input, **kwargs): rectified = nonlinearities.softplus(input) sum_rect = T.sum(rectified, axis=(1,2)) output = 1 - T.exp(-sum_rect) return output
def __init__(self, incoming, exp=nn.init.Constant(2.), **kwargs): super(AggSoPP, self).__init__(incoming, **kwargs) self.exp = self.add_param(exp, (1,), name='exp', regularizable=False)
def get_output_for(self, input, **kwargs): ps = nonlinearities.sigmoid(input) powd = ps ** self.exp tmean = T.mean(powd, axis=(1,2)) return tmean
def get_output_for(self, input, **kwargs): return T.log(T.mean(T.exp(self.r * input), axis=self.axis) + 1e-7) / self.r