我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.log()。
def svgd_kernel(self, h = -1): sq_dist = pdist(self.theta) pairwise_dists = squareform(sq_dist)**2 if h < 0: # if h < 0, using median trick h = np.median(pairwise_dists) h = np.sqrt(0.5 * h / np.log(self.theta.shape[0]+1)) # compute the rbf kernel Kxy = np.exp( -pairwise_dists / h**2 / 2) dxkxy = -np.matmul(Kxy, self.theta) sumkxy = np.sum(Kxy, axis=1) for i in range(self.theta.shape[1]): dxkxy[:, i] = dxkxy[:,i] + np.multiply(self.theta[:,i],sumkxy) dxkxy = dxkxy / (h**2) return (Kxy, dxkxy)
def rbf_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h ** 2 / 2.0) neighbors = T.argsort(H, axis=1)[:, 1] return Kxy, neighbors, h
def rbf_kernel(X): XY = T.dot(X, X.T) x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x') X2e = T.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2. * XY V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(.5 * h / T.log(H.shape[0].astype('float32') + 1.)) # compute the rbf kernel kxy = T.exp(-H / (h ** 2) / 2.0) dxkxy = -T.dot(kxy, X) sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x') dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / (h ** 2) return kxy, dxkxy
def gaussian_nll(x, mus, sigmas): """ NLL for Multivariate Normal with diagonal covariance matrix See: wikipedia.org/wiki/Multivariate_normal_distribution#Likelihood_function where \Sigma = diag(s_1^2,..., s_n^2). x, mus, sigmas all should have the same shape. sigmas (s_1,..., s_n) should be strictly positive. Results in output shape of similar but without the last dimension. """ nll = lib.floatX(numpy.log(2. * numpy.pi)) nll += 2. * T.log(sigmas) nll += ((x - mus) / sigmas) ** 2. nll = nll.sum(axis=-1) nll *= lib.floatX(0.5) return nll
def step(self, mode): if mode == "train" and self.mode == "test": raise Exception("Cannot train during test mode") if mode == "train": theano_fn = self.train_fn batch_gen = self.train_batch_gen elif mode == "test": theano_fn = self.test_fn batch_gen = self.test_batch_gen else: raise Exception("Invalid mode") data = next(batch_gen) ret = theano_fn(*data) return {"prediction": np.array(ret[0]), "answers": data[-1], "current_loss": ret[1], "log": ""}
def evaluation(self, X_test, y_test): # normalization X_test = self.normalization(X_test) # average over the output pred_y_test = np.zeros([self.M, len(y_test)]) prob = np.zeros([self.M, len(y_test)]) ''' Since we have M particles, we use a Bayesian view to calculate rmse and log-likelihood ''' for i in range(self.M): w1, b1, w2, b2, loggamma, loglambda = self.unpack_weights(self.theta[i, :]) pred_y_test[i, :] = self.nn_predict(X_test, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train prob[i, :] = np.sqrt(np.exp(loggamma)) /np.sqrt(2*np.pi) * np.exp( -1 * (np.power(pred_y_test[i, :] - y_test, 2) / 2) * np.exp(loggamma) ) pred = np.mean(pred_y_test, axis=0) # evaluation svgd_rmse = np.sqrt(np.mean((pred - y_test)**2)) svgd_ll = np.mean(np.log(np.mean(prob, axis = 0))) return (svgd_rmse, svgd_ll)
def parse_arguments(parser): parser.add_argument('seq_file', type=str, metavar='<visit_file>', help='The path to the Pickled file containing visit information of patients') parser.add_argument('label_file', type=str, metavar='<label_file>', help='The path to the Pickled file containing label information of patients') parser.add_argument('tree_file', type=str, metavar='<tree_file>', help='The path to the Pickled files containing the ancestor information of the input medical codes. Only use the prefix and exclude ".level#.pk".') parser.add_argument('out_file', metavar='<out_file>', help='The path to the output models. The models will be saved after every epoch') parser.add_argument('--embed_file', type=str, default='', help='The path to the Pickled file containing the representation vectors of medical codes. If you are not using medical code representations, do not use this option') parser.add_argument('--embed_size', type=int, default=128, help='The dimension size of the visit embedding. If you are providing your own medical code vectors, this value will be automatically decided. (default value: 128)') parser.add_argument('--rnn_size', type=int, default=128, help='The dimension size of the hidden layer of the GRU (default value: 128)') parser.add_argument('--attention_size', type=int, default=128, help='The dimension size of hidden layer of the MLP that generates the attention weights (default value: 128)') parser.add_argument('--batch_size', type=int, default=100, help='The size of a single mini-batch (default value: 100)') parser.add_argument('--n_epochs', type=int, default=100, help='The number of training epochs (default value: 100)') parser.add_argument('--L2', type=float, default=0.001, help='L2 regularization coefficient for all weights except RNN (default value: 0.001)') parser.add_argument('--dropout_rate', type=float, default=0.5, help='Dropout rate used for the hidden layer of RNN (default value: 0.5)') parser.add_argument('--log_eps', type=float, default=1e-8, help='A small value to prevent log(0) (default value: 1e-8)') parser.add_argument('--verbose', action='store_true', help='Print output after every 100 mini-batches (default false)') args = parser.parse_args() return args
def iou_loss(p, t): # print "pass" tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2)) overlaps_t0 = T.maximum(tp[:, 0, :], tt[:, 0, :]) overlaps_t1 = T.minimum(tp[:, 1, :], tt[:, 1, :]) intersection = overlaps_t1 - overlaps_t0 bool_overlap = T.min(intersection, axis=1) > 0 intersection = intersection[:, 0] * intersection[:, 1] intersection = T.maximum(intersection, np.float32(0.)) dims_p = tp[:, 1, :] - tp[:, 0, :] areas_p = dims_p[:, 0] * dims_p[:, 1] dims_t = tt[:, 1, :] - tt[:, 0, :] areas_t = dims_t[:, 0] * dims_t[:, 1] union = areas_p + areas_t - intersection loss = 1. - T.minimum( T.exp(T.log(T.abs_(intersection)) - T.log(T.abs_(union) + np.float32(1e-5))), np.float32(1.) ) # return loss return T.mean(loss)
def iou_loss_val(p, t): tp, tt = p.reshape((p.shape[0], 2, 2)), t.reshape((t.shape[0], 2, 2)) overlaps = np.zeros_like(tp, dtype=np.float32) overlaps[:, 0, :] = np.maximum(tp[:, 0, :], tt[:, 0, :]) overlaps[:, 1, :] = np.minimum(tp[:, 1, :], tt[:, 1, :]) intersection = overlaps[:, 1, :] - overlaps[:, 0, :] bool_overlap = np.min(intersection, axis=1) > 0 intersection = intersection[:, 0] * intersection[:, 1] intersection = np.maximum(intersection, 0.) # print "bool", bool_overlap # print "Int", intersection dims_p = tp[:, 1, :] - tp[:, 0, :] areas_p = dims_p[:, 0] * dims_p[:, 1] dims_t = tt[:, 1, :] - tt[:, 0, :] areas_t = dims_t[:, 0] * dims_t[:, 1] union = areas_p + areas_t - intersection # print "un", union loss = 1. - np.minimum( np.exp(np.log(np.abs(intersection)) - np.log(np.abs(union) + 1e-5)), 1. ) # print loss return np.mean(loss)
def negativeLogLikelihoodWeighted(self, y, weightPerClass): #Weighting the cost of the different classes in the cost-function, in order to counter class imbalance. e1 = np.finfo(np.float32).tiny addTinyProbMatrix = T.lt(self.p_y_given_x_train, 4*e1) * e1 weights = weightPerClass.dimshuffle('x', 0, 'x', 'x', 'x') log_p_y_given_x_train = T.log(self.p_y_given_x_train + addTinyProbMatrix) weighted_log_probs = log_p_y_given_x_train * weights wShape = weighted_log_probs.shape # Re-arrange idx0 = T.arange( wShape[0] ).dimshuffle( 0, 'x','x','x') idx2 = T.arange( wShape[2] ).dimshuffle('x', 0, 'x','x') idx3 = T.arange( wShape[3] ).dimshuffle('x','x', 0, 'x') idx4 = T.arange( wShape[4] ).dimshuffle('x','x','x', 0) return -T.mean( weighted_log_probs[ idx0, y, idx2, idx3, idx4] )
def log_marginal(self, y, h, py, q): '''Computes the approximate log marginal. Uses \log \sum p / q - \log N Args: y: T.tensor, target values. h: T.tensor, latent samples. py: T.tesnor, conditional density p(y | h) q: approximate posterior q(h | y) Returns: approximate log marginal. ''' log_py_h = -self.conditional.neg_log_prob(y, py) log_ph = -self.prior.neg_log_prob(h) log_qh = -self.posterior.neg_log_prob(h, q) assert log_py_h.ndim == log_ph.ndim == log_qh.ndim log_p = log_py_h + log_ph - log_qh log_p_max = T.max(log_p, axis=0, keepdims=True) w = T.exp(log_p - log_p_max) return (T.log(w.mean(axis=0, keepdims=True)) + log_p_max).mean()
def step_free_energy(self, x, beta, *params): '''Step free energy function. Args: x (T.tensor): data sample. beta (float): beta value for annealing. *params: theano shared variables. Returns: T.tensor: free energy. ''' W, v_params, h_params = self.split_params(*params) vis_term = beta * self.v_dist.get_energy_bias(x, *v_params) x = self.v_dist.scale_for_energy_model(x, *v_params) hid_act = beta * (T.dot(x, W) + self.h_dist.get_center(*h_params)) fe = -vis_term - T.log(1. + T.exp(hid_act)).sum(axis=1) return fe
def step_free_energy_h(self, h, beta, *params): '''Step free energy function for hidden states. Args: h (T.tensor): hidden sample. beta (float): beta value for annealing. *params: theano shared variables. Returns: T.tensor: free energy. ''' W, v_params, h_params = self.split_params(*params) hid_term = beta * self.h_dist.get_energy_bias(h, *h_params) h = self.h_dist.scale_for_energy_model(h, *h_params) vis_act = beta * (T.dot(h, W.T) + self.v_dist.get_center(*v_params)) fe = -hid_term - T.log(1. + T.exp(vis_act)).sum(axis=1) return fe
def _generate_train_model_function(self, scores): u = T.lvector('u') i = T.lvector('i') j = T.lvector('j') self.W = theano.shared(numpy.zeros((self._dim)).astype('float32'), name='W'); self.S = theano.shared(scores, name='S'); x_ui = T.dot(self.W, self.S[u,i,:].T); x_uj = T.dot(self.W, self.S[u,j,:].T); x_uij = x_ui - x_uj; obj = T.sum( T.log(T.nnet.sigmoid(x_uij)).sum() - \ self._lambda_w * 0.5 * (self.W ** 2).sum() ) cost = -obj g_cost_W = T.grad(cost=cost, wrt=self.W) updates = [ (self.W, self.W - self._learning_rate * g_cost_W) ] self.train_model = theano.function(inputs=[u,i,j], outputs=cost, updates=updates);
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0]), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def ctc_path_probs(predict, Y, alpha=1e-4): smoothed_predict = (1 - alpha) * predict[:, Y] + alpha * np.float32(1.) / Y.shape[0] L = T.log(smoothed_predict) zeros = T.zeros_like(L[0]) log_first = zeros f_skip_idxs = ctc_create_skip_idxs(Y) b_skip_idxs = ctc_create_skip_idxs(Y[::-1]) # there should be a shortcut to calculating this def step(log_f_curr, log_b_curr, f_active, log_f_prev, b_active, log_b_prev): f_active_next, log_f_next = ctc_update_log_p(f_skip_idxs, zeros, f_active, log_f_curr, log_f_prev) b_active_next, log_b_next = ctc_update_log_p(b_skip_idxs, zeros, b_active, log_b_curr, log_b_prev) return f_active_next, log_f_next, b_active_next, log_b_next [f_active, log_f_probs, b_active, log_b_probs], _ = theano.scan( step, sequences=[L, L[::-1, ::-1]], outputs_info=[np.int32(1), log_first, np.int32(1), log_first]) idxs = T.arange(L.shape[1]).dimshuffle('x', 0) mask = (idxs < f_active.dimshuffle(0, 'x')) & (idxs < b_active.dimshuffle(0, 'x'))[::-1, ::-1] log_probs = log_f_probs + log_b_probs[::-1, ::-1] - L return log_probs, mask
def theano_logsumexp(x, axis=None): """ Compute log(sum(exp(x), axis=axis) in a numerically stable fashion. Parameters ---------- x : tensor_like A Theano tensor (any dimension will do). axis : int or symbolic integer scalar, or None Axis over which to perform the summation. `None`, the default, performs over all axes. Returns ------- result : ndarray or scalar The result of the log(sum(exp(...))) operation. """ xmax = x.max(axis=axis, keepdims=True) xmax_ = x.max(axis=axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
def padMatrixWithTime(seqs, labels, times, options): lengths = np.array([len(seq) for seq in seqs]) - 1 n_samples = len(seqs) maxlen = np.max(lengths) inputDimSize = options['inputDimSize'] numClass = options['numClass'] x = np.zeros((maxlen, n_samples, inputDimSize)).astype(config.floatX) y = np.zeros((maxlen, n_samples, numClass)).astype(config.floatX) t = np.zeros((maxlen, n_samples)).astype(config.floatX) mask = np.zeros((maxlen, n_samples)).astype(config.floatX) for idx, (seq,time,label) in enumerate(zip(seqs,times,labels)): for xvec, subseq in zip(x[:,idx,:], seq[:-1]): xvec[subseq] = 1. for yvec, subseq in zip(y[:,idx,:], label[1:]): yvec[subseq] = 1. mask[:lengths[idx], idx] = 1. t[:lengths[idx], idx] = time[:-1] lengths = np.array(lengths, dtype=config.floatX) if options['useLogTime']: t = np.log(t + options['logEps']) return x, y, t, mask, lengths
def parse_arguments(parser): parser.add_argument('seq_file', type=str, metavar='<visit_file>', help='The path to the Pickled file containing visit information of patients') parser.add_argument('n_input_codes', type=int, metavar='<n_input_codes>', help='The number of unique input medical codes') parser.add_argument('label_file', type=str, metavar='<label_file>', help='The path to the Pickled file containing label information of patients') parser.add_argument('n_output_codes', type=int, metavar='<n_output_codes>', help='The number of unique label medical codes') parser.add_argument('out_file', metavar='out_file', help='The path to the output models. The models will be saved after every epoch') parser.add_argument('--time_file', type=str, default='', help='The path to the Pickled file containing durations between visits of patients. If you are not using duration information, do not use this option') parser.add_argument('--predict_time', type=int, default=0, choices=[0,1], help='Use this option if you want the GRU to also predict the time duration until the next visit (0 for false, 1 for true) (default value: 0)') parser.add_argument('--tradeoff', type=float, default=1.0, help='Tradeoff variable for balancing the two loss functions: code prediction function and duration prediction function (default value: 1.0)') parser.add_argument('--use_log_time', type=int, default=1, choices=[0,1], help='Use logarithm of time duration to dampen the impact of the outliers (0 for false, 1 for true) (default value: 1)') parser.add_argument('--embed_file', type=str, default='', help='The path to the Pickled file containing the representation vectors of medical codes. If you are not using medical code representations, do not use this option') parser.add_argument('--embed_size', type=int, default=200, help='The size of the visit embedding before passing it to the GRU layers. If you are not providing your own medical code vectors, you must specify this value (default value: 200)') parser.add_argument('--embed_finetune', type=int, default=1, choices=[0,1], help='If you are using randomly initialized code representations, always use this option. If you are using an external medical code representations, and you want to fine-tune them as you train the GRU, use this option as well. (0 for false, 1 for true) (default value: 1)') parser.add_argument('--hidden_dim_size', type=str, default='[200,200]', help='The size of the hidden layers of the GRU. This is a string argument. For example, [500,400] means you are using a two-layer GRU where the lower layer uses a 500-dimensional hidden layer, and the upper layer uses a 400-dimensional hidden layer. (default value: [200,200])') parser.add_argument('--batch_size', type=int, default=100, help='The size of a single mini-batch (default value: 100)') parser.add_argument('--n_epochs', type=int, default=10, help='The number of training epochs (default value: 10)') parser.add_argument('--L2_softmax', type=float, default=0.001, help='L2 regularization for the softmax function (default value: 0.001)') parser.add_argument('--L2_time', type=float, default=0.001, help='L2 regularization for the linear regression (default value: 0.001)') parser.add_argument('--dropout_rate', type=float, default=0.5, help='Dropout rate between GRU hidden layers, and between the final hidden layer and the softmax layer (default value: 0.5)') parser.add_argument('--log_eps', type=float, default=1e-8, help='A small value to prevent log(0) (default value: 1e-8)') parser.add_argument('--verbose', action='store_true', help='Print output after every 10 mini-batches (default false)') args = parser.parse_args() return args
def theano_logsumexp(x, axis=None): """ Compute log(sum(exp(x), axis=axis) in a numerically stable fashion. Parameters ---------- x : tensor_like A Theano tensor (any dimension will do). axis : int or symbolic integer scalar, or None Axis over which to perform the summation. `None`, the default, performs over all axes. Returns ------- result : ndarray or scalar The result of the log(sum(exp(...))) operation. """ xmax = T.max(x, axis = axis, keepdims = True) xmax_ = T.max(x, axis = axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis = axis))
def xent(self, inputs, inputs_mask, chars, chars_mask, outputs, outputs_mask, attention): pred_outputs, pred_attention = self( inputs, inputs_mask, chars, chars_mask, outputs, outputs_mask) outputs_xent = batch_sequence_crossentropy( pred_outputs, outputs[1:], outputs_mask[1:]) # Note that pred_attention will contain zero elements for masked-out # character positions, to avoid trouble with log() we add 1 for zero # element of attention (which after multiplication will be removed # anyway). batch_size = attention.shape[1].astype(theano.config.floatX) attention_mask = (inputs_mask.dimshuffle('x', 1, 0) * outputs_mask[1:].dimshuffle(0, 1, 'x') ).astype(theano.config.floatX) epsilon = 1e-6 attention_xent = ( -attention[1:] * T.log(epsilon + pred_attention + (1-attention_mask)) * attention_mask).sum() / batch_size return outputs_xent, attention_xent
def multi_label_ACE(outputs,y_labels): data_shape=outputs.shape loss_buff=0 # num=T.iscalar(data_shape[0]) #theano int to get value from tensor # for i in range(int(num)): # for j in range(12): # y_exp=outputs[i,j] # y_tru=y_labels[i,0,0,j] # if y_tru==0: # loss_ij=math.log(1-outputs[i,j]) # loss_buff-=loss_ij # if y_tru>0: # loss_ij=math.log(outputs[i,j]) # loss_buff-=loss_ij # wts=[ 0.24331649, 0.18382575, 0.23082499, 0.44545567, 0.52901483, 0.58482504, \ # 0.57321465, 0.43411294, 0.15502839, 0.36377019, 0.19050646, 0.16083916] # for i in [3,4,5,6,7,9]: for i in range(12): target=y_labels[:,i] output=outputs[:,i] loss_au=T.sum(-(target * T.log((output+0.05)/1.05) + (1.0 - target) * T.log((1.05 - output)/1.05))) loss_buff+=loss_au return loss_buff/(12*BATCH_SIZE)
def sequence_log_likelihood(y, y_hat, y_mask, y_hat_mask, blank_symbol): """ Based on code from Shawn Tan. Credits to Kyle Kastner as well. """ y_hat_mask_len = tensor.sum(y_hat_mask, axis=0, dtype='int32') y_mask_len = tensor.sum(y_mask, axis=0, dtype='int32') log_probabs = _log_path_probabs( y, T.log(y_hat), y_mask, y_hat_mask, blank_symbol) batch_size = log_probabs.shape[1] log_labels_probab = _log_add( log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 1], log_probabs[y_hat_mask_len - 1, tensor.arange(batch_size), y_mask_len - 2]) return log_labels_probab
def log_sum_exp(x, axis=1): m = T.max(x, axis=axis) return m+T.log(T.sum(T.exp(x-m.dimshuffle(0,'x')), axis=axis))
def softmax_loss(p_true, output_before_softmax): output_before_softmax -= T.max(output_before_softmax, axis=1, keepdims=True) if p_true.ndim==2: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - T.sum(p_true*output_before_softmax, axis=1)) else: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - output_before_softmax[T.arange(p_true.shape[0]),p_true])
def build_model(model_): global fn_predict, fn_record global g_ozer, g_mdl g_ozer = dict(simple=VanillaSGD, adam=AdamSGD)[OZER]() g_ozer.lr = LEARN_RATE s_x = T.tensor4('x') s_y = T.ivector('y') s_pdpo = T.scalar() s_out = model_(s_x, s_pdpo) s_y_onehot = T.extra_ops.to_one_hot(s_y, len(g_dataset.label_map)) s_loss = T.mean(-s_y_onehot*T.log(s_out + 1e-3)) s_accr = T.mean( T.switch( T.eq(T.argmax(s_out, axis=1), T.argmax(s_y_onehot, axis=1)), 1, 0)) no_dropout = [(s_pdpo, T.constant(0., dtype=th.config.floatX))] fn_predict = th.function( [s_x, s_y], {'pred':s_out, 'accr':s_accr, 'loss':s_loss}, givens=no_dropout, profile=PROFILE) rec_fetches = { 'x': s_x, 'y': s_y, 'pred': s_out} rec_fetches.update(g_mdl.params_di) fn_record = th.function( [s_x, s_y], rec_fetches, givens=no_dropout, profile=PROFILE) g_ozer.compile( [s_x, s_y], s_loss, g_mdl.params_di.values(), fetches_={'pred': s_out, 'loss': s_loss, 'accr': s_accr}, givens_=[(s_pdpo, T.constant(TRAIN_PDPO, dtype=th.config.floatX))], profile_=PROFILE)
def compute_loss(output, num_samples, num_entries=6, gamma=500.0): """Compute the loss of a dataset, given the output of the DSSM. Args: output (:class:`lasagne.layers.Layer`): the output of the DSSM num_samples (int): the number of samples in the dataset num_entries (int): the number of compared papers in the DSSM structure gamma (float): the coefficient applied in the softmax of the similarities Returns: theano.tensor.TensorType: the loss of the dataset """ assert (num_entries > 2) assert (num_samples > 0) # Post-NN operations to compute the loss # First, we extract the first output of each bundle mask = np.zeros(num_entries * num_samples) mask[::num_entries] = 1 unmask = np.ones(num_entries * num_samples) - mask cited = T.extra_ops.compress(mask, output, axis=0) odocs = T.extra_ops.compress(unmask, output, axis=0) # We duplicate each row 'x' num_entries-1 times cited = T.extra_ops.repeat(cited, num_entries-1, axis=0) # Then we compute element-wise product of x with each y, for each bundle sims = T.sum(cited * odocs, axis=1) # We reshape the similarities sims = T.reshape(sims, (num_samples, num_entries-1)) sims = gamma * sims # We take the softmax of each row probs = T.nnet.softmax(sims) # We compute the loss as the sum of element on the first column loss_mask = np.zeros(num_entries-1) loss_mask[0] = 1 loss = T.extra_ops.compress(loss_mask, probs, axis=1) return -T.log(T.prod(loss))
def cost(self, probs, y, y_mask): y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * self.vocab_size + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) cost = cost.mean() return cost
def f_log_probs(self, probs, x, x_mask, y, y_mask, src_selector, trg_selector, cg=None): y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * self.vocab_size + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) func_inps = [x, x_mask, y, y_mask, src_selector, trg_selector] return theano.function( inputs=func_inps, outputs=cost, on_unused_input='warn')
def GMM_nll(x, mus, sigmas, mix_weights): """ D is dimension of each observation (e.g. frame_size) for each component (multivariate Normal with diagonal covariance matrix) See `gaussian_nll` x : (batch_size, D) mus : (batch_size, D, num_gaussians) sigmas : (batch_size, D, num_gaussians) mix_weights : (batch_size, num_gaussians) """ x = x.dimshuffle(0, 1, 'x') # Similar to `gaussian_nll` ll_component_wise = lib.floatX(numpy.log(2. * numpy.pi)) ll_component_wise += 2. * T.log(sigmas) ll_component_wise += ((x - mus) / sigmas) ** 2. ll_component_wise = ll_component_wise.sum(axis=1) # on FRAME_SIZE ll_component_wise *= lib.floatX(-0.5) # LL not NLL # Now ready to take care of weights of each component # Simply applying exp could potentially cause inf/NaN. # Look up LogSumExp trick, Softmax in theano, or this: # hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ weighted_ll = ll_component_wise + T.log(mix_weights) ll_max = T.max(weighted_ll, axis=1, keepdims=True) nll = T.log(T.sum(T.exp(weighted_ll - ll_max), axis=1, keepdims=True)) nll += ll_max nll = -nll.sum(axis=1) return nll
def multilabel_loss(preds, labels): eps = 1e-4 preds = T.clip(preds, eps, 1-eps) return -(labels * T.log(preds) + (1 - labels) * T.log(1 - preds)).mean(axis=1).mean(axis=0)
def multilabel_loss_with_mask(preds, labels, mask): eps = 1e-4 preds = T.clip(preds, eps, 1-eps) return -(mask * (labels * T.log(preds) + (1 - labels) * T.log(1 - preds))).mean(axis=1).mean(axis=0)
def init_weights(self, a0, b0): w1 = 1.0 / np.sqrt(self.d + 1) * np.random.randn(self.d, self.n_hidden) b1 = np.zeros((self.n_hidden,)) w2 = 1.0 / np.sqrt(self.n_hidden + 1) * np.random.randn(self.n_hidden) b2 = 0. loggamma = np.log(np.random.gamma(a0, b0)) loglambda = np.log(np.random.gamma(a0, b0)) return (w1, b1, w2, b2, loggamma, loglambda)
def unpack_weights(self, z): w = z w1 = np.reshape(w[:self.d*self.n_hidden], [self.d, self.n_hidden]) b1 = w[self.d*self.n_hidden:(self.d+1)*self.n_hidden] w = w[(self.d+1)*self.n_hidden:] w2, b2 = w[:self.n_hidden], w[-3] # the last two parameters are log variance loggamma, loglambda= w[-2], w[-1] return (w1, b1, w2, b2, loggamma, loglambda)
def centered_softplus(x): return T.nnet.softplus(x) - np.cast[th.config.floatX](np.log(2.))
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1) # Input Mixture of Gaussian Layer
def connect(self, inputs): energy = tensor.dot(inputs, self.W) + self.b energy = energy.reshape([energy.shape[0] * energy.shape[1], energy.shape[2]]) log_scores = tensor.log(tensor.nnet.softmax(energy)) predictions = tensor.argmax(log_scores, axis=-1) return (log_scores, predictions)
def connect(self, inputs, weights, labels): """ - inputs: flattened log scores from the softmax layer. """ y_flat = labels.flatten() x_flat_idx = tensor.arange(y_flat.shape[0]) cross_ent = - inputs[x_flat_idx, y_flat].reshape([labels.shape[0], labels.shape[1]]) if weights != None: cross_ent = cross_ent * weights # Summed over timesteps. Averaged across samples in the batch. return cross_ent.sum(axis=0).mean()
def kl_divergence(p, p_hat): return p_hat - p + p * T.log(p / p_hat)
def mean_squared_logarithmic_error(y_true, y_pred): return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
def poisson_loss(y_true, y_pred): return T.mean(y_pred - y_true * T.log(y_pred + epsilon), axis=-1) #################################################### # Variational Auto-encoder
def gaussian_kl_divergence(mean, ln_var): """Computes the KL-divergence of Gaussian variables from the standard one. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function returns a variable representing the KL-divergence between the given multi-dimensional Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` .. math:: D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` and :math:`I` is an identity matrix. Args: mean (~chainer.Variable): A variable representing mean of given gaussian distribution, :math:`\\mu`. ln_var (~chainer.Variable): A variable representing logarithm of variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. Returns: ~chainer.Variable: A variable representing KL-divergence between given gaussian distribution and the standard gaussian. """ var = T.exp(ln_var) return 0.5 * T.sum(mean * mean + var - ln_var - 1, 1) # aliases
def entropy(y_pred): # Clip predictions to avoid numerical instability y_pred = T.clip(y_pred, _EPSILON, 1.0 - _EPSILON) ent = - T.sum(y_pred * T.log(y_pred), axis=1) return ent.mean()
def __init__(self, input1, input2, log=False, **kwargs): super(MultLayer, self).__init__([input1, input2], **kwargs)