def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true): mus_ex = mus[np.newaxis, :, :] X = y_true[:, np.newaxis, :] diff = X - mus_ex diffprod = T.prod(diff, axis=-1) corxy2 = corxy **2 diff2 = diff ** 2 sigmas2 = sigmas ** 2 sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] diffsigma = diff2 / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods oneminuscorxy2inv = 1.0 / (1.0 - corxy2) expterm = -0.5 * z * oneminuscorxy2inv new_exponent = T.log(0.5/np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis) max_exponent = T.max(new_exponent ,axis=1, keepdims=True) mod_exponent = new_exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent),axis=1) log_gauss = max_exponent + T.log(gauss_mix) loss = -T.mean(log_gauss) return loss
def log_marginal(self, y, h, py, q): '''Computes the approximate log marginal. Uses \log \sum p / q - \log N Args: y: T.tensor, target values. h: T.tensor, latent samples. py: T.tesnor, conditional density p(y | h) q: approximate posterior q(h | y) Returns: approximate log marginal. ''' log_py_h = -self.conditional.neg_log_prob(y, py) log_ph = -self.prior.neg_log_prob(h) log_qh = -self.posterior.neg_log_prob(h, q) assert log_py_h.ndim == log_ph.ndim == log_qh.ndim log_p = log_py_h + log_ph - log_qh log_p_max = T.max(log_p, axis=0, keepdims=True) w = T.exp(log_p - log_p_max) return (T.log(w.mean(axis=0, keepdims=True)) + log_p_max).mean()
def ctc_update_log_p(skip_idxs, zeros, active, log_p_curr, log_p_prev): active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()] active_next = T.cast(T.minimum( T.maximum( active + 1, T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1 ), log_p_curr.shape[0]), 'int32') common_factor = T.max(log_p_prev[:active]) p_prev = T.exp(log_p_prev[:active] - common_factor) _p_prev = zeros[:active_next] # copy over _p_prev = T.set_subtensor(_p_prev[:active], p_prev) # previous transitions _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1]) # skip transitions _p_prev = T.inc_subtensor(_p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs]) updated_log_p_prev = T.log(_p_prev) + common_factor log_p_next = T.set_subtensor( zeros[:active_next], log_p_curr[:active_next] + updated_log_p_prev ) return active_next, log_p_next
def multiclass_hinge_loss(self, predictions, targets, delta=1): num_cls = predictions.shape[1] if targets.ndim == predictions.ndim - 1: targets = T.extra_ops.to_one_hot(targets, num_cls) elif targets.ndim != predictions.ndim: raise TypeError('rank mismatch between targets and predictions') corrects = predictions[targets.nonzero()] rest = T.reshape(predictions[(1-targets).nonzero()], (-1, num_cls-1)) rest = T.max(rest, axis=1) return T.nnet.relu(rest - corrects + delta).mean()
def theano_logsumexp(x, axis=None): """ Compute log(sum(exp(x), axis=axis) in a numerically stable fashion. Parameters ---------- x : tensor_like A Theano tensor (any dimension will do). axis : int or symbolic integer scalar, or None Axis over which to perform the summation. `None`, the default, performs over all axes. Returns ------- result : ndarray or scalar The result of the log(sum(exp(...))) operation. """ xmax = T.max(x, axis = axis, keepdims = True) xmax_ = T.max(x, axis = axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis = axis))
def compile_train(self, *args): # args is a list of dictionaries if self.verbose: print('compiling training function...') import theano for arg_list in args: self.compiled_train_fn_list.append(theano.function(**arg_list)) if self.monitor_grad: norms = [grad.norm(L=2) for grad in self.grads] import theano.tensor as T norms = T.log10(norms) self.get_norm = theano.function([self.subb_ind], [T.sum(norms), T.max(norms)], givens=[(self.x, self.shared_x_slice), (self.y, self.shared_y_slice)] )
def theano_logsumexp(x, axis=None): """ Compute log(sum(exp(x), axis=axis) in a numerically stable fashion. Parameters ---------- x : tensor_like A Theano tensor (any dimension will do). axis : int or symbolic integer scalar, or None Axis over which to perform the summation. `None`, the default, performs over all axes. Returns ------- result : ndarray or scalar The result of the log(sum(exp(...))) operation. """ xmax = x.max(axis=axis, keepdims=True) xmax_ = x.max(axis=axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
def pool(self, input, window, mode, stride, pad, autopad): if mode == 'max': mode = 'max' elif mode == 'sum': mode = 'sum' elif mode == 'avg': mode = 'average_exc_pad' elif mode == 'avgpad': mode = 'average_inc_pad' else: mode = 'sum' if input.ndim == 4: return P.pool_2d(input=input, ws=window, ignore_border=not autopad, stride=stride, pad=pad, mode=mode) elif input.ndim == 5: return P.pool_3d(input=input, ws=window, ignore_border=not autopad, stride=stride, pad=pad, mode=mode) else: basic.defaultreturn()
def needed_key(self): return self._needed_key_impl('activation_fn') # class MaxPool(Chip): # ''' This class_chip collapses the input tensor by max pooling along its last dimension. # ''' # def construct(self, input_tv): # pool_size = self.prm('pool_size') # y = T.reshape(input_tv, # ([input_tv.shape[i] for i in range(input_tv.ndim - 1)] # + [T.floor_div(input_tv.shape[input_tv.ndim - 1], pool_size).astype('int32'), pool_size]), # ndim=input_tv.ndim + 1) # self.output_tv = T.max(y, axis=y.ndim - 1) # return tuple() # def needed_key(self): # return self._needed_key_impl('pool_size')
def get_pooling_padding_and_theano_pool_mode( pool_size, border_mode, pool_mode): if border_mode == BorderMode.same: padding = [x - (2 if x%2==1 else 1) for x in pool_size] elif border_mode == BorderMode.valid: padding = (0, 0) else: raise RuntimeError("Valid border modes are: "+str(BorderMode.vals) +", got: "+str(border_mode)) if (pool_mode == PoolMode.max): theano_pool_mode = 'max' elif (pool_mode == PoolMode.avg): theano_pool_mode = 'average_exc_pad' else: raise RuntimeError("Valid pool modes are: "+str(PoolMode.vals) +", got: "+str(pool_mode)) return padding, theano_pool_mode
def get_output(self, train=False): print(len(self.layers)) u=self.layers[0].get_output(train) t=self.layers[1].get_output(train) #tp=t[0] #tn=t[1] #un=T.dot(u,u) #return [T.dot(u,tp)/(un*T.dot(tp,tp)) ,T.dot(u,tn)/(un*T.dot(tn,tn))] #theano.printing.pprint('vals') #x=T.dvector() #printed_u = hello_world_op(x) #f = theano.function([x], printed_u) #f(['here']) #T.reshape(u,[2,1]) #T.reshape(t,[1,2,2]) #d=T.dot(t.dimshuffle(1, 0, 2), u) #u1=self.activation(u) #t.reshape([2,2,2]) return T.max( (([u ,u]*t.dimshuffle(1,0,2)).dimshuffle(1,0,2)),2)#.reshape([2,2]) #return d.dimshuffle(1,0,2) #just dot product
def test_kmax_pool(): nbatches, nkernels_in, nwords, ndim = 2, 1, 5, 3 input_shape = (nbatches, nkernels_in, nwords, ndim) input = T.tensor4('input') k = 3 f_kmax = theano.function([input], k_max_pooling(input, k)) f_max = theano.function([input], max_pooling(input)) image_data = np.arange(np.prod(input_shape), dtype=np.float64) np.random.shuffle(image_data) image_data = image_data.reshape(input_shape) print image_data print 'kmax' print f_kmax(image_data) print 'max' print f_max(image_data)
def inv(self, output): output = output.dimshuffle(0,1,2,'x').repeat(self.pool_shape[0], axis=3) if self.depooler == 'random': mask = self.theano_rng.uniform(size=output.shape, dtype=theano.config.floatX) mask = T.floor(mask / mask.max(axis=3).dimshuffle(0,1,2,'x')) output = mask * output elif self.depooler == 'first': mask_np = np.zeros(self.pool_shape, dtype=theano.config.floatX) mask_np[0] = 1.0 mask = theano.shared(mask_np, borrow=True).dimshuffle('x','x','x',0) output = mask * output else: output = self.depooler(output, axis=3) return output.reshape(self.input_shape)
def get_pooling_batch(hs, mask, pooling_method): """ :param hs: (batch, len, dim) :param mask: (batch, len) :param pooling_method: :return: """ if pooling_method == 'max': add_v = ((1 - mask) * -BIG_INT)[:, :, None] return T.max(hs + add_v, axis=1) elif pooling_method == 'min': add_v = ((1 - mask) * BIG_INT)[:, :, None] return T.min(hs + add_v, axis=1) elif pooling_method in ['averaging', 'mean' , 'average']: return T.sum(hs * mask[:, :, None], axis=1) / T.sum(mask, axis=1)[:, None] elif pooling_method == 'sum': return T.sum(hs * mask[:, :, None], axis=1) elif pooling_method in ['final', 'last']: return hs[:, -1, :] else: raise NotImplementedError('Not implemented pooling method: {}'.format(pooling_method))
def mlp_layer_softmax(tparams, layer1_input, prefix='mlp_layer'): """ layer1_input: n_sample * n_feature 64*20 input_shape: (num of hiddens, number of input features) 200*20 pred_shape: (num of labels, number of hiddens) 2*200 y_recon : n_label *n_sample 2*64 """ hidden_2_out = tensor.nnet.sigmoid(tensor.dot(layer1_input, tparams[_p(prefix,'W1')].T) + tparams[_p(prefix,'b1')] ) # 64*200 y_recons = tensor.dot(hidden_2_out, tparams[_p(prefix,'V1')].T) + tparams[_p(prefix,'c1')] #y_recons = tensor.tanh(y_recons) * 10 # avoid numerical issues/label smoothing #y_recons = tensor.nnet.softmax(y_recons) # 64*2 max_w = tensor.max(y_recons, axis = 1, keepdims=True) e0 = tensor.exp(y_recons - max_w) y_recons = e0 / tensor.sum(e0, axis = 1, keepdims=True) return y_recons
def test_local_reduce_broadcast_some_0(self): for fct in [tensor.sum, tensor.all, tensor.any, tensor.prod, tensor.max, tensor.min]: x = T.TensorType('int64', (True, False, True))() f = theano.function([x], [fct(x, axis=[0, 1])], mode=self.mode) order = f.maker.fgraph.toposort() assert 1 == sum([isinstance(node.op, T.CAReduce) for node in order]) node = [node for node in order if isinstance(node.op, tensor.CAReduce)][0] op = node.op assert isinstance(op, T.CAReduce) # -- the leading broadcastable dimension has been dropped # by the local_reduce_broadcastable optimization # now summation is over the original x's dimension 1. assert node.inputs[0].ndim == 2, node assert op.axis == (0,), op.axis
def test_optimization(self): # If we use only the max output, we should replace this op with # a faster one. mode = theano.compile.mode.get_default_mode().including( 'canonicalize', 'fast_run') for axis in [0, 1, -1]: data = numpy.asarray(numpy.random.rand(2, 3), dtype=config.floatX) n = tensor.matrix() f = function([n], tensor.max_and_argmax(n, axis)[0], mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, CAReduce) f = function([n], tensor.max_and_argmax(n, axis), mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, tensor.MaxAndArgmax)
def sparse_tuple_from(sequences, dtype=np.int32): """Create a sparse representention of x. Args: sequences: a list of lists of type dtype where each element is a sequence Returns: A tuple with (indices, values, shape) """ indices = [] values = [] for n, seq in enumerate(sequences): indices.extend(zip([n]*len(seq), range(len(seq)))) values.extend(seq) indices = np.asarray(indices, dtype=np.int64) values = np.asarray(values, dtype=dtype) shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64) return indices, values, shape
def log_sum_exp(x, axis=1): m = T.max(x, axis=axis) return m+T.log(T.sum(T.exp(x-m.dimshuffle(0,'x')), axis=axis))
def softmax_loss(p_true, output_before_softmax): output_before_softmax -= T.max(output_before_softmax, axis=1, keepdims=True) if p_true.ndim==2: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - T.sum(p_true*output_before_softmax, axis=1)) else: return T.mean(T.log(T.sum(T.exp(output_before_softmax),axis=1)) - output_before_softmax[T.arange(p_true.shape[0]),p_true])
def GMM_nll(x, mus, sigmas, mix_weights): """ D is dimension of each observation (e.g. frame_size) for each component (multivariate Normal with diagonal covariance matrix) See `gaussian_nll` x : (batch_size, D) mus : (batch_size, D, num_gaussians) sigmas : (batch_size, D, num_gaussians) mix_weights : (batch_size, num_gaussians) """ x = x.dimshuffle(0, 1, 'x') # Similar to `gaussian_nll` ll_component_wise = lib.floatX(numpy.log(2. * numpy.pi)) ll_component_wise += 2. * T.log(sigmas) ll_component_wise += ((x - mus) / sigmas) ** 2. ll_component_wise = ll_component_wise.sum(axis=1) # on FRAME_SIZE ll_component_wise *= lib.floatX(-0.5) # LL not NLL # Now ready to take care of weights of each component # Simply applying exp could potentially cause inf/NaN. # Look up LogSumExp trick, Softmax in theano, or this: # hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ weighted_ll = ll_component_wise + T.log(mix_weights) ll_max = T.max(weighted_ll, axis=1, keepdims=True) nll = T.log(T.sum(T.exp(weighted_ll - ll_max), axis=1, keepdims=True)) nll += ll_max nll = -nll.sum(axis=1) return nll
def theano_logsumexp(x, axis=None): xmax = x.max(axis=axis, keepdims=True) xmax_ = x.max(axis=axis) return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
def get_output_for(self, input, **kwargs): def max_fn(f, mask, prev_score, prev_back, W_sim): next_score = prev_score.dimshuffle(0, 1, 'x') + f.dimshuffle(0, 'x', 1) + W_sim.dimshuffle('x', 0, 1) next_back = T.argmax(next_score, axis = 1) next_score = T.max(next_score, axis = 1) mask = mask.dimshuffle(0, 'x') next_score = next_score * mask + prev_score * (1.0 - mask) next_back = next_back * mask + prev_back * (1.0 - mask) next_back = T.cast(next_back, 'int32') return [next_score, next_back] def produce_fn(back, mask, prev_py): # back: inst * class, prev_py: inst, mask: inst next_py = back[T.arange(prev_py.shape[0]), prev_py] next_py = mask * next_py + (1.0 - mask) * prev_py next_py = T.cast(next_py, 'int32') return next_py f = T.dot(input, self.W) init_score, init_back = f[:, 0, :], T.zeros_like(f[:, 0, :], dtype = 'int32') if CRF_INIT: init_score = init_score + self.W_init[0].dimshuffle('x', 0) ([scores, backs], _) = theano.scan(fn = max_fn, \ sequences = [f.dimshuffle(1, 0, 2)[1: ], self.mask_input.dimshuffle(1, 0)[1: ]], \ outputs_info = [init_score, init_back], non_sequences = [self.W_sim], strict = True) init_py = T.argmax(scores[-1], axis = 1) init_py = T.cast(init_py, 'int32') # init_py: inst, backs: time * inst * class pys, _ = theano.scan(fn = produce_fn, \ sequences = [backs, self.mask_input.dimshuffle(1, 0)[1:]], outputs_info = [init_py], go_backwards = True) # pys: (rev_time - 1) * inst pys = pys.dimshuffle(1, 0)[:, :: -1] # pys : inst * (time - 1) return T.concatenate([pys, init_py.dimshuffle(0, 'x')], axis = 1)
def train(self, eval_func): print "\t".join(['epoch', 'iter', 'max_f1', 'f1', 'prec', 'recall']) max_f1 = 0.0 for epoch in range(self.epoch): ind = np.random.permutation(self.x.shape[0]) i = 0 iter = 0 while i < self.x.shape[0]: iter += 1 j = min(self.x.shape[0], i + self.batch_size) s_x, s_y, s_m, s_wx = self.x[ind[i: j]], self.y[ind[i: j]], self.m[ind[i: j]], self.wx[ind[i: j]] if NOISE: noise = np.random.randint(self.char_cnt, size = s_x.shape) noise_mask = np.random.binomial(1, NOISE_RATE, s_x.shape) s_x = np.array(noise * noise_mask + s_x * (1 - noise_mask), dtype = np.int32) s_cm = self.cm[ind[i: j]] s_gaze = self.gaze[ind[i: j]] if self.use_gaze else None s_lemma = self.lemma[ind[i: j]] if self.use_lemma else None s_pos = self.pos[ind[i: j]] if self.use_pos else None loss = self.train_fn(s_x, s_y, s_m, s_wx, s_cm, s_gaze, s_lemma, s_pos) i = j period = PERIOD if epoch > self.min_epoch else MIN_PERIOD if iter * self.batch_size % period == 0: py = self.predict(self.tx, self.tm, self.twx, self.tcm, self.tgaze, self.tlemma, self.tpos) if self.ind2word is not None: acc, f1, prec, recall = eval_func(py, self.ty, self.tm, full = True, ind2word = self.ind2word, x = self.twx) else: acc, f1, prec, recall = eval_func(py, self.ty, self.tm, full = True) max_f1 = max(max_f1, f1) print epoch, iter, max_f1, f1, prec, recall py = self.predict(self.tx, self.tm, self.twx, self.tcm, self.tgaze, self.tlemma, self.tpos) if self.ind2word is not None: acc, f1, prec, recall = eval_func(py, self.ty, self.tm, full = True, ind2word = self.ind2word, x = self.twx) else: acc, f1, prec, recall = eval_func(py, self.ty, self.tm, full = True) max_f1 = max(max_f1, f1) print epoch, iter, max_f1, f1, prec, recall
def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true): """ negative log likelihood loss of a 2d y_true coordinate in each of the Gaussians with parameters mus, sigmas, corxy, pis. Note that the mus, sigmas and corxy are shared between all samples and only pis are different for each sample. The formula for negative log likelihood is : \mathcal{L}(y \vert x) = - \log\bigg\{\sum_{k=1}^K \pi_k(x) \mathcal{N}\big(y \vert \mu_k(x), \Sigma_k(x)\big)\bigg\} The size of pis is n_batch x n_components, the size of mus is n_components x 2, the size of sigmas is n_components x 2 and the size of corxy is n_components x 1. The size of y_true is batch_size x 2. """ mus_ex = mus[np.newaxis, :, :] X = y_true[:, np.newaxis, :] diff = X - mus_ex diffprod = T.prod(diff, axis=-1) corxy2 = corxy ** 2 diff2 = diff ** 2 sigmas2 = sigmas ** 2 sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] diffsigma = diff2 / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods oneminuscorxy2inv = 1.0 / (1.0 - corxy2) expterm = -0.5 * z * oneminuscorxy2inv #apply logsumExp trick for numerical stability https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ new_exponent = T.log(0.5 / np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis) max_exponent = T.max(new_exponent , axis=1, keepdims=True) mod_exponent = new_exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent), axis=1) log_gauss = max_exponent + T.log(gauss_mix) loss = -T.mean(log_gauss) return loss
def log_mean_exp(x, axis=None, as_numpy=False): '''Numerically stable log(exp(x).mean()). ''' if as_numpy: Te = np else: Te = T x_max = Te.max(x, axis=axis, keepdims=True) return Te.log(Te.mean(Te.exp(x - x_max), axis=axis, keepdims=True)) + x_max
def log_sum_exp(x, axis=None): '''Numerically stable log( sum( exp(A) ) ). ''' x_max = T.max(x, axis=axis, keepdims=True) y = T.log(T.sum(T.exp(x - x_max), axis=axis, keepdims=True)) + x_max y = T.sum(y, axis=axis) return y
def max(x, axis=None, keepdims=False): return T.max(x, axis=axis, keepdims=keepdims)
def ctc_cost(predict, Y): log_probs, mask = ctc_path_probs(predict, ctc_interleave_blanks(Y)) common_factor = T.max(log_probs) total_log_prob = T.log(T.sum(T.exp(log_probs - common_factor)[mask.nonzero()])) + common_factor return -total_log_prob # batchifies original CTC code
def naive_relu(x): return TT.max(x, 0)
def __call__(self, x): e_x = T.exp(x - x.max(axis=1).dimshuffle(0, 'x')) return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def __call__(self, x): e_x = T.exp(x - x.max(axis=1, keepdims=True)) return e_x / e_x.sum(axis=1, keepdims=True)
def __call__(self, x): if x.ndim == 2: x = T.max([x[:, n::self.n_pool] for n in range(self.n_pool)], axis=0) elif x.ndim == 4: x = T.max([x[:, n::self.n_pool, :, :] for n in range(self.n_pool)], axis=0) else: raise NotImplementedError return x
def logp_rbm(X): y = T.dot(X, gB) + gc y_max = T.max(T.maximum(y, -y), axis=1).dimshuffle(0,'x') log_sum = y_max + T.log(T.exp(y - y_max) + T.exp(-y - y_max)) # apply the log sum trick log_sum = T.sum(log_sum, axis=1) logp = T.dot(X, gb.dimshuffle(0, 'x')).flatten() - .5 * T.sum(X*X, axis=1) + log_sum return logp
def chooseBestAction(self, state): """ Get the best action for a belief state Arguments --------- state : one belief state Returns ------- The best action : int """ q_vals = self.qValues(state) return np.argmax(q_vals),np.max(q_vals)
def fprop(self, x): if_longer = x[:self.required] padding = ReplicateLayer(TT.max([1, self.required - x.shape[0]]))(x[-1]).out if_shorter = TT.concatenate([x, padding]) diff = x.shape[0] - self.required self.out = ifelse(diff < 0, if_shorter, if_longer) return self.out
def __call__(self, x): shape = x.shape if x.ndim == 1: shape1 = TT.cast(shape[0] / self.maxout_part, 'int64') shape2 = TT.cast(self.maxout_part, 'int64') x = x.reshape([shape1, shape2]) x = x.max(1) else: shape1 = TT.cast(shape[1] / self.maxout_part, 'int64') shape2 = TT.cast(self.maxout_part, 'int64') x = x.reshape([shape[0], shape1, shape2]) x = x.max(2) return x