def gelu(x): return 0.5 * x * (1 + T.tanh(T.sqrt(2 / np.pi) * (x + 0.044715 * T.pow(x, 3))))
def create_learning_rate_func(solver_params): base = tt.fscalar('base') gamma = tt.fscalar('gamma') power = tt.fscalar('power') itrvl = tt.fscalar('itrvl') iter = tt.scalar('iter') if solver_params['lr_type']=='inv': lr_ = base * tt.pow(1 + gamma * iter, -power) lr = t.function( inputs=[iter, t.Param(base, default=solver_params['base']), t.Param(gamma, default=solver_params['gamma']), t.Param(power, default=solver_params['power'])], outputs=lr_) elif solver_params['lr_type']=='fixed': lr_ = base lr = t.function( inputs=[iter, t.Param(base, default=solver_params['base'])], outputs=lr_, on_unused_input='ignore') elif solver_params['lr_type']=='episodic': lr_ = base / (tt.floor(iter/itrvl) + 1) lr = t.function( inputs=[iter, t.Param(base, default=solver_params['base']), t.Param(itrvl, default=solver_params['interval'])], outputs=lr_, on_unused_input='ignore') return lr
def get_parent_state(self, children_states, node_type, use_dropout: bool, iteration_number) -> tuple: layer_input = T.flatten(children_states) nn_out = self.__compute_layer_output(layer_input, node_type, use_dropout, iteration_number) encoder_input = T.flatten(T.concatenate((children_states, nn_out))) * self.__ae_noise encoding = T.tanh(T.dot(encoder_input, self.__encoder_weights[node_type])) decoded = T.tanh(T.dot(encoding, self.__decoder_weights)) decoded /= decoded.norm(2) / layer_input.norm(2) output_reconstruction = self.__compute_layer_output(decoded, node_type, use_dropout, iteration_number) reconstruction_cos = T.dot(nn_out[0], output_reconstruction[0]) children_reconstruction_cos = T.dot(decoded, layer_input) additional_objective = reconstruction_cos + children_reconstruction_cos constrain_usage_pct = T.cast(1. - T.pow(self.__hyperparameters['constrain_intro_rate'], iteration_number), theano.config.floatX) return nn_out[0], constrain_usage_pct * additional_objective
def output_func(self, input): # P(Y|X) = softmax(W.X + b) features = input[0] session_info = input[1] exam = 1 / (1 + T.exp(-T.dot(features, self.W[0]) - self.b[0])) rel = 1 / (1 + T.exp(-T.dot(features, self.W[1]) - self.b[1])) p_1 = exam * rel #p_1 = 1 / (1 + T.exp(-T.dot(features, self.W) - self.b)) self.y_pred = p_1 > 0.5 self.p_y_given_x = T.horizontal_stack(1 - p_1, p_1) #self.p_y_given_x = T.nnet.softmax(self._dot(input, self.W) + self.b) #self.y_pred = T.argmax(self.p_y_given_x, axis=1) #comput add loss #q_info = session_info[:,0] #u_info = session_info[:,1:] r_info = session_info[:,1 + self.dim:] self.rel_model_loss = T.pow(rel - r_info, 2) #prev_rel = 1 / (1 + T.exp(-T.dot(features, self.R_W) - self.R_b)) #self.rel_const_loss = T.pow(rel - prev_rel, 2) return self.y_pred
def get_adam_updates(f, params, lr=10., b1=0.9, b2=0.999, e=1e-8, dec=5e-3, norm_grads=False): """Generate updates to optimize using the Adam optimizer with linear learning rate decay.""" t = theano.shared(0) ms = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params] vs = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params] gs = T.grad(f, params) if norm_grads: gs = [g / (T.sum(T.abs_(g)) + 1e-8) for g in gs] t_u = (t, t + 1) m_us = [(m, b1 * m + (1. - b1) * g) for m, g in zip(ms, gs)] v_us = [(v, b2 * v + (1. - b2) * T.sqr(g)) for v, g in zip(vs, gs)] t_u_f = T.cast(t_u[1], floatX) lr_hat = (lr / (1. + t_u_f * dec)) * T.sqrt(1. - T.pow(b2, t_u_f)) / (1. - T.pow(b1, t_u_f)) param_us = [(param, param - lr_hat * m_u[1] / (T.sqrt(v_u[1]) + e)) for m_u, v_u, param in zip(m_us, v_us, params)] return m_us + v_us + param_us + [t_u]
def run(self, params, loss): m = theano.shared(np.zeros(params.shape.eval()), borrow=True, name='m') v = theano.shared(np.zeros(params.shape.eval()), borrow=True, name='v') grad = T.grad(loss, params) norm_grad = grad.norm(2) m_t = self.beta1 * m + (1 - self.beta1) * grad v_t = self.beta2 * v + (1 - self.beta2) * T.pow(grad, 2) step = T.iscalar(name='step') update_rules = [(params, params - self.lr * (m_t / (1.0 - T.pow(self.beta1, step)) / (T.sqrt(v_t / (1.0 - T.pow(self.beta2, step))) + self.stable))), (m, m_t), (v, v_t)] train_epoch = theano.function([step], [loss, norm_grad], updates=update_rules) for epoch in xrange(self.max_epoch): loss, grad = train_epoch(epoch + 1) norm_l2 = norm(grad) print("epoch = %d\t loss = %f\t norm = %f" %(epoch + 1, loss, norm_l2), end='') print() if norm_l2 < self.eps: break
def get_cost_updates(self, corruption_level, learning_rate, mu): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) L = T.sum(T.pow(self.x - z, 2), axis = 1) cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, delta, gparam in zip(self.params, self.delta, gparams): updates.append( (delta, mu*delta - learning_rate * gparam) ) updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam )) return (cost, updates)
def pow(x, a): return T.pow(x, a)
def ADAM(lr, params, grads, loss, iteration, beta_1=0.9, beta_2=0.999, epsilon=1e-8): """ ADAM update """ t = iteration lr_t = lr * T.sqrt(1 - T.pow(beta_2, t)) / (1 - T.pow(beta_1, t)) w_decay = cfg.TRAIN.WEIGHT_DECAY updates = [] for p, g in zip(params, grads): # zero init of moment m = theano.shared(p.val.get_value() * 0.) # zero init of velocity v = theano.shared(p.val.get_value() * 0.) if p.is_bias or w_decay == 0: regularized_g = g else: regularized_g = g + w_decay * p.val m_t = (beta_1 * m) + (1 - beta_1) * regularized_g v_t = (beta_2 * v) + (1 - beta_2) * T.square(regularized_g) p_t = p.val - lr_t * m_t / (T.sqrt(v_t) + epsilon) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p.val, p_t)) return updates
def pow(self, x, a): return T.pow(x, a)
def get_output_for(self, input, deterministic=False, **kwargs): """Apply alpha dropout.""" if deterministic or self.p == 0: return input else: # Using theano constant to prevent upcasting one = T.constant(1) retain_prob = one - self.p if self.rescale: input /= retain_prob # use nonsymbolic shape for dropout mask if possible mask_shape = self.input_shape if any(s is None for s in mask_shape): mask_shape = input.shape # apply dropout, respecting shared axes if self.shared_axes: shared_axes = tuple(a if a >= 0 else a + input.ndim for a in self.shared_axes) mask_shape = tuple(1 if a in shared_axes else s for a, s in enumerate(mask_shape)) mask = self._srng.uniform(mask_shape, dtype=input.dtype) < retain_prob if self.shared_axes: bcast = tuple(bool(s == 1) for s in mask_shape) mask = T.patternbroadcast(mask, bcast) a = T.pow(retain_prob + self.alpha ** 2 * retain_prob * (1 - retain_prob), -0.5) b = -a * (1 - retain_prob) * self.alpha return a * (input * mask + self.alpha * (1 - mask)) + b
def output_error(self, input_sequence, true_output, mask): outputs = T.pow(true_output - input_sequence, 2) outputs = T.sum(outputs, axis=2) / outputs.shape[2] outputs = T.mul(outputs.dimshuffle(0,1,'x'), mask) return T.sum(outputs) / T.sum(mask) ###### 2-class weightes cross entropy ########################################
def log_zero_inflated_negative_binomial(x, pi, p, log_r, eps = 0.0): pi = T.clip(pi, eps, 1.0 - eps) p = T.clip(p, eps, 1.0 - eps) r = T.exp(log_r) r = T.clip(r, eps, r) y_0 = T.log(pi + (1 - pi) * T.pow(1 - p, r)) y_1 = T.log(1 - pi) + log_negative_binomial(x, p, log_r, eps) y = T.eq(x, 0) * y_0 + T.gt(x, 0) * y_1 return y
def gradients_to_updates(self, params, grads): updates = OrderedDict() self.i = theano.shared(np.array(0).astype('float32'), 'adam_i') self.params.append(self.i) self.i.tags = ['optimizer_param'] i_t = self.i + 1 fix1 = 1.0 - T.pow(self.beta1, i_t) fix2 = 1.0 - T.pow(self.beta2, i_t) lr_t = self.lr * (T.sqrt(fix2) / fix1) for pp, gg in zip(params, grads): value = pp.get_value(borrow=True) self.m = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adam_m_'+pp.name) self.v = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adam_v_'+pp.name) self.params.append(self.m) self.params.append(self.v) self.m.tags = ['optimizer_param'] self.v.tags = ['optimizer_param'] m_t = (self.beta1 * self.m) + ((1.0 - self.beta1) * gg) v_t = (self.beta2 * self.v) + ((1.0 - self.beta2) * T.sqr(gg)) g_t = m_t / (T.sqrt(v_t) + 1e-7) p_t = pp - (lr_t * g_t) updates[self.m] = m_t updates[self.v] = v_t updates[pp] = p_t updates[self.i] = i_t return updates
def get_output(self, input_, label, mask): """ This function overrides the parents' one. Computes the loss by mode input_ion and real label. Parameters ---------- input_: TensorVariable an array of (batch size, input_ion). for accuracy task, "input_" is 2D matrix. label: TensorVariable an array of (batch size, answer) or (batchsize,) if label is a list of class labels. for word perplexity case, currently only second one is supported. should make label as integer. mask: TensorVariable an array of (batchsize,) only contains 0 and 1. loss are summed or averaged only through 1. Returns ------- TensorVariable a symbolic tensor variable which is scalar. """ # do if mask is None: return T.pow(2, -T.mean(T.log2(input_[T.arange(label.shape[0]), label]))) else: return T.pow(2, -T.sum(T.log2(input_[T.arange(label.shape[0]), label]) * mask) / T.sum(mask))
def __get_siamese_loss(self, use_dropout, scale_similar=1, scale_dissimilar=1): encoder_copy = self.__encoder.copy_full(name="siameseEncoder") encoding_1 = self.__encoder.get_encoding() encoding_2 = encoder_copy.get_encoding() representation_distance = (encoding_1 - encoding_2).norm(2) similar_loss = -scale_similar * T.pow(representation_distance, 2) margin = self.__hyperparameters['dissimilar_margin'] dissimilar_loss = -scale_dissimilar * T.pow(T.nnet.relu(margin - representation_distance), 2) return dissimilar_loss, similar_loss, encoder_copy, encoding_1, encoding_2
def adagrad(parameter, parameter_gradient, learning_rate=.05, fudge_factor=1e-10, clip_threshold=1): clipped_gradient = T.clip(parameter_gradient, -clip_threshold, clip_threshold) adagrad_historical = theano.shared(np.zeros(parameter.get_value().shape, dtype=parameter.dtype), "adagrad_historical") next_adagrad = adagrad_historical + T.pow(clipped_gradient, 2) adagrad_update = adagrad_historical, next_adagrad update = learning_rate / T.sqrt(fudge_factor + next_adagrad) * clipped_gradient parameter_update = parameter, parameter + update ratio = update.norm(2) / parameter.norm(2) return (adagrad_update, parameter_update), ratio
def rmsprop(parameter, parameter_gradient, learning_rate=.05, fudge_factor=1e-10, rho=.9, clip_threshold=1): clipped_gradient = T.clip(parameter_gradient, -clip_threshold, clip_threshold) rmsprob_moving_avg = theano.shared(np.ones(parameter.get_value().shape, dtype=parameter.dtype) * 0, "rmsprop_historical") next_rmsprop_avg = rho * rmsprob_moving_avg + (1. - rho) * T.pow(clipped_gradient, 2) update = rmsprob_moving_avg, next_rmsprop_avg grad_step = learning_rate / T.sqrt(fudge_factor + next_rmsprop_avg) * clipped_gradient parameter_update = parameter, parameter + grad_step ratio = grad_step.norm(2) / parameter.norm(2) return (update, parameter_update), ratio
def nesterov_rmsprop(parameter, parameter_gradient, learning_rate: float, momentum: float, fudge_factor: float = 1e-10, rho: float = .9): memory = theano.shared(np.zeros_like(parameter.get_value(), dtype=parameter.dtype), name="nesterov_momentum") rmsprop_moving_avg = theano.shared(np.zeros(parameter.get_value().shape, dtype=parameter.dtype), "rmsprop_historical") next_rmsprop_avg = rho * rmsprop_moving_avg + (1. - rho) * T.pow(parameter_gradient, 2) memory_update = memory, momentum * memory + learning_rate / T.sqrt( fudge_factor + next_rmsprop_avg) * parameter_gradient grad_step = - momentum * memory + (1. + momentum) * memory_update[1] parameter_update = parameter, parameter + grad_step ratio = grad_step.norm(2) / parameter.norm(2) return (memory_update, parameter_update, (rmsprop_moving_avg, next_rmsprop_avg)), ratio
def pow(inp, power): return T.pow(inp, power)
def sigmoid_grad(inp): #T.nnet.sigmoid.grad has been deprecated in 0.9 return T.exp(inp)/T.pow((T.exp(inp)+1),2) #out = sigmoid(inp) #grad = T.nnet.sigmoid.grad((inp,), (out,)) #return grad
def smooth_L1(x): x_abs = abs(x) return tensor.switch(x_abs < 1, 0.5*tensor.pow(x, 2), x_abs - 0.5)
def op(self, state): X = self.l_in.op(state=state) lpn = 1.+T.log(1.+T.exp(self.lpn)) lpnb = lpn.dimshuffle('x', 0, 'x', 'x') X = T.abs_(X)**lpnb X = T.mean(X, axis=[2, 3]) X = T.pow(X, 1/lpn) return X
def op(self, state): X = self.l_in.op(state=state) lpn = 1.+T.log(1.+T.exp(self.lpn)) lpnb = lpn.dimshuffle('x', 'x', 0) X = T.abs_(X)**lpnb X = T.mean(X, axis=[0]) X = T.pow(X, 1/lpn) return X
def test_elemwise(self): # float Ops mats = theano.tensor.matrices('cabxy') c, a, b, x, y = mats s1 = T.switch(c, a, b) s2 = T.switch(c, x, y) for op in (T.add, T.sub, T.mul, T.true_div, T.int_div, T.floor_div, T.minimum, T.maximum, T.gt, T.lt, T.ge, T.le, T.eq, T.neq, T.pow): g = optimize(FunctionGraph(mats, [op(s1, s2)])) assert str(g).count('Switch') == 1 # integer Ops mats = theano.tensor.imatrices('cabxy') c, a, b, x, y = mats s1 = T.switch(c, a, b) s2 = T.switch(c, x, y) for op in (T.and_, T.or_, T.xor, T.bitwise_and, T.bitwise_or, T.bitwise_xor): g = optimize(FunctionGraph(mats, [op(s1, s2)])) assert str(g).count('Switch') == 1 # add/mul with more than two inputs u, v = theano.tensor.matrices('uv') s3 = T.switch(c, u, v) for op in (T.add, T.mul): g = optimize(FunctionGraph(mats + [u, v], [op(s1, s2, s3)])) assert str(g).count('Switch') == 1
def is_positive(v): if hints(v).get('positive', False): return True # TODO: how to handle this - a registry? # infer_hints on Ops? logger.debug('is_positive: %s' % str(v)) if v.owner and v.owner.op == tensor.pow: try: exponent = tensor.get_scalar_constant_value(v.owner.inputs[1]) except tensor.basic.NotScalarConstantError: return False if 0 == exponent % 2: return True return False
def local_log_pow(node): if node.op == tensor.log: x, = node.inputs if x.owner and x.owner.op == tensor.pow: base, exponent = x.owner.inputs # TODO: reason to be careful with dtypes? return [exponent * tensor.log(base)]
def spectral_radius_bound(X, log2_exponent): """ Returns upper bound on the largest eigenvalue of square symmetrix matrix X. log2_exponent must be a positive-valued integer. The larger it is, the slower and tighter the bound. Values up to 5 should usually suffice. The algorithm works by multiplying X by itself this many times. From V.Pan, 1990. "Estimating the Extremal Eigenvalues of a Symmetric Matrix", Computers Math Applic. Vol 20 n. 2 pp 17-22. Rq: an efficient algorithm, not used here, is defined in this paper. """ if X.type.ndim != 2: raise TypeError('spectral_radius_bound requires a matrix argument', X) if not isinstance(log2_exponent, integer_types): raise TypeError('spectral_radius_bound requires an integer exponent', log2_exponent) if log2_exponent <= 0: raise ValueError('spectral_radius_bound requires a strictly positive ' 'exponent', log2_exponent) XX = X for i in xrange(log2_exponent): XX = tensor.dot(XX, XX) return tensor.pow( trace(XX), 2 ** (-log2_exponent))
def test_int_pow(): a = CudaNdarrayType([False])() f = theano.function([a], (a * 4).sum(), mode=mode_with_gpu) op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu'] f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu) op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
def return_output(self,Dif): #Dif is theano.Tensor.matrix type Frac = Dif/self.gamma Cov = self.v0*T.pow(Frac,self.alpha) L = sin.cholesky(T.exp(-Cov)) eps = self.srng.normal(avg=0,std=0.001,size=(self.time,self.lsize)) return T.dot(L,eps) ##This converts the noise signal into the basioc matrix required for Covariance calculation
def return_output(self,Dif): #Dif is theano.Tensor.matrix type Frac = Dif/self.gamma Cov = self.v0*T.pow(Frac,self.alpha) L = sin.cholesky(T.exp(-Cov)) eps = self.srng.uniform((self.time,self.lsize)) return T.dot(L,eps) ##This converts the noise signal into the basioc matrix required for Covariance calculation
def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch # L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) L = T.sum(T.pow(self.x - z, 2), axis = 1) cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates) # ## class HiddenLayer2
def finetune_cost_updates(self, prototypes_y, prototypes_r, mu, learning_rate): """ This function computes the cost and the updates .""" # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, withd one entry per # example in minibatch # Using least-squares loss for both clustering # No reconstruction cost in this version network_output = self.get_output() L = T.sum(T.maximum(0, 1 + T.sum(prototypes_r * network_output, axis = 1) - T.sum(prototypes_y * network_output, axis = 1) ), axis = 0) # temp = T.pow(center - network_output, 2) # # L = T.sum(temp, axis=1) # Add the network reconstruction error z = self.get_network_reconst() reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1) # reconst_err = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) L = L + self.lbd*reconst_err cost1 = T.mean(L) cost2 = self.lbd*T.mean(reconst_err) cost3 = cost1 - cost2 # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost1, self.params) # generate the list of updates updates = [] for param, delta, gparam in zip(self.params, self.delta, gparams): updates.append( (delta, mu*delta - learning_rate * gparam) ) updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam )) return ((cost1, cost2, cost3, learning_rate), updates)
def finetune_cost_updates(self, center, mu, learning_rate): """ This function computes the cost and the updates .""" # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, withd one entry per # example in minibatch network_output = self.get_output() temp = T.pow(center - network_output, 2) L = T.sum(temp, axis=1) # Add the network reconstruction error z = self.get_network_reconst() reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1) L = self.beta*L + self.lbd*reconst_err cost1 = T.mean(L) cost2 = self.lbd*T.mean(reconst_err) cost3 = cost1 - cost2 # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost1, self.params) # generate the list of updates updates = [] grad_values = [] param_norm = [] for param, delta, gparam in zip(self.params, self.delta, gparams): updates.append( (delta, mu*delta - learning_rate * gparam) ) updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam )) grad_values.append(gparam.norm(L=2)) param_norm.append(param.norm(L=2)) grad_ = T.stack(*grad_values) param_ = T.stack(*param_norm) return ((cost1, cost2, cost3, grad_, param_), updates)
def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # L = T.sum(T.pow(self.x - z, 2), axis = 1) cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates) # class SdC, main class for deep-clustering
def get_cost_updates(self, center, corruption_level, learning_rate): """ This function computes the cost and the updates .""" tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch # Using least-squares loss for both clustering and reconstruction temp1 = T.pow(center - y, 2) temp2 = T.pow(self.x - z, 2) L = T.sum(temp1 , axis=1) + T.sum(temp2 , axis=1) # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(L) # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates)
def costfunction(self,y): z = y.copy().astype('float64') print() return T.sum(T.pow(self.output.dimshuffle(1,0) - z, 2))/(2 * y.shape[0])
def errors(self,y): z = y.copy().astype('float64') return T.sum(T.pow(self.output.dimshuffle(1,0) - z, 2)) / ( y.shape[0])
def get_reg_ind(self): nsl = self.noise_lvl**2 constant = .5 * np.log(nsl) + c1 * nsl + c2 * (nsl**2) + c3 * (nsl**3) stdx, stdy = self._get_stds() drop_ax, drop_ay = T.pow(stdx, 2), T.pow(stdy, 2) reg_indx = .5 * T.log(drop_ax) + c1 * drop_ax + c2 * T.pow(drop_ax, 2) + c3 * T.pow(drop_ax, 3) - constant reg_indy = .5 * T.log(drop_ay) + c1 * drop_ay + c2 * T.pow(drop_ay, 2) + c3 * T.pow(drop_ay, 3) - constant reg_ind = T.sum(reg_indx) + T.sum(reg_indy) return reg_ind
def get_reg_ind(self): drop_ax, drop_ay = T.pow(T.exp(self.params[-2]), 2), T.pow(T.exp(self.params[-1]), 2) constant = np.cast[theano.config.floatX](.5 * np.log(self.noise_lvl) + c1 * self.noise_lvl + c2 * (self.noise_lvl**2) + c3 * (self.noise_lvl**3)) reg_indx = .5 * T.log(drop_ax) + c1 * drop_ax + c2 * T.pow(drop_ax, 2) + c3 * T.pow(drop_ax, 3) - constant reg_indy = .5 * T.log(drop_ay) + c1 * drop_ay + c2 * T.pow(drop_ay, 2) + c3 * T.pow(drop_ay, 3) - constant reg_ind = T.cast(T.prod(self.params[3].shape), theano.config.floatX) * reg_indx + T.cast(T.prod(self.params[4].shape), theano.config.floatX) * reg_indy return reg_ind
def jacobian_regularize(hidden, params): ''' Computes the jacobian of the hidden layer with respect to the input, reshapes are necessary for broadcasting the element-wise product on the right axis ''' hidden = hidden * (1 - hidden) L = expand_dims(hidden, 1) * expand_dims(params, 0) # Compute the jacobian and average over the number of samples/minibatch L = T.sum(T.pow(L, 2)) / hidden.shape[0] return T.mean(L)
def inner_fn(t, stm1, oat, ot, oht, pos, vt): hst = T.nnet.relu( T.dot(Wq_hst_stm1,stm1) + T.dot(Wq_hst_ot,ot) + T.dot(Wq_hst_oht,oht) + T.dot(Wq_hst_oat,oat) + bq_hst ) hst2 = T.nnet.relu( T.dot(Wq_hst2_hst,hst) + bq_hst2 ) stmu = T.tanh( T.dot(Wq_stmu_hst2,hst2) + bq_stmu ) stsig = T.nnet.softplus( T.dot(Wq_stsig_hst2,hst2) + bq_stsig ) + sig_min_states # Rescale representation to fit within linear response of the tanh-nonlinearity stmu = T.set_subtensor(stmu[0,:],0.1*ot[0,:]) stsig = T.set_subtensor(stsig[0,:],0.01) st = stmu + theano_rng.normal((n_s,n_proc))*stsig ost = T.nnet.relu( T.dot(Wl_ost_st,st) + bl_ost ) ost2 = T.nnet.relu( T.dot(Wl_ost2_ost,ost) + bl_ost2 ) ost3 = T.nnet.relu( T.dot(Wl_ost3_ost2,ost2) + bl_ost3 ) otmu = T.dot(Wl_otmu_st, ost3) + bl_otmu otsig = T.nnet.softplus(T.dot(Wl_otsig_st, ost3) + bl_otsig) + sig_min_obs ohtmu = T.dot(Wl_ohtmu_st, ost3) + bl_ohtmu ohtsig = T.nnet.softplus( T.dot(Wl_ohtsig_st, ost3) + bl_ohtsig ) + sig_min_obs oatmu = T.dot(Wl_oatmu_st, ost3) + bl_oatmu oatsig = T.nnet.softplus( T.dot(Wl_oatsig_st, ost3) + bl_oatsig ) + sig_min_obs p_ot = GaussianNLL(ot, otmu, otsig) p_oht = GaussianNLL(oht, ohtmu, ohtsig) p_oat = GaussianNLL(oat, oatmu, oatsig) prior_stmu = T.tanh( T.dot(Wl_stmu_stm1, stm1) + bl_stmu ) prior_stsig = T.nnet.softplus( T.dot(Wl_stsig_stm1, stm1) + bl_stsig ) + sig_min_states prior_stmu = ifelse(T.lt(t,20),prior_stmu, T.set_subtensor(prior_stmu[0,:],0.1)) prior_stsig = ifelse(T.lt(t,20),prior_stsig, T.set_subtensor(prior_stsig[0,:],0.01)) KL_st = KLGaussianGaussian(stmu, stsig, prior_stmu, prior_stsig) FEt = KL_st + p_ot + p_oht + p_oat oat_mu = T.dot(Wa_atmu_st, st) + ba_atmu oat_sig = T.nnet.softplus( T.dot(Wa_atsig_st, st) + ba_atsig ) + sig_min_action oat_new = 0.0*oat + oat_mu + theano_rng.normal((n_oa,n_proc))*oat_sig action_force = T.tanh( oat_new ) force = T.switch(T.lt(pos,0.0),-2*pos - 1,-T.pow(1+5*T.sqr(pos),-0.5)-T.sqr(pos)*T.pow(1 + 5*T.sqr(pos),-1.5)-T.pow(pos,4)/16.0) - 0.25*vt vt_new = vt + 0.05*force + 0.03*action_force pos_new = pos + vt_new ot_new = pos_new + theano_rng.normal((n_o,n_samples))*0.01 oht_new = T.exp(-T.sqr(pos_new-1.0)/2.0/0.3/0.3) return st, oat_new, ot_new, oht_new, pos_new, vt_new, FEt, KL_st, stmu, stsig, force, p_ot, p_oht, p_oat