我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.square()。
def rbf_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h ** 2 / 2.0) neighbors = T.argsort(H, axis=1)[:, 1] return Kxy, neighbors, h
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.dot(input, self.W) if init: ma = T.mean(activation, axis=0) activation -= ma.dimshuffle('x',0) stdv = T.sqrt(T.mean(T.square(activation),axis=0)) activation /= stdv.dimshuffle('x',0) self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)] else: activation += self.b.dimshuffle('x', 0) return self.nonlinearity(activation)
def l2normalize(layer, train_scale=True): W_param = layer.W s = W_param.get_value().shape if len(s)==4: axes_to_sum = (1,2,3) dimshuffle_args = [0,'x','x','x'] k = s[0] else: axes_to_sum = 0 dimshuffle_args = ['x',0] k = s[1] layer.W_scale = layer.add_param(lasagne.init.Constant(1.), (k,), name="W_scale", trainable=train_scale, regularizable=False) layer.W = W_param * (layer.W_scale/T.sqrt(1e-6 + T.sum(T.square(W_param),axis=axes_to_sum))).dimshuffle(*dimshuffle_args) return layer # fully connected layer with weight normalization
def kl_sym(self, old_dist_info_vars, new_dist_info_vars): old_means = old_dist_info_vars["mean"] old_log_stds = old_dist_info_vars["log_std"] new_means = new_dist_info_vars["mean"] new_log_stds = new_dist_info_vars["log_std"] """ Compute the KL divergence of two multivariate Gaussian distribution with diagonal covariance matrices """ old_std = TT.exp(old_log_stds) new_std = TT.exp(new_log_stds) # means: (N*A) # std: (N*A) # formula: # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + # ln(\sigma_2/\sigma_1) numerator = TT.square(old_means - new_means) + \ TT.square(old_std) - TT.square(new_std) denominator = 2 * TT.square(new_std) + 1e-8 return TT.sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info): old_means = old_dist_info["mean"] old_log_stds = old_dist_info["log_std"] new_means = new_dist_info["mean"] new_log_stds = new_dist_info["log_std"] """ Compute the KL divergence of two multivariate Gaussian distribution with diagonal covariance matrices """ old_std = np.exp(old_log_stds) new_std = np.exp(new_log_stds) # means: (N*A) # std: (N*A) # formula: # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + # ln(\sigma_2/\sigma_1) numerator = np.square(old_means - new_means) + \ np.square(old_std) - np.square(new_std) denominator = 2 * np.square(new_std) + 1e-8 return np.sum( numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def get_output(self, input_): """ This function overrides the parents' one. Creates symbolic function to compute output from an input. Parameters ---------- input_: TensorVariable Returns ------- TensorVariable """ mu = input_[:, :self.output_shape[0]] logvar = input_[:, self.output_shape[0]:] return 0.5 * T.mean(T.square(mu) + T.exp(logvar) - logvar - 1)
def get_output(self, input1_, input2_): """ This function overrides the parents' one. Creates symbolic function to compute output from an input. http://stats.stackexchange.com/questions/66271/kullback-leibler-divergence-of-two-normal-distributions Parameters ---------- input_: TensorVariable Returns ------- TensorVariable """ mu1 = input1_[:, :self.output_shape[0]] logvar1 = input1_[:, self.output_shape[0]:] mu2 = input2_[:, :self.output_shape[0]] logvar2 = input2_[:, self.output_shape[0]:] return 0.5 * T.mean((T.square(mu1 - mu2) + T.exp(logvar1) + T.exp(logvar2)) * (1 / T.exp(logvar1) + 1 / T.exp(logvar2))) - 2 # TODO: Fix L1, L2 to work!
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def adam_conditional_updates(params, cost, mincost, lr=0.001, mom1=0.9, mom2=0.999): # if cost is less than mincost, don't do update updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, ifelse(cost<mincost,v,v_t))) updates.append((mg, ifelse(cost<mincost,mg,mg_t))) updates.append((p, ifelse(cost<mincost,p,p_t))) updates.append((t, ifelse(cost<mincost,t,t+1))) return updates
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) if self.nonlinearity is not None: return self.nonlinearity(activation) else: return activation
def get_output_for(self, input, deterministic=False, set_bn_updates=True, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates if set_bn_updates: new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.dot(input, self.W) if init: ma = T.mean(activation, axis=0) activation -= ma.dimshuffle('x',0) stdv = T.sqrt(T.mean(T.square(activation),axis=0)) activation /= stdv.dimshuffle('x',0) self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)] else: activation += self.b.dimshuffle('x', 0) return self.nonlinearity(activation) # comes from Ishamel code base
def __init__(self, dim, eps=1e-6, init_count=0, init_mean=0., init_meansq=1.): ''' Args: dim: dimension of the space of points to be standardized eps: small constant to add to denominators to prevent division by 0 init_count, init_mean, init_meansq: initial values for accumulators Note: if init_count is 0, then init_mean and init_meansq have no effect beyond the first call to update(), which will ignore their values and replace them with values from a new batch of data. ''' self._eps = eps self._dim = dim with variable_scope(type(self).__name__) as self.__varscope: self._count = get_variable('count', np.array(float(init_count)), trainable=False) self._mean_1_D = get_variable('mean_1_D', np.full((1, self._dim), init_mean), broadcastable=(True,False), trainable=False) self._meansq_1_D = get_variable('meansq_1_D', np.full((1, self._dim), init_meansq), broadcastable=(True,False), trainable=False) self._stdev_1_D = tensor.sqrt(tensor.nnet.relu(self._meansq_1_D - tensor.square(self._mean_1_D))) # Relu ensures inside is nonnegative. maybe the better choice would have been to # add self._eps inside the square root, but I'm keeping things this way to preserve # backwards compatibility with existing saved models. self.get_mean = self._mean_1_D.get_value self.get_stdev = theano.function([], self._stdev_1_D[0,:]) # TODO: return with shape (1,D)
def adam(cost, params, lr, beta1=0.9, beta2=0.999, eps=1e-8): updates = [] grads = tensor.grad(cost, params); assert len(params) == len(grads) t0 = theano.shared(np.array(0., dtype=theano.config.floatX)) t = t0 + 1 corr1 = (1 - beta1**t) corr2 = (1 - beta2**t) alpha = lr * tensor.sqrt(corr2) / corr1 for p, g in zip(params, grads): m = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable) v = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable) m_t = beta1 * m + (1 - beta1) * g v_t = beta2 * v + (1 - beta2) * tensor.square(g) p_t = p - alpha * m_t/(tensor.sqrt(v_t) + eps) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((t0, t)) return updates
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): value = p.get_value(borrow=True) v = th.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) mg = th.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
def get_output_for_reparametrization(self, input, **kwargs): """Implementation of the local reparametrization trick. This essentially leads to a speedup compared to the naive implementation case. Furthermore, it leads to gradients with less variance. References ---------- Kingma et al., "Variational Dropout and the Local Reparametrization Trick", 2015 """ if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) gamma = T.dot(input, self.mu) + self.b_mu.dimshuffle('x', 0) delta = T.dot(T.square(input), T.square(self.log_to_std( self.rho))) + T.square(self.log_to_std(self.b_rho)).dimshuffle('x', 0) epsilon = self._srng.normal(size=(self.num_units,), avg=0., std=1., dtype=theano.config.floatX) # @UndefinedVariable activation = gamma + T.sqrt(delta) * epsilon return self.nonlinearity(activation)
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.), W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs): super(WeightNormLayer, self).__init__(incoming, **kwargs) self.nonlinearity = nonlinearity self.init_stdv = init_stdv k = self.input_shape[1] if b is not None: self.b = self.add_param(b, (k,), name="b", regularizable=False) if g is not None: self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g) if len(self.input_shape)==4: self.axes_to_sum = (0,2,3) self.dimshuffle_args = ['x',0,'x','x'] else: self.axes_to_sum = 0 self.dimshuffle_args = ['x',0] # scale weights in layer below incoming.W_param = incoming.W #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape)) if incoming.W_param.ndim==4: if isinstance(incoming, Deconv2DLayer): W_axes_to_sum = (0,2,3) W_dimshuffle_args = ['x',0,'x','x'] else: W_axes_to_sum = (1,2,3) W_dimshuffle_args = [0,'x','x','x'] else: W_axes_to_sum = 0 W_dimshuffle_args = ['x',0] if g is not None: incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args) else: incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def get_output_for(self, input, init=False, **kwargs): if init: m = T.mean(input, self.axes_to_sum) input -= m.dimshuffle(*self.dimshuffle_args) inv_stdv = self.init_stdv/T.sqrt(T.mean(T.square(input), self.axes_to_sum)) input *= inv_stdv.dimshuffle(*self.dimshuffle_args) self.init_updates = [(self.b, -m*inv_stdv), (self.g, self.g*inv_stdv)] elif hasattr(self,'b'): input += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(input)
def __init__(self, incoming, num_kernels, dim_per_kernel=5, theta=lasagne.init.Normal(0.05), log_weight_scale=lasagne.init.Constant(0.), b=lasagne.init.Constant(-1.), **kwargs): super(MinibatchLayer, self).__init__(incoming, **kwargs) self.num_kernels = num_kernels num_inputs = int(np.prod(self.input_shape[1:])) self.theta = self.add_param(theta, (num_inputs, num_kernels, dim_per_kernel), name="theta") self.log_weight_scale = self.add_param(log_weight_scale, (num_kernels, dim_per_kernel), name="log_weight_scale") self.W = self.theta * (T.exp(self.log_weight_scale)/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0,1) self.b = self.add_param(b, (num_kernels,), name="b")
def __init__(self, incoming, num_units, theta=lasagne.init.Normal(0.1), b=lasagne.init.Constant(0.), weight_scale=lasagne.init.Constant(1.), train_scale=False, nonlinearity=relu, **kwargs): super(DenseLayer, self).__init__(incoming, **kwargs) self.nonlinearity = (lasagne.nonlinearities.identity if nonlinearity is None else nonlinearity) self.num_units = num_units num_inputs = int(np.prod(self.input_shape[1:])) self.theta = self.add_param(theta, (num_inputs, num_units), name="theta") self.weight_scale = self.add_param(weight_scale, (num_units,), name="weight_scale", trainable=train_scale) self.W = self.theta * (self.weight_scale/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0) self.b = self.add_param(b, (num_units,), name="b")