Python theano.tensor 模块,square() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.square()。
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def l2normalize(layer, train_scale=True):
W_param = layer.W
s = W_param.get_value().shape
if len(s)==4:
axes_to_sum = (1,2,3)
dimshuffle_args = [0,'x','x','x']
k = s[0]
else:
axes_to_sum = 0
dimshuffle_args = ['x',0]
k = s[1]
layer.W_scale = layer.add_param(lasagne.init.Constant(1.),
(k,), name="W_scale", trainable=train_scale, regularizable=False)
layer.W = W_param * (layer.W_scale/T.sqrt(1e-6 + T.sum(T.square(W_param),axis=axes_to_sum))).dimshuffle(*dimshuffle_args)
return layer
# fully connected layer with weight normalization
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def rbf_kernel(X0):
XY = T.dot(X0, X0.transpose())
x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
X2e = T.repeat(x2, X0.shape[0], axis=1)
H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)
V = H.flatten()
# median distance
h = T.switch(T.eq((V.shape[0] % 2), 0),
# if even vector
T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
# if odd vector
T.sort(V)[V.shape[0] // 2])
h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.
Kxy = T.exp(-H / h ** 2 / 2.0)
neighbors = T.argsort(H, axis=1)[:, 1]
return Kxy, neighbors, h
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
def get_output(self, input_):
"""
This function overrides the parents' one.
Creates symbolic function to compute output from an input.
Parameters
----------
input_: TensorVariable
Returns
-------
TensorVariable
"""
mu = input_[:, :self.output_shape[0]]
logvar = input_[:, self.output_shape[0]:]
return 0.5 * T.mean(T.square(mu) + T.exp(logvar) - logvar - 1)
def get_output(self, input1_, input2_):
"""
This function overrides the parents' one.
Creates symbolic function to compute output from an input.
http://stats.stackexchange.com/questions/66271/kullback-leibler-divergence-of-two-normal-distributions
Parameters
----------
input_: TensorVariable
Returns
-------
TensorVariable
"""
mu1 = input1_[:, :self.output_shape[0]]
logvar1 = input1_[:, self.output_shape[0]:]
mu2 = input2_[:, :self.output_shape[0]]
logvar2 = input2_[:, self.output_shape[0]:]
return 0.5 * T.mean((T.square(mu1 - mu2) + T.exp(logvar1) + T.exp(logvar2)) * (1 / T.exp(logvar1) + 1 / T.exp(logvar2))) - 2
# TODO: Fix L1, L2 to work!
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1.), th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def adam_conditional_updates(params, cost, mincost, lr=0.001, mom1=0.9, mom2=0.999): # if cost is less than mincost, don't do update
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, ifelse(cost<mincost,v,v_t)))
updates.append((mg, ifelse(cost<mincost,mg,mg_t)))
updates.append((p, ifelse(cost<mincost,p,p_t)))
updates.append((t, ifelse(cost<mincost,t,t+1)))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
if self.nonlinearity is not None:
return self.nonlinearity(activation)
else:
return activation
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, set_bn_updates=True, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
if set_bn_updates:
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
# comes from Ishamel code base
def __init__(self, dim, eps=1e-6, init_count=0, init_mean=0., init_meansq=1.):
'''
Args:
dim: dimension of the space of points to be standardized
eps: small constant to add to denominators to prevent division by 0
init_count, init_mean, init_meansq: initial values for accumulators
Note:
if init_count is 0, then init_mean and init_meansq have no effect beyond
the first call to update(), which will ignore their values and
replace them with values from a new batch of data.
'''
self._eps = eps
self._dim = dim
with variable_scope(type(self).__name__) as self.__varscope:
self._count = get_variable('count', np.array(float(init_count)), trainable=False)
self._mean_1_D = get_variable('mean_1_D', np.full((1, self._dim), init_mean), broadcastable=(True,False), trainable=False)
self._meansq_1_D = get_variable('meansq_1_D', np.full((1, self._dim), init_meansq), broadcastable=(True,False), trainable=False)
self._stdev_1_D = tensor.sqrt(tensor.nnet.relu(self._meansq_1_D - tensor.square(self._mean_1_D)))
# Relu ensures inside is nonnegative. maybe the better choice would have been to
# add self._eps inside the square root, but I'm keeping things this way to preserve
# backwards compatibility with existing saved models.
self.get_mean = self._mean_1_D.get_value
self.get_stdev = theano.function([], self._stdev_1_D[0,:]) # TODO: return with shape (1,D)
def adam(cost, params, lr, beta1=0.9, beta2=0.999, eps=1e-8):
updates = []
grads = tensor.grad(cost, params); assert len(params) == len(grads)
t0 = theano.shared(np.array(0., dtype=theano.config.floatX))
t = t0 + 1
corr1 = (1 - beta1**t)
corr2 = (1 - beta2**t)
alpha = lr * tensor.sqrt(corr2) / corr1
for p, g in zip(params, grads):
m = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable)
v = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable)
m_t = beta1 * m + (1 - beta1) * g
v_t = beta2 * v + (1 - beta2) * tensor.square(g)
p_t = p - alpha * m_t/(tensor.sqrt(v_t) + eps)
updates.append((m, m_t))
updates.append((v, v_t))
updates.append((p, p_t))
updates.append((t0, t))
return updates
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, set_bn_updates=True, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
if set_bn_updates:
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
# comes from Ishamel code base
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
value = p.get_value(borrow=True)
v = th.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
mg = th.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=p.broadcastable)
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999):
updates = []
grads = T.grad(cost, params)
t = th.shared(np.cast[th.config.floatX](1.))
for p, g in zip(params, grads):
v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
v_t = mom1*v + (1. - mom1)*g
mg_t = mom2*mg + (1. - mom2)*T.square(g)
v_hat = v_t / (1. - mom1 ** t)
mg_hat = mg_t / (1. - mom2 ** t)
g_t = v_hat / T.sqrt(mg_hat + 1e-8)
p_t = p - lr * g_t
updates.append((v, v_t))
updates.append((mg, mg_t))
updates.append((p, p_t))
updates.append((t, t+1))
return updates
def get_output_for(self, input, deterministic=False, **kwargs):
if deterministic:
norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
else:
batch_mean = T.mean(input,axis=self.axes_to_sum).flatten()
centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args)
batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten()
batch_stdv = T.sqrt(1e-6 + batch_var)
norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args)
# BN updates
new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean
new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var
self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)]
if hasattr(self, 'g'):
activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args)
else:
activation = norm_features
if hasattr(self, 'b'):
activation += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs):
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
activation = T.dot(input, self.W)
if init:
ma = T.mean(activation, axis=0)
activation -= ma.dimshuffle('x',0)
stdv = T.sqrt(T.mean(T.square(activation),axis=0))
activation /= stdv.dimshuffle('x',0)
self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)]
else:
activation += self.b.dimshuffle('x', 0)
return self.nonlinearity(activation)
def get_output_for_reparametrization(self, input, **kwargs):
"""Implementation of the local reparametrization trick.
This essentially leads to a speedup compared to the naive implementation case.
Furthermore, it leads to gradients with less variance.
References
----------
Kingma et al., "Variational Dropout and the Local Reparametrization Trick", 2015
"""
if input.ndim > 2:
# if the input has more than two dimensions, flatten it into a
# batch of feature vectors.
input = input.flatten(2)
gamma = T.dot(input, self.mu) + self.b_mu.dimshuffle('x', 0)
delta = T.dot(T.square(input), T.square(self.log_to_std(
self.rho))) + T.square(self.log_to_std(self.b_rho)).dimshuffle('x', 0)
epsilon = self._srng.normal(size=(self.num_units,), avg=0., std=1.,
dtype=theano.config.floatX) # @UndefinedVariable
activation = gamma + T.sqrt(delta) * epsilon
return self.nonlinearity(activation)
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
old_means = old_dist_info_vars["mean"]
old_log_stds = old_dist_info_vars["log_std"]
new_means = new_dist_info_vars["mean"]
new_log_stds = new_dist_info_vars["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = TT.exp(old_log_stds)
new_std = TT.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = TT.square(old_means - new_means) + \
TT.square(old_std) - TT.square(new_std)
denominator = 2 * TT.square(new_std) + 1e-8
return TT.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def kl(self, old_dist_info, new_dist_info):
old_means = old_dist_info["mean"]
old_log_stds = old_dist_info["log_std"]
new_means = new_dist_info["mean"]
new_log_stds = new_dist_info["log_std"]
"""
Compute the KL divergence of two multivariate Gaussian distribution with
diagonal covariance matrices
"""
old_std = np.exp(old_log_stds)
new_std = np.exp(new_log_stds)
# means: (N*A)
# std: (N*A)
# formula:
# { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
# ln(\sigma_2/\sigma_1)
numerator = np.square(old_means - new_means) + \
np.square(old_std) - np.square(new_std)
denominator = 2 * np.square(new_std) + 1e-8
return np.sum(
numerator / denominator + new_log_stds - old_log_stds, axis=-1)
def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs):
super(WeightNormLayer, self).__init__(incoming, **kwargs)
self.nonlinearity = nonlinearity
self.init_stdv = init_stdv
k = self.input_shape[1]
if b is not None:
self.b = self.add_param(b, (k,), name="b", regularizable=False)
if g is not None:
self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g)
if len(self.input_shape)==4:
self.axes_to_sum = (0,2,3)
self.dimshuffle_args = ['x',0,'x','x']
else:
self.axes_to_sum = 0
self.dimshuffle_args = ['x',0]
# scale weights in layer below
incoming.W_param = incoming.W
#incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
if incoming.W_param.ndim==4:
if isinstance(incoming, Deconv2DLayer):
W_axes_to_sum = (0,2,3)
W_dimshuffle_args = ['x',0,'x','x']
else:
W_axes_to_sum = (1,2,3)
W_dimshuffle_args = [0,'x','x','x']
else:
W_axes_to_sum = 0
W_dimshuffle_args = ['x',0]
if g is not None:
incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
else:
incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
def get_output_for(self, input, init=False, **kwargs):
if init:
m = T.mean(input, self.axes_to_sum)
input -= m.dimshuffle(*self.dimshuffle_args)
inv_stdv = self.init_stdv/T.sqrt(T.mean(T.square(input), self.axes_to_sum))
input *= inv_stdv.dimshuffle(*self.dimshuffle_args)
self.init_updates = [(self.b, -m*inv_stdv), (self.g, self.g*inv_stdv)]
elif hasattr(self,'b'):
input += self.b.dimshuffle(*self.dimshuffle_args)
return self.nonlinearity(input)
def __init__(self, incoming, num_kernels, dim_per_kernel=5, theta=lasagne.init.Normal(0.05),
log_weight_scale=lasagne.init.Constant(0.), b=lasagne.init.Constant(-1.), **kwargs):
super(MinibatchLayer, self).__init__(incoming, **kwargs)
self.num_kernels = num_kernels
num_inputs = int(np.prod(self.input_shape[1:]))
self.theta = self.add_param(theta, (num_inputs, num_kernels, dim_per_kernel), name="theta")
self.log_weight_scale = self.add_param(log_weight_scale, (num_kernels, dim_per_kernel), name="log_weight_scale")
self.W = self.theta * (T.exp(self.log_weight_scale)/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0,1)
self.b = self.add_param(b, (num_kernels,), name="b")
def __init__(self, incoming, num_units, theta=lasagne.init.Normal(0.1), b=lasagne.init.Constant(0.),
weight_scale=lasagne.init.Constant(1.), train_scale=False, nonlinearity=relu, **kwargs):
super(DenseLayer, self).__init__(incoming, **kwargs)
self.nonlinearity = (lasagne.nonlinearities.identity if nonlinearity is None else nonlinearity)
self.num_units = num_units
num_inputs = int(np.prod(self.input_shape[1:]))
self.theta = self.add_param(theta, (num_inputs, num_units), name="theta")
self.weight_scale = self.add_param(weight_scale, (num_units,), name="weight_scale", trainable=train_scale)
self.W = self.theta * (self.weight_scale/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0)
self.b = self.add_param(b, (num_units,), name="b")