我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.sqrt()。
def gelu(x): return 0.5 * x * (1 + T.tanh(T.sqrt(2 / np.pi) * (x + 0.044715 * T.pow(x, 3))))
def nll_loss_sharedparams(self, mus, sigmas, corxy, pis, y_true): mus_ex = mus[np.newaxis, :, :] X = y_true[:, np.newaxis, :] diff = X - mus_ex diffprod = T.prod(diff, axis=-1) corxy2 = corxy **2 diff2 = diff ** 2 sigmas2 = sigmas ** 2 sigmainvs = 1.0 / sigmas sigmainvprods = sigmainvs[:, 0] * sigmainvs[:, 1] diffsigma = diff2 / sigmas2 diffsigmanorm = T.sum(diffsigma, axis=-1) z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods oneminuscorxy2inv = 1.0 / (1.0 - corxy2) expterm = -0.5 * z * oneminuscorxy2inv new_exponent = T.log(0.5/np.pi) + T.log(sigmainvprods) + T.log(np.sqrt(oneminuscorxy2inv)) + expterm + T.log(pis) max_exponent = T.max(new_exponent ,axis=1, keepdims=True) mod_exponent = new_exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent),axis=1) log_gauss = max_exponent + T.log(gauss_mix) loss = -T.mean(log_gauss) return loss
def rbf_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h ** 2 / 2.0) neighbors = T.argsort(H, axis=1)[:, 1] return Kxy, neighbors, h
def rbf_kernel(X): XY = T.dot(X, X.T) x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x') X2e = T.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2. * XY V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(.5 * h / T.log(H.shape[0].astype('float32') + 1.)) # compute the rbf kernel kxy = T.exp(-H / (h ** 2) / 2.0) dxkxy = -T.dot(kxy, X) sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x') dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / (h ** 2) return kxy, dxkxy
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
def lyr_linear( self, name_, s_x_, idim_, odim_, init_=None, bias_=0., params_di_='params'): ''' dense matrix multiplication, optionally adding a bias vector ''' name_W = name_+'_w' name_B = name_+'_b' self.set_vars(params_di_) if init_ is None: init_ = dict(init_=[1.4/sqrt(idim_+odim_)]) v_W = self.get_variable(name_W, (idim_,odim_), **init_) if bias_ is None: s_ret = T.dot(s_x_, v_W) else: v_B = self.get_variable(name_B, (odim_,), bias_) s_ret = T.dot(s_x_, v_W) + v_B return s_ret
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
def get_output_for(self, input, init=False, deterministic=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.dot(input, self.W) if init: ma = T.mean(activation, axis=0) activation -= ma.dimshuffle('x',0) stdv = T.sqrt(T.mean(T.square(activation),axis=0)) activation /= stdv.dimshuffle('x',0) self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)] else: activation += self.b.dimshuffle('x', 0) return self.nonlinearity(activation)
def l2normalize(layer, train_scale=True): W_param = layer.W s = W_param.get_value().shape if len(s)==4: axes_to_sum = (1,2,3) dimshuffle_args = [0,'x','x','x'] k = s[0] else: axes_to_sum = 0 dimshuffle_args = ['x',0] k = s[1] layer.W_scale = layer.add_param(lasagne.init.Constant(1.), (k,), name="W_scale", trainable=train_scale, regularizable=False) layer.W = W_param * (layer.W_scale/T.sqrt(1e-6 + T.sum(T.square(W_param),axis=axes_to_sum))).dimshuffle(*dimshuffle_args) return layer # fully connected layer with weight normalization
def adadelta(parameters, gradients, rho=0.95, eps=1e-6): """ Reference: ADADELTA: An Adaptive Learning Rate Method, Zeiler 2012. https://arxiv.org/abs/1212.5701 Adapted from the Adadelta implementation from Tensorflow. """ accum = [theano.shared(numpy.zeros(p.get_value().shape, floatX)) for p in parameters] accum_updates = [theano.shared(numpy.zeros(p.get_value().shape, floatX)) for p in parameters] new_accum = [rho * g0 + (1.0 - rho) * (g**2) for g0, g in izip(accum, gradients)] updates = [tensor.sqrt(d0 + eps) / tensor.sqrt(g0 + eps) * g for d0, g0, g in izip(accum_updates, new_accum, gradients)] new_accum_updates = [rho * d0 + (1.0 - rho) * (d**2) for d0, d in izip(accum_updates, updates)] accum_ = zip(accum, new_accum) accum_updates_ = zip(accum_updates, new_accum_updates) parameters_ = [ (p, (p - d)) for p,d in izip(parameters, updates)] return accum_ + accum_updates_ + parameters_
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) accumulators = [shared_zeros(p.get_value().shape) for p in params] delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] # self.updates = [] self.updates = [(self.iterations, self.iterations + 1.)] for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator self.updates.append((a, new_a)) # use the new accumulator and the *old* delta_accumulator update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon) new_p = p - self.lr * update self.updates.append((p, new_p)) # update delta_accumulator new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2 self.updates.append((d_a, new_d_a)) return self.updates
def dot_2d(k, M, b=None, g=None): # k: (nb_samples, memory_width) # M: (nb_samples, memory_dim, memory_width) # norms of keys and memories # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,) # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,) k = k[:, None, :] # (nb_samples, 1, memory_width) value = k * M if b is not None: b = b[:, None, :] value *= b # (nb_samples, memory_dim,) if g is not None: g = g[None, None, :] value *= g sim = T.sum(value, axis=2) return sim
def op_cosine_c( s_xr_, s_xi_, s_yr_, s_yi_, axis_=-1, keepdims_=True, eps_=1e-7): ''' cosine between two complex vectors, uses standard complex inner product Args: s_xr_: real part of x s_xi_: imag part of x s_yr_: real part of y s_yi_: imag part of y eps_: small number to prevent divide by zero ''' s_nrm = s_xr_*s_yr_ + s_xi_*s_yi_ s_nx = T.sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_) s_ny = T.sum(T.sqr(s_yr_) + T.sqr(s_yi_), axis=axis_, keepdims=keepdims_) return T.sum(s_nrm, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_)
def apply_grad( self, mdl_, v_params_, s_grads_, params_group_='params', oparams_group_='oparams', lr_=1e-3, beta1_=.9, beta2_=.999, eps_=1e-9): def get_shared_shape(v_): return v_.get_value(return_internal_type=True).shape with mdl_.get_group(oparams_group_): v_lr = mdl_.get_variable(name_=self.name + '_lr', init_=np.asarray(lr_, dtype=th.config.floatX)) v_beta = mdl_.get_variable(name_=self.name + '_beta', init_=np.asarray([beta1_, beta2_], dtype=th.config.floatX)) v_eps = mdl_.get_variable(name_=self.name + '_eps', init_=np.asarray(eps_, dtype=th.config.floatX)) v_timestep = mdl_.get_variable(name_=self.name + '_timestep', init_=np.asarray(eps_, dtype=th.config.floatX)) v_m_li = [mdl_.get_variable(name_='adam_m_'+p.name, shape_=get_shared_shape(p), init_=0.) for p in v_params_] v_v_li = [mdl_.get_variable(name_='adam_v_'+p.name, shape_=get_shared_shape(p), init_=0.) for p in v_params_] s_bs = 1. / (1. - v_beta * v_timestep) s_b1, s_b2 = v_beta[0], v_beta[1] s_b1s, s_b2s = s_bs[0], s_bs[1] r_m = [(m, (m*s_b1 + (1.-s_b1)*g)) for m,g in zip(v_m_li,s_grads_)] r_v = [(v, (v*s_b2 + (1.-s_b2)*g*g)) for v,g in zip(v_v_li,s_grads_)] r_grad = [(p, p-(s_b1s*m*v_lr)/(T.sqrt(s_b2s*v)+v_eps)) for p,m,v in zip(v_params_,v_m_li,v_v_li)] return r_grad + r_m + r_v + [(v_timestep, v_timestep+1)]
def adadelta(tparams, grads, x, y, mask, lengths, cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, y, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def adadelta(tparams, grads, weightVector, iVector, jVector, cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([weightVector, iVector, jVector], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def get_output_for(self, inputs, **kwargs): vals, ref = inputs def filt(V, R): if self.norm_type is not None: o = tt.ones((1, V.shape[1], V.shape[2]), np.float32) norm = gaussian_filter(R, o, self.kern_std, self.ref_dim) norm = tt.sqrt(norm) if self.norm_type == "sym" else norm norm += 1e-8 V = V / norm if self.norm_type in ["pre", "sym"] else V F = gaussian_filter(R, V, self.kern_std) return F / norm if self.norm_type in ["post", "sym"] else F filtered = theano.scan(fn=filt, sequences=[vals, ref], outputs_info=None)[0] return filtered
def __call__(self, input_): m = input_.mean() v = input_.std() new_m = T.switch(T.eq(self.m, 0.), m, (np.float32(1.) - self.rate) * self.m + self.rate * m) new_var = T.switch(T.eq(self.var, 0.), v, (np.float32(1.) - self.rate) * self.var + self.rate * v) updates = [(self.m, new_m), (self.var, new_var)] input_centered = ( (input_ - new_m) / T.maximum(1., T.sqrt(new_var))) input_ = T.zeros_like(input_) + input_ outs = OrderedDict( x=input_, x_centered=input_centered, m=new_m, var=new_var ) return outs, updates
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) t = theano.shared(floatX(1.)) b1_t = self.b1*self.l**(t-1) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = b1_t*m + (1 - b1_t)*g v_t = self.b2*v + (1 - self.b2)*g**2 m_c = m_t / (1-self.b1**t) v_c = v_t / (1-self.b2**t) p_t = p - (self.lr * m_c) / (T.sqrt(v_c) + self.e) p_t = self.regularizer.weight_regularize(p_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t) ) updates.append((t, t + 1.)) return updates
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) for p,g in zip(params,grads): g = self.regularizer.gradient_regularize(p, g) acc = theano.shared(p.get_value() * 0.) acc_delta = theano.shared(p.get_value() * 0.) acc_new = self.rho * acc + (1 - self.rho) * g ** 2 updates.append((acc,acc_new)) update = g * T.sqrt(acc_delta + self.epsilon) / T.sqrt(acc_new + self.epsilon) updated_p = p - self.lr * update updated_p = self.regularizer.weight_regularize(updated_p) updates.append((p, updated_p)) acc_delta_new = self.rho * acc_delta + (1 - self.rho) * update ** 2 updates.append((acc_delta,acc_delta_new)) return updates
def discrim(X): current_input = dropout(X, 0.3) ### encoder ### cv1 = relu(dnn_conv(current_input, aew1, subsample=(1,1), border_mode=(1,1))) cv2 = relu(batchnorm(dnn_conv(cv1, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2, b=aeb2)) cv3 = relu(batchnorm(dnn_conv(cv2, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3, b=aeb3)) cv4 = relu(batchnorm(dnn_conv(cv3, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4, b=aeb4)) cv5 = relu(batchnorm(dnn_conv(cv4, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5, b=aeb5)) cv6 = relu(batchnorm(dnn_conv(cv5, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6, b=aeb6)) ### decoder ### dv6 = relu(batchnorm(deconv(cv6, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6t, b=aeb6t)) dv5 = relu(batchnorm(deconv(dv6, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5t, b=aeb5t)) dv4 = relu(batchnorm(deconv(dv5, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4t, b=aeb4t)) dv3 = relu(batchnorm(deconv(dv4, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3t, b=aeb3t)) dv2 = relu(batchnorm(deconv(dv3, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2t, b=aeb2t)) dv1 = tanh(deconv(dv2, aew1, subsample=(1,1), border_mode=(1,1))) rX = dv1 mse = T.sqrt(T.sum(T.abs_(T.flatten(X-rX, 2)),axis=1)) + T.sqrt(T.sum(T.flatten((X-rX)**2, 2), axis=1)) return T.flatten(cv6, 2), rX, mse
def discrim(X): current_input = dropout(X, 0.3) ### encoder ### cv1 = relu(dnn_conv(current_input, aew1, subsample=(1,1), border_mode=(1,1))) cv2 = relu(batchnorm(dnn_conv(cv1, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2, b=aeb2)) cv3 = relu(batchnorm(dnn_conv(cv2, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3, b=aeb3)) cv4 = relu(batchnorm(dnn_conv(cv3, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4, b=aeb4)) cv5 = relu(batchnorm(dnn_conv(cv4, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5, b=aeb5)) cv6 = relu(batchnorm(dnn_conv(cv5, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6, b=aeb6)) ### decoder ### dv6 = relu(batchnorm(deconv(cv6, aew6, subsample=(4,4), border_mode=(0,0)), g=aeg6t, b=aeb6t)) dv5 = relu(batchnorm(deconv(dv6, aew5, subsample=(1,1), border_mode=(1,1)), g=aeg5t, b=aeb5t)) dv4 = relu(batchnorm(deconv(dv5, aew4, subsample=(4,4), border_mode=(2,2)), g=aeg4t, b=aeb4t)) dv3 = relu(batchnorm(deconv(dv4, aew3, subsample=(1,1), border_mode=(1,1)), g=aeg3t, b=aeb3t)) dv2 = relu(batchnorm(deconv(dv3, aew2, subsample=(4,4), border_mode=(2,2)), g=aeg2t, b=aeb2t)) dv1 = tanh(deconv(dv2, aew1, subsample=(1,1), border_mode=(1,1))) rX = dv1 mse = T.sqrt(T.sum(T.abs_(T.flatten(X-rX, 2)),axis=1)) + T.sqrt(T.sum(T.flatten((X-rX)**2, 2), axis=1)) # L1 and L2 loss return T.flatten(cv6, 2), rX, mse
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) t = theano.shared(floatX(1.)) b1_t = self.b1*self.l**(t-1) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = b1_t*m + (1 - b1_t)*g v_t = self.b2*v + (1 - self.b2)*g**2 m_c = m_t / (1-self.b1**t) v_c = v_t / (1-self.b2**t) p_t = p - (self.lr * m_c) / (T.sqrt(v_c) + self.e) p_t = self.regularizer.weight_regularize(p_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((t, t + 1.)) return updates
def create_adadelta_updates(updates, params, gparams, gsums, xsums,\ lr, eps, rho): for p, g, gacc, xacc in zip(params, gparams, gsums, xsums): if is_subtensor_op(p): origin, indexes = get_subtensor_op_inputs(p) gacc_slices = gacc[indexes] xacc_slices = xacc[indexes] new_gacc = rho * gacc_slices + (1.0-rho) * g**2 d = -T.sqrt((xacc_slices + eps)/(new_gacc + eps)) * g new_xacc = rho * xacc_slices + (1.0-rho) * d**2 updates[gacc] = T.set_subtensor(gacc_slices, new_gacc) updates[xacc] = T.set_subtensor(xacc_slices, new_xacc) updates[origin] = T.inc_subtensor(p, d) else: new_gacc = rho * gacc + (1.0-rho) * g**2 d = -T.sqrt((xacc + eps)/(new_gacc + eps)) * g new_xacc = rho * xacc + (1.0-rho) * d**2 updates[gacc] = new_gacc updates[xacc] = new_xacc updates[p] = p + d
def get_result(self, input, create_updates) : if create_updates: self.create_updates(input) # returns BN result for given input. epsilon = np.float64(1e-06).astype(theano.config.floatX) if self.mode == 0: now_mean = T.mean(input, axis=0) now_var = T.var(input, axis=0) else: now_mean = T.mean(input, axis=(0,2,3)) now_var = T.var(input, axis=(0,2,3)) now_mean = self.run_mode * self.mean + (1.0-self.run_mode) * now_mean now_var = self.run_mode * self.var + (1.0-self.run_mode) * now_var if self.mode == 0: output = self.gamma * (input - now_mean) / (T.sqrt(now_var+epsilon)) + self.beta else: output = self.gamma.dimshuffle(('x', 0, 'x', 'x')) * (input - now_mean.dimshuffle(('x', 0, 'x', 'x'))) \ / (T.sqrt(now_var+epsilon).dimshuffle(('x', 0, 'x', 'x'))) + self.beta.dimshuffle(('x', 0, 'x', 'x')) return output
def create_adagrad_updates(updates, params, gparams, gsums, lr, eps): for p, g, acc in zip(params, gparams, gsums): if is_subtensor_op(p): origin, indexes = get_subtensor_op_inputs(p) #acc_slices = acc[indexes] acc_slices = get_similar_subtensor(acc, indexes, p) new_acc = acc_slices + g**2 updates[acc] = T.set_subtensor(acc_slices, new_acc) updates[origin] = T.inc_subtensor(p, \ - lr * (g / T.sqrt(new_acc + eps))) else: new_acc = acc + g**2 updates[acc] = new_acc updates[p] = p - lr * (g / T.sqrt(new_acc + eps)) #updates[p] = p - lr * (g / (T.sqrt(new_acc) + eps)) # which one to use?
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum): has_momentum = momentum.get_value() > 0.0 samples = [ default_mrng.normal(size=p.shape, avg=0, std=1, dtype=theano.config.floatX) for p in params ] HVs = T.Lop(gparams, params, samples) i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omg_t = 1.0 - gamma**i_t for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs): if is_subtensor_op(p): raise Exception("ESGD subtensor update not implemented!") else: D_t = D * gamma + T.sqr(Hv) * (1.0-gamma) if has_momentum: m_t = m*momentum + g updates[m] = m_t else: m_t = g g_t = m_t / ( T.sqrt(D_t/omg_t + eps) ) #g_t = m_t / ( T.sqrt(D_t + eps) ) updates[D] = D_t updates[p] = p - lr*g_t updates[i] = i_t
def adadelta(tparams, grads, x, y, mask, lengths, cost, options, t=None, t_label=None): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] if options['predictTime']: f_grad_shared = theano.function([x, y, t, t_label, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') elif len(options['timeFile']) > 0: f_grad_shared = theano.function([x, y, t, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') else: f_grad_shared = theano.function([x, y, mask, lengths], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def __call__(self, cost, params): grads = T.grad(cost=cost ,wrt=params) updates = [] exp = theano.shared(np.float32(1.0),name='exp',borrow=True) updates.append((exp, exp+1)) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_new = self.beta1 * m + (1 - self.beta1) * g v_new = self.beta2 * v + (1 - self.beta2) * g**2 mt = m_new / (1 - self.beta1**exp) vt = v_new / (1 - self.beta2**exp) updates.append((m, m_new)) updates.append((v, v_new)) updates.append((p, p - self.lr * mt / (T.sqrt(vt) + self.epsilon))) return updates
def get_gradients(self, loss, params): """ Consider the situation that gradient is weighted. """ if isinstance(loss, list): grads = T.grad(loss[0], params, consider_constant=loss[1:]) # gradient of loss else: grads = T.grad(loss, params) if hasattr(self, 'clipnorm') and self.clipnorm > 0: print('use gradient clipping!!') print('clipnorm = %f' % self.clipnorm) norm = T.sqrt(sum([T.sum(g ** 2) for g in grads])) grads = [clip_norm(g, self.clipnorm, norm) for g in grads] else: print('not use gradient clipping!!') return grads
def Adam(cost, params, learning_rate=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = OrderedDict() grads = T.grad(cost, params) i = theano.shared(np.asarray(0., dtype=theano.config.floatX)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = learning_rate * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates[m] = m_t updates[v] = v_t updates[p] = p_t updates[i] = i_t return updates
def RmsProp(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): updates = OrderedDict() grads = T.grad(cost, params) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(np.dtype(theano.config.floatX).type(1)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.clip(g, -grad_clip, grad_clip) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates
def adagrad(cost, params, learning_rate=0.1, epsilon=1e-6, **kwargs): """Adaptive Gradient Descent Scale learning rates by dividing with the square root of accumulated squared gradients References ---------- .. [1] http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf """ gparams = T.grad(cost, params) updates = OrderedDict() for param, gparam in zip(params, gparams): accu = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) accu_new = accu + gparam ** 2 updates[accu] = accu_new updates[param] = param - learning_rate * gparam / T.sqrt(accu_new + epsilon) return updates
def adam(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs): """Adam Gradient Descent Scale learning rates by Adaptive moment estimation References ---------- .. [1] https://arxiv.org/pdf/1412.6980v8.pdf """ gparams = T.grad(cost, params) updates = OrderedDict() t = shared_variable(to_float_X(0.)) t_t = 1. + t l_r_t = learning_rate * T.sqrt(1. - beta2 ** t_t) / (1. - beta1 ** t_t) for param, gparam in zip(params, gparams): m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) v = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) m_t = beta1 * m + (1. - beta1) * gparam v_t = beta2 * v + (1. - beta2) * T.sqr(gparam) updates[m] = m_t updates[v] = v_t updates[param] = param - l_r_t * m_t / (T.sqrt(v_t) + epsilon) updates[t] = t_t return updates
def conv_pairwise_distance(feature_maps, codebook): """ Calculates the pairwise distances between the feature maps (n_samples, filters, x, y) :param feature_maps: :param codebook: :return: """ x_square = T.sum(feature_maps ** 2, axis=1) # n_samples, filters, x, y x_square = x_square.reshape((x_square.shape[0], 1, x_square.shape[1], x_square.shape[2])) x_square = T.addbroadcast(x_square, 1) y_square = T.sum(codebook ** 2, axis=1) y_square = y_square.reshape((1, y_square.shape[0], y_square.shape[1], y_square.shape[2])) y_square = T.addbroadcast(y_square, 0, 2, 3) inner_product = T.nnet.conv2d(feature_maps, codebook) dist = x_square + y_square - 2 * inner_product dist = T.sqrt(T.maximum(dist, 0)) return dist
def symbolic_distance_matrix(A, B): """ Defines the symbolic matrix that contains the distances between the vectors of A and B :param A: :param B: :return: """ aa = T.sum(A * A, axis=1) bb = T.sum(B * B, axis=1) AB = T.dot(A, T.transpose(B)) AA = T.transpose(T.tile(aa, (bb.shape[0], 1))) BB = T.tile(bb, (aa.shape[0], 1)) D = AA + BB - 2 * AB D = T.maximum(D, 0) D = T.sqrt(D) return D
def fit(self, x): s = x.shape x = x.copy().reshape((s[0],np.prod(s[1:]))) m = np.mean(x, axis=0) x -= m sigma = np.dot(x.T,x) / x.shape[0] U, S, V = linalg.svd(sigma) tmp = np.dot(U, np.diag(1./np.sqrt(S+self.regularization))) tmp2 = np.dot(U, np.diag(np.sqrt(S+self.regularization))) self.ZCA_mat = th.shared(np.dot(tmp, U.T).astype(th.config.floatX)) self.inv_ZCA_mat = th.shared(np.dot(tmp2, U.T).astype(th.config.floatX)) self.mean = th.shared(m.astype(th.config.floatX))
def lyr_conv( self, name_, s_x_, idim_, odim_, fsize_=3, init_scale_ = None, bias_=0., dilation_=1, params_di_='params'): ''' Typical convolution layer used in CNN ''' name_conv_W = '%s_w'%name_ name_conv_B = '%s_b'%name_ ir = 2.3/sqrt(idim_*fsize_*fsize_+odim_) if init_scale_ is None else init_scale_ self.set_vars(params_di_) v_conv_W = self.get_variable( name_conv_W, (odim_,idim_,fsize_,fsize_), init_=[ir]) ret = T.nnet.conv2d( s_x_, v_conv_W, filter_shape=(odim_, idim_, fsize_, fsize_), border_mode = 'half', filter_dilation = (dilation_, dilation_) ) if bias_ is not None: v_conv_B = self.get_variable(name_conv_B, (odim_,), init_=[bias_]) ret = ret + v_conv_B.dimshuffle('x',0,'x','x') return ret
def op_l2norm(s_x_, eps_=1e-6): return T.sqrt(eps_+T.sum(T.sqr(s_x_)))
def op_cosine(s_u_, s_v_, flatten_=True, eps_=1e-6): if flatten_: s_u = s_u_.flatten() s_v = s_v_.flatten() return T.dot(s_u, s_v) / T.sqrt(eps_+T.sum(T.sqr(s_u))*T.sum(T.sqr(s_v))) else: s_u = s_u_ s_v = s_v_ T.sum(s_u*s_v, axis=-1)/T.sqrt(eps_+T.sum(T.sqr(s_u), axis=-1)*T.sum(T.sqr(s_v), axis=-1))