我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.sqr()。
def dot_2d(k, M, b=None, g=None): # k: (nb_samples, memory_width) # M: (nb_samples, memory_dim, memory_width) # norms of keys and memories # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,) # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,) k = k[:, None, :] # (nb_samples, 1, memory_width) value = k * M if b is not None: b = b[:, None, :] value *= b # (nb_samples, memory_dim,) if g is not None: g = g[None, None, :] value *= g sim = T.sum(value, axis=2) return sim
def op_cosine_c( s_xr_, s_xi_, s_yr_, s_yi_, axis_=-1, keepdims_=True, eps_=1e-7): ''' cosine between two complex vectors, uses standard complex inner product Args: s_xr_: real part of x s_xi_: imag part of x s_yr_: real part of y s_yi_: imag part of y eps_: small number to prevent divide by zero ''' s_nrm = s_xr_*s_yr_ + s_xi_*s_yi_ s_nx = T.sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_) s_ny = T.sum(T.sqr(s_yr_) + T.sqr(s_yi_), axis=axis_, keepdims=keepdims_) return T.sum(s_nrm, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_)
def op_ortho_loss(s_x_, axes_=(-2, -1), ndim_=None): ''' orthogoal matrix loss used to regularize parameter to unitary Args: s_x_: (batch of) matrices axes_: tuple of two integers, specify which axes to be for matrix, defaults to last two axes ndim_: specify args to be (ndim_ x ndim_) matrices ''' if ndim_ is None: ax = axes_[0] ndim = T.shape(s_x_)[ax] else: ndim = ndim_ tpat = list(range(ndim)) bpat = ['x'] * s_x_.ndim tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]] bpat[axes_[0]] = 0 bpat[axes_[1]] = 1 s_y = T.dot(s_x_.transpose(*tpat), s_x_) return T.sqr(s_y - T.eye(ndim).dimshuffle(*bpat))
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum): has_momentum = momentum.get_value() > 0.0 samples = [ default_mrng.normal(size=p.shape, avg=0, std=1, dtype=theano.config.floatX) for p in params ] HVs = T.Lop(gparams, params, samples) i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omg_t = 1.0 - gamma**i_t for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs): if is_subtensor_op(p): raise Exception("ESGD subtensor update not implemented!") else: D_t = D * gamma + T.sqr(Hv) * (1.0-gamma) if has_momentum: m_t = m*momentum + g updates[m] = m_t else: m_t = g g_t = m_t / ( T.sqrt(D_t/omg_t + eps) ) #g_t = m_t / ( T.sqrt(D_t + eps) ) updates[D] = D_t updates[p] = p - lr*g_t updates[i] = i_t
def Adam(cost, params, learning_rate=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = OrderedDict() grads = T.grad(cost, params) i = theano.shared(np.asarray(0., dtype=theano.config.floatX)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = learning_rate * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates[m] = m_t updates[v] = v_t updates[p] = p_t updates[i] = i_t return updates
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(np.dtype(theano.config.floatX).type(1)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.clip(g, -grad_clip, grad_clip) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates
def adam(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs): """Adam Gradient Descent Scale learning rates by Adaptive moment estimation References ---------- .. [1] https://arxiv.org/pdf/1412.6980v8.pdf """ gparams = T.grad(cost, params) updates = OrderedDict() t = shared_variable(to_float_X(0.)) t_t = 1. + t l_r_t = learning_rate * T.sqrt(1. - beta2 ** t_t) / (1. - beta1 ** t_t) for param, gparam in zip(params, gparams): m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) v = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) m_t = beta1 * m + (1. - beta1) * gparam v_t = beta2 * v + (1. - beta2) * T.sqr(gparam) updates[m] = m_t updates[v] = v_t updates[param] = param - l_r_t * m_t / (T.sqrt(v_t) + epsilon) updates[t] = t_t return updates
def __call__(self, c01b): """ .. todo:: WRITEME """ half = self.n // 2 sq = T.sqr(c01b) ch, r, c, b = c01b.shape extra_channels = T.alloc(0., ch + 2*half, r, c, b) sq = T.set_subtensor(extra_channels[half:half+ch,:,:,:], sq) scale = self.k for i in xrange(self.n): scale += self.alpha * sq[i:i+ch,:,:,:] scale = scale ** self.beta return c01b / scale
def get_cost(self, X, Y, X_sizes): """ Calculates cost for each values in mini batch, also regularizes all the input parameters and then returns final cost function as theano variable """ cost_fn, _ = theano.scan( fn=self.get_likelihood, sequences=[X, Y, X_sizes] ) cost_fn = cost_fn.mean() cost_fn += self.reg_lambda * T.sqr(self.W_c_r).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.W_c_l).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.W_conv).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.W_output).sum() / 2. cost_fn += self.reg_lambda * T.sqr(self.b_output).sum() / 2. # Regularizing word embedding cost_fn += self.reg_lambda * T.sqr(self.vector_dict).sum() / 2 return cost_fn
def define_loss(self): #Inverse since those that have a smaller distance are the most probable. self.pred_func = TT.nnet.sigmoid( TT.sum(self.e1[self.rows,:] * self.r1[self.cols,:] * self.e1[self.tubes,:], 1) \ + TT.sum(self.e2[self.rows,:] * self.r1[self.cols,:] * self.e2[self.tubes,:], 1) \ + TT.sum(self.e1[self.rows,:] * self.r2[self.cols,:] * self.e2[self.tubes,:], 1) \ - TT.sum(self.e2[self.rows,:] * self.r2[self.cols,:] * self.e1[self.tubes,:], 1) ) self.loss = TT.nnet.softplus( - self.ys * ( TT.sum(self.e1[self.rows,:] * self.r1[self.cols,:] * self.e1[self.tubes,:], 1) \ + TT.sum(self.e2[self.rows,:] * self.r1[self.cols,:] * self.e2[self.tubes,:], 1) \ + TT.sum(self.e1[self.rows,:] * self.r2[self.cols,:] * self.e2[self.tubes,:], 1) \ - TT.sum(self.e2[self.rows,:] * self.r2[self.cols,:] * self.e1[self.tubes,:], 1) )).mean() self.regul_func = TT.sqr(self.e1[self.rows,:]).mean() \ + TT.sqr(self.e2[self.rows,:]).mean() \ + TT.sqr(self.e1[self.tubes,:]).mean() \ + TT.sqr(self.e2[self.tubes,:]).mean() \ + TT.sqr(self.r1[self.cols,:]).mean() \ + TT.sqr(self.r2[self.cols,:]).mean()
def fit(self, weights, o_error, tpo ): gradients = T.grad(o_error ,weights) updates = [] for c, v, w, g in zip(self.t_cache, self.t_velocity, weights,gradients): new_velocity = T.sub( T.mul(tpo["momentum_rate"], v) , T.mul(tpo["learn_rate"], g) ) new_cache = T.add( T.mul(tpo["decay_rate"] , c) , T.mul(T.sub( 1, tpo["decay_rate"]) , T.sqr(g))) new_weights = T.sub(T.add(w , new_velocity) , T.true_div( T.mul(g,tpo["learn_rate"]) , T.sqrt(T.add(new_cache,0.1**8)))) updates.append((w, new_weights)) updates.append((v, new_velocity)) updates.append((c, new_cache)) return updates ###### Nesterov momentum ########################################
def fit(self, weights, o_error, tpo): updates = [] gradients = theano.grad(o_error, weights) for c, w, g in zip(self.t_cache, weights, gradients): new_cache = tpo["decay_rate"] * c + ( 1- tpo["decay_rate"]) * T.sqr(g) new_weights = w - (g * tpo["learn_rate"]) / T.sqrt(new_cache + 0.1**8) updates.append((w, new_weights)) updates.append((c, new_cache)) return updates ###### ADADELTA ########################################
def build_vae_loss(input_var, l_z_mu, l_z_ls, l_x_mu_list, l_x_ls_list, l_x_list, l_x, deterministic, binary, L): layer_outputs = nn.layers.get_output([l_z_mu, l_z_ls] + l_x_mu_list + l_x_ls_list + l_x_list + [l_x], deterministic=deterministic) z_mu = layer_outputs[0] z_ls = layer_outputs[1] x_mu = [] if binary else layer_outputs[2:2+L] x_ls = [] if binary else layer_outputs[2+L:2+2*L] x_list = layer_outputs[2:2+L] if binary else layer_outputs[2+2*L:2+3*L] x = layer_outputs[-1] kl_div = 0.5 * T.sum(1 + 2*z_ls - T.sqr(z_mu) - T.exp(2 * z_ls)) if binary: logpxz = sum(nn.objectives.binary_crossentropy(x, input_var).sum() for x in x_list) * (-1./L) prediction = x_list[0] if deterministic else x else: logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls) for mu, ls in zip(x_mu, x_ls))/L prediction = x_mu[0] if deterministic else T.sum(x_mu, axis=0)/L loss = -1 * (logpxz + kl_div) return loss, prediction
def sym_logdensity(self, x): """ x is a matrix of column datapoints (VxB) V = n_visible, B = batch size """ def density_given_previous_a_and_x(x, w, V_alpha, b_alpha, V_mu, b_mu, V_sigma, b_sigma, activations_factor, p_prev, a_prev, x_prev): a = a_prev + T.dot(T.shape_padright(x_prev, 1), T.shape_padleft(w, 1)) h = self.nonlinearity(a * activations_factor) # BxH Alpha = T.nnet.softmax(T.dot(h, V_alpha) + T.shape_padleft(b_alpha)) # BxC Mu = T.dot(h, V_mu) + T.shape_padleft(b_mu) # BxC Sigma = T.exp((T.dot(h, V_sigma) + T.shape_padleft(b_sigma))) # BxC p = p_prev + log_sum_exp(-constantX(0.5) * T.sqr((Mu - T.shape_padright(x, 1)) / Sigma) - T.log(Sigma) - constantX(0.5 * np.log(2 * np.pi)) + T.log(Alpha)) return (p, a, x) # First element is different (it is predicted from the bias only) a0 = T.zeros_like(T.dot(x.T, self.W)) # BxH p0 = T.zeros_like(x[0]) x0 = T.ones_like(x[0]) ([ps, _as, _xs], updates) = theano.scan(density_given_previous_a_and_x, sequences=[x, self.W, self.V_alpha, self.b_alpha, self.V_mu, self.b_mu, self.V_sigma, self.b_sigma, self.activation_rescaling], outputs_info=[p0, a0, x0]) return (ps[-1], updates)
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = OrderedDict() i = theano.shared(np.float32(0)) i_t = i + 1. for p, g in zip(params, grads): v = build_shared_zeros(p.get_value(True).shape) r = build_shared_zeros(p.get_value(True).shape) v_t = (b1 * v) + (1. - b1) * g r_t = (b2 * r) + (1. - b2) * T.sqr(g) r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e) v_hat = v / (1 - b1 ** i_t) p_t = p - r_hat * v_hat updates[v] = v_t updates[r] = r_t updates[p] = p_t updates[i] = i_t return updates
def gradients_to_updates(self, params, grads): updates = OrderedDict() for pp, gg in zip(params, grads): value = pp.get_value(borrow=True) self.accu = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adadelta_accu_'+pp.name) self.delta_accu = theano.shared(np.zeros(value.shape, dtype=theano.config.floatX), 'adadelta_delta_accu_'+pp.name) self.params.append(self.accu) self.params.append(self.delta_accu) self.accu.tags = ['optimizer_param'] self.delta_accu.tags = ['optimizer_param'] accu_new = self.rho * self.accu + (1 - self.rho) * T.sqr(gg) updates[self.accu] = accu_new ud = gg * (T.sqrt(self.delta_accu) + 1e-7) / (T.sqrt(accu_new) + 1e-7) updates[pp] = pp - self.lr * ud delta_accu_new = self.rho * self.delta_accu + (1 - self.rho) * T.sqr(ud) updates[self.delta_accu] = delta_accu_new return updates
def RMSProp(self, learning_rate=0.01, decay=0.9, epsilon=1.0 / 100.): """ RMSProp of Tieleman et al. :param learning_rate: learning rate :param decay: decay rate of gradient history :param epsilon: gradient clip :return: update """ for param_i, grad_i in zip(self.params, self.grads): # Accumulate gradient msg = theano.shared(numpy.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) self.shared.append(msg) new_mean_squared_grad = (decay * msg + (1 - decay) * T.sqr(grad_i)) # Compute update rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, epsilon) delta_x_t = -learning_rate * grad_i / rms_grad_t # Apply update self.updates.append((param_i, param_i + delta_x_t)) self.updates.append((msg, new_mean_squared_grad)) return self.updates
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def Adam(self, params, cost, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(as_floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates
def Adam(grads, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] varlist = [] i = sharedX(0.) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in grads.items(): m = sharedX(p.get_value() * 0., name=p.name + '_adam_optimizer_m') v = sharedX(p.get_value() * 0., name=p.name + '_adam_optimizer_v') m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) varlist.append(m) varlist.append(v) updates.append((i, i_t)) return updates, varlist
def Adagrad(grads, lr): updates = OrderedDict() for param in grads.keys(): # sum_square_grad := \sum g^2 sum_square_grad = sharedX(param.get_value() * 0.) if param.name is not None: sum_square_grad.name = 'sum_square_grad_' + param.name # Accumulate gradient new_sum_squared_grad = sum_square_grad + T.sqr(grads[param]) # Compute update delta_x_t = (- lr / T.sqrt(numpy.float32(1e-5) + new_sum_squared_grad)) * grads[param] # Apply update updates[sum_square_grad] = new_sum_squared_grad updates[param] = param + delta_x_t return updates
def get_adam_updates(f, params, lr=10., b1=0.9, b2=0.999, e=1e-8, dec=5e-3, norm_grads=False): """Generate updates to optimize using the Adam optimizer with linear learning rate decay.""" t = theano.shared(0) ms = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params] vs = [theano.shared(np.zeros(param.shape.eval(), dtype=floatX), borrow=True) for param in params] gs = T.grad(f, params) if norm_grads: gs = [g / (T.sum(T.abs_(g)) + 1e-8) for g in gs] t_u = (t, t + 1) m_us = [(m, b1 * m + (1. - b1) * g) for m, g in zip(ms, gs)] v_us = [(v, b2 * v + (1. - b2) * T.sqr(g)) for v, g in zip(vs, gs)] t_u_f = T.cast(t_u[1], floatX) lr_hat = (lr / (1. + t_u_f * dec)) * T.sqrt(1. - T.pow(b2, t_u_f)) / (1. - T.pow(b1, t_u_f)) param_us = [(param, param - lr_hat * m_u[1] / (T.sqrt(v_u[1]) + e)) for m_u, v_u, param in zip(m_us, v_us, params)] return m_us + v_us + param_us + [t_u]
def op_l2norm(s_x_, eps_=1e-6): return T.sqrt(eps_+T.sum(T.sqr(s_x_)))
def op_cosine(s_u_, s_v_, flatten_=True, eps_=1e-6): if flatten_: s_u = s_u_.flatten() s_v = s_v_.flatten() return T.dot(s_u, s_v) / T.sqrt(eps_+T.sum(T.sqr(s_u))*T.sum(T.sqr(s_v))) else: s_u = s_u_ s_v = s_v_ T.sum(s_u*s_v, axis=-1)/T.sqrt(eps_+T.sum(T.sqr(s_u), axis=-1)*T.sum(T.sqr(s_v), axis=-1))
def gradient_clipping(gradients, max_norm=5.0): global_grad_norm = tensor.sqrt(sum(map(lambda x: tensor.sqr(x).sum(), gradients))) multiplier = tensor.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm) return [g * multiplier for g in gradients]
def RMSProp(self, learning_rate=0.01, decay=0.9, epsilon=1.0 / 100.): """ RMSProp of Tieleman et al. :param learning_rate: learning rate :param decay: decay rate of gradient history :param epsilon: gradient clip :return: update """ updates = [] for param_i, grad_i in zip(self.params, self.grads): # Accumulate gradient msg = theano.shared(numpy.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) new_mean_squared_grad = (decay * msg + (1 - decay) * T.sqr(grad_i)) # Compute update rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, epsilon) delta_x_t = -learning_rate * grad_i / rms_grad_t # Apply update updates.append((param_i, param_i + delta_x_t)) updates.append((msg, new_mean_squared_grad)) return updates
def mean_squared_error(y_true, y_pred): return T.sqr(y_pred - y_true).mean(axis=-1)
def mean_squared_logarithmic_error(y_true, y_pred): return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
def squared_hinge(y_true, y_pred): return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1)
def cosine_sim2d(k, M): # k: (nb_samples, memory_width) # M: (nb_samples, memory_dim, memory_width) # norms of keys and memories k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,) M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,) k = k[:, None, :] # (nb_samples, 1, memory_width) k_norm = k_norm[:, None] # (nb_samples, 1) sim = T.sum(k * M, axis=2) # (nb_samples, memory_dim,) sim /= k_norm * M_norm # (nb_samples, memory_dim,) return sim
def op_sqr_c(s_xr_, s_xi_): ''' elemwise complex square ''' return T.sqr(s_xr_) - T.sqr(s_xi_), 2*s_xr_*s_xi_
def op_norm2(s_x_, axis_=-1, use_mean_=False, keepdims_=True): ''' Square of L2 norm Args: s_x_: input (batch of) vector axis_: int or tuple of int use_mean_: cause mean of square to be one instead of sum ''' op_sum = T.sum if not use_mean_ else T.mean return op_sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_)
def op_norm2_c(s_xr_, s_xi_, axis_=-1, use_mean_=False, keepdims_=True): ''' Complex squared L2 norm ''' op_sum = T.sum if not use_mean_ else T.mean return op_sum(T.sqr(s_xr_) + T.sqr(s_xi_), axis=axis_, keepdims=keepdims_)
def op_cosine(s_x_, s_y_, axis_=-1, keepdims_=True, eps_=1e-7): ''' cosine between two vectors ''' s_prod = s_x_ * s_y_ s_nx = T.sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_) s_ny = T.sum(T.sqr(s_y_), axis=axis_, keepdims=keepdims_) return (T.sum(s_prod, axis=axis_, keepdims=keepdims_) / T.sqrt(s_nx * s_ny + eps_))
def op_sqr_cosine(s_x_, s_y_, axis_=-1, keepdims_=True, eps_=1e-7): ''' squared cosine for some occasion, sqrt is not needed ''' s_prod = s_x_ * s_y_ s_nx = T.sum(T.sqr(s_x_), axis=axis_, keepdims=keepdims_) s_ny = T.sum(T.sqr(s_y_), axis=axis_, keepdims=keepdims_) return (T.sqr(T.sum(s_prod, axis=axis_, keepdims=keepdims_)) / (s_nx * s_ny + eps_))
def op_unitary_loss(s_re_, s_im_, axes_=None, size_=None): ''' unitary matrix loss of real/imag part, used to regularize parameter to unitary Args: s_re_: real part, square matrix s_im_: imag part, square matrix size_: specify args to be (size_ x size_) matrices axes_: tuple of two integers, specify which axes to be for matrix, defaults to last two axes ''' if axes_ is None: axes_ = (-2, -1) if size_ is None: ax = axes_[0] size = T.shape(s_re_)[ax] else: size = size_ assert s_re_.ndim == s_im_.ndim tpat = list(range(s_re_.ndim)) bpat = ['x'] * s_re_.ndim tpat[axes_[0]], tpat[axes_[1]] = tpat[axes_[1]], tpat[axes_[0]] bpat[axes_[0]] = 0 bpat[axes_[1]] = 1 s_y_re_ = T.dot(s_re_.transpose(*tpat), s_re_) + T.dot(s_im_.transpose(*tpat), s_im_) s_tmp = T.dot(s_re_.transpose(*tpat), s_im_) s_y_im_ = s_tmp - s_tmp.transpose(*tpat) return T.mean(T.sqr(s_y_re_ - T.eye(size).dimshuffle(*bpat)) + T.sqr(s_y_im_))
def get_updates(self, learning_rate, params, grads, lr_scalers): """Compute the AdaDelta updates of the model's parameters. param_t := param_(t-1) + AdaDelta_update_t """ if self._first_time: self.sum_square_grad = [ sharedX_mtx( param.get_value() * 0., name='sum_square_grad_'+param.name, borrow=True) for param in params] self._first_time = False updates = [] for (param, grad, sum_square_grad, lr_sc) in zip( params, grads, self.sum_square_grad, lr_scalers): # Calculate the running average gradient: E[g^2]_t new_sum_square_grad = sum_square_grad + T.sqr(grad) # The update: delta_x_t lr_scaled = learning_rate * lr_sc epsilon = lr_scaled sqrt_sum_grad_t = T.sqrt(new_sum_square_grad) delta_x_t = - (epsilon / sqrt_sum_grad_t) * grad # update the params new_param = param + delta_x_t # Send for the update updates.append((sum_square_grad, new_sum_square_grad)) if self.max_colm_norm and param.name in ["W", "w"]: new_param_final = norm_constraint(tensor_var=new_param, max_norm=self.max_norm) else: new_param_final = new_param updates.append((param, new_param_final)) return updates
def get_updates(self, learning_rate, params, grads, lr_scalers): """Compute the parameters' updates. """ if self._first_time: self.mean_square_grads = [ sharedX_mtx( param.get_value() * 0., name='mean_square_grad_'+param.name, borrow=True) for param in params] self._first_time = False updates = [] for (param, grad, mean_square_grad, lr_sc) in zip( params, grads, self.mean_square_grads, lr_scalers): new_mean_square_grad = ( self.decay * mean_square_grad + (1-self.decay) * T.sqr(grad)) # the update rms_grad_t = T.sqrt(new_mean_square_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) lr_scaled = learning_rate * lr_sc delta_x_t = - lr_scaled * grad / rms_grad_t new_param = param + delta_x_t # updates if self.max_colm_norm and param.name in ["W", "w"]: new_param_final = norm_constraint(tensor_var=new_param, max_norm=self.max_norm) else: new_param_final = new_param updates.append((param, new_param_final)) updates.append((mean_square_grad, new_mean_square_grad)) return updates
def localResponseNormalizationCrossChannel(incoming, alpha=1e-4, k=2, beta=0.75, n=5): """ Implement the local response normalization cross the channels described in <ImageNet Classification with Deep Convolutional Neural Networks>, A.Krizhevsky et al. sec.3.3. Reference of the code: https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/ normalization.py https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/expr/normalize.py Parameters: incomping: The feature maps. (output of the convolution layer). alpha: float scalar k: float scalr beta: float scalar n: integer: number of adjacent channels. Must be odd. """ if n % 2 == 0: raise NotImplementedError("Works only with odd n") input_shape = incoming.shape half_n = n // 2 input_sqr = T.sqr(incoming) b, ch, r, c = input_shape extra_channels = T.alloc(0., b, ch + 2*half_n, r, c) input_sqr = T.set_subtensor(extra_channels[:, half_n:half_n+ch, :, :], input_sqr) scale = k for i in range(n): scale += alpha * input_sqr[:, i:i+ch, :, :] scale = scale ** beta return incoming / scale
def contractive_penality(self, h, linear_hid, contraction_level=0.0, batch_size=-1): if batch_size == -1 or batch_size == 0: raise Exception("invalid batch size.") grad = T.grad(h.sum(), linear_hid) jacob = T.dot(T.sqr(grad), T.sqr(self.hidden.W.sum(axis=0))) frob_norm_jacob = T.sum(jacob) / batch_size contract_pen = contraction_level * frob_norm_jacob return contract_pen
def get_net_cost(model, cost_type, eye=True): """Get the train cost of the network.""" cost = None if eye: d_eyes = ( (model.trg[:, 37] - model.trg[:, 46])**2 + (model.trg[:, 37] - model.trg[:, 46])**2).T if cost_type == CostType.MeanSquared: cost = T.mean( T.sqr(model.output_dropout - model.trg), axis=1) / d_eyes elif cost_type == CostType.CrossEntropy: cost = T.mean( T.nnet.binary_crossentropy( model.output_dropout, model.trg), axis=1) else: raise ValueError("cost type unknow.") else: if cost_type == CostType.MeanSquared: cost = T.mean( T.sqr(model.output_dropout - model.trg), axis=1) elif cost_type == CostType.CrossEntropy: cost = T.mean( T.nnet.binary_crossentropy( model.output_dropout, model.trg), axis=1) else: raise ValueError("cost type unknow.") if model.l1 != 0.: cost += model.l1 if model.l2 != 0.: cost += model.l2 return cost