我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.switch()。
def rbf_kernel(X0): XY = T.dot(X0, X0.transpose()) x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1)) X2e = T.repeat(x2, X0.shape[0], axis=1) H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY) V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2. Kxy = T.exp(-H / h ** 2 / 2.0) neighbors = T.argsort(H, axis=1)[:, 1] return Kxy, neighbors, h
def rbf_kernel(X): XY = T.dot(X, X.T) x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x') X2e = T.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2. * XY V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = T.sqrt(.5 * h / T.log(H.shape[0].astype('float32') + 1.)) # compute the rbf kernel kxy = T.exp(-H / (h ** 2) / 2.0) dxkxy = -T.dot(kxy, X) sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x') dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / (h ** 2) return kxy, dxkxy
def switch(condition, then_tensor, else_tensor): """ Keras' implementation of switch for tensorflow uses tf.switch which accepts only scalar conditions. It should use tf.select instead. """ if K.backend() == 'tensorflow': import tensorflow as tf condition_shape = condition.get_shape() input_shape = then_tensor.get_shape() if condition_shape[-1] != input_shape[-1] and condition_shape[-1] == 1: # This means the last dim is an embedding dim. Keras does not mask this dimension. But tf wants # the condition and the then and else tensors to be the same shape. condition = K.dot(tf.cast(condition, tf.float32), tf.ones((1, input_shape[-1]))) return tf.select(tf.cast(condition, dtype=tf.bool), then_tensor, else_tensor) else: import theano.tensor as T return T.switch(condition, then_tensor, else_tensor)
def dropout_layer(state_before, use_noise, trng): """ :todo: - Fix according to _param - Test! From Cho's code here: https://github.com/nyu-dl/dl4mt-tutorial/blob/master/session2/nmt.py#L45 """ proj = tensor.switch( use_noise, # for training state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype), # for validation/sampling state_before * 0.5) return proj
def _ternarize(W, H=1): '''The weights' ternarization function, # References: - [Recurrent Neural Networks with Limited Numerical Precision](http://arxiv.org/abs/1608.06902) - [Ternary Weight Networks](http://arxiv.org/abs/1605.04711) ''' W /= H ones = K.ones_like(W) zeros = K.zeros_like(W) Wt = switch(W > 0.5, ones, switch(W <= -0.5, -ones, zeros)) Wt *= H return Wt
def shared_dropout_layer(shape, use_noise, trng, value, scaled=True): #re-scale dropout at training time, so we don't need to at test time if scaled: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype='float32')/value, theano.shared(numpy.float32(1.))) else: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype='float32'), theano.shared(numpy.float32(value))) return proj # feedforward layer: affine transformation + point-wise nonlinearity
def __call__(self, input_): m = input_.mean() v = input_.std() new_m = T.switch(T.eq(self.m, 0.), m, (np.float32(1.) - self.rate) * self.m + self.rate * m) new_var = T.switch(T.eq(self.var, 0.), v, (np.float32(1.) - self.rate) * self.var + self.rate * v) updates = [(self.m, new_m), (self.var, new_var)] input_centered = ( (input_ - new_m) / T.maximum(1., T.sqrt(new_var))) input_ = T.zeros_like(input_) + input_ outs = OrderedDict( x=input_, x_centered=input_centered, m=new_m, var=new_var ) return outs, updates
def dropout(state_before, is_train, trng): """ dropout with p=0.5 Parameters ---------- state_before : theano 3d tensor, input data, dimensions: (num of time steps, batch size, dim of vector) is_train : theano shared scalar, 0. = test/valid, 1. = train, trng : random number generator Returns ------- proj : theano 3d tensor, output data, dimensions: (num of time steps, batch size, dim of vector) """ proj = tensor.switch(is_train, state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype), state_before * 0.5) return proj
def dropout_layer(state_before, use_noise, trng): """ tensor switch is like an if statement that checks the value of the theano shared variable (use_noise), before either dropping out the state_before tensor or computing the appropriate activation. During training/testing use_noise is toggled on and off. """ proj = tensor.switch( use_noise, state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype), state_before * 0.5) return proj # make prefix-appended name
def sequence_iteration(self, in_seq, mask, use_dropout,dropout_value=1): in_seq_d = T.switch(use_dropout, (in_seq * self.trng.binomial(in_seq.shape, p=dropout_value, n=1, dtype=in_seq.dtype)), in_seq) rz_in_seq = T.dot(in_seq_d, self.weights[0]) out_seq, updates = theano.scan( fn=self.t_forward_step, sequences=[mask, rz_in_seq], outputs_info=[self.t_ol_t00], non_sequences=[i for i in self.weights][1:] + [self.t_n_out], go_backwards = self.go_backwards, truncate_gradient=-1, #n_steps=50, strict=True, allow_gc=False, ) return out_seq
def _generate_conv(self, image_shape=None): input = T.tensor4(name='input') W = theano.shared(np.asarray(self.weights['input'], dtype=input.dtype), name='W') conv_out = T.nnet.conv2d(input, W, border_mode=self.pad, subsample=self.stride, filter_shape=self.filter_shape, input_shape=image_shape) if self.bias: b = theano.shared( np.asarray(self.weights['bias'], dtype=input.dtype), name='b') conv_out = conv_out + b.dimshuffle('x', 0, 'x', 'x') if self.activation_fct is None: output = conv_out elif self.activation_fct == "hardlimit": output = conv_out>0 elif self.activation_fct == "hardtanh": output = T.switch(conv_out > -1, T.switch(conv_out > 1, 1, conv_out), -1) else: output = self.activation_fct(conv_out) self.conv_fct = theano.function([input], output)
def _generate_conv(self, image_shape=None): input = T.tensor4(name='input') W = theano.shared(np.asarray(self.weights['input'], dtype=input.dtype), name='W') conv_out = T.nnet.conv2d(input, W, border_mode=self.pad, subsample=self.stride, filter_shape=self.weights['input'].shape, input_shape=image_shape) if self.activation_fct is None: output = conv_out elif self.activation_fct == "hardlimit": output = conv_out>0 elif self.activation_fct == "hardtanh": output = T.switch(conv_out > -1, T.switch(conv_out > 1, 1, conv_out), -1) else: output = self.activation_fct(conv_out) self.conv_fct = theano.function([input], output)
def get_output(self, input_): """ This function overrides the parents' one. Creates symbolic function to compute output from an input. The symbolic function use theano switch function conditioned by flag. Math Expression --------------- y = (x - mean(x)) / std(x) mean and std through each data point. Parameters ---------- input_: TensorVariable Returns ------- Tensorvariable """ dim_mean = T.mean(input_, axis=1) dim_std = T.std(input_, axis=1) return self.gamma * (input_ - dim_mean.dimshuffle(0, 'x')) / (dim_std.dimshuffle(0, 'x') + 1e-7) + self.beta
def get_output(self, input_): """ This function overrides the parents' one. Creates symbolic function to compute output from an input. The symbolic function use theano switch function conditioned by flag. Math Expression --------------- y = (x - mean(x)) / std(x) mean and std through each data point. Parameters ---------- input_: TensorVariable Returns ------- Tensorvariable """ dim_mean = T.mean(input_, axis=[1, 2, 3]) dim_std = T.std(input_, axis=[1, 2, 3]) return self.gamma.dimshuffle('x', 0, 'x', 'x') * (input_ - dim_mean.dimshuffle(0, 'x', 'x', 'x')) / (dim_std.dimshuffle(0, 'x', 'x', 'x') + 1e-7) + self.beta.dimshuffle('x', 0, 'x', 'x')
def get_output(self, input_): """ This function overrides the parents' one. ELU is element-wise operation. If alpha = 0, same as ReLU. Math Expression ------------------- y = ifelse(x > 0, x, \alpha * (exp(x) - 1)) Parameters ---------- input_: TensorVariable Returns ------- TensorVariable """ return T.switch(T.gt(input_, 0), input_, self.alpha * (T.exp(input_) - 1))
def _applyNL(self,lin_out): if self.params['nonlinearity']=='relu': if 'leaky_params' in self.params: return T.nnet.relu(lin_out, alpha = self.params['leaky_params']) else: return T.nnet.relu(lin_out) elif self.params['nonlinearity']=='softplus': return T.nnet.softplus(lin_out) elif self.params['nonlinearity']=='elu': return T.switch(lin_out > 0, lin_out, T.exp(lin_out) - 1) elif self.params['nonlinearity']=='maxout': maxout_out = None for i in xrange(self.params['maxout_stride']): tmp = lin_out[:,i::self.params['maxout_stride']] if maxout_out is None: maxout_out = tmp else: maxout_out = T.maximum(maxout_out, tmp) return maxout_out else: return T.tanh(lin_out)
def normalize(grads, grad_norm): """ grads: list of gradients grad_norm : None (or positive value) returns: gradients rescaled to satisfy norm constraints """ #Check if we're clipping gradients if grad_norm is not None: assert grad_norm > 0, 'Must specify a positive value to normalize to' print '<<<<<< Normalizing Gradients to have norm (',grad_norm,') >>>>>>' g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(T.switch(g2 > (grad_norm**2), g/T.sqrt(g2)*grad_norm, g)) return new_grads else: return grads
def clip_grad_remove_nan(grads, clip_c_shared, mt_tparams): g2 = 0. for g in grads: g2 += (g*g).sum() not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) if clip_c_shared.get_value() > 0.: new_grads = [] for g, p in zip(grads, itemlist(mt_tparams)): tmpg = tensor.switch(g2 > (clip_c_shared*clip_c_shared), g / tensor.sqrt(g2) * clip_c_shared, g) new_grads.append(tensor.switch(not_finite, np.float32(.1)*p, tmpg)) return new_grads, tensor.sqrt(g2) else: return grads, tensor.sqrt(g2)
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def shared_dropout_layer(shape, use_noise, trng, value, scaled=True): #re-scale dropout at training time, so we don't need to at test time if scaled: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype=floatX)/value, theano.shared(numpy_floatX(1.))) else: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype=floatX), theano.shared(numpy_floatX(value))) return proj # layer normalization # code from https://github.com/ryankiros/layer-norm
def build_model(model_): global fn_predict, fn_record global g_ozer, g_mdl g_ozer = dict(simple=VanillaSGD, adam=AdamSGD)[OZER]() g_ozer.lr = LEARN_RATE s_x = T.tensor4('x') s_y = T.ivector('y') s_pdpo = T.scalar() s_out = model_(s_x, s_pdpo) s_y_onehot = T.extra_ops.to_one_hot(s_y, len(g_dataset.label_map)) s_loss = T.mean(-s_y_onehot*T.log(s_out + 1e-3)) s_accr = T.mean( T.switch( T.eq(T.argmax(s_out, axis=1), T.argmax(s_y_onehot, axis=1)), 1, 0)) no_dropout = [(s_pdpo, T.constant(0., dtype=th.config.floatX))] fn_predict = th.function( [s_x, s_y], {'pred':s_out, 'accr':s_accr, 'loss':s_loss}, givens=no_dropout, profile=PROFILE) rec_fetches = { 'x': s_x, 'y': s_y, 'pred': s_out} rec_fetches.update(g_mdl.params_di) fn_record = th.function( [s_x, s_y], rec_fetches, givens=no_dropout, profile=PROFILE) g_ozer.compile( [s_x, s_y], s_loss, g_mdl.params_di.values(), fetches_={'pred': s_out, 'loss': s_loss, 'accr': s_accr}, givens_=[(s_pdpo, T.constant(TRAIN_PDPO, dtype=th.config.floatX))], profile_=PROFILE)
def past_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: Make this actually use sparsity, one of these days. kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))+1) te_last = create_shared_variable(np.zeros((n_samples, n_out))+1) x_last = create_shared_variable(np.zeros((n_samples, n_in))) e_last = create_shared_variable(np.zeros((n_samples, n_out))) t_last = tt.minimum(tx_last[:, :, None], te_last[:, None, :]) x_spikes = tt.neq(xs, 0) dw_potentials = x_last[:, :, None] * e_last[:, None, :] * \ rx**(tx_last[:, :, None]-t_last) \ * re**(te_last[:, None, :]-t_last) \ * geoseries_sum(rx*re, t_end=t_last, t_start=1) e_spikes = tt.neq(es, 0) dws = (x_spikes[:, :, None]+e_spikes[:, None, :]-x_spikes[:, :, None]*e_spikes[:, None, :])*dw_potentials # (n_samples, n_in, n_out) add_update(x_last, tt.switch(x_spikes, x_last*rx**tx_last + xs/as_floatx(kd_x), x_last)) add_update(e_last, tt.switch(e_spikes, e_last*rx**te_last + es/as_floatx(kd_e), e_last)) add_update(tx_last, tt.switch(x_spikes, 1, tx_last+1)) add_update(te_last, tt.switch(e_spikes, 1, te_last+1)) return dws.sum(axis=0)
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes): """ Do an efficient update of the weights given the two spike-trains. This isn't actually implemented as an efficient update, but it will produce the identical result as if it were. :param xs: An (n_samples, n_in) array :param es: An (n_samples, n_out) array :param kp_x: kp for the x units :param kd_x: kd for the x units :param kp_e: kp for the e units :param kd_e: kd for the e units :param shapes: (minibatch_size, n_in, n_out) :return: An (n_in, n_out) approximate weight gradient. """ # TODO: RESOLVE INSTABILITY ISSUE kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) tx_last = create_shared_variable(np.zeros((n_samples, n_in))) te_last = create_shared_variable(np.zeros((n_samples, n_out))) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :]) sum_to_last = geoseries_sum(rx*re, t_start=t_last, t_end=0) # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) dw_es = (xr[:, :, None]*er[:, None, :]*spikes)*sum_to_last # PROBLEM HERE!!!! Can be very small number times very large numen # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last add_update(xr, xr*rx + xs/(kp_x+kd_x)) add_update(er, er*re + es/(kp_e+kd_e)) add_update(tx_last, tt.switch(x_spikes, 0, tx_last-1)) add_update(te_last, tt.switch(e_spikes, 0, te_last-1)) return dw_es.sum(axis=0)
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7): """ :param xs: :param es: :param kp_x: :param kd_x: :param kp_e: :param kd_e: :param shapes: :param epsilon: :return: """ kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)] n_samples, n_in, n_out = shapes v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out))) rx = kd_x/(kp_x+kd_x) re = kd_e/(kp_e+kd_e) xr = create_shared_variable(np.zeros((n_samples, n_in))) er = create_shared_variable(np.zeros((n_samples, n_out))) x_spikes = tt.neq(xs, 0) e_spikes = tt.neq(es, 0) xr_decayed = xr*rx er_decayed = er*re spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :]) v2 = xr_decayed[:, :, None]*er_decayed[:, None, :] dws = (spikes*(v2-v1))/(rx*re-1) new_xr = xr_decayed + xs/(kp_x+kd_x) new_er = er_decayed + es/(kp_e+kd_e) add_update(v1, tt.switch(spikes, new_xr[:, :, None]*new_er[:, None, :], v1)) add_update(xr, new_xr) add_update(er, new_er) return dws.sum(axis=0)
def gradient_clipping(gradients, max_norm=5.0): global_grad_norm = tensor.sqrt(sum(map(lambda x: tensor.sqr(x).sum(), gradients))) multiplier = tensor.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm) return [g * multiplier for g in gradients]
def switch(condition, t, e): if K.backend() == 'tensorflow': import tensorflow as tf return tf.where(condition, t, e) elif K.backend() == 'theano': import theano.tensor as tt return tt.switch(condition, t, e)
def InvReLU(x): """ Rectified linear unit :param x: input value :return: max(x,0) """ import theano.tensor as T x *= -1. return T.switch(x < 0, 0, x)
def TruncLin(x): """ Truncated linear unit :param x: input value :return: max(min(x,1),-1) """ import theano.tensor as T return T.switch(x < -1, -1, T.switch(x > 1, 1, x))
def TruncReLU(x): """ Truncated rectified linear unit :param x: input value :return: max(min(x,1),0) """ import theano.tensor as T return T.switch(x < 0, 0, T.switch(x > 1, 1, x))
def SlopeLin(slope): """ Linear unit with different slopes :param slope: slope of negative quadrant :return: x if x > 0 else x/slope """ import theano.tensor as T def inner(x): return T.switch(T.gt(x, 0), x, T.true_div(x, slope)) return inner
def SlopeLinInv(slope): """ Truncated linear unit :param slope: slope of negative quadrant :return: x if x > 0 else x*slope """ import theano.tensor as T def inner(x): return T.switch(T.gt(x, 0), x, T.mul(x, slope)) return inner
def huber(delta): """ Huber loss, robust at 0 :param delta: delta parameter :return: loss value """ import theano.tensor as T def inner(target, output): d = target - output a = .5 * d**2 b = delta * (T.abs_(d) - delta / 2.) l = T.switch(T.abs_(d) <= delta, a, b) return l return inner
def clip_norm(g, c, n): if c > 0: g = T.switch(T.ge(n, c), g * c / n, g) return g
def relu(x): return T.switch(x > 0, x, 0)
def dropout_layer(state_before, use_noise, trng, prob): proj = T.switch(use_noise, (state_before * trng.binomial(state_before.shape, p=prob, n=1, dtype=state_before.dtype)), state_before * 0.5) return proj
def dropout_layer(state_before, use_noise, trng): proj = tensor.switch( use_noise, state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype), state_before * 0.5) return proj # -------------------------------------------------------------------------# # Feedforward: # affine transformation + point-wise nonlinearity
def grad_clip(dJ, clip_c=1): clip_c = clip_c. if clip_c > 0.: g2 = 0. for g in dJ: g2 += (g ** 2).sum() new_grads = [] for g in dJ: new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g)) dJ = new_grads return dJ
def ReLU(cls): return lambda x: T.switch(x < 0, 0, x)
def get_output_for(self, inputs, **kwargs): in1, in2 = inputs out = T.switch(T.gt(in2, 0.), in1, 0.) return out
def get_output_for(self, inputs, **kwargs): in1, in2 = inputs out = T.switch(T.gt(in2, 0.), 0., in1) return out
def smooth_l1_loss(predictions, targets, sigma=1.5): cond = np.float32(1. / sigma / sigma) point_five = np.float32(0.5) sigma_t = np.float32(sigma) sub_const = np.float32(0.5 / sigma / sigma) diff = T.abs_(predictions - targets) out = T.switch(T.lt(diff, cond), point_five * sigma_t * diff * sigma_t * diff, diff - sub_const) return T.mean(T.sum(out, axis=1))
def applyActivationFunction_ReLU_v2(inputData): return T.switch(inputData < 0., 0., inputData) # --- Version 3 ---
def set_params(self): counter = 0 self.params = OrderedDict(counter=counter, switch=switch)
def __call__(self, x): counter = T.switch(T.ge(self.counter, self.rate), 0, self.counter + 1) switch = T.switch(T.ge(self.counter, 0), 1, 0) x = T.switch(switch, eval(method)(x), x) return OrderedDict(x=x), theano.OrderedUpdates([(self.counter, counter)])
def switch(condition, then_expression, else_expression): '''condition: scalar tensor. ''' return T.switch(condition, then_expression, else_expression)
def in_train_phase(x, alt): if _LEARNING_PHASE is 1: return x elif _LEARNING_PHASE is 0: return alt x = T.switch(_LEARNING_PHASE, x, alt) x._uses_learning_phase = True return x