我们从Python开源项目中,提取了以下22个代码示例,用于说明如何使用theano.tensor.or_()。
def clip_grad_remove_nan(grads, clip_c_shared, mt_tparams): g2 = 0. for g in grads: g2 += (g*g).sum() not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) if clip_c_shared.get_value() > 0.: new_grads = [] for g, p in zip(grads, itemlist(mt_tparams)): tmpg = tensor.switch(g2 > (clip_c_shared*clip_c_shared), g / tensor.sqrt(g2) * clip_c_shared, g) new_grads.append(tensor.switch(not_finite, np.float32(.1)*p, tmpg)) return new_grads, tensor.sqrt(g2) else: return grads, tensor.sqrt(g2)
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def test_or(self): mode = theano.compile.get_default_mode().including('canonicalize') x = T.scalar('x', dtype='int8') for zero, one in [(numpy.int8(0), numpy.int8(1)), (0, 1)]: f = theano.function([x], T.or_(x, one), mode=mode) self.assert_eqs_const(f, 1) f = theano.function([x], T.or_(one, x), mode=mode) self.assert_eqs_const(f, 1) f = theano.function([x], T.or_(x, zero), mode=mode) if f.outputs[0].variable.dtype == x.dtype: self.assert_identity(f) f = theano.function([x], T.or_(zero, x), mode=mode) if f.outputs[0].variable.dtype == x.dtype: self.assert_identity(f)
def compute_updates(training_cost, params, config): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = np.float32(1.) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, np.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) updates = Adam(grads, config.learning_rate) #??adam?????? return updates
def or_(self, l, r): return T.or_(l, r)
def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999, epsilon=1e-8, grads=None): # Gradients if grads is None: grads = tensor.grad(loss, self.trainables) # Clipping norm = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads])) m = theanotools.clipping_multiplier(norm, max_norm) grads = [m*g for g in grads] # Safeguard against numerical instability new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)), tensor.or_(norm < 0, norm > 1e10)) grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads] # Safeguard against numerical instability #cond = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm))) #grads = [tensor.switch(cond, np.float32(0), g) for g in grads] # New values t = self.time + 1 lr_t = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t) means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)] vars_t = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)] steps = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon) for m_t, v_t in zip(means_t, vars_t)] # Updates updates = [(x, x - step) for x, step in zip(self.trainables, steps)] updates += [(m, m_t) for m, m_t in zip(self.means, means_t)] updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)] updates += [(self.time, t)] return norm, grads, updates
def adam(self,cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): all_grads = T.grad(cost=cost, wrt=params) all_grads = total_norm_constraint(all_grads,10) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() t = t_prev + 1 a_t = learning_rate*T.sqrt(1-beta2**t)/(1-beta1**t) for param, g_t in zip(params, all_grads): g_t = T.switch(not_finite, 0.1 * param,g_t) value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1*m_prev + (1-beta1)*g_t v_t = beta2*v_prev + (1-beta2)*g_t**2 step = a_t*m_t/(T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t return updates
def gradient_clipping(grads, tparams, clip_c=10): g2 = 0. for g in grads: g2 += (g**2).sum() g2 = tensor.sqrt(g2) not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) new_grads = [] for p, g in zip(tparams.values(), grads): new_grads.append(tensor.switch(g2 > clip_c, g * (clip_c / g2), g)) return new_grads, not_finite, tensor.lt(clip_c, g2)
def test_elemwise(self): # float Ops mats = theano.tensor.matrices('cabxy') c, a, b, x, y = mats s1 = T.switch(c, a, b) s2 = T.switch(c, x, y) for op in (T.add, T.sub, T.mul, T.true_div, T.int_div, T.floor_div, T.minimum, T.maximum, T.gt, T.lt, T.ge, T.le, T.eq, T.neq, T.pow): g = optimize(FunctionGraph(mats, [op(s1, s2)])) assert str(g).count('Switch') == 1 # integer Ops mats = theano.tensor.imatrices('cabxy') c, a, b, x, y = mats s1 = T.switch(c, a, b) s2 = T.switch(c, x, y) for op in (T.and_, T.or_, T.xor, T.bitwise_and, T.bitwise_or, T.bitwise_xor): g = optimize(FunctionGraph(mats, [op(s1, s2)])) assert str(g).count('Switch') == 1 # add/mul with more than two inputs u, v = theano.tensor.matrices('uv') s3 = T.switch(c, u, v) for op in (T.add, T.mul): g = optimize(FunctionGraph(mats + [u, v], [op(s1, s2, s3)])) assert str(g).count('Switch') == 1
def get_grad_param(self): self.grad_norm = TT.sqrt(sum(TT.sqr(g).sum() for g in self.model.grad)) / TT.cast( self.model.interface_layer.input.shape[1], 'float32') # self.has_numeric_error = TT.or_(TT.isnan(self.grad_norm), TT.isinf(self.grad_norm)) # self.grad = [TT.switch(self.has_numeric_error, numpy_floatX(0.1) * p, g) # for g, p in zip(self.model.grad, self.model.param)] self.grad =[g / TT.cast( self.model.interface_layer.input.shape[1], 'float32') for g in self.model.grad] if self.clip_threshold is not None: self.grad = [TT.switch(TT.ge(self.grad_norm, self.clip_threshold), g * self.clip_threshold / self.grad_norm, g) for g in self.grad]
def pseudograd(loss, params, srng=None, temperature = 1.0e-1, learning_rate=1.0e-2, rho2=0.95): one = T.constant(1.0) zero = T.constant(0.0) deltas = [ make_normal(param, srng=srng) for param in params ] momentum = [ make_copy(param) for param in params ] new_params = [ param + learning_rate * delta for param, delta, m in zip(params, deltas, momentum) ] new_loss = theano.clone( loss, replace=dict(zip(params, new_params)) ) accepting_p = T.exp((loss - new_loss) / temperature) u = srng.uniform(size=(), dtype=loss.dtype) cond = T.or_(T.or_(u > accepting_p, T.isnan(new_loss)), T.isinf(new_loss)) step = T.switch(cond, zero, one) updates = OrderedDict() for m, delta in zip(momentum, deltas): updates[m] = m * rho2 + (one - rho2) * delta * step for param, m in zip(params, momentum): updates[param] = param + learning_rate * m return updates
def rmsprop(cost, params, learning_rate, momentum=0.5, rescale=5.): grads = T.grad(cost=cost, wrt=params) running_square_ = [theano.shared(np.zeros_like(p.get_value(),dtype=p.dtype), broadcastable=p.broadcastable) for p in params] running_avg_ = [theano.shared(np.zeros_like(p.get_value(),dtype=p.dtype), broadcastable=p.broadcastable) for p in params] memory_ = [theano.shared(np.zeros_like(p.get_value(),dtype=p.dtype), broadcastable=p.broadcastable) for p in params] grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Gradient clipping c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.W_emb in grads: if self.initialize_from_pretrained_word_embeddings and self.fix_pretrained_word_embeddings: assert not self.fix_encoder_parameters # Keep pretrained word embeddings fixed logger.debug("Will use mask to fix pretrained word embeddings") grads[self.W_emb] = grads[self.W_emb] * self.W_emb_pretrained_mask elif self.fix_encoder_parameters: # If 'fix_encoder_parameters' is on, the word embeddings will be excluded from parameter training set logger.debug("Will fix word embeddings to initial embeddings or embeddings from resumed model") else: logger.debug("Will train all word embeddings") optimizer_variables = [] if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates, optimizer_variables = Adam(grads, self.lr) else: raise Exception("Updater not understood!") return updates, optimizer_variables # Batch training function.
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Gradient clipping c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.initialize_from_pretrained_word_embeddings and self.fix_pretrained_word_embeddings: assert not self.fix_encoder_parameters # Keep pretrained word embeddings fixed logger.debug("Will use mask to fix pretrained word embeddings") grads[self.W_emb] = grads[self.W_emb] * self.W_emb_pretrained_mask elif self.fix_encoder_parameters: # If 'fix_encoder_parameters' is on, the word embeddings will be excluded from parameter training set logger.debug("Will fix word embeddings to initial embeddings or embeddings from resumed model") else: logger.debug("Will train all word embeddings") if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads, self.lr) else: raise Exception("Updater not understood!") return updates # Batch training function.
def IoU(n_classes, void_labels): def IoU_flatt(y_true, y_pred): '''Expects a binary class matrix instead of a vector of scalar classes. ''' if dim_ordering == 'th': y_pred = K.permute_dimensions(y_pred, (0, 2, 3, 1)) shp_y_pred = K.shape(y_pred) y_pred = K.reshape(y_pred, (shp_y_pred[0]*shp_y_pred[1]*shp_y_pred[2], shp_y_pred[3])) # go back to b01,c # shp_y_true = K.shape(y_true) y_true = K.cast(K.flatten(y_true), 'int32') # b,01 -> b01 y_pred = K.argmax(y_pred, axis=-1) # We use not_void in case the prediction falls in the void class of # the groundtruth for i in range(len(void_labels)): if i == 0: not_void = K.not_equal(y_true, void_labels[i]) else: not_void = not_void * K.not_equal(y_true, void_labels[i]) sum_I = K.zeros((1,), dtype='float32') out = {} for i in range(n_classes): y_true_i = K.equal(y_true, i) y_pred_i = K.equal(y_pred, i) if dim_ordering == 'th': I_i = K.sum(y_true_i * y_pred_i) U_i = K.sum(T.or_(y_true_i, y_pred_i) * not_void) # I = T.set_subtensor(I[i], I_i) # U = T.set_subtensor(U[i], U_i) sum_I = sum_I + I_i else: U_i = K.sum(K.cast(tf.logical_and(tf.logical_or(y_true_i, y_pred_i), not_void), 'float32')) y_true_i = K.cast(y_true_i, 'float32') y_pred_i = K.cast(y_pred_i, 'float32') I_i = K.sum(y_true_i * y_pred_i) sum_I = sum_I + I_i out['I'+str(i)] = I_i out['U'+str(i)] = U_i if dim_ordering == 'th': accuracy = K.sum(sum_I) / K.sum(not_void) else: accuracy = K.sum(sum_I) / tf.reduce_sum(tf.cast(not_void, 'float32')) out['acc'] = accuracy return out return IoU_flatt
def ctc_path_probability(scorematrix, queryseq, blank): """ Compute path probability based on CTC algorithm, only forward pass is used. Batch not supported, for batch version, refer to the CTC class above Speed much slower than the numba & cython version (51.5min vs ~3.9min on word_correction_CTC experiment) :param scorematrix: (T, C+1) :param queryseq: (L, 1) :param blank: scalar, blank symbol :return: (NLL, alphas), NLL > 0 (smaller is better, = -log(p(l|x)); alphas is the forward variable) """ def update_s(s, alphas, scorematrix, queryseq, blank, t): l = (s - 1) // 2 alphas = ifelse(tensor.eq(s % 2, 0), ifelse(tensor.eq(s, 0), tensor.set_subtensor(alphas[s, t], alphas[s, t - 1] * scorematrix[blank, t]), tensor.set_subtensor(alphas[s, t], (alphas[s, t - 1] + alphas[s - 1, t - 1]) * scorematrix[blank, t]), name='for_blank_symbol'), ifelse(tensor.or_(tensor.eq(s, 1), tensor.eq(queryseq[l], queryseq[l - 1])), tensor.set_subtensor(alphas[s, t], (alphas[s, t - 1] + alphas[s - 1, t - 1]) * scorematrix[ queryseq[l], t]), tensor.set_subtensor(alphas[s, t], (alphas[s, t - 1] + alphas[s - 1, t - 1] + alphas[s - 2, t - 1]) * scorematrix[queryseq[l], t]), name='for_same_label_twice')) return alphas def update_t(t, LLForward, alphas, scorematrix, queryseq, blank, T, L2): start = tensor.max([0, L2 - 2 * (T - t)]) end = tensor.min([2 * t + 2, L2]) s = tensor.arange(start, end) results, _ = theano.scan(fn=update_s, sequences=[s], non_sequences=[scorematrix, queryseq, blank, t], outputs_info=[alphas], name='scan_along_s') alphas = results[-1] c = tensor.sum(alphas[start:end, t]) c = tensor.max([1e-15, c]) alphas = tensor.set_subtensor(alphas[start:end, t], alphas[start:end, t] / c) LLForward += tensor.log(c) return LLForward, alphas L = queryseq.shape[0] # Length of label sequence L2 = 2 * L + 1 # Length of label sequence padded with blanks T = scorematrix.shape[1] # time length alphas = tensor.zeros((L2, T)) # Initialize alphas and forward pass alphas = tensor.set_subtensor(alphas[[0, 1], 0], scorematrix[[blank, queryseq[0]], 0]) c = tensor.sum(alphas[:, 0]) alphas = tensor.set_subtensor(alphas[:, 0], alphas[:, 0] / c) LLForward = tensor.log(c) t = tensor.arange(1, T) results, _ = theano.scan(fn=update_t, sequences=[t], non_sequences=[scorematrix, queryseq, blank, T, L2], outputs_info=[LLForward, alphas], name='scan_along_t') NLL, alphas = ifelse(tensor.gt(T, 1), (-results[0][-1], results[1][-1]), (-LLForward, alphas)) return NLL, alphas