我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用theano.tensor.isinf()。
def clip_grad_remove_nan(grads, clip_c_shared, mt_tparams): g2 = 0. for g in grads: g2 += (g*g).sum() not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) if clip_c_shared.get_value() > 0.: new_grads = [] for g, p in zip(grads, itemlist(mt_tparams)): tmpg = tensor.switch(g2 > (clip_c_shared*clip_c_shared), g / tensor.sqrt(g2) * clip_c_shared, g) new_grads.append(tensor.switch(not_finite, np.float32(.1)*p, tmpg)) return new_grads, tensor.sqrt(g2) else: return grads, tensor.sqrt(g2)
def gradient_descent(self, loss): """Momentum GD with gradient clipping.""" grad = T.grad(loss, self.params) self.momentum_velocity_ = [0.] * len(grad) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) updates = OrderedDict() not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) scaling_den = T.maximum(5.0, grad_norm) for n, (param, grad) in enumerate(zip(self.params, grad)): grad = T.switch(not_finite, 0.1 * param, grad * (5.0 / scaling_den)) velocity = self.momentum_velocity_[n] update_step = self.momentum * velocity - self.learning_rate * grad self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates
def compute_updates(training_cost, params, config): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = np.float32(1.) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, np.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) updates = Adam(grads, config.learning_rate) #??adam?????? return updates
def remove_nans(x): return T.switch(T.isnan(x) + T.isinf(x), 0, x)
def get_updates(self, loss, lr, max_norm=1, beta1=0.9, beta2=0.999, epsilon=1e-8, grads=None): # Gradients if grads is None: grads = tensor.grad(loss, self.trainables) # Clipping norm = tensor.sqrt(sum([tensor.sqr(g).sum() for g in grads])) m = theanotools.clipping_multiplier(norm, max_norm) grads = [m*g for g in grads] # Safeguard against numerical instability new_cond = tensor.or_(tensor.or_(tensor.isnan(norm), tensor.isinf(norm)), tensor.or_(norm < 0, norm > 1e10)) grads = [tensor.switch(new_cond, np.float32(0), g) for g in grads] # Safeguard against numerical instability #cond = tensor.or_(norm < 0, tensor.or_(tensor.isnan(norm), tensor.isinf(norm))) #grads = [tensor.switch(cond, np.float32(0), g) for g in grads] # New values t = self.time + 1 lr_t = lr*tensor.sqrt(1. - beta2**t)/(1. - beta1**t) means_t = [beta1*m + (1. - beta1)*g for g, m in zip(grads, self.means)] vars_t = [beta2*v + (1. - beta2)*tensor.sqr(g) for g, v in zip(grads, self.vars)] steps = [lr_t*m_t/(tensor.sqrt(v_t) + epsilon) for m_t, v_t in zip(means_t, vars_t)] # Updates updates = [(x, x - step) for x, step in zip(self.trainables, steps)] updates += [(m, m_t) for m, m_t in zip(self.means, means_t)] updates += [(v, v_t) for v, v_t in zip(self.vars, vars_t)] updates += [(self.time, t)] return norm, grads, updates
def adam(self,cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): all_grads = T.grad(cost=cost, wrt=params) all_grads = total_norm_constraint(all_grads,10) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), all_grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) t_prev = theano.shared(utils.floatX(0.)) updates = OrderedDict() t = t_prev + 1 a_t = learning_rate*T.sqrt(1-beta2**t)/(1-beta1**t) for param, g_t in zip(params, all_grads): g_t = T.switch(not_finite, 0.1 * param,g_t) value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1*m_prev + (1-beta1)*g_t v_t = beta2*v_prev + (1-beta2)*g_t**2 step = a_t*m_t/(T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t return updates
def replace_inf_nan(x, v): return tensor.switch(tensor.or_(tensor.isnan(x), tensor.isinf(x)), v, x) #apply r = x + delta if r is not inf / nan, else return x
def update_inf_nan(x, delta, v): r = x + delta return tensor.switch(tensor.or_(tensor.isnan(r), tensor.isinf(r)), x, r) #will check if shuffle is needed
def gradient_clipping(grads, tparams, clip_c=10): g2 = 0. for g in grads: g2 += (g**2).sum() g2 = tensor.sqrt(g2) not_finite = tensor.or_(tensor.isnan(g2), tensor.isinf(g2)) new_grads = [] for p, g in zip(tparams.values(), grads): new_grads.append(tensor.switch(g2 > clip_c, g * (clip_c / g2), g)) return new_grads, not_finite, tensor.lt(clip_c, g2)
def get_grad_param(self): self.grad_norm = TT.sqrt(sum(TT.sqr(g).sum() for g in self.model.grad)) / TT.cast( self.model.interface_layer.input.shape[1], 'float32') # self.has_numeric_error = TT.or_(TT.isnan(self.grad_norm), TT.isinf(self.grad_norm)) # self.grad = [TT.switch(self.has_numeric_error, numpy_floatX(0.1) * p, g) # for g, p in zip(self.model.grad, self.model.param)] self.grad =[g / TT.cast( self.model.interface_layer.input.shape[1], 'float32') for g in self.model.grad] if self.clip_threshold is not None: self.grad = [TT.switch(TT.ge(self.grad_norm, self.clip_threshold), g * self.clip_threshold / self.grad_norm, g) for g in self.grad]
def pseudograd(loss, params, srng=None, temperature = 1.0e-1, learning_rate=1.0e-2, rho2=0.95): one = T.constant(1.0) zero = T.constant(0.0) deltas = [ make_normal(param, srng=srng) for param in params ] momentum = [ make_copy(param) for param in params ] new_params = [ param + learning_rate * delta for param, delta, m in zip(params, deltas, momentum) ] new_loss = theano.clone( loss, replace=dict(zip(params, new_params)) ) accepting_p = T.exp((loss - new_loss) / temperature) u = srng.uniform(size=(), dtype=loss.dtype) cond = T.or_(T.or_(u > accepting_p, T.isnan(new_loss)), T.isinf(new_loss)) step = T.switch(cond, zero, one) updates = OrderedDict() for m, delta in zip(momentum, deltas): updates[m] = m * rho2 + (one - rho2) * delta * step for param, m in zip(params, momentum): updates[param] = param + learning_rate * m return updates
def rmsprop(cost, params, learning_rate, momentum=0.5, rescale=5.): grads = T.grad(cost=cost, wrt=params) running_square_ = [theano.shared(np.zeros_like(p.get_value(),dtype=p.dtype), broadcastable=p.broadcastable) for p in params] running_avg_ = [theano.shared(np.zeros_like(p.get_value(),dtype=p.dtype), broadcastable=p.broadcastable) for p in params] memory_ = [theano.shared(np.zeros_like(p.get_value(),dtype=p.dtype), broadcastable=p.broadcastable) for p in params] grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Gradient clipping c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.W_emb in grads: if self.initialize_from_pretrained_word_embeddings and self.fix_pretrained_word_embeddings: assert not self.fix_encoder_parameters # Keep pretrained word embeddings fixed logger.debug("Will use mask to fix pretrained word embeddings") grads[self.W_emb] = grads[self.W_emb] * self.W_emb_pretrained_mask elif self.fix_encoder_parameters: # If 'fix_encoder_parameters' is on, the word embeddings will be excluded from parameter training set logger.debug("Will fix word embeddings to initial embeddings or embeddings from resumed model") else: logger.debug("Will train all word embeddings") optimizer_variables = [] if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates, optimizer_variables = Adam(grads, self.lr) else: raise Exception("Updater not understood!") return updates, optimizer_variables # Batch training function.
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Gradient clipping c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.initialize_from_pretrained_word_embeddings and self.fix_pretrained_word_embeddings: assert not self.fix_encoder_parameters # Keep pretrained word embeddings fixed logger.debug("Will use mask to fix pretrained word embeddings") grads[self.W_emb] = grads[self.W_emb] * self.W_emb_pretrained_mask elif self.fix_encoder_parameters: # If 'fix_encoder_parameters' is on, the word embeddings will be excluded from parameter training set logger.debug("Will fix word embeddings to initial embeddings or embeddings from resumed model") else: logger.debug("Will train all word embeddings") if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads, self.lr) else: raise Exception("Updater not understood!") return updates # Batch training function.