我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.grad()。
def adamax_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) for p, g in zip(params, grads): mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) if mom1>0: v_t = mom1*v + (1. - mom1)*g updates.append((v,v_t)) else: v_t = g mg_t = T.maximum(mom2*mg, abs(g)) g_t = v_t / (mg_t + 1e-6) p_t = p - lr * g_t updates.append((mg, mg_t)) updates.append((p, p_t)) return updates
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
def train_one(self, x, target): x, target = tt.unbroadcast(x, 0), tt.unbroadcast(target, 0) # F'ing scan states = {} for layer in self.layers: x, layer_state = layer.forward_pass_and_state(x, count_ops=True) states[layer]=layer_state loss = self.loss(x, target) param_grad_pairs = [] grad = None for layer in self.layers[::-1]: grad, param_grads = layer.backward_pass(state=states[layer], grad=grad, cost = loss, count_ops=True) loss = None param_grad_pairs += list(izip_equal(layer.parameters, param_grads)) all_params, all_param_grads = zip(*param_grad_pairs) self.optimizer.update_from_gradients(parameters=all_params, gradients=all_param_grads) return create_constant(0.) # scan demands some return
def sgd_optimizer(model, lr=0.001, momentum=0.9): lr = theano.shared(np.array(lr).astype(theano.config.floatX)) # Make sure momentum is a sane value assert momentum < 1 and momentum >= 0 # the updates of SGD with momentum updates = [] grads = T.grad(model.costs[0], model.params) for param, grad in zip(model.params, grads): param_update = theano.shared(param.get_value()*0.) updates.append((param, param - lr * param_update)) updates.append((param_update, momentum*param_update + (1. - momentum)*grad)) train_func = theano.function(model.inputs, model.costs, updates=updates) valid_func = theano.function(model.inputs, model.costs) return train_func, valid_func
def get_sgd_updates(self, learning_rate, lr_scaler=1.0, batch_size=1, sparsity_level=-1, sparse_reg=-1, x_in=None): h = self.encode(x_in) x_rec = self.decode(h) cost = self.get_rec_cost(x_rec) if self.L1_reg != -1 and self.L1_reg is not None: cost += self.L1_reg * self.L1 if self.L2_reg != -1 and self.L2_reg is not None: cost += self.L2_reg * self.L2 if sparsity_level != -1 and sparse_reg != -1: sparsity_penal = self.sparsity_penality( h, sparsity_level, sparse_reg, batch_size) cost += sparsity_penal self.gparams = T.grad(cost, self.params) updates = OrderedDict({}) for param, gparam in zip(self.params, self.gparams): updates[param] = self.momentum * param - lr_scaler * \ learning_rate * gparam return (cost, updates, h, x_rec)
def momentum(loss, params, caches, learning_rate=0.1, rho=0.1, clip_at=0.0, scale_norm=0.0, lambda2=0.0): updates = OrderedDict() grads = T.grad(cost=loss, wrt=params) for p, c, g in zip(params, caches, grads): if clip_at > 0.0: grad = clip(g, clip_at) else: grad = g if scale_norm > 0.0: grad = scale(grad, scale_norm) delta = rho * grad + (1-rho) * c updates[p] = p - learning_rate * (delta + lambda2 * p) return updates, grads
def _generate_train_model_function(self, scores): u = T.lvector('u') i = T.lvector('i') j = T.lvector('j') self.W = theano.shared(numpy.zeros((self._dim)).astype('float32'), name='W'); self.S = theano.shared(scores, name='S'); x_ui = T.dot(self.W, self.S[u,i,:].T); x_uj = T.dot(self.W, self.S[u,j,:].T); x_uij = x_ui - x_uj; obj = T.sum( T.log(T.nnet.sigmoid(x_uij)).sum() - \ self._lambda_w * 0.5 * (self.W ** 2).sum() ) cost = -obj g_cost_W = T.grad(cost=cost, wrt=self.W) updates = [ (self.W, self.W - self._learning_rate * g_cost_W) ] self.train_model = theano.function(inputs=[u,i,j], outputs=cost, updates=updates);
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) def Hx_plain(): Hx_plain_splits = TT.grad( TT.sum([TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn' ) return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: ext.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) t = theano.shared(floatX(1.)) b1_t = self.b1*self.l**(t-1) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = b1_t*m + (1 - b1_t)*g v_t = self.b2*v + (1 - self.b2)*g**2 m_c = m_t / (1-self.b1**t) v_c = v_t / (1-self.b2**t) p_t = p - (self.lr * m_c) / (T.sqrt(v_c) + self.e) p_t = self.regularizer.weight_regularize(p_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t) ) updates.append((t, t + 1.)) return updates
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) for p,g in zip(params,grads): g = self.regularizer.gradient_regularize(p, g) acc = theano.shared(p.get_value() * 0.) acc_delta = theano.shared(p.get_value() * 0.) acc_new = self.rho * acc + (1 - self.rho) * g ** 2 updates.append((acc,acc_new)) update = g * T.sqrt(acc_delta + self.epsilon) / T.sqrt(acc_new + self.epsilon) updated_p = p - self.lr * update updated_p = self.regularizer.weight_regularize(updated_p) updates.append((p, updated_p)) acc_delta_new = self.rho * acc_delta + (1 - self.rho) * update ** 2 updates.append((acc_delta,acc_delta_new)) return updates
def svgd_gradient(X0): hidden, _, mse = discrim(X0) grad = -1.0 * T.grad( mse.sum(), X0) kxy, neighbors, h = rbf_kernel(hidden) #TODO coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 ) v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2 X1 = X0[neighbors] hidden1, _, _ = discrim(X1) dxkxy = T.Lop(hidden1, X1, v) #svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x') svgd_grad = grad + dxkxy / 2. return grad, svgd_grad, dxkxy
def __add__(self, other): assert hasattr(self, 'out'), 'all layers need a default output' new_obj = utils.copy(self) other_var = new_obj.tensor_from_layer(other) new_obj.out = new_obj.out + other_var # Summing cost layers: if hasattr(new_obj, 'grads') and hasattr(other, 'grads'): for param, grad_param in zip(other.params, other.grads): pos = new_obj.params.index(param) new_obj.grads[pos] += grad_param elif hasattr(new_obj, 'grads') and \ isinstance(other, theano.gof.Variable) and \ other.ndim == 0: other_grads = TT.grad(other, new_obj.params, disconnected_inputs='ignore') new_obj.grads = [x + y for x,y in zip(new_obj.grads, other_grads)] elif hasattr(new_obj, 'grads'): raise ValueError('I do not know how to compute the gradients' ' of the added term' + str(other) + '. Call' ' train on it if it is an output layer') return new_obj
def __sub__(self, other): assert hasattr(self, 'out'), 'all layers need a default output' new_obj = utils.copy(self) other_var = new_obj.tensor_from_layer(other) new_obj.out = new_obj.out - other_var if hasattr(new_obj, 'grads') and hasattr(other, 'grads'): for param, grad_param in zip(other.params, other.grads): pos = new_obj.params.index(param) new_obj.grads[pos] -= grad_param elif hasattr(new_obj, 'grads') and \ isinstance(other, theano.gof.Variable) and \ other.ndim == 0: other_grads = TT.grad(other, new_obj.params, disconnected_inputs='ignore') new_obj.grads = [x - y for x,y in zip(new_obj.grads, other_grads)] elif hasattr(new_obj, 'grads'): raise ValueError('I do not know how to compute the gradients' ' of the subtracted term' + str(other) + '. Call' ' train on it if it is an output layer') return new_obj
def __init__(self): super(UpPooling, self).__init__() #X = self.get_input(train) #if self.dim_ordering == 'th': # output = K.repeat_elements(X, self.size[0], axis=2) # output = K.repeat_elements(output, self.size[1], axis=3) #elif self.dim_ordering == 'tf': # output = K.repeat_elements(X, self.size[0], axis=1) # output = K.repeat_elements(output, self.size[1], axis=2) #else: # raise Exception('Invalid dim_ordering: ' + self.dim_ordering) # #f = T.grad(T.sum(self._pool2d_layer.get_output(train)), wrt=self._pool2d_layer.get_input(train)) * output #return f
def __call__(self, cost, params): grads = T.grad(cost=cost ,wrt=params) updates = [] exp = theano.shared(np.float32(1.0),name='exp',borrow=True) updates.append((exp, exp+1)) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_new = self.beta1 * m + (1 - self.beta1) * g v_new = self.beta2 * v + (1 - self.beta2) * g**2 mt = m_new / (1 - self.beta1**exp) vt = v_new / (1 - self.beta2**exp) updates.append((m, m_new)) updates.append((v, v_new)) updates.append((p, p - self.lr * mt / (T.sqrt(vt) + self.epsilon))) return updates
def get_gradients(self, loss, params): """ Consider the situation that gradient is weighted. """ if isinstance(loss, list): grads = T.grad(loss[0], params, consider_constant=loss[1:]) # gradient of loss else: grads = T.grad(loss, params) if hasattr(self, 'clipnorm') and self.clipnorm > 0: print('use gradient clipping!!') print('clipnorm = %f' % self.clipnorm) norm = T.sqrt(sum([T.sum(g ** 2) for g in grads])) grads = [clip_norm(g, self.clipnorm, norm) for g in grads] else: print('not use gradient clipping!!') return grads
def get_cost_updates(self, corruption_level, learning_rate): #if corruption_level == 0: # tilde_x = self.x #else: # tilde_x = self.get_corrupted_input(self.x, corruption_level) tilde_x = self.x y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) L = T.sum ( (self.x-z) * (self.x-z), axis=1 ) cost = T.mean(L) / 2 gparams = T.grad(cost, self.params) updates = {} for param, gparam in zip(self.params, gparams): updates[param] = param - learning_rate*gparam return (cost, updates)
def Adam(cost, params, learning_rate=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = OrderedDict() grads = T.grad(cost, params) i = theano.shared(np.asarray(0., dtype=theano.config.floatX)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = learning_rate * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates[m] = m_t updates[v] = v_t updates[p] = p_t updates[i] = i_t return updates
def RmsProp(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): updates = OrderedDict() grads = T.grad(cost, params) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def EGD(cost, params, learning_rate = 0.33, constraint = 1.0): updates = OrderedDict() grads = T.grad(cost, params) U = T.constant(constraint) #first half of params rw_pos = T.exp(-learning_rate * U * grads[0]) rb_pos = T.exp(-learning_rate * U * grads[1]) #second half rw_neg = 1/rw_pos rb_neg = 1/rb_pos rs = [rw_pos, rb_pos, rw_neg, rb_neg] partition = T.sum(params[0]*rs[0]) + T.sum(params[1]*rs[1]) + T.sum(params[2]*rs[2]) + T.sum(params[3]*rs[3]) for param, r in zip(params, rs): updates[param] = U*param*r/partition return updates
def compile_maxpool(output_shape, pool_size): X = T.tensor4() # compute output with both methods out1 = T.signal.pool.pool_2d(X, pool_size, ignore_border=True, st=None, padding=(0, 0), mode='max') out2 = my_pool_2d(X, pool_size, ignore_border=True, st=None, padding=(0, 0), mode='max') # compute gradient with random incoming gradient for both cases incoming_grad = T.as_tensor_variable(np.random.random(size=output_shape) .astype(np.float32)) grad1 = T.grad(None, wrt=X, known_grads={out1: incoming_grad}) grad2 = T.grad(None, wrt=X, known_grads={out2: incoming_grad}) return theano.function([X], [out1, out2, grad1, grad2])
def test_maxpool_edge_case(self): """ Test MaxPooling on an edge case: inputs have same values in a patch Check one and only one gradient is back-propagated in each patch """ X = np.zeros(shape=self.input_shape, dtype=np.float32) out1, out2, _, grad = self.maxpool(X) assert np.all(np.isclose(out1, out2)) for i in range(self.output_shape[0]): for j in range(self.output_shape[1]): for k in range(self.output_shape[2]): for l in range(self.output_shape[3]): count = 0 for m in range(self.pool_size[0]): for n in range(self.pool_size[1]): kk = self.pool_size[0] * k + m ll = self.pool_size[1] * l + n if grad[i, j, kk, ll] != 0.: count += 1 assert count == 1
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(np.dtype(theano.config.floatX).type(1)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.clip(g, -grad_clip, grad_clip) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates
def adagrad(cost, params, learning_rate=0.1, epsilon=1e-6, **kwargs): """Adaptive Gradient Descent Scale learning rates by dividing with the square root of accumulated squared gradients References ---------- .. [1] http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf """ gparams = T.grad(cost, params) updates = OrderedDict() for param, gparam in zip(params, gparams): accu = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) accu_new = accu + gparam ** 2 updates[accu] = accu_new updates[param] = param - learning_rate * gparam / T.sqrt(accu_new + epsilon) return updates
def adam(cost, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, **kwargs): """Adam Gradient Descent Scale learning rates by Adaptive moment estimation References ---------- .. [1] https://arxiv.org/pdf/1412.6980v8.pdf """ gparams = T.grad(cost, params) updates = OrderedDict() t = shared_variable(to_float_X(0.)) t_t = 1. + t l_r_t = learning_rate * T.sqrt(1. - beta2 ** t_t) / (1. - beta1 ** t_t) for param, gparam in zip(params, gparams): m = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) v = shared_variable(np.zeros(param.get_value(borrow=True).shape), broadcastable=param.broadcastable) m_t = beta1 * m + (1. - beta1) * gparam v_t = beta2 * v + (1. - beta2) * T.sqr(gparam) updates[m] = m_t updates[v] = v_t updates[param] = param - l_r_t * m_t / (T.sqrt(v_t) + epsilon) updates[t] = t_t return updates
def compile_train(self, *args): # args is a list of dictionaries if self.verbose: print('compiling training function...') import theano for arg_list in args: self.compiled_train_fn_list.append(theano.function(**arg_list)) if self.monitor_grad: norms = [grad.norm(L=2) for grad in self.grads] import theano.tensor as T norms = T.log10(norms) self.get_norm = theano.function([self.subb_ind], [T.sum(norms), T.max(norms)], givens=[(self.x, self.shared_x_slice), (self.y, self.shared_y_slice)] )
def compile_train(self, *args): # args is a list of dictionaries if self.verbose: print('compiling training function...') import theano for arg_list in args: self.compiled_train_fn_list.append(theano.function(**arg_list)) if self.monitor_grad: norms = [grad.norm(L=2) for grad in self.grads] self.get_norm = theano.function([self.subb_ind], norms, givens=[(self.x, self.shared_x_slice), (self.y, self.shared_y_slice)] )
def compile_iter_fns(self, *args, **kwargs): import theano import time start=time.time() # f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob') self.f_pred = theano.function([self.x, self.mask], self.pred.argmax(axis=1), name='f_pred') # f_cost = theano.function([x, mask, y], cost, name='f_cost') import theano.tensor as tensor grads = tensor.grad(self.cost, wrt=list(self.tparams.values())) # f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') from theanompi.models.lstm import adadelta self.f_grad_shared, self.f_update = adadelta(lr, self.tparams, grads, self.x, self.mask, self.y, self.cost) if self.rank==0: print('compile time %.3f' % (time.time()-start))
def fit(self, weights, o_error, tpo ): gradients = T.grad(o_error ,weights) updates = [] for c, v, w, g in zip(self.t_cache, self.t_velocity, weights,gradients): new_velocity = T.sub( T.mul(tpo["momentum_rate"], v) , T.mul(tpo["learn_rate"], g) ) new_cache = T.add( T.mul(tpo["decay_rate"] , c) , T.mul(T.sub( 1, tpo["decay_rate"]) , T.sqr(g))) new_weights = T.sub(T.add(w , new_velocity) , T.true_div( T.mul(g,tpo["learn_rate"]) , T.sqrt(T.add(new_cache,0.1**8)))) updates.append((w, new_weights)) updates.append((v, new_velocity)) updates.append((c, new_cache)) return updates ###### Nesterov momentum ########################################
def fit(self, weights, o_error, tpo): updates = [] gradients = theano.grad(o_error, weights) for c, w, g in zip(self.t_cache, weights, gradients): new_cache = tpo["decay_rate"] * c + ( 1- tpo["decay_rate"]) * T.sqr(g) new_weights = w - (g * tpo["learn_rate"]) / T.sqrt(new_cache + 0.1**8) updates.append((w, new_weights)) updates.append((c, new_cache)) return updates ###### ADADELTA ########################################
def fit(self, weights, o_error, tpo): gradients = theano.grad(o_error, weights) updates = [] for v, w, g in zip(self.t_velocity, weights, gradients): #gradient = T.grad(o_error ,w) new_velocity = tpo["momentum_rate"] * v - tpo["learn_rate"] * g new_weights = w + new_velocity updates.append((w, new_weights)) updates.append((v, new_velocity)) return updates ###### Vanilla SGD ########################################
def adadelta(loss, params, learning_rate, rho=.95, epsilon=1e-6): grads = T.grad(loss, params) updates = OrderedDict() for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared( np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) delta_accu = theano.shared( np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (1 - rho) * grad ** 2 updates[accu] = accu_new update = (grad * T.sqrt(delta_accu + epsilon) / T.sqrt(accu_new + epsilon)) updates[param] = param - learning_rate * update delta_accu_new = rho * delta_accu + (1 - rho) * update ** 2 updates[delta_accu] = delta_accu_new return updates
def adam(loss, params, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8): grads = T.grad(loss, params) updates = OrderedDict() t_prev = theano.shared(np.cast[theano.config.floatX](0)) t = t_prev + 1 a_t = learning_rate * T.sqrt(1-beta2**t)/(1-beta1**t) for param, grad in zip(params, grads): value = param.get_value(borrow=True) m_prev = theano.shared( np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared( np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (1 - beta1) * grad v_t = beta2 * v_prev + (1 - beta2) * grad ** 2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) updates[m_prev] = m_t updates[v_prev] = v_t updates[param] = param - step updates[t_prev] = t return updates
def __init__( self, model, eta=1e-2, rho=0.9, epsilon=1e-6, minibatch_size=10 ): """ Initialize RMSPROP. Arguments --------- model : model instance should equip params, grad(), [and updates]. eta : float. Learning rate. rho : float. epsilon : float. Constant for numerical stability. minibatch_size : integer. Minibatch size to calcurate stochastic gradient. """ self.model = model self.__eta = eta self.__rho = rho self.__eps = epsilon self.minibatch_size = minibatch_size self.__compile()
def __compile( self ): self.update_funcs = [] for params, inputs, cost in self.model.get_opt_infos(): # Shared variables for acc. accs = [ theano.shared( np.zeros( p.get_value().shape, dtype=theano.config.floatX ) ) for p in params ] sgrad = tensor.grad( cost, params ) new_accs = [ self.__rho * acc + (1 - self.__rho) * sg ** 2 for (acc, sg) in zip( accs, sgrad ) ] updates = OrderedDict() updates.update( zip( accs, new_accs ) ) updates.update( [ (p, p - ( self.__eta * sg / tensor.sqrt( acc_new + self.__eps ) ) ) for (p, sg, acc_new) in zip( params, sgrad, new_accs ) ] ) self.update_funcs.append( theano.function( inputs = inputs, updates = updates ) )
def __init__( self, model, eta=1e-3, beta1=0.9, beta2=0.999, epsilon=1e-8, minibatch_size=10 ): """ Initialize ADAM. Arguments --------- model : model instance should equip params, grad(), [and updates]. eta : float. Learning rate. beta1, beta2 : float. epsilon : float. Constant for numerical stability. minibatch_size : integer. Minibatch size to calcurate stochastic gradient. """ self.model = model self.__eta = eta self.__beta1 = beta1 self.__beta2 = beta2 self.__eps = epsilon self.minibatch_size = minibatch_size self.__compile()