我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用theano.tensor.Lop()。
def svgd_gradient(X0): hidden, _, mse = discrim(X0) grad = -1.0 * T.grad( mse.sum(), X0) kxy, neighbors, h = rbf_kernel(hidden) #TODO coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 ) v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2 X1 = X0[neighbors] hidden1, _, _ = discrim(X1) dxkxy = T.Lop(hidden1, X1, v) #svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x') svgd_grad = grad + dxkxy / 2. return grad, svgd_grad, dxkxy
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum): has_momentum = momentum.get_value() > 0.0 samples = [ default_mrng.normal(size=p.shape, avg=0, std=1, dtype=theano.config.floatX) for p in params ] HVs = T.Lop(gparams, params, samples) i = theano.shared(np.float64(0.0).astype(theano.config.floatX)) i_t = i + 1.0 omg_t = 1.0 - gamma**i_t for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs): if is_subtensor_op(p): raise Exception("ESGD subtensor update not implemented!") else: D_t = D * gamma + T.sqr(Hv) * (1.0-gamma) if has_momentum: m_t = m*momentum + g updates[m] = m_t else: m_t = g g_t = m_t / ( T.sqrt(D_t/omg_t + eps) ) #g_t = m_t / ( T.sqrt(D_t + eps) ) updates[D] = D_t updates[p] = p - lr*g_t updates[i] = i_t
def Lop(output, wrt, eval_points): grads = tf.gradients(output, wrt, grad_ys=eval_points) return grads
def replace_gradients_mse(model, opt, batch_size, n_samples = 1): ''' Replace the gradients of a Keras model with mean square error loss. ''' # targets has been repeated twice so the below creates two identical columns # of the target values - we'll only use the first column. targets = K.reshape(model.targets[0], (batch_size, n_samples * 2)) output = K.mean(K.reshape(model.outputs[0], (batch_size, n_samples, 2)), axis=1) # compute d Loss / d output dL_dOutput = (output[:,0] - targets[:,0]) * (2.) / batch_size # compute (d Loss / d output) (d output / d theta) for each theta trainable_weights = model.trainable_weights grads = Lop(output[:,1], wrt=trainable_weights, eval_points=dL_dOutput) # compute regularizer gradients # add loss with respect to regularizers reg_loss = model.total_loss * 0. for r in model.losses: reg_loss += r reg_grads = K.gradients(reg_loss, trainable_weights) grads = [g+r for g,r in zip(grads, reg_grads)] opt = keras.optimizers.get(opt) # Patch keras gradient calculation to allow for user defined gradients opt.get_gradients = types.MethodType( get_gradients, opt ) opt.grads = grads model.optimizer = opt return model
def get_grads(self, state_below, target, mask = None, reg = None, scale=None, sum_over_time=True, use_noise=True, additional_inputs=None): """ This function implements both the forward and backwards pass of this layer. The reason we do this in a single function is because for the factorized softmax layer is hard to rely on grad and get an optimized graph. For uniformity I've implemented this method for this layer as well (though one doesn't need to use it) :param state_below: theano variable representing the input to the softmax layer :param target: theano variable representing the target for this layer :return: cost, dC_dstate_below, param_grads, new_properties dC_dstate_below is a computational graph representing the gradient of the cost wrt to state_below param_grads is a list containing the gradients wrt to the different parameters of the layer new_properties is a dictionary containing additional properties of the model; properties are theano expression that are evaluated and reported by the model """ cost = self.get_cost(state_below, target, mask = mask, reg = reg, scale=scale, sum_over_time=sum_over_time, use_noise=use_noise, additional_inputs=additional_inputs) grads = TT.grad(cost, self.params) if self.additional_gradients: for new_grads, to_replace, properties in self.additional_gradients: gparams, params = new_grads prop_expr = [x[1] for x in properties] replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace] rval = theano.clone(gparams + prop_expr, replace=replace) gparams = rval[:len(gparams)] prop_expr = rval[len(gparams):] self.properties += [(x[0], y) for x,y in zip(properties, prop_expr)] for gp, p in zip(gparams, params): grads[self.params.index(p)] += gp self.cost = cost self.grads = grads def Gvs_fn(*args): w = (1 - self.model_output) * self.model_output * state_below.shape[1] Gvs = TT.Lop(self.model_output, self.params, TT.Rop(self.model_output, self.params, args)/w) return Gvs self.Gvs = Gvs_fn return cost, grads
def hypergrad(params_ele, params_hyper, dvalid_dtheta, loss_ele, loss_hyper, loss_ele_penalty=0.): """ Function defining the hypergradients: gradients of validation cost with respect to various hyperparameters. The function is separating penalty hyperparameters (which is assumed to depend only on w) from noise and other hyperparameters, due to otherwise dependancy errors in the Lop operator. Inputs: paramsT1, paramsT2 :: T1 and T2 parameters c1, c2 :: cross-entropy on training and validation set p1, p2 :: penalty terms on training and validation set (p2 assumed 0) """ # initializations reg_penalty, reg_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], [] # separate different types of parameters for regular in params_hyper: reg_type, _ = regular.name.split('_') if reg_type in penalty_list: reg_penalty += [regular] elif reg_type in noise_list: reg_noise += [regular] else: print 'Hypergrad not implemented for ', reg_type # separate weight parameters and gradients for (param, grad) in zip(params_ele, dvalid_dtheta): paramType, _ = param.name.split('_') if paramType == 'W': w += [param] dvalid_dw += [grad] # hyper-gradients if reg_penalty: dpenalty_dw = T.grad(loss_ele_penalty, w) dpenalty_dw = [-grad for grad in dpenalty_dw] grad_penalty = T.Lop(dpenalty_dw, reg_penalty, dvalid_dw) if reg_noise: dele_dtheta = T.grad(loss_ele, params_ele) dele_dtheta = [-grad for grad in dele_dtheta] grad_noise = T.Lop(dele_dtheta, reg_noise, dvalid_dtheta) # outputs params_hyper = reg_penalty + reg_noise dvalid_dgamma = grad_penalty + grad_noise return params_hyper, dvalid_dgamma
def L_hvp_meta(params_ele, params_hyper, pseudo_params_ele, vec, batchx, batchy): """ :param params_ele: elementary params :param params_hyper: hyper params :param pseudo_params_ele: the psed :param vec: a vector multiple to the hessian, could be learning rate vec or momentum vec :param batchx: data x of this iteration :param batchy: data y of this iteration :return: gradient w.r.t. hyper params """ reg_params_penalty, reg_params_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], [] # forward to obtain loss & gradients loss_ele, loss_ele_penalty = L_hvp_meta_unsafe(batchx, batchy, 1, 0) # separate different types of parameters for regular in params_hyper: reg_type, _ = regular.name.split('_') if reg_type in penalty_list: reg_params_penalty += [regular] elif reg_type in noise_list: reg_params_noise += [regular] else: print 'Hypergrad not implemented for ', reg_type # VJ = T.Lop(y, W, v), to calc v * dy/dW if reg_params_penalty: dpenalty_dw = T.grad(loss_ele_penalty, w) dpenalty_dw = [-grad for grad in dpenalty_dw] # dpenalty_dw might be calc through `meta_backward_ele()`, # as you like, discuss it later grad_penalty = T.Lop(dpenalty_dw, reg_params_penalty, vec) # if reg_params_noise: # dele_dtheta = T.grad(loss_ele, params_ele) # dele_dtheta = [-grad for grad in dele_dtheta] # grad_noise = T.Lop(dele_dtheta, reg_params_noise, dL_dtheta) # outputs params_hyper = reg_params_penalty + reg_params_noise dvalid_dgamma = grad_penalty + grad_noise return dvalid_dgamma
def L_hvp_meta_unsafe(params_ele, params_hyper, pseudo_params_ele, batchx, batchy, x, y, loss): """ :param params_ele: elementary params :param params_hyper: hyper params :param pseudo_params_ele: the psed, a dictionary whose keys are elements in params_ele :param batchx: data x of this iteration :param batchy: data y of this iteration :param x: variable x of the model :param y: variable y of the model :param loss: symbol of loss function expression :return: gradient w.r.t. hyper params at pseudo_params_ele Attention please! In order to save the memory, the value of params_ele would be replaced by the values of pseudo_params_ele. SAVE ve the values of weights before calling me! """ reg_params_penalty, reg_params_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], [] # replace the params for param in params_ele: param.set_value(pseudo_params_ele[param]) # separate different types of parameters for regular in params_hyper: reg_type, _ = regular.name.split('_') if reg_type in penalty_list: reg_params_penalty += [regular] elif reg_type in noise_list: reg_params_noise += [regular] else: print 'Hypergrad not implemented for ', reg_type # get gradient w.r.t. hyper params if reg_params_penalty: dloss_dpenalty = T.grad(loss, penalty_list) # forward & backward to obtain gradients meta_fwbw_ele = theano.function([x, y], dloss_dpenalty) grad_penalty = meta_fwbw_ele(batchx, batchy) # if reg_params_noise: # dele_dtheta = T.grad(loss_ele, params_ele) # dele_dtheta = [-grad for grad in dele_dtheta] # grad_noise = T.Lop(dele_dtheta, reg_params_noise, dL_dtheta) # outputs params_hyper = reg_params_penalty + reg_params_noise dvalid_dgamma = grad_penalty + grad_noise return dloss_dpenalty, dvalid_dgamma
def check_mat_rop_lop(self, y, out_shape): """ Test the Rop/Lop when input is a matrix and the output is a vector :param y: the output variable of the op applied to self.mx :param out_shape: Used to generate a random tensor corresponding to the evaluation point of the Rop (i.e. the tensor with which you multiply the Jacobian). It should be a tuple of ints. If the Op has more than 1 input, one of them must be mx, while others must be shared variables / constants. We will test only against the input self.mx, so you must call check_mat_rop_lop/check_rop_lop for the other inputs. We expect all inputs/outputs have dtype floatX. If you want to test an Op with an output matrix, add a sum after the Op you want to test. """ vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) yv = tensor.Rop(y, self.mx, self.mv) rop_f = function([self.mx, self.mv], yv, on_unused_input='ignore') sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.mx, self.mv]) scan_f = function([self.mx, self.mv], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) self.check_nondiff_rop(theano.clone(y, replace={self.mx: break_op(self.mx)})) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.mx, self.v) lop_f = function([self.mx, self.v], yv) sy = tensor.grad((self.v * y).sum(), self.mx) scan_f = function([self.mx, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) known_fail = False try: self.check_nondiff_rop(theano.clone(y, replace={self.x: break_op(self.x)})) except AssertionError: known_fail = True # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2)) if known_fail: raise SkipTest('Rop does not handle non-differentiable inputs ' 'correctly. Bug exposed by fixing Add.grad method.')
def test_rop_lop(): mx = tensor.matrix('mx') mv = tensor.matrix('mv') v = tensor.vector('v') y = matrix_inverse(mx).sum(axis=0) yv = tensor.Rop(y, mx, mv) rop_f = function([mx, mv], yv) sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, mx, mv]) scan_f = function([mx, mv], sy) rng = numpy.random.RandomState(utt.fetch_seed()) vx = numpy.asarray(rng.randn(4, 4), theano.config.floatX) vv = numpy.asarray(rng.randn(4, 4), theano.config.floatX) v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) raised = False try: tensor.Rop( theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except ValueError: raised = True if not raised: raise Exception(( 'Op did not raised an error even though the function' ' is not differentiable')) vv = numpy.asarray(rng.uniform(size=(4,)), theano.config.floatX) yv = tensor.Lop(y, mx, v) lop_f = function([mx, v], yv) sy = tensor.grad((v * y).sum(), mx) scan_f = function([mx, v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))