我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用theano.tensor.Rop()。
def gauss_newton_product(cost, p, v, s): # this computes the product Gv = J'HJv (G is the Gauss-Newton matrix) if not isinstance(s, (list, tuple)): s = [s] sum_Gv = None for si in s: Jv = T.Rop(si, p, v) HJv = T.grad(T.sum(T.grad(cost, si, disconnected_inputs='ignore') * Jv), si, consider_constant=[Jv], disconnected_inputs='ignore') Gv = T.grad(T.sum(HJv * si), p, consider_constant=[HJv, Jv], disconnected_inputs='ignore') Gv = list(map(T.as_tensor_variable, Gv)) # for CudaNdarray if sum_Gv is None: sum_Gv = Gv else: sum_Gv = [a+b for a, b in zip(Gv, sum_Gv)] return sum_Gv
def _get_updates_for(self, param, grad): D_tm1 = shared_like(param, 'D_ewma') Hv = TT.Rop(grad, param, self.rng.normal(param.shape)) D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv den = TT.sqrt(D_t) + self.epsilon yield D_tm1, D_t yield param, param - grad * self.learning_rate / den
def check_nondiff_rop(self, y): """ If your op is not differentiable(so you can't define Rop) test that an error is raised.""" raised = False try: tensor.Rop(y, self.x, self.v) except ValueError: raised = True if not raised: self.fail(( 'Op did not raise an error even though the function' ' is not differentiable'))
def test_invalid_input(self): success = False try: tensor.Rop(0., [tensor.matrix()], [tensor.vector()]) success = True except ValueError: pass assert not success
def test_Rop_dot_bug_18Oct2013_Jeremiah(self): # This test refers to a bug reported by Jeremiah Lowin on 18th Oct # 2013. The bug consists when through a dot operation there is only # one differentiable path (i.e. there is no gradient wrt to one of # the inputs). x = tensor.arange(20.0).reshape([1, 20]) v = theano.shared(numpy.ones([20])) d = tensor.dot(x, v).sum() tensor.Rop(tensor.grad(d, v), v, v)
def __call__(self, v, cost, parameters, damp): # compute Gauss-Newton Matrix right-multiplied by `v` Jv = T.Rop(self._s, parameters, v) HJv = T.grad(T.sum(T.grad(cost, self._s) * Jv), self._s, consider_constant=[Jv]) JHJv = T.grad(T.sum(HJv * self._s), parameters, consider_constant=[HJv, Jv]) # apply Tikhonov damping JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)] return JHJv
def gauss_newton_product(cost, p, v, s): # this computes the product Gv = J'HJv (G is the Gauss-Newton matrix) Jv = T.Rop(s, p, v) HJv = T.grad(T.sum(T.grad(cost, s) * Jv), s, consider_constant=[Jv], disconnected_inputs='ignore') Gv = T.grad(T.sum(HJv * s), p, consider_constant=[HJv, Jv], disconnected_inputs='ignore') Gv = map(T.as_tensor_variable, Gv) # for CudaNdarray return Gv
def __init__(self, _p, inputs, s, costs, h=None, ha=None): '''Constructs and compiles the necessary Theano functions. p : list of Theano shared variables Parameters of the model to be optimized. inputs : list of Theano variables Symbolic variables that are inputs to your graph (they should also include your model 'output'). Your training examples must fit these. s : Theano variable Symbolic variable with respect to which the Hessian of the objective is positive-definite, implicitly defining the Gauss-Newton matrix. Typically, it is the activation of the output layer. costs : list of Theano variables Monitoring costs, the first of which will be the optimized objective. h: Theano variable or None Structural damping is applied to this variable (typically the hidden units of an RNN). ha: Theano variable or None Symbolic variable that implicitly defines the Gauss-Newton matrix for the structural damping term (typically the activation of the hidden layer). If None, it will be set to `h`.''' self.p = _p self.shapes = [i.get_value().shape for i in _p] self.sizes = list(map(numpy.prod, self.shapes)) self.positions = numpy.cumsum([0] + self.sizes)[:-1] g = T.grad(costs[0], _p) g = list(map(T.as_tensor_variable, g)) # for CudaNdarray self.f_gc = compile_function(inputs, g + costs) # during gradient computation self.f_cost = compile_function(inputs, costs) # for quick cost evaluation symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4 v = [symbolic_types[len(i)]() for i in self.shapes] Gv = gauss_newton_product(costs[0], _p, v, s) coefficient = T.scalar() # this is lambda*mu if h is not None: # structural damping with cross-entropy h_constant = symbolic_types[h.ndim]() # T.Rop does not support `consider_constant` yet, so use `givens` structural_damping = coefficient * ( -h_constant * T.log(h + 1e-10) - (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0] if ha is None: ha = h Gv_damping = gauss_newton_product(structural_damping, _p, v, ha) Gv = [a + b for a, b in zip(Gv, Gv_damping)] givens = {h_constant: h} else: givens = {} self.function_Gv = compile_function(inputs + v + [coefficient], Gv, givens=givens)
def get_grads(self, state_below, target, mask = None, reg = None, scale=None, sum_over_time=True, use_noise=True, additional_inputs=None): """ This function implements both the forward and backwards pass of this layer. The reason we do this in a single function is because for the factorized softmax layer is hard to rely on grad and get an optimized graph. For uniformity I've implemented this method for this layer as well (though one doesn't need to use it) :param state_below: theano variable representing the input to the softmax layer :param target: theano variable representing the target for this layer :return: cost, dC_dstate_below, param_grads, new_properties dC_dstate_below is a computational graph representing the gradient of the cost wrt to state_below param_grads is a list containing the gradients wrt to the different parameters of the layer new_properties is a dictionary containing additional properties of the model; properties are theano expression that are evaluated and reported by the model """ cost = self.get_cost(state_below, target, mask = mask, reg = reg, scale=scale, sum_over_time=sum_over_time, use_noise=use_noise, additional_inputs=additional_inputs) grads = TT.grad(cost, self.params) if self.additional_gradients: for new_grads, to_replace, properties in self.additional_gradients: gparams, params = new_grads prop_expr = [x[1] for x in properties] replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace] rval = theano.clone(gparams + prop_expr, replace=replace) gparams = rval[:len(gparams)] prop_expr = rval[len(gparams):] self.properties += [(x[0], y) for x,y in zip(properties, prop_expr)] for gp, p in zip(gparams, params): grads[self.params.index(p)] += gp self.cost = cost self.grads = grads def Gvs_fn(*args): w = (1 - self.model_output) * self.model_output * state_below.shape[1] Gvs = TT.Lop(self.model_output, self.params, TT.Rop(self.model_output, self.params, args)/w) return Gvs self.Gvs = Gvs_fn return cost, grads
def check_mat_rop_lop(self, y, out_shape): """ Test the Rop/Lop when input is a matrix and the output is a vector :param y: the output variable of the op applied to self.mx :param out_shape: Used to generate a random tensor corresponding to the evaluation point of the Rop (i.e. the tensor with which you multiply the Jacobian). It should be a tuple of ints. If the Op has more than 1 input, one of them must be mx, while others must be shared variables / constants. We will test only against the input self.mx, so you must call check_mat_rop_lop/check_rop_lop for the other inputs. We expect all inputs/outputs have dtype floatX. If you want to test an Op with an output matrix, add a sum after the Op you want to test. """ vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) yv = tensor.Rop(y, self.mx, self.mv) rop_f = function([self.mx, self.mv], yv, on_unused_input='ignore') sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.mx, self.mv]) scan_f = function([self.mx, self.mv], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) self.check_nondiff_rop(theano.clone(y, replace={self.mx: break_op(self.mx)})) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.mx, self.v) lop_f = function([self.mx, self.v], yv) sy = tensor.grad((self.v * y).sum(), self.mx) scan_f = function([self.mx, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def check_rop_lop(self, y, out_shape): """ As check_mat_rop_lop, except the input is self.x which is a vector. The output is still a vector. """ # TEST ROP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) yv = tensor.Rop(y, self.x, self.v) rop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(J, self.v) scan_f = function([self.x, self.v], sy, on_unused_input='ignore') v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) known_fail = False try: self.check_nondiff_rop(theano.clone(y, replace={self.x: break_op(self.x)})) except AssertionError: known_fail = True # TEST LOP vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.x, self.v) lop_f = function([self.x, self.v], yv, on_unused_input='ignore') J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x), sequences=tensor.arange(y.shape[0]), non_sequences=[y, self.x]) sy = tensor.dot(self.v, J) scan_f = function([self.x, self.v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2)) if known_fail: raise SkipTest('Rop does not handle non-differentiable inputs ' 'correctly. Bug exposed by fixing Add.grad method.')
def test_conv(self): for conv_op in [conv.conv2d, conv2d]: for border_mode in ['valid', 'full']: image_shape = (2, 2, 4, 5) filter_shape = (2, 2, 2, 3) image_dim = len(image_shape) filter_dim = len(filter_shape) input = tensor.TensorType( theano.config.floatX, [False] * image_dim)(name='input') filters = tensor.TensorType( theano.config.floatX, [False] * filter_dim)(name='filter') ev_input = tensor.TensorType( theano.config.floatX, [False] * image_dim)(name='ev_input') ev_filters = tensor.TensorType( theano.config.floatX, [False] * filter_dim)(name='ev_filters') def sym_conv2d(input, filters): return conv_op(input, filters, border_mode=border_mode) output = sym_conv2d(input, filters).flatten() yv = tensor.Rop(output, [input, filters], [ev_input, ev_filters]) mode = None if theano.config.mode == "FAST_COMPILE": mode = "FAST_RUN" rop_f = function([input, filters, ev_input, ev_filters], yv, on_unused_input='ignore', mode=mode) sy, _ = theano.scan(lambda i, y, x1, x2, v1, v2: (tensor.grad(y[i], x1) * v1).sum() + (tensor.grad(y[i], x2) * v2).sum(), sequences=tensor.arange(output.shape[0]), non_sequences=[output, input, filters, ev_input, ev_filters], mode=mode) scan_f = function([input, filters, ev_input, ev_filters], sy, on_unused_input='ignore', mode=mode) dtype = theano.config.floatX image_data = numpy.random.random(image_shape).astype(dtype) filter_data = numpy.random.random(filter_shape).astype(dtype) ev_image_data = numpy.random.random(image_shape).astype(dtype) ev_filter_data = numpy.random.random(filter_shape).astype(dtype) v1 = rop_f(image_data, filter_data, ev_image_data, ev_filter_data) v2 = scan_f(image_data, filter_data, ev_image_data, ev_filter_data) assert numpy.allclose(v1, v2), ("Rop mismatch: %s %s" % (v1, v2))
def test_rop_mitmot(self): # this test is a copy paste from the script given by Justin Bayer to # reproduce this bug # We have 2 parameter groups with the following shapes. W1shape = (1, 3) W2shape = (3, 3) n_pars = 1 * 3 + 3 * 3 # Allocate big parameter array. pars = theano.shared(numpy.empty(n_pars)) # Assign slices. W1 = pars[:3].reshape(W1shape) W2 = pars[3:].reshape(W2shape) # Define recurrent model. We are using a model where each input is a # tensor # of shape (T, B, D) where T is the number of timesteps, B is the # number of # sequences iterated over in parallel and D is the dimensionality of # each # item at a timestep. inpt = tensor.tensor3('inpt') target = tensor.tensor3('target') # Make these flat in order to be able to use dot products instead of # tensordot, # which is slower. inpt_flat = inpt.reshape((inpt.shape[0] * inpt.shape[1], inpt.shape[2])) hidden_flat = tensor.dot(inpt_flat, W1) hidden = hidden_flat.reshape((inpt.shape[0], inpt.shape[1], 3)) transfer = tensor.nnet.sigmoid hidden_rec, _ = theano.scan( lambda x, h_tm1: transfer(tensor.dot(h_tm1, W2) + x), sequences=hidden, outputs_info=[tensor.zeros_like(hidden[0])]) hidden_rec_flat = hidden_rec.reshape( (hidden_rec.shape[0] * hidden_rec.shape[1], hidden_rec.shape[2])) cost = ((hidden_rec - target) ** 2).mean() d_cost_wrt_pars = tensor.grad(cost, pars) p = tensor.dvector() Hp = tensor.Rop(d_cost_wrt_pars, pars, p)
def test_rop_lop(): mx = tensor.matrix('mx') mv = tensor.matrix('mv') v = tensor.vector('v') y = matrix_inverse(mx).sum(axis=0) yv = tensor.Rop(y, mx, mv) rop_f = function([mx, mv], yv) sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sequences=tensor.arange(y.shape[0]), non_sequences=[y, mx, mv]) scan_f = function([mx, mv], sy) rng = numpy.random.RandomState(utt.fetch_seed()) vx = numpy.asarray(rng.randn(4, 4), theano.config.floatX) vv = numpy.asarray(rng.randn(4, 4), theano.config.floatX) v1 = rop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2)) raised = False try: tensor.Rop( theano.clone(y, replace={mx: break_op(mx)}), mx, mv) except ValueError: raised = True if not raised: raise Exception(( 'Op did not raised an error even though the function' ' is not differentiable')) vv = numpy.asarray(rng.uniform(size=(4,)), theano.config.floatX) yv = tensor.Lop(y, mx, v) lop_f = function([mx, v], yv) sy = tensor.grad((v * y).sum(), mx) scan_f = function([mx, v], sy) v1 = lop_f(vx, vv) v2 = scan_f(vx, vv) assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
def __init__(self, p, inputs, s, costs, h=None, ha=None): '''Constructs and compiles the necessary Theano functions. p : list of Theano shared variables Parameters of the model to be optimized. inputs : list of Theano variables Symbolic variables that are inputs to your graph (they should also include your model 'output'). Your training examples must fit these. s : Theano variable Symbolic variable with respect to which the Hessian of the objective is positive-definite, implicitly defining the Gauss-Newton matrix. Typically, it is the activation of the output layer. costs : list of Theano variables Monitoring costs, the first of which will be the optimized objective. h: Theano variable or None Structural damping is applied to this variable (typically the hidden units of an RNN). ha: Theano variable or None Symbolic variable that implicitly defines the Gauss-Newton matrix for the structural damping term (typically the activation of the hidden layer). If None, it will be set to `h`.''' self.p = p self.shapes = [i.get_value().shape for i in p] self.sizes = map(numpy.prod, self.shapes) self.positions = numpy.cumsum([0] + self.sizes)[:-1] g = T.grad(costs[0], p) g = map(T.as_tensor_variable, g) # for CudaNdarray self.f_gc = theano.function(inputs, g + costs, on_unused_input='ignore') # during gradient computation self.f_cost = theano.function(inputs, costs, on_unused_input='ignore') # for quick cost evaluation symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4 v = [symbolic_types[len(i)]() for i in self.shapes] Gv = gauss_newton_product(costs[0], p, v, s) coefficient = T.scalar() # this is lambda*mu if h is not None: # structural damping with cross-entropy h_constant = symbolic_types[h.ndim]() # T.Rop does not support `consider_constant` yet, so use `givens` structural_damping = coefficient * ( -h_constant * T.log(h + 1e-10) - (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0] if ha is None: ha = h Gv_damping = gauss_newton_product(structural_damping, p, v, ha) Gv = [a + b for a, b in zip(Gv, Gv_damping)] givens = {h_constant: h} else: givens = {} self.function_Gv = theano.function(inputs + v + [coefficient], Gv, givens=givens, on_unused_input='ignore')