Python theano.tensor 模块，Lop() 实例源码

我们从Python开源项目中，提取了以下17个代码示例，用于说明如何使用theano.tensor.Lop()。

项目：SteinGAN 作者：DartML | 项目源码 | 文件源码

def svgd_gradient(X0):

    hidden, _, mse = discrim(X0)
    grad = -1.0 * T.grad( mse.sum(), X0)

    kxy, neighbors, h = rbf_kernel(hidden)  #TODO

    coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 )
    v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2

    X1 = X0[neighbors]
    hidden1, _, _ = discrim(X1)
    dxkxy = T.Lop(hidden1, X1, v)

    #svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x')
    svgd_grad = grad + dxkxy / 2.
    return grad, svgd_grad, dxkxy

项目：SteinGAN 作者：DartML | 项目源码 | 文件源码

def svgd_gradient(X0):

    hidden, _, mse = discrim(X0)
    grad = -1.0 * T.grad( mse.sum(), X0)

    kxy, neighbors, h = rbf_kernel(hidden)  #TODO

    coff = T.exp( - T.sum((hidden[neighbors] - hidden)**2, axis=1) / h**2 / 2.0 )
    v = coff.dimshuffle(0, 'x') * (-hidden[neighbors] + hidden) / h**2

    X1 = X0[neighbors]
    hidden1, _, _ = discrim(X1)
    dxkxy = T.Lop(hidden1, X1, v)

    #svgd_grad = (T.dot(kxy, T.flatten(grad, 2)).reshape(dxkxy.shape) + dxkxy) / T.sum(kxy, axis=1).dimshuffle(0, 'x', 'x', 'x')
    svgd_grad = grad + dxkxy / 2.
    return grad, svgd_grad, dxkxy

项目：icml17_knn 作者：taolei87 | 项目源码 | 文件源码

def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum):
    has_momentum = momentum.get_value() > 0.0
    samples = [ default_mrng.normal(size=p.shape, avg=0, std=1,
                    dtype=theano.config.floatX) for p in params ]
    HVs = T.Lop(gparams, params, samples)

    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omg_t = 1.0 - gamma**i_t
    for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
        if is_subtensor_op(p):
            raise Exception("ESGD subtensor update not implemented!")
        else:
            D_t = D * gamma + T.sqr(Hv) * (1.0-gamma)
            if has_momentum:
                m_t = m*momentum + g
                updates[m] = m_t
            else:
                m_t = g
            g_t = m_t / ( T.sqrt(D_t/omg_t + eps) )
            #g_t = m_t / ( T.sqrt(D_t + eps) )
            updates[D] = D_t
            updates[p] = p - lr*g_t
    updates[i] = i_t

项目：SocializedWordEmbeddings 作者：HKUST-KnowComp | 项目源码 | 文件源码

def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma, momentum):
    has_momentum = momentum.get_value() > 0.0
    samples = [ default_mrng.normal(size=p.shape, avg=0, std=1,
                    dtype=theano.config.floatX) for p in params ]
    HVs = T.Lop(gparams, params, samples)

    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omg_t = 1.0 - gamma**i_t
    for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
        if is_subtensor_op(p):
            raise Exception("ESGD subtensor update not implemented!")
        else:
            D_t = D * gamma + T.sqr(Hv) * (1.0-gamma)
            if has_momentum:
                m_t = m*momentum + g
                updates[m] = m_t
            else:
                m_t = g
            g_t = m_t / ( T.sqrt(D_t/omg_t + eps) )
            #g_t = m_t / ( T.sqrt(D_t + eps) )
            updates[D] = D_t
            updates[p] = p - lr*g_t
    updates[i] = i_t

项目：DeepIV 作者：jhartford | 项目源码 | 文件源码

def Lop(output, wrt, eval_points):
        grads = tf.gradients(output, wrt, grad_ys=eval_points)
        return grads

项目：DeepIV 作者：jhartford | 项目源码 | 文件源码

def replace_gradients_mse(model, opt, batch_size, n_samples = 1):
    '''
    Replace the gradients of a Keras model with mean square error loss.
    '''
    # targets has been repeated twice so the below creates two identical columns
    # of the target values - we'll only use the first column.
    targets = K.reshape(model.targets[0], (batch_size, n_samples * 2))
    output =  K.mean(K.reshape(model.outputs[0], (batch_size, n_samples, 2)), axis=1)
    # compute d Loss / d output
    dL_dOutput = (output[:,0] - targets[:,0]) * (2.) / batch_size
    # compute (d Loss / d output) (d output / d theta) for each theta
    trainable_weights = model.trainable_weights
    grads = Lop(output[:,1], wrt=trainable_weights, eval_points=dL_dOutput) 
    # compute regularizer gradients

    # add loss with respect to regularizers
    reg_loss = model.total_loss * 0.
    for r in model.losses:
         reg_loss += r
    reg_grads = K.gradients(reg_loss, trainable_weights)
    grads = [g+r for g,r in zip(grads, reg_grads)]

    opt = keras.optimizers.get(opt)
    # Patch keras gradient calculation to allow for user defined gradients
    opt.get_gradients = types.MethodType( get_gradients, opt )
    opt.grads = grads
    model.optimizer = opt
    return model

项目：NMT 作者：tuzhaopeng | 项目源码 | 文件源码

def get_grads(self, state_below, target, mask = None, reg = None,
                  scale=None, sum_over_time=True, use_noise=True,
                 additional_inputs=None):
        """
        This function implements both the forward and backwards pass of this
        layer. The reason we do this in a single function is because for the
        factorized softmax layer is hard to rely on grad and get an
        optimized graph. For uniformity I've implemented this method for
        this layer as well (though one doesn't need to use it)

        :param state_below: theano variable representing the input to the
            softmax layer
        :param target: theano variable representing the target for this
            layer
        :return: cost, dC_dstate_below, param_grads, new_properties
            dC_dstate_below is a computational graph representing the
            gradient of the cost wrt to state_below
            param_grads is a list containing the gradients wrt to the
            different parameters of the layer
            new_properties is a dictionary containing additional properties
            of the model; properties are theano expression that are
            evaluated and reported by the model
        """
        cost = self.get_cost(state_below,
                             target,
                             mask = mask,
                             reg = reg,
                             scale=scale,
                             sum_over_time=sum_over_time,
                             use_noise=use_noise,
                             additional_inputs=additional_inputs)
        grads = TT.grad(cost, self.params)
        if self.additional_gradients:
            for new_grads, to_replace, properties in self.additional_gradients:
                gparams, params = new_grads
                prop_expr = [x[1] for x in properties]
                replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
                rval = theano.clone(gparams + prop_expr,
                                    replace=replace)
                gparams = rval[:len(gparams)]
                prop_expr = rval[len(gparams):]
                self.properties += [(x[0], y) for x,y in zip(properties,
                                                             prop_expr)]
                for gp, p in zip(gparams, params):
                    grads[self.params.index(p)] += gp

        self.cost = cost
        self.grads = grads
        def Gvs_fn(*args):
            w = (1 - self.model_output) * self.model_output * state_below.shape[1]
            Gvs = TT.Lop(self.model_output, self.params,
                         TT.Rop(self.model_output, self.params, args)/w)
            return Gvs
        self.Gvs = Gvs_fn
        return cost, grads

项目：NMT 作者：tuzhaopeng | 项目源码 | 文件源码

def get_grads(self, state_below, target, mask = None, reg = None,
                  scale=None, sum_over_time=True, use_noise=True,
                 additional_inputs=None):
        """
        This function implements both the forward and backwards pass of this
        layer. The reason we do this in a single function is because for the
        factorized softmax layer is hard to rely on grad and get an
        optimized graph. For uniformity I've implemented this method for
        this layer as well (though one doesn't need to use it)

        :param state_below: theano variable representing the input to the
            softmax layer
        :param target: theano variable representing the target for this
            layer
        :return: cost, dC_dstate_below, param_grads, new_properties
            dC_dstate_below is a computational graph representing the
            gradient of the cost wrt to state_below
            param_grads is a list containing the gradients wrt to the
            different parameters of the layer
            new_properties is a dictionary containing additional properties
            of the model; properties are theano expression that are
            evaluated and reported by the model
        """
        cost = self.get_cost(state_below,
                             target,
                             mask = mask,
                             reg = reg,
                             scale=scale,
                             sum_over_time=sum_over_time,
                             use_noise=use_noise,
                             additional_inputs=additional_inputs)
        grads = TT.grad(cost, self.params)
        if self.additional_gradients:
            for new_grads, to_replace, properties in self.additional_gradients:
                gparams, params = new_grads
                prop_expr = [x[1] for x in properties]
                replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
                rval = theano.clone(gparams + prop_expr,
                                    replace=replace)
                gparams = rval[:len(gparams)]
                prop_expr = rval[len(gparams):]
                self.properties += [(x[0], y) for x,y in zip(properties,
                                                             prop_expr)]
                for gp, p in zip(gparams, params):
                    grads[self.params.index(p)] += gp

        self.cost = cost
        self.grads = grads
        def Gvs_fn(*args):
            w = (1 - self.model_output) * self.model_output * state_below.shape[1]
            Gvs = TT.Lop(self.model_output, self.params,
                         TT.Rop(self.model_output, self.params, args)/w)
            return Gvs
        self.Gvs = Gvs_fn
        return cost, grads

项目：drmad 作者：bigaidream-projects | 项目源码 | 文件源码

def hypergrad(params_ele, params_hyper,
              dvalid_dtheta,
              loss_ele, loss_hyper, loss_ele_penalty=0.):

    """ Function defining the hypergradients: gradients of validation cost
        with respect to various hyperparameters.     

        The function is separating penalty hyperparameters 
        (which is assumed to depend only on w) from noise and other hyperparameters,
        due to otherwise dependancy errors in the Lop operator.

        Inputs: 

        paramsT1, paramsT2 :: T1 and T2 parameters
        c1, c2 :: cross-entropy on training and validation set
        p1, p2 :: penalty terms on training and validation set (p2 assumed 0)

    """
    # initializations
    reg_penalty, reg_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], []

    # separate different types of parameters
    for regular in params_hyper:
        reg_type, _ = regular.name.split('_')
        if reg_type in penalty_list:
            reg_penalty += [regular]
        elif reg_type in noise_list:
            reg_noise += [regular]
        else:
            print 'Hypergrad not implemented for ', reg_type

    # separate weight parameters and gradients
    for (param, grad) in zip(params_ele, dvalid_dtheta):
        paramType, _ = param.name.split('_')
        if paramType == 'W':
            w += [param]
            dvalid_dw += [grad]

    # hyper-gradients        
    if reg_penalty:
        dpenalty_dw = T.grad(loss_ele_penalty, w)
        dpenalty_dw = [-grad for grad in dpenalty_dw]
        grad_penalty = T.Lop(dpenalty_dw, reg_penalty, dvalid_dw)
    if reg_noise:
        dele_dtheta = T.grad(loss_ele, params_ele)
        dele_dtheta = [-grad for grad in dele_dtheta]
        grad_noise = T.Lop(dele_dtheta, reg_noise, dvalid_dtheta)

    # outputs     
    params_hyper = reg_penalty + reg_noise
    dvalid_dgamma = grad_penalty + grad_noise

    return params_hyper, dvalid_dgamma

项目：drmad 作者：bigaidream-projects | 项目源码 | 文件源码

def L_hvp_meta(params_ele, params_hyper, pseudo_params_ele, vec, batchx, batchy):
    """
    :param params_ele: elementary params
    :param params_hyper: hyper params
    :param pseudo_params_ele: the psed
    :param vec: a vector multiple to the hessian, could be learning rate vec or momentum vec
    :param batchx: data x of this iteration
    :param batchy: data y of this iteration
    :return: gradient w.r.t. hyper params
    """

    reg_params_penalty, reg_params_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], []

    # forward to obtain loss & gradients
    loss_ele, loss_ele_penalty = L_hvp_meta_unsafe(batchx, batchy, 1, 0)

    # separate different types of parameters
    for regular in params_hyper:
        reg_type, _ = regular.name.split('_')
        if reg_type in penalty_list:
            reg_params_penalty += [regular]
        elif reg_type in noise_list:
            reg_params_noise += [regular]
        else:
            print 'Hypergrad not implemented for ', reg_type

    # VJ = T.Lop(y, W, v), to calc v * dy/dW
    if reg_params_penalty:
        dpenalty_dw = T.grad(loss_ele_penalty, w)
        dpenalty_dw = [-grad for grad in dpenalty_dw] # dpenalty_dw might be calc through `meta_backward_ele()`,
                                                      # as you like, discuss it later
        grad_penalty = T.Lop(dpenalty_dw, reg_params_penalty, vec)

    # if reg_params_noise:
    #     dele_dtheta = T.grad(loss_ele, params_ele)
    #     dele_dtheta = [-grad for grad in dele_dtheta]
    #     grad_noise = T.Lop(dele_dtheta, reg_params_noise, dL_dtheta)

    # outputs
    params_hyper = reg_params_penalty + reg_params_noise
    dvalid_dgamma = grad_penalty + grad_noise

    return dvalid_dgamma

项目：drmad 作者：bigaidream-projects | 项目源码 | 文件源码

def L_hvp_meta_unsafe(params_ele, params_hyper, pseudo_params_ele, batchx, batchy, x, y, loss):
    """
    :param params_ele: elementary params
    :param params_hyper: hyper params
    :param pseudo_params_ele: the psed, a dictionary whose keys are elements in params_ele
    :param batchx: data x of this iteration
    :param batchy: data y of this iteration
    :param x: variable x of the model
    :param y: variable y of the model
    :param loss: symbol of loss function expression
    :return: gradient w.r.t. hyper params at pseudo_params_ele

    Attention please! In order to save the memory, the value of params_ele
    would be replaced by the values of pseudo_params_ele. SAVE ve the values of
    weights before calling me!
    """

    reg_params_penalty, reg_params_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], []
    # replace the params
    for param in params_ele:
        param.set_value(pseudo_params_ele[param])

    # separate different types of parameters
    for regular in params_hyper:
        reg_type, _ = regular.name.split('_')
        if reg_type in penalty_list:
            reg_params_penalty += [regular]
        elif reg_type in noise_list:
            reg_params_noise += [regular]
        else:
            print 'Hypergrad not implemented for ', reg_type
    # get gradient w.r.t. hyper params
    if reg_params_penalty:
        dloss_dpenalty = T.grad(loss, penalty_list)

    # forward & backward to obtain gradients
    meta_fwbw_ele = theano.function([x, y], dloss_dpenalty)
    grad_penalty = meta_fwbw_ele(batchx, batchy)

    # if reg_params_noise:
    #     dele_dtheta = T.grad(loss_ele, params_ele)
    #     dele_dtheta = [-grad for grad in dele_dtheta]
    #     grad_noise = T.Lop(dele_dtheta, reg_params_noise, dL_dtheta)

    # outputs
    params_hyper = reg_params_penalty + reg_params_noise
    dvalid_dgamma = grad_penalty + grad_noise

    return dloss_dpenalty, dvalid_dgamma

项目：Theano-Deep-learning 作者：GeekLiB | 项目源码 | 文件源码

def check_mat_rop_lop(self, y, out_shape):
        """ Test the Rop/Lop when input is a matrix and the output is a vector

        :param y: the output variable of the op applied to self.mx
        :param out_shape: Used to generate a random tensor
                          corresponding to the evaluation point of the Rop
                          (i.e. the tensor with which you multiply the
                          Jacobian). It should be a tuple of ints.

        If the Op has more than 1 input, one of them must be mx, while
        others must be shared variables / constants. We will test only
        against the input self.mx, so you must call
        check_mat_rop_lop/check_rop_lop for the other inputs.

        We expect all inputs/outputs have dtype floatX.

        If you want to test an Op with an output matrix, add a sum
        after the Op you want to test.
        """
        vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape),
                           theano.config.floatX)
        yv = tensor.Rop(y, self.mx, self.mv)
        rop_f = function([self.mx, self.mv], yv, on_unused_input='ignore')
        sy, _ = theano.scan(lambda i, y, x, v:
                            (tensor.grad(y[i], x) * v).sum(),
                            sequences=tensor.arange(y.shape[0]),
                            non_sequences=[y, self.mx, self.mv])
        scan_f = function([self.mx, self.mv], sy, on_unused_input='ignore')

        v1 = rop_f(vx, vv)
        v2 = scan_f(vx, vv)

        assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2))

        self.check_nondiff_rop(theano.clone(y, replace={self.mx: break_op(self.mx)}))

        vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
        yv = tensor.Lop(y, self.mx, self.v)
        lop_f = function([self.mx, self.v], yv)

        sy = tensor.grad((self.v * y).sum(), self.mx)
        scan_f = function([self.mx, self.v], sy)

        v1 = lop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))

项目：Theano-Deep-learning 作者：GeekLiB | 项目源码 | 文件源码

def check_rop_lop(self, y, out_shape):
        """
        As check_mat_rop_lop, except the input is self.x which is a
        vector. The output is still a vector.

        """
        # TEST ROP
        vx = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)

        yv = tensor.Rop(y, self.x, self.v)
        rop_f = function([self.x, self.v], yv, on_unused_input='ignore')
        J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x),
                           sequences=tensor.arange(y.shape[0]),
                           non_sequences=[y, self.x])
        sy = tensor.dot(J, self.v)

        scan_f = function([self.x, self.v], sy, on_unused_input='ignore')

        v1 = rop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2))
        known_fail = False
        try:
            self.check_nondiff_rop(theano.clone(y, replace={self.x: break_op(self.x)}))
        except AssertionError:
            known_fail = True

        # TEST LOP

        vx = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=out_shape),
                           theano.config.floatX)

        yv = tensor.Lop(y, self.x, self.v)
        lop_f = function([self.x, self.v], yv, on_unused_input='ignore')
        J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x),
                           sequences=tensor.arange(y.shape[0]),
                           non_sequences=[y, self.x])
        sy = tensor.dot(self.v, J)

        scan_f = function([self.x, self.v], sy)

        v1 = lop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))

        if known_fail:
            raise SkipTest('Rop does not handle non-differentiable inputs '
                           'correctly. Bug exposed by fixing Add.grad method.')

项目：Theano-Deep-learning 作者：GeekLiB | 项目源码 | 文件源码

def test_rop_lop():
    mx = tensor.matrix('mx')
    mv = tensor.matrix('mv')
    v = tensor.vector('v')
    y = matrix_inverse(mx).sum(axis=0)

    yv = tensor.Rop(y, mx, mv)
    rop_f = function([mx, mv], yv)

    sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
                        sequences=tensor.arange(y.shape[0]),
                        non_sequences=[y, mx, mv])
    scan_f = function([mx, mv], sy)

    rng = numpy.random.RandomState(utt.fetch_seed())
    vx = numpy.asarray(rng.randn(4, 4), theano.config.floatX)
    vv = numpy.asarray(rng.randn(4, 4), theano.config.floatX)

    v1 = rop_f(vx, vv)
    v2 = scan_f(vx, vv)

    assert _allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2))

    raised = False
    try:
        tensor.Rop(
            theano.clone(y, replace={mx: break_op(mx)}),
            mx,
            mv)
    except ValueError:
        raised = True
    if not raised:
        raise Exception((
            'Op did not raised an error even though the function'
            ' is not differentiable'))

    vv = numpy.asarray(rng.uniform(size=(4,)), theano.config.floatX)
    yv = tensor.Lop(y, mx, v)
    lop_f = function([mx, v], yv)

    sy = tensor.grad((v * y).sum(), mx)
    scan_f = function([mx, v], sy)

    v1 = lop_f(vx, vv)
    v2 = scan_f(vx, vv)
    assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))

项目：NMT-Coverage 作者：tuzhaopeng | 项目源码 | 文件源码

def get_grads(self, state_below, target, mask = None, reg = None,
                  scale=None, sum_over_time=True, use_noise=True,
                 additional_inputs=None):
        """
        This function implements both the forward and backwards pass of this
        layer. The reason we do this in a single function is because for the
        factorized softmax layer is hard to rely on grad and get an
        optimized graph. For uniformity I've implemented this method for
        this layer as well (though one doesn't need to use it)

        :param state_below: theano variable representing the input to the
            softmax layer
        :param target: theano variable representing the target for this
            layer
        :return: cost, dC_dstate_below, param_grads, new_properties
            dC_dstate_below is a computational graph representing the
            gradient of the cost wrt to state_below
            param_grads is a list containing the gradients wrt to the
            different parameters of the layer
            new_properties is a dictionary containing additional properties
            of the model; properties are theano expression that are
            evaluated and reported by the model
        """
        cost = self.get_cost(state_below,
                             target,
                             mask = mask,
                             reg = reg,
                             scale=scale,
                             sum_over_time=sum_over_time,
                             use_noise=use_noise,
                             additional_inputs=additional_inputs)
        grads = TT.grad(cost, self.params)
        if self.additional_gradients:
            for new_grads, to_replace, properties in self.additional_gradients:
                gparams, params = new_grads
                prop_expr = [x[1] for x in properties]
                replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
                rval = theano.clone(gparams + prop_expr,
                                    replace=replace)
                gparams = rval[:len(gparams)]
                prop_expr = rval[len(gparams):]
                self.properties += [(x[0], y) for x,y in zip(properties,
                                                             prop_expr)]
                for gp, p in zip(gparams, params):
                    grads[self.params.index(p)] += gp

        self.cost = cost
        self.grads = grads
        def Gvs_fn(*args):
            w = (1 - self.model_output) * self.model_output * state_below.shape[1]
            Gvs = TT.Lop(self.model_output, self.params,
                         TT.Rop(self.model_output, self.params, args)/w)
            return Gvs
        self.Gvs = Gvs_fn
        return cost, grads

项目：NMT-Coverage 作者：tuzhaopeng | 项目源码 | 文件源码

def get_grads(self, state_below, target, mask = None, reg = None,
                  scale=None, sum_over_time=True, use_noise=True,
                 additional_inputs=None):
        """
        This function implements both the forward and backwards pass of this
        layer. The reason we do this in a single function is because for the
        factorized softmax layer is hard to rely on grad and get an
        optimized graph. For uniformity I've implemented this method for
        this layer as well (though one doesn't need to use it)

        :param state_below: theano variable representing the input to the
            softmax layer
        :param target: theano variable representing the target for this
            layer
        :return: cost, dC_dstate_below, param_grads, new_properties
            dC_dstate_below is a computational graph representing the
            gradient of the cost wrt to state_below
            param_grads is a list containing the gradients wrt to the
            different parameters of the layer
            new_properties is a dictionary containing additional properties
            of the model; properties are theano expression that are
            evaluated and reported by the model
        """
        cost = self.get_cost(state_below,
                             target,
                             mask = mask,
                             reg = reg,
                             scale=scale,
                             sum_over_time=sum_over_time,
                             use_noise=use_noise,
                             additional_inputs=additional_inputs)
        grads = TT.grad(cost, self.params)
        if self.additional_gradients:
            for new_grads, to_replace, properties in self.additional_gradients:
                gparams, params = new_grads
                prop_expr = [x[1] for x in properties]
                replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
                rval = theano.clone(gparams + prop_expr,
                                    replace=replace)
                gparams = rval[:len(gparams)]
                prop_expr = rval[len(gparams):]
                self.properties += [(x[0], y) for x,y in zip(properties,
                                                             prop_expr)]
                for gp, p in zip(gparams, params):
                    grads[self.params.index(p)] += gp

        self.cost = cost
        self.grads = grads
        def Gvs_fn(*args):
            w = (1 - self.model_output) * self.model_output * state_below.shape[1]
            Gvs = TT.Lop(self.model_output, self.params,
                         TT.Rop(self.model_output, self.params, args)/w)
            return Gvs
        self.Gvs = Gvs_fn
        return cost, grads

项目：VNMT 作者：DeepLearnXMU | 项目源码 | 文件源码

def get_grads(self, state_below, target, mask = None, reg = None,
                  scale=None, sum_over_time=True, use_noise=True,
                 additional_inputs=None):
        """
        This function implements both the forward and backwards pass of this
        layer. The reason we do this in a single function is because for the
        factorized softmax layer is hard to rely on grad and get an
        optimized graph. For uniformity I've implemented this method for
        this layer as well (though one doesn't need to use it)

        :param state_below: theano variable representing the input to the
            softmax layer
        :param target: theano variable representing the target for this
            layer
        :return: cost, dC_dstate_below, param_grads, new_properties
            dC_dstate_below is a computational graph representing the
            gradient of the cost wrt to state_below
            param_grads is a list containing the gradients wrt to the
            different parameters of the layer
            new_properties is a dictionary containing additional properties
            of the model; properties are theano expression that are
            evaluated and reported by the model
        """
        cost = self.get_cost(state_below,
                             target,
                             mask = mask,
                             reg = reg,
                             scale=scale,
                             sum_over_time=sum_over_time,
                             use_noise=use_noise,
                             additional_inputs=additional_inputs)
        grads = TT.grad(cost, self.params)
        if self.additional_gradients:
            for new_grads, to_replace, properties in self.additional_gradients:
                gparams, params = new_grads
                prop_expr = [x[1] for x in properties]
                replace = [(x[0], TT.grad(cost, x[1])) for x in to_replace]
                rval = theano.clone(gparams + prop_expr,
                                    replace=replace)
                gparams = rval[:len(gparams)]
                prop_expr = rval[len(gparams):]
                self.properties += [(x[0], y) for x,y in zip(properties,
                                                             prop_expr)]
                for gp, p in zip(gparams, params):
                    grads[self.params.index(p)] += gp

        self.cost = cost
        self.grads = grads
        def Gvs_fn(*args):
            w = (1 - self.model_output) * self.model_output * state_below.shape[1]
            Gvs = TT.Lop(self.model_output, self.params,
                         TT.Rop(self.model_output, self.params, args)/w)
            return Gvs
        self.Gvs = Gvs_fn
        return cost, grads