我们从Python开源项目中,提取了以下44个代码示例,用于说明如何使用sklearn.utils.extmath.safe_sparse_dot()。
def predict(self, X): """ Predict data using the ``centroids_`` of subclusters. Avoid computation of the row norms of X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data. Returns ------- labels : ndarray, shape(n_samples) Labelled data. """ X = check_array(X, accept_sparse='csr') self._check_fit(X) reduced_distance = safe_sparse_dot(X, self.subcluster_centers_.T) reduced_distance *= -2 reduced_distance += self._subcluster_norms return self.subcluster_labels_[np.argmin(reduced_distance, axis=1)]
def _fit_owl_fista(X, y, w, loss, max_iter=500, max_linesearch=20, eta=2.0, tol=1e-3, verbose=0): # least squares loss def sfunc(coef, grad=False): y_scores = safe_sparse_dot(X, coef) if grad: obj, lp = loss(y, y_scores, return_derivative=True) grad = safe_sparse_dot(X.T, lp) return obj, grad else: return loss(y, y_scores) def nsfunc(coef, L): return prox_owl(coef, w / L) coef = np.zeros(X.shape[1]) return fista(sfunc, nsfunc, coef, max_iter, max_linesearch, eta, tol, verbose)
def _intercept_dot(w, X, y): """Computes y * np.dot(X, w). It takes into consideration if the intercept should be fit or not. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. """ c = 0. if w.size == X.shape[1] + 1: c = w[-1] w = w[:-1] z = safe_sparse_dot(X, w) + c return w, c, y * z
def polynomial_kernel(self, x, y, gamma): """ Custom polynomial kernel function, similarities of vectors over polynomials :param x: array of input vectors :param y: array of input vectors :param gamma: gamma :returns: - pk: inner product """ c = 1 degree = 2 pk = ssd(x, y.T, dense_output = True) pk *= gamma pk += c pk **= degree return pk
def sigmoid_kernel(self, x, y, gamma): """ Custom sigmoid kernel function, similarities of vectors in a sigmoid kernel matrix :param x: array of input vectors :param y: array of input vectors :param gamma: gamma :returns: - sk: inner product """ c = 1 sk = ssd(x, y.T, dense_output = True) sk *= gamma sk += c np.tanh(np.array(sk, dtype='float64'), np.array(sk, dtype = 'float64')) return np.array(sk, dtype = 'float64')
def _compute_input_activations(self, X): """Compute input activations given X""" n_samples = X.shape[0] mlp_acts = np.zeros((n_samples, self.n_hidden)) if (self._use_mlp_input): b = self.components_['biases'] w = self.components_['weights'] mlp_acts = self.alpha * (safe_sparse_dot(X, w) + b) rbf_acts = np.zeros((n_samples, self.n_hidden)) if (self._use_rbf_input): radii = self.components_['radii'] centers = self.components_['centers'] scale = self.rbf_width * (1.0 - self.alpha) rbf_acts = scale * cdist(X, centers)/radii self.input_activations_ = mlp_acts + rbf_acts
def _decision_function(self, X): """Predict using the linear model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Returns ------- array, shape (n_samples,) Predicted target values per element in X. """ check_is_fitted(self, ["t_", "coef_", "intercept_"], all_or_any=all) X = check_array(X, accept_sparse='csr') scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ return scores.ravel()
def test_sparse_decision_function(): #Test decision_function #Sanity check, test that decision_function implemented in python #returns the same as the one in libsvm # multi class: svc = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovo') clf = svc.fit(iris.data, iris.target) dec = safe_sparse_dot(iris.data, clf.coef_.T) + clf.intercept_ assert_array_almost_equal(dec, clf.decision_function(iris.data)) # binary: clf.fit(X, Y) dec = np.dot(X, clf.coef_.T) + clf.intercept_ prediction = clf.predict(X) assert_array_almost_equal(dec.ravel(), clf.decision_function(X)) assert_array_almost_equal( prediction, clf.classes_[(clf.decision_function(X) > 0).astype(np.int).ravel()]) expected = np.array([-1., -0.66, -1., 0.66, 1., 1.]) assert_array_almost_equal(clf.decision_function(X), expected, 2)
def fgrad(we, X, y, l1, l2): nsamples, nfactors = X.shape w0 = we[0] w = we[1:(nfactors+1)] - we[(nfactors+1):] yz = y * (safe_sparse_dot(X, w) + w0) f = - np.sum(log_logistic(yz)) + l1 * np.sum(we[1:]) + 0.5 * l2 * np.dot(w, w) e = (expit(yz) - 1) * y g = safe_sparse_dot(X.T, e) + l2 * w g0 = np.sum(e) grad = np.concatenate([g, -g]) + l1 grad = np.insert(grad, 0, g0) return f, grad
def score(self, user, candidates, context): # i_mat is (n_item_context, n_item) for all possible items # extract only target items i_mat = self.i_mat[:, candidates] n_target = len(candidates) # u_mat will be (n_user_context, n_item) for the target user u_vec = np.concatenate((user.feature, context)) u_vec = np.array([u_vec]).T u_mat = sp.csr_matrix(np.repeat(u_vec, n_target, axis=1)) # stack them into (p, n_item) matrix Y = sp.vstack((u_mat, i_mat)) Y = self.proj.reduce(Y) Y = sp.csr_matrix(preprocessing.normalize(Y, norm='l2', axis=0)) X = np.identity(self.k) - np.dot(self.U_r, self.U_r.T) A = safe_sparse_dot(X, Y, dense_output=True) return ln.norm(A, axis=0, ord=2)
def _mean_hiddens(self, v, temperature=1.0): """Computes the probabilities P(h=1|v). v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.components_.T/temperature) p += self.intercept_hidden_/(min(1.0, temperature) if BIASED_PRIOR else temperature) return expit(p, out=p)
def _free_energy(self, v): """Computes the free energy F(v) = - log sum_h exp(-E(v,h)). v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- free_energy : array-like, shape (n_samples,) The value of the free energy. """ return (- safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_).sum(axis=1))
def _fit(self, v_pos): """Inner fit for one mini-batch. Adjust the parameters to maximize the likelihood of v using Stochastic Maximum Likelihood (SML). v_pos : array-like, shape (n_samples, n_features) The data to use for training. """ h_pos = self._mean_hiddens(v_pos) # TODO: Worth trying with visible probabilities rather than binary states. # PG: it is common to use p_i instead of sampling a binary value'... 'it reduces # sampling noise this allowing faster learning. There is some evidence that it leads # to slightly worse density models' # I'm confounded by the fact that we seem to get more effective models WITHOUT # softmax visible units. The only explanation I can think of is that it's like # a pseudo-version of using visible probabilities. Without softmax, v_neg # can have multiple 1s per one-hot vector, which maybe somehow accelerates learning? # Need to think about this some more. v_neg = self._sample_visibles(self.h_samples_) h_neg = self._mean_hiddens(v_neg) lr = float(self.learning_rate) / v_pos.shape[0] update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T update -= np.dot(h_neg.T, v_neg) / self.fantasy_to_batch # L2 weight penalty update -= self.components_ * self.weight_cost self.components_ += lr * update self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0)/self.fantasy_to_batch) self.intercept_visible_ += lr * (np.asarray( v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0)/self.fantasy_to_batch) h_neg[self.rng_.uniform(size=h_neg.shape) < h_neg] = 1.0 # sample binomial self.h_samples_ = np.floor(h_neg, h_neg)
def norms_svec_resampling(X, orthogonal_basis, rank, num_samples=1000): """ Resampling procedure described in AJIVE paper for Wedin bound Parameters --------- orthogonal_basis: basis vectors for the orthogonal complement of the score space X: the observed data rank: number of columns to resample num_samples: how many resamples Output ------ an array of the resampled norms """ # TODO: rename some arguments e.g. orthogonal_basis -- this is not really # a basis, also resampled_projection resampled_norms = [0]*num_samples for s in range(num_samples): # sample columns from orthogonal basis sampled_col_index = np.random.choice(orthogonal_basis.shape[1], size=rank, replace=True) resampled_basis = orthogonal_basis[:, sampled_col_index] # ^ this is V* from AJIVE p12 # project observed data # resampled_projection = np.dot(X, resampled_basis) resampled_projection = safe_sparse_dot(X, resampled_basis) # compute resampled operator L2 norm resampled_norms[s] = np.linalg.norm(resampled_projection, ord=2) return resampled_norms
def transform(self, X, y=None): return safe_sparse_dot(self.vect_.transform(X), self.embeds)
def fit_transform(self, X, y=None, **fit_params): self.vect_ = TfidfVectorizer(vocabulary=self.embed_vocab, analyzer=_lower_words_getter, norm='l1', use_idf=False) return safe_sparse_dot(self.vect_.fit_transform(X), self.embeds)
def joint_feature(self, x, y): if isinstance(y, DocLabel): Y_prop, Y_link, compat, second_order = self._marg_rounded(x, y) else: Y_prop, Y_link, compat, second_order = self._marg_fractional(x, y) prop_acc = safe_sparse_dot(Y_prop.T, x.X_prop) # node_cls * node_feats link_acc = safe_sparse_dot(Y_link.T, x.X_link) # link_cls * link_feats f_sec_ord = [] if len(second_order): second_order = second_order.reshape(-1, len(x.second_order)) if self.coparents: f_sec_ord.append(safe_sparse_dot(second_order[0], x.X_sec_ord)) second_order = second_order[1:] if self.grandparents: f_sec_ord.append(safe_sparse_dot(second_order[0], x.X_sec_ord)) second_order = second_order[1:] if self.siblings: f_sec_ord.append(safe_sparse_dot(second_order[0], x.X_sec_ord)) elif self.n_second_order_factors_: # document has no second order factors so the joint feature # must be filled with zeros manually f_sec_ord = [np.zeros(self.n_second_order_features_) for _ in range(self.n_second_order_factors_)] jf = np.concatenate([prop_acc.ravel(), link_acc.ravel(), compat.ravel()] + f_sec_ord) return jf # basically reversing the joint feature
def transform_lsi(self, X): """ LSI transform, normalized by the inverse of the eigen values""" X = check_array(X, accept_sparse='csr') return safe_sparse_dot(X, self.components_.T).dot( np.diag(1./self.singular_values_[:self.n_components]))
def update_X(X, mu, k=6): #U, S, VT = svdp(X, k=k) U, S, VT = svds(X, k=k, which='LM') P = np.c_[np.ones((k, 1)), 1-S, 1./2./mu-S] sigma_star = np.zeros(k) for t in range(k): p = P[t, :] delta = p[1]**2 - 4 * p[0] * p[2] if delta <= 0: sigma_star[t] = 0. else: solution = np.roots(p) solution = solution.tolist() solution.sort(key=abs) solution = np.array(solution) if solution[0] * solution[1] <= 0: sigma_star[t] = solution[1] elif solution[1] < 0: sigma_star[t] = 0. else: f = np.log(1 + solution[1]) + mu * (solution[1] - s[t])**2 if f > mu *s[t]**2: sigma_star[t] = 0. else: sigma_star[t] = solution[1] sigma_star = sp.csr_matrix(np.diag(sigma_star)) sigma_star = safe_sparse_dot(safe_sparse_dot(U, sigma_star), VT) sigma_star[abs(sigma_star)<1e-10] = 0 return sp.lil_matrix(sigma_star)
def _decision_function(self, X): return safe_sparse_dot(X, self.coef_)
def _get_output(self, X): y_pred = _poly_predict(X, self.P_[0, :, :], self.lams_, kernel='anova', degree=self.degree) if self.fit_linear: y_pred += safe_sparse_dot(X, self.w_) if self.fit_lower == 'explicit' and self.degree == 3: # degree cannot currently be > 3 y_pred += _poly_predict(X, self.P_[1, :, :], self.lams_, kernel='anova', degree=2) return y_pred
def _D(X, P, degree=2): """The "replacement" part of the homogeneous polynomial kernel. D[i, j] = sum_k [(X_ik * P_jk) ** degree] """ return safe_sparse_dot(safe_power(X, degree), P.T ** degree)
def _compute_hidden_activations(self, X): """Compute the hidden activations.""" hidden_activations = safe_sparse_dot(X, self.coef_hidden_) hidden_activations += self.intercept_hidden_ # Apply the activation method activation = ACTIVATIONS[self.activation] hidden_activations = activation(hidden_activations) return hidden_activations
def _decision_scores(self, X): """Predict using the ELM model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) if self.batch_size is None: hidden_activations = self._compute_hidden_activations(X) y_pred = safe_sparse_dot(hidden_activations, self.coef_output_) else: n_samples = X.shape[0] batches = gen_batches(n_samples, self.batch_size) y_pred = np.zeros((n_samples, self.n_outputs_)) for batch in batches: h_batch = self._compute_hidden_activations(X[batch]) y_pred[batch] = safe_sparse_dot(h_batch, self.coef_output_) return y_pred
def linear_kernel_matrix(self, x, y): """ Custom inner product. The returned matrix is a Linear Kernel Gram Matrix :param x: your dataset :param y: another dataset (sth like transposed(x)) """ return ssd(x,y, dense_output = True)
def tf_map(real, imag): sqrt_i = np.sqrt(ssd(imag.T, imag)) sqrt_r = np.sqrt(ssd(real.T, real)) stft_k = ssd(real.T, imag) return stft_k / (sqrt_i * sqrt_r)
def SGD(a, lab, Q, reg_param): iterations = 1 for i in xrange(7): for tau in xrange(len(a)): wx = ssd(a, Q[tau,:], dense_output = True) a = a * (1-1/iterations) if(lab[tau]*wx < 1): a[tau] = lab[tau]/(reg_param * iterations) iterations += 1 return a
def _forward_pass(self, activations, with_output_activation=True): """Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer. Parameters ---------- activations: list, length = n_layers - 1 The ith index of the list holds the values of the ith layer. with_output_activation : bool, default True If True, the output passes through the output activation function, which is either the softmax function or the logistic function """ # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], self.layers_coef_[i]) activations[i + 1] += self.layers_intercept_[i] # For the hidden layers if i + 1 != self.n_layers_ - 1: hidden_activation = ACTIVATIONS[self.activation] activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer if with_output_activation: output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) return activations
def _compute_cost_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads): """Compute the cost gradient for the layer.""" coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer]) / n_samples coef_grads[layer] += (self.alpha * self.layers_coef_[layer]) intercept_grads[layer] = np.mean(deltas[layer], 0) return coef_grads, intercept_grads
def _fit_regression(self, y): """ fit regression using pseudo-inverse or supplied regressor """ if (self.regressor is None): self.coefs_ = safe_sparse_dot(pinv2(self.hidden_activations_), y) else: self.regressor.fit(self.hidden_activations_, y) self.fitted_ = True
def _get_predictions(self): """get predictions using internal least squares/supplied regressor""" if (self.regressor is None): preds = safe_sparse_dot(self.hidden_activations_, self.coefs_) else: preds = self.regressor.predict(self.hidden_activations_) return preds
def getWordTypeCooccurPieces(self, dtype=np.float32): """ Calculate building blocks for word-word cooccur calculation These pieces can be used for incremental construction. Returns ------- Q : 2D matrix, W x W (where W is vocab_size) sameWordVec : 1D array, size W nDoc : scalar """ sameWordVec = np.zeros(self.vocab_size) data = np.zeros(self.word_count.shape, dtype=dtype) for docID in xrange(self.nDoc): start = self.doc_range[docID] stop = self.doc_range[docID + 1] N = self.word_count[start:stop].sum() NNm1 = N * (N - 1) sameWordVec[self.word_id[start:stop]] += \ self.word_count[start:stop] / NNm1 data[start:stop] = self.word_count[start:stop] / np.sqrt(NNm1) # Now, create a sparse matrix that's D x V sparseDocWordMat = scipy.sparse.csr_matrix( (data, self.word_id, self.doc_range), shape=(self.nDoc, self.vocab_size), dtype=dtype) # Q : V x V from sklearn.utils.extmath import safe_sparse_dot Q = safe_sparse_dot( sparseDocWordMat.T, sparseDocWordMat, dense_output=1) return Q, sameWordVec, self.nDoc
def test_svc_with_custom_kernel(): kfunc = lambda x, y: safe_sparse_dot(x, y.T) clf_lin = svm.SVC(kernel='linear').fit(X_sp, Y) clf_mylin = svm.SVC(kernel=kfunc).fit(X_sp, Y) assert_array_equal(clf_lin.predict(X_sp), clf_mylin.predict(X_sp))
def predict(self, X): n = (len(self.we) - 1) / 2 w0 = self.we[0] w = self.we[1:n+1] - self.we[n+1:] return expit(w0 + safe_sparse_dot(X, w))
def reduce(self, Y): return safe_sparse_dot(self.E, Y)
def reduce(self, Y): return safe_sparse_dot(self.R, Y)
def reduce(self, Y): return safe_sparse_dot(self.W1, Y) * safe_sparse_dot(self.W2, Y) / np.sqrt(self.k)
def score(self, user, candidates, context): # i_mat is (n_item_context, n_item) for all possible items # extract only target items i_mat = self.i_mat[:, candidates] n_target = len(candidates) u_vec = user.encode(dim=self.n_user, index=self.use_index, feature=True, vertical=True) u_vec = np.concatenate((u_vec, np.array([context]).T)) u_mat = sp.csr_matrix(np.repeat(u_vec, n_target, axis=1)) mat = sp.vstack((u_mat, i_mat)) # Matrix A and B should be dense (numpy array; rather than scipy CSR matrix) because V is dense. V = sp.csr_matrix(self.V) A = safe_sparse_dot(V.T, mat) A.data[:] = A.data ** 2 sq_mat = mat.copy() sq_mat.data[:] = sq_mat.data ** 2 sq_V = V.copy() sq_V.data[:] = sq_V.data ** 2 B = safe_sparse_dot(sq_V.T, sq_mat) interaction = (A - B).sum(axis=0) interaction /= 2. # (1, n_item); numpy matrix form pred = self.w0 + safe_sparse_dot(self.w, mat, dense_output=True) + interaction return np.abs(1. - np.ravel(pred))
def _get_potentials(self, x, w): # check sizes? n_node_coefs = self.n_prop_states * self.n_prop_features n_link_coefs = self.n_link_states * self.n_link_features n_compat_coefs = self.n_prop_states ** 2 * self.n_link_states if self.compat_features: n_compat_coefs *= self.n_compat_features_ assert w.size == (n_node_coefs + n_link_coefs + n_compat_coefs + self.n_second_order_features_ * self.n_second_order_factors_) w_node = w[:n_node_coefs] w_node = w_node.reshape(self.n_prop_states, self.n_prop_features) w_link = w[n_node_coefs:n_node_coefs + n_link_coefs] w_link = w_link.reshape(self.n_link_states, self.n_link_features) # for readability, consume w. This is not inplace, don't worry. w = w[n_node_coefs + n_link_coefs:] w_compat = w[:n_compat_coefs] if self.compat_features: w_compat = w_compat.reshape((self.n_compat_features_, -1)) w_compat = np.dot(x.X_compat, w_compat) compat_potentials = w_compat.reshape((-1, self.n_prop_states, self.n_prop_states, self.n_link_states)) else: compat_potentials = w_compat.reshape(self.n_prop_states, self.n_prop_states, self.n_link_states) w = w[n_compat_coefs:] coparent_potentials = grandparent_potentials = sibling_potentials = [] if self.coparents: w_coparent = w[:self.n_second_order_features_] coparent_potentials = safe_sparse_dot(x.X_sec_ord, w_coparent) w = w[self.n_second_order_features_:] if self.grandparents: w_grandparent = w[:self.n_second_order_features_] grandparent_potentials = safe_sparse_dot(x.X_sec_ord, w_grandparent) w = w[self.n_second_order_features_:] if self.siblings: w_sibling = w[:self.n_second_order_features_] sibling_potentials = safe_sparse_dot(x.X_sec_ord, w_sibling) prop_potentials = safe_sparse_dot(x.X_prop, w_node.T) link_potentials = safe_sparse_dot(x.X_link, w_link.T) return (prop_potentials, link_potentials, compat_potentials, coparent_potentials, grandparent_potentials, sibling_potentials)
def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ _, n_features = X.shape grad = np.empty_like(w) w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() return out, grad
def _multinomial_loss(w, X, Y, alpha, sample_weight): """Computes multinomial loss and class probabilities. Parameters ---------- w : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Y : ndarray, shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- loss : float Multinomial loss. p : ndarray, shape (n_samples, n_classes) Estimated class probabilities. w : ndarray, shape (n_classes, n_features) Reshaped param vector excluding intercept terms. Reference --------- Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = w.size == (n_classes * (n_features + 1)) w = w.reshape(n_classes, -1) alpha = alpha.reshape(n_classes, -1) sample_weight = sample_weight[:, np.newaxis] if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: intercept = 0 p = safe_sparse_dot(X, w.T) p += intercept p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() loss += 0.5 * ((alpha + L2_REG) * w * w).sum() p = np.exp(p, p) return loss, p, w
def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): """Computes the multinomial loss, gradient and class probabilities. Parameters ---------- w : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Y : ndarray, shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. Returns ------- loss : float Multinomial loss. grad : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Ravelled gradient of the multinomial loss. p : ndarray, shape (n_samples, n_classes) Estimated class probabilities Reference --------- Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = (w.size == n_classes * (n_features + 1)) grad = np.zeros((n_classes, n_features + bool(fit_intercept))) alpha = alpha.reshape(n_classes, -1) loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight) sample_weight = sample_weight[:, np.newaxis] diff = sample_weight * (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) grad[:, :n_features] += (alpha + L2_REG) * w if fit_intercept: grad[:, -1] = diff.sum(axis=0) return loss, grad.ravel(), p