加速 Python 中的矩阵向量乘法和指数运算，可能通过调用 C/C++

小能豆

加速 Python 中的矩阵向量乘法和指数运算，可能通过调用 C/C++

我目前正在从事一个机器学习项目，其中给定一个数据矩阵Z和一个向量rho，我必须计算处的逻辑损失函数rho的值和斜率。计算涉及基本的矩阵向量乘法和对数/指数运算，并使用避免数值溢出的技巧。

我目前在 Python 中使用 NumPy 执行此操作，如下所示（作为参考，此代码运行时间为 0.2 秒）。虽然效果很好，但我希望加快速度，因为我在代码中多次调用该函数（它占我项目中涉及的计算的 90% 以上）。

我正在寻找任何方法来改进此代码的运行时间，而无需并行化（即只有 1 个 CPU）。我很乐意使用 Python 中的任何公开可用的包，或者调用 C 或 C++（因为我听说这可以将运行时间提高一个数量级）。预处理数据矩阵Z也可以。可以利用一些方法来更好地计算，例如向量rho通常是稀疏的（大约 50% 的条目 = 0），并且行数通常比列数多得多（在大多数情况下n_cols <= 100）

import time
import numpy as np

np.__config__.show() #make sure BLAS/LAPACK is being used
np.random.seed(seed = 0)

#initialize data matrix X and label vector Y
n_rows, n_cols = 1e6, 100
X = np.random.random(size=(n_rows, n_cols))
Y = np.random.randint(low=0, high=2, size=(n_rows, 1))
Y[Y==0] = -1
Z = X*Y # all operations are carried out on Z

def compute_logistic_loss_value_and_slope(rho, Z):
    #compute the value and slope of the logistic loss function in a way that is numerically stable
    #loss_value: (1 x 1) scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
    #loss_slope: (n_cols x 1) vector = 1/n_rows * sum(-Z*rho ./ (1+exp(-Z*rho))
    #see also: https://stackoverflow.com/questions/20085768/

    scores = Z.dot(rho)
    pos_idx = scores > 0
    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(scores[~pos_idx])

    #compute loss value
    loss_value = np.empty_like(scores)
    loss_value[pos_idx] = np.log(1.0 + exp_scores_pos)
    loss_value[~pos_idx] = -scores[~pos_idx] + np.log(1.0 + exp_scores_neg)
    loss_value = loss_value.mean()

    #compute loss slope
    phi_slope = np.empty_like(scores)
    phi_slope[pos_idx]  = 1.0 / (1.0 + exp_scores_pos)
    phi_slope[~pos_idx] = exp_scores_neg / (1.0 + exp_scores_neg)
    loss_slope = Z.T.dot(phi_slope - 1.0) / Z.shape[0]

    return loss_value, loss_slope


#initialize a vector of integers where more than half of the entries = 0
rho_test = np.random.randint(low=-10, high=10, size=(n_cols, 1))
set_to_zero = np.random.choice(range(0,n_cols), size =(np.floor(n_cols/2), 1), replace=False)
rho_test[set_to_zero] = 0.0

start_time = time.time()
loss_value, loss_slope = compute_logistic_loss_value_and_slope(rho_test, Z)
print "total runtime = %1.5f seconds" % (time.time() - start_time)

阅读 9

2024-11-14

共1个答案

小能豆

为了加快您的代码速度，尤其是考虑到您提供的约束（无并行化、稀疏rho、行多于列），这里有一些有针对性的建议，重点是优化现有计算。这些建议旨在减少不必要的计算，更有效地利用矩阵运算rho。

1.利用稀疏矩阵`rho`

由于您提到rho稀疏（大约 50％的条目为零），因此scipy.sparse要避免不必要的Z。

您可以转换rho为水疗中心

import numpy as np

impo
import scipy.sparse as sp

# Assuming 'rho_test' is the sparse vector
rho_sparse = sp.csr_matrix(rho_test)


rho_sparse = sp.csr_matrix(rho_test)

de

rho_sparse = sp.csr_matrix(rho

rho_sparse = sp.csr_matr

rho_sparse = sp.c

rho_spars
def compute_logistic_loss_value_and_slope(rho, Z):
    # Convert rho to a sparse matrix (CSR format)
    rho_sparse = sp.csr_matrix(rho)


    rho_sparse = sp.csr_matrix(rho)



    rho_sparse = sp.csr_matrix(rho)


    rho_sparse = sp.csr_matrix(rho

    rho_sparse = sp.csr_matri

    rho_sparse = sp.csr

    rho_sparse = 

    rho_sp


# Compute the scores (Z.dot(rho)) only for non-zero entries in rho
    scores = Z.dot(rho_sparse.toarray())  
    scores = Z.dot(rho_sparse.toa

    scores = Z.dot(rh
# Convert sparse to dense for the multiplication

    # Efficient handling of the positive/negative indices
    pos_idx = scores > 
    pos_idx = scor

    pos_
0
    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(scores[~pos_idx])


    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(scores[~pos_idx])

    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(scores[~pos_

    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(score

    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.e

    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_n

    exp_scores_pos = np.exp(-scores[pos_idx])
    exp

    exp_scores_pos = np.exp(-scores[pos_idx

    exp_scores_pos = np.e

    exp_scores_pos

    exp_sc


# Compute loss value
    loss_value = np.empty_like(scores)
    loss_value[pos_idx] = np.log(
    loss_value = np.empty_like(scores)
    loss_value[pos_idx] = np.lo

    loss_value = np.empty_like(scores)
    loss_value[pos_idx] = 

    loss_value = np.empty_like(scores)
    loss_value[pos_id

    loss_value = np.empty_like(scores)
    loss_value[

    loss_value = np.empty_like(scores)
    loss

    loss_value = np.empty_like(scores)


    loss_value = np.empty_like(s

    loss_value = np.emp

    loss_value = 

    loss_v


1.0 + exp_scores_pos)
    loss_value[~pos_idx] = -scores[~pos_idx] + np.log(
    loss_value[~pos_idx] = -scores[~pos_idx] + 

    loss_value[~pos_idx] = -scores[~pos_id

    loss_value[~pos_idx] = -scores[~

    loss_value[~pos_idx] = -s

    loss_value[~pos_id

    loss_v

    lo


1.0 + exp_scores_neg)
    loss_value = loss_value.mean()


    loss_value = loss_value.mean()



    loss_value = loss_value.mea

    loss_value = loss_

    loss_val


# Compute loss slope
    phi_slope = np.empty_like(scores)
    phi_slope[pos_idx]  = 
    phi_slope = np.empty_like(scores)
    phi_slope[pos_idx] 

    phi_slope = np.empty_like(scores)
    phi_slope[pos_id

    phi_slope = np.empty_like(scores)
    phi_slope[pos

    phi_slope = np.empty_like(scores)
    phi_slope[

    phi_slope = np.empty_like(scores)
    phi_sl

    phi_slope = np.empty_like(scores)
    ph

    phi_slope = np.empty_like(scores)


    phi_slope = np.empty_like(score

    phi_slope = np.empty_like

    phi_slope = np.empt

    phi_slope = 

    phi_
1.0 / (1.0 + exp_scores_pos)
    phi_slope[~pos_idx] = exp_scores_neg / (
    phi_slope[~pos_idx] = exp_sco


1.0 + exp_scores_neg)
    loss_slope = Z.T.dot(phi_slope - 
    loss_slope = Z.T.dot(phi_sl

    loss_slope = Z.T.do

    loss_slope


1.0) / Z.shape[0]

    return loss_value, loss_slope

2.高效的矩阵运算

避免np.empty_like(scores)：与其使用创建loss_value和phi_slope数组np.empty_like(scores)，不如将它们创建为np.zeros数组或直接对数组的子集执行计算，这样效率更高。这可以避免任何不必要的分配开销。
矩阵运算：您可以优化矩阵向量乘法和元素运算以避免额外的计算，尤其是在使用稀疏矩阵时。

3.用于`np.logaddexp`数值稳定的对数指数

为了提高数值稳定性并避免计算对数和指数时出现溢出问题，可以使用np.logaddexp。此函数log(1 + exp(x))以数值稳定的方式进行计算。

替换以下内容：

loss_value[pos_idx] = np.log(1.0 + exp_scores_pos)
loss_value[~pos_idx] = -scores[~pos_idx] + np.log(1.0 + exp_scores_neg)

和：

loss_value = np.logaddexp(0, -scores)  # for positive scores
loss_value[~pos_idx] = -scores[~pos_idx] + np.logaddexp(0, scores)  # for negative scores

这种方法有助于数值稳定性，特别是当对数或指数的参数非常大或非常小时。

4.稀疏时避免完全计算`rho`

由于rho有很大一部分是零项，因此您可以通过仅处理非零项来显著加快代码速度。您可以使用rho的非零元素的索引来选择性地仅计算相关部分。

实现方法如下：

def compute_logistic_loss_value_and_slope(rho, Z):
    # Only compute for non-zero entries of rho
    non_zero_idx = np.nonzero(rho)[0]  # Get indices where rho is non-zero

    # Create a submatrix of Z for only the relevant columns
    Z_sub = Z[:, non_zero_idx]  # Submatrix of Z corresponding to non-zero rho elements
    rho_sub = rho[non_zero_idx]  # Only non-zero rho values

    # Compute the scores (Z.dot(rho)) only for relevant columns
    scores = Z_sub.dot(rho_sub)
    pos_idx = scores > 0
    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(scores[~pos_idx])

    # Compute loss value
    loss_value = np.empty_like(scores)
    loss_value[pos_idx] = np.log(1.0 + exp_scores_pos)
    loss_value[~pos_idx] = -scores[~pos_idx] + np.log(1.0 + exp_scores_neg)
    loss_value = loss_value.mean()

    # Compute loss slope
    phi_slope = np.empty_like(scores)
    phi_slope[pos_idx]  = 1.0 / (1.0 + exp_scores_pos)
    phi_slope[~pos_idx] = exp_scores_neg / (1.0 + exp_scores_neg)

    # For the slope, extend back to the full matrix Z
    loss_slope = np.zeros(Z.shape[1])
    loss_slope[non_zero_idx] = Z_sub.T.dot(phi_slope - 1.0) / Z.shape[0]

    return loss_value, loss_slope

5.考虑使用`Numba`或`Cython`

如果需要进一步提高性能，您可能需要考虑使用Numba或Cython加快计算速度。这些工具允许您将 Python 代码编译为机器代码以提高性能，尤其是对于数值运算。

Numba更易于集成，不需要过多修改代码结构。

这是您的函数的一个简单的 Numba 加速版本：

import numba
from numba import jit

@jit(nopython=True)
def compute_logistic_loss_value_and_slope(rho, Z):
    scores = Z.dot(rho)
    pos_idx = scores > 0
    exp_scores_pos = np.exp(-scores[pos_idx])
    exp_scores_neg = np.exp(scores[~pos_idx])

    loss_value = np.empty_like(scores)
    loss_value[pos_idx] = np.log(1.0 + exp_scores_pos)
    loss_value[~pos_idx] = -scores[~pos_idx] + np.log(1.0 + exp_scores_neg)
    loss_value = loss_value.mean()

    phi_slope = np.empty_like(scores)
    phi_slope[pos_idx]  = 1.0 / (1.0 + exp_scores_pos)
    phi_slope[~pos_idx] = exp_scores_neg / (1.0 + exp_scores_neg)
    loss_slope = Z.T.dot(phi_slope - 1.0) / Z.shape[0]

    return loss_value, loss_slope

Numba可以显著加快矩阵向量乘法和元素函数等运算。

6.优化总结：

使用稀疏矩阵来rho节省零条目的计算。
优化对数指数计算以获得np.logaddexp更好的数值稳定性。
仅处理非零条目以rho避免不必要的计算。
如果需要，请考虑使用 Numba或Cython进一步加速。

尝试实施这些更改，您应该会看到函数性能明显改善，尤其是在处理大型数据集时！

2024-11-14