一尘不染

使用itertools.groupby性能进行numpy分组

algorithm

我有很多大的(>
35,000,000)整数列表,其中将包含重复项。我需要获取列表中每个整数的计数。以下代码有效,但似乎很慢。还有人可以使用Python最好是Numpy更好地进行基准测试吗?

def group():
    import numpy as np
    from itertools import groupby
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    groups = ((k,len(list(g))) for k,g in groupby(values))
    index = np.fromiter(groups,dtype='u4,u2')

if __name__=='__main__':
    from timeit import Timer
    t = Timer("group()","from __main__ import group")
    print t.timeit(number=1)

返回:

$ python bench.py 
111.377498865

干杯!

*根据回复进行 *编辑

def group_original():
    import numpy as np
    from itertools import groupby
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    groups = ((k,len(list(g))) for k,g in groupby(values))
    index = np.fromiter(groups,dtype='u4,u2')

def group_gnibbler():
    import numpy as np
    from itertools import groupby
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    groups = ((k,sum(1 for i in g)) for k,g in groupby(values))
    index = np.fromiter(groups,dtype='u4,u2')

def group_christophe():
    import numpy as np
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    counts=values.searchsorted(values, side='right') - values.searchsorted(values, side='left')
    index = np.zeros(len(values),dtype='u4,u2')
    index['f0']=values
    index['f1']=counts
    #Erroneous result!

def group_paul():
    import numpy as np
    values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
    values.sort()
    diff = np.concatenate(([1],np.diff(values)))
    idx = np.concatenate((np.where(diff)[0],[len(values)]))
    index = np.empty(len(idx)-1,dtype='u4,u2')
    index['f0']=values[idx[:-1]]
    index['f1']=np.diff(idx)

if __name__=='__main__':
    from timeit import Timer
    timings=[
                ("group_original","Original"),
                ("group_gnibbler","Gnibbler"),
                ("group_christophe","Christophe"),
                ("group_paul","Paul"),
            ]
    for method,title in timings:
        t = Timer("%s()"%method,"from __main__ import %s"%method)
        print "%s: %s secs"%(title,t.timeit(number=1))

返回:

$ python bench.py 
Original: 113.385262966 secs
Gnibbler: 71.7464978695 secs
Christophe: 27.1690568924 secs
Paul: 9.06268405914 secs

尽管Christophe目前给出的结果不正确


阅读 395

收藏
2020-07-28

共1个答案

一尘不染

我做类似这样的事情得到了3倍的改进:

def group():
    import numpy as np
    values = np.array(np.random.randint(0,3298,size=35000000),dtype='u4')
    values.sort()
    dif = np.ones(values.shape,values.dtype)
    dif[1:] = np.diff(values)
    idx = np.where(dif>0)
    vals = values[idx]
    count = np.diff(idx)
2020-07-28