Python numpy 模块,argsort() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.argsort()。
def PCA(data, num_components=None):
# mean center the data
data -= data.mean(axis=0)
# calculate the covariance matrix
R = np.cov(data, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric,
# the performance gain is substantial
V, E = np.linalg.eigh(R)
# sort eigenvalue in decreasing order
idx = np.argsort(V)[::-1]
E = E[:,idx]
# sort eigenvectors according to same index
V = V[idx]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
E = E[:, :num_components]
# carry out the transformation on the data using eigenvectors
# and return the re-scaled data, eigenvalues, and eigenvectors
return np.dot(E.T, data.T).T, V, E
def __SubDoWavelets(self,waveforms):
scales = 4
dimensions = 10
nspk,ls = waveforms.shape
cc = pywt.wavedec(waveforms,"haar",mode="symmetric",level=scales,axis=-1)
cc = np.hstack(cc)
sd = list()
for i in range(ls):
test_data = cc[:,i]
thr_dist = np.std(test_data,ddof=1)*3
thr_dist_min = np.mean(test_data)-thr_dist
thr_dist_max = np.mean(test_data)+thr_dist
aux = test_data[(test_data>thr_dist_min)&(test_data<thr_dist_max)]
if aux.size > 10:
sd.append(self.__test_ks(aux))
else:
sd.append(0)
ind = np.argsort(sd)
ind = ind[::-1]
coeff = ind[:dimensions]
waveletspk = cc[:,coeff]
return waveletspk
def get_feature_importance(list_of_features):
n_estimators=10000
random_state=0
n_jobs=4
x_train=data_frame[list_of_features]
y_train=data_frame.iloc[:,-1]
feat_labels= data_frame.columns[1:]
forest = BaggingRegressor(n_estimators=n_estimators,random_state=random_state,n_jobs=n_jobs)
forest.fit(x_train,y_train)
importances=forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],
importances[indices[f]]))
plt.title("Feature Importance")
plt.bar(range(x_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(x_train.shape[1]),feat_labels[indices],rotation=90)
plt.xlim([-1,x_train.shape[1]])
plt.tight_layout()
plt.show()
def update_sort_idcs(self):
# The selected points are sorted before all the other points -- an easy
# way to achieve this is to add the maximum score to their score
if self.current_order == 0:
score = self.score_x
elif self.current_order == 1:
score = self.score_y
elif self.current_order == 2:
score = self.score_z
else:
raise AssertionError(self.current_order)
score = score.copy()
if len(self.selected_points):
score[np.array(sorted(self.selected_points))] += score.max()
self.sort_idcs = np.argsort(score)
def removeTopPCs(X, numRemovePCs):
t0 = time.time()
X_mean = X.mean(axis=0)
X -= X_mean
XXT = symmetrize(blas.dsyrk(1.0, X, lower=0))
s,U = la.eigh(XXT)
if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found')
s[s<0]=0
ind = np.argsort(s)[::-1]
U = U[:, ind]
s = s[ind]
s = np.sqrt(s)
#remove null PCs
ind = (s>1e-6)
U = U[:, ind]
s = s[ind]
V = X.T.dot(U/s)
#print 'max diff:', np.max(((U*s).dot(V.T) - X)**2)
X = (U[:, numRemovePCs:]*s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :])
X += X_mean
return X
def _translate(seq, f_init, f_next, trg_eos_idx, src_sel, trg_sel,
k, cond_init_trg, normalize, n_best, **kwargs):
sample, score = gen_sample(
f_init, f_next, x=numpy.array(seq).reshape([len(seq), 1]),
eos_idx=trg_eos_idx, src_selector=src_sel, trg_selector=trg_sel,
k=k, maxlen=3*len(seq), stochastic=False, argmax=False,
cond_init_trg=cond_init_trg, **kwargs)
if normalize:
lengths = numpy.array([len(s) for s in sample])
score = score / lengths
if n_best == 1:
sidx = numpy.argmin(score)
elif n_best > 1:
sidx = numpy.argsort(score)[:n_best]
else:
raise ValueError('n_best cannot be negative!')
return sample[sidx], score[sidx]
def closestNeighbor(query, embedding_array, normed=False, top_k=1):
'''Gets the index of the closest neighbor of embedding_array
to the query point. Distance metric is cosine.
SLOW. DO NOT USE THIS FOR RAPID COMPUTATION.
'''
embedding_array = numpy.array(embedding_array)
if not normed:
embedding_array = numpy.array([
(embedding_array[i] / numpy.linalg.norm(embedding_array[i]))
for i in range(embedding_array.shape[0])
])
## assuming embeddings are unit-normed by this point;
## norm(query) is a constant factor, so we can ignore it
dists = numpy.array([
numpy.dot(query, embedding_array[i])
for i in range(embedding_array.shape[0])
])
sorted_ixes = numpy.argsort(-1 * dists)
return sorted_ixes[:top_k]
def testMerge(self, dtype=dtype):
testarray1 = range(1,101)
testarray2 = range(5,106)
a = numpy.empty((100,2), dtype=dtype)
b = numpy.empty((100,2), dtype=dtype)
merged = numpy.empty((200,2), dtype=dtype)
incompatible1 = numpy.empty((200,3), dtype=dtype)
incompatible2 = numpy.empty(200, dtype=dtype)
a[:,0] = numpy.arange(1,101)
a[:,1] = numpy.arange(2,102)
b[:,0] = numpy.arange(5,105)
b[:,1] = numpy.arange(6,106)
ref = numpy.concatenate([a,b])
ref = ref[numpy.argsort(ref[:,0])]
self.assertEqual(mapped_struct.index_merge(a, b, merged), 200)
self.assertTrue((merged == ref).all())
self.assertRaises(ValueError, mapped_struct.index_merge, a, b, incompatible1)
self.assertRaises(ValueError, mapped_struct.index_merge, a, incompatible1, merged)
self.assertRaises(ValueError, mapped_struct.index_merge, a, b, incompatible2)
self.assertRaises(ValueError, mapped_struct.index_merge, a, incompatible2, merged)
def __init__(self, pos, color, mode=None):
"""
=============== ==============================================================
**Arguments:**
pos Array of positions where each color is defined
color Array of RGBA colors.
Integer data types are interpreted as 0-255; float data types
are interpreted as 0.0-1.0
mode Array of color modes (ColorMap.RGB, HSV_POS, or HSV_NEG)
indicating the color space that should be used when
interpolating between stops. Note that the last mode value is
ignored. By default, the mode is entirely RGB.
=============== ==============================================================
"""
self.pos = np.array(pos)
order = np.argsort(self.pos)
self.pos = self.pos[order]
self.color = np.array(color)[order]
if mode is None:
mode = np.ones(len(pos))
self.mode = mode
self.stopsCache = {}
def __loadChnTimeWave(self,f,selectChan):
times = list()
waveforms = list()
spk_startswith = "spike_{0}".format(selectChan)
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
times = times[sort_index]
return times,waveforms
else:
return None,None
def __init__(self, pos, color, mode=None):
"""
=============== ==============================================================
**Arguments:**
pos Array of positions where each color is defined
color Array of RGBA colors.
Integer data types are interpreted as 0-255; float data types
are interpreted as 0.0-1.0
mode Array of color modes (ColorMap.RGB, HSV_POS, or HSV_NEG)
indicating the color space that should be used when
interpolating between stops. Note that the last mode value is
ignored. By default, the mode is entirely RGB.
=============== ==============================================================
"""
self.pos = np.array(pos)
order = np.argsort(self.pos)
self.pos = self.pos[order]
self.color = np.array(color)[order]
if mode is None:
mode = np.ones(len(pos))
self.mode = mode
self.stopsCache = {}
def __loadChnTimeWave(self,f,selectChan):
times = list()
waveforms = list()
spk_startswith = "spike_{0}".format(selectChan)
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
times = times[sort_index]
return times,waveforms
else:
return None,None
def __SubDoWavelets(self,waveforms):
scales = 4
dimensions = 10
nspk,ls = waveforms.shape
cc = pywt.wavedec(waveforms,"haar",mode="symmetric",level=scales,axis=-1)
cc = np.hstack(cc)
sd = list()
for i in range(ls):
test_data = cc[:,i]
thr_dist = np.std(test_data,ddof=1)*3
thr_dist_min = np.mean(test_data)-thr_dist
thr_dist_max = np.mean(test_data)+thr_dist
aux = test_data[(test_data>thr_dist_min)&(test_data<thr_dist_max)]
if aux.size > 10:
sd.append(self.__test_ks(aux))
else:
sd.append(0)
ind = np.argsort(sd)
ind = ind[::-1]
coeff = ind[:dimensions]
waveletspk = cc[:,coeff]
return waveletspk
def __load_waveforms(self,selectChan,file_name):
spk_startswith = "spike_{0}".format(selectChan)
with hp.File(file_name,"r") as f:
times = list()
waveforms = list()
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
tep_time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(tep_time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
return waveforms
else:
return None
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list):
"""
:param row: it is a list.length is number of labels. e.g. 2002
:param vocabulary_index2word_label
:param result_list
:return: a lable
"""
label_list=list(np.argsort(row))
label_list.reverse()
#print("label_list:",label_list) # a list,length is number of labels.
for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want.
#print(i,"index:",index)
flag1=vocabulary_index2word_label[index] not in result_list
flag2=index!=vocabulary_word2index_label[_PAD]
flag3=index!=vocabulary_word2index_label[_END]
if flag1 and flag2 and flag3:
#print("going to return ")
return vocabulary_index2word_label[index]
# write question id and labels to file system.
def get_label_using_logits_batch(question_id_sublist, logits_batch, vocabulary_index2word_label, f, top_number=5):
print("get_label_using_logits.shape:", np.array(logits_batch).shape) # (1, 128, 2002))
for i, logits in enumerate(logits_batch):
index_list = np.argsort(logits)[-top_number:]
#print("index_list:",index_list)
index_list = index_list[::-1]
label_list = []
for index in index_list:
#print("index:",index)
label = vocabulary_index2word_label[index]
label_list.append(
label) # ('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
# print("get_label_using_logits.label_list",label_list)
write_question_id_with_labels(question_id_sublist[i], label_list, f)
f.flush()
# get label using logits
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list):
"""
:param row: it is a list.length is number of labels. e.g. 2002
:param vocabulary_index2word_label
:param result_list
:return: a lable
"""
label_list=list(np.argsort(row))
label_list.reverse()
#print("label_list:",label_list) # a list,length is number of labels.
for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want.
#print(i,"index:",index)
flag1=vocabulary_index2word_label[index] not in result_list
flag2=index!=vocabulary_word2index_label[_PAD]
flag3=index!=vocabulary_word2index_label[_END]
if flag1 and flag2 and flag3:
#print("going to return ")
return vocabulary_index2word_label[index]
# write question id and labels to file system.
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list):
"""
:param row: it is a list.length is number of labels. e.g. 2002
:param vocabulary_index2word_label
:param result_list
:return: a lable
"""
label_list=list(np.argsort(row))
label_list.reverse()
#print("label_list:",label_list) # a list,length is number of labels.
for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want.
#print(i,"index:",index)
flag1=vocabulary_index2word_label[index] not in result_list
flag2=index!=vocabulary_word2index_label[_PAD]
flag3=index!=vocabulary_word2index_label[_END]
if flag1 and flag2 and flag3:
#print("going to return ")
return vocabulary_index2word_label[index]
def filter_sort_unique(self, max_objval=float('Inf')):
# filter
if max_objval < float('inf'):
good_idx = self.objvals <= max_objval
self.objvals = self.objvals[good_idx]
self.solutions = self.solutions[good_idx]
if len(self.objvals) > 0:
sort_idx = np.argsort(self.objvals)
self.objvals = self.objvals[sort_idx]
self.solutions = self.solutions[sort_idx]
# unique
b = np.ascontiguousarray(self.solutions).view(
np.dtype((np.void, self.solutions.dtype.itemsize * self.P)))
_, unique_idx = np.unique(b, return_index=True)
self.objvals = self.objvals[unique_idx]
self.solutions = self.solutions[unique_idx]
def round_solution_pool(pool, constraints):
pool.distinct().sort()
P = pool.P
L0_reg_ind = np.isnan(constraints['coef_set'].C_0j)
L0_max = constraints['L0_max']
rounded_pool = SolutionPool(P)
for solution in pool.solutions:
# sort from largest to smallest coefficients
feature_order = np.argsort([-abs(x) for x in solution])
rounded_solution = np.zeros(shape=(1, P))
l0_norm_count = 0
for k in range(0, P):
j = feature_order[k]
if not L0_reg_ind[j]:
rounded_solution[0, j] = np.round(solution[j], 0)
elif l0_norm_count < L0_max:
rounded_solution[0, j] = np.round(solution[j], 0)
l0_norm_count += L0_reg_ind[j]
rounded_pool.add(objvals=np.nan, solutions=rounded_solution)
rounded_pool.distinct().sort()
return rounded_pool
def listen(self, results):
score_out = results['score_out']
y_gt = results['y_gt']
sort_idx = np.argsort(score_out, axis=-1)
idx_gt = np.argmax(y_gt, axis=-1)
correct = 0
count = 0
for kk, ii in enumerate(idx_gt):
sort_idx_ = sort_idx[kk][::-1]
for jj in sort_idx_[:self.top_k]:
if ii == jj:
correct += 1
break
count += 1
# self.log.info('Correct {}/{}'.format(correct, count))
self.correct += correct
self.count += count
self.step = int(results['step'])
# self.log.info('Step {}'.format(self.step))
pass
def nn(model, text, vectors, query, k=5):
"""
Return the nearest neighbour sentences to query
text: list of sentences
vectors: the corresponding representations for text
query: a string to search
"""
qf = encode(model, [query])
qf /= norm(qf)
scores = numpy.dot(qf, vectors.T).flatten()
sorted_args = numpy.argsort(scores)[::-1]
sentences = [text[a] for a in sorted_args[:k]]
print('QUERY: ' + query)
print('NEAREST: ')
for i, s in enumerate(sentences):
print(s, sorted_args[i])
def nn(model, text, vectors, query, k=5):
"""
Return the nearest neighbour sentences to query
text: list of sentences
vectors: the corresponding representations for text
query: a string to search
"""
qf = encode(model, [query])
qf /= norm(qf)
scores = numpy.dot(qf, vectors.T).flatten()
sorted_args = numpy.argsort(scores)[::-1]
sentences = [text[a] for a in sorted_args[:k]]
print 'QUERY: ' + query
print 'NEAREST: '
for i, s in enumerate(sentences):
print s, sorted_args[i]
def _spatial_sort(glyph):
from scipy.spatial.distance import cdist
from numpy import argsort
from numpy import argmin
curr = argmin(glyph[:,0])
visited = set([curr])
order = [curr]
dd = cdist(glyph, glyph)
while len(visited)<len(glyph):
row = dd[curr,:]
for i in argsort(row):
if row[i]<=0.0 or i==curr or i in visited:
continue
order.append(i)
visited.add(i)
break
glyph[:,:] = glyph[order,:]
def label_ranking_reciprocal_rank(label, # [sent_num]
preds): # [sent_num]
""" Calcualting the reciprocal rank according to definition,
"""
rank = np.argsort(preds)[::-1]
#pos_rank = np.take(rank, np.where(label == 1)[0])
#return np.mean(1.0 / pos_rank)
if_find = False
pos = 0
for r in rank:
pos += 1
if label[r] == 1:
first_pos_r = pos
if_find = True
break
assert(if_find)
return 1.0 / first_pos_r
def _matrix_inverse(self, matrix):
"""
Computes inverse of a matrix.
"""
matrix = np.array(matrix)
n_features = matrix.shape[0]
rank = np.linalg.matrix_rank(matrix)
if rank == n_features:
return np.linalg.inv(matrix)
else:
# Matrix is not full rank, so use Hadi's technique to compute inverse
# Reference: Ali S. Hadi (1992) "Identifying Multiple Outliers in Multivariate Data" eg. 2.3, 2.4
eigenValues, eigenVectors = np.linalg.eig(matrix)
eigenValues = np.abs(eigenValues) # to deal with -0 values
idx = eigenValues.argsort()[::-1]
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:, idx]
s = eigenValues[eigenValues != 0].min()
w = [1 / max(e, s) for e in eigenValues]
W = w * np.eye(n_features)
return eigenVectors.dot(W).dot(eigenVectors.T)
def get_1000G_snps(sumstats, out_file):
sf = np.loadtxt(sumstats,dtype=str,skiprows=1)
h5f = h5py.File('ref/Misc/1000G_SNP_info.h5','r')
rf = h5f['snp_chr'][:]
h5f.close()
ind1 = np.in1d(sf[:,1],rf[:,2])
ind2 = np.in1d(rf[:,2],sf[:,1])
sf1 = sf[ind1]
rf1 = rf[ind2]
### check order ###
if sum(sf1[:,1]==rf1[:,2])==len(rf1[:,2]):
print 'Good!'
else:
print 'Shit happens, sorting sf1 to have the same order as rf1'
O1 = np.argsort(sf1[:,1])
O2 = np.argsort(rf1[:,2])
O3 = np.argsort(O2)
sf1 = sf1[O1][O3]
out = ['hg19chrc snpid a1 a2 bp or p'+'\n']
for i in range(len(sf1[:,1])):
out.append(sf1[:,0][i]+' '+sf1[:,1][i]+' '+sf1[:,2][i]+' '+sf1[:,3][i]+' '+rf1[:,1][i]+' '+sf1[:,5][i]+' '+sf1[:,6][i]+'\n')
ff = open(out_file,"w")
ff.writelines(out)
ff.close()
def plot_heatmaps(data, mis, column_label, cont, topk=30, prefix=''):
cmap = sns.cubehelix_palette(as_cmap=True, light=.9)
m, nv = mis.shape
for j in range(m):
inds = np.argsort(- mis[j, :])[:topk]
if len(inds) >= 2:
plt.clf()
order = np.argsort(cont[:,j])
subdata = data[:, inds][order].T
subdata -= np.nanmean(subdata, axis=1, keepdims=True)
subdata /= np.nanstd(subdata, axis=1, keepdims=True)
columns = [column_label[i] for i in inds]
sns.heatmap(subdata, vmin=-3, vmax=3, cmap=cmap, yticklabels=columns, xticklabels=False, mask=np.isnan(subdata))
filename = '{}/heatmaps/group_num={}.png'.format(prefix, j)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
plt.title("Latent factor {}".format(j))
plt.yticks(rotation=0)
plt.savefig(filename, bbox_inches='tight')
plt.close('all')
#plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=cont[:, j],
# outfile=prefix + '/relationships/group_num=' + str(j), latent=labels[:, j], alpha=0.1)
def plot_top_relationships(data, corex, labels, column_label, topk=5, prefix=''):
dual = (corex.moments['X_i Y_j'] * corex.moments['X_i Z_j']).T
alpha = dual > 0.04
cy = corex.moments['ry']
m, nv = alpha.shape
for j in range(m):
inds = np.where(alpha[j] > 0)[0]
inds = inds[np.argsort(- dual[j][inds])][:topk]
if len(inds) >= 2:
if dual[j, inds[0]] > 0.1:
factor = labels[:, j]
title = '$Y_{%d}$' % j
else:
k = np.argmax(np.abs(cy[j]))
if k == j:
k = np.argsort(-np.abs(cy[j]))[1]
factor = corex.moments['X_i Z_j'][inds[0], j] * labels[:, j] + corex.moments['X_i Z_j'][inds[0], k] * labels[:, k]
title = '$Y_{%d} + Y_{%d}$' % (j, k)
plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=factor,
outfile=prefix + '/relationships/group_num=' + str(j), title=title)
def trim(g, max_parents=False, max_children=False):
for node in g:
if max_parents:
parents = list(g.successors(node))
weights = [g.edge[node][parent]['weight'] for parent in parents]
for weak_parent in np.argsort(weights)[:-max_parents]:
g.remove_edge(node, parents[weak_parent])
if max_children:
children = g.predecessors(node)
weights = [g.edge[child][node]['weight'] for child in children]
for weak_child in np.argsort(weights)[:-max_children]:
g.remove_edge(children[weak_child], node)
return g
# Misc. utilities
def deviation_plot(rp, variable_name, slope_cutoff=1, average_cutoff = 2.):
average_panel = rp.value_panel(variable_name, types=['average'])
average_panel = (average_panel.T - np.median(average_panel, axis=1)).T
average_panel.sort()
average_ranges = np.max(average_panel, axis=1) - np.min(average_panel, axis=1)
average_panel = average_panel[np.argsort(average_ranges)][::-1]
slope_panel = rp.value_panel(variable_name, types=['slope'])
slope_panel = (slope_panel.T - np.median(slope_panel, axis=1)).T
slope_panel.sort()
slope_ranges = np.max(slope_panel, axis=1) - np.min(slope_panel, axis=1)
slope_panel = slope_panel[np.argsort(slope_ranges)][::-1]
return _multiplot(rp.dataset, variable_name, slope_panel, average_panel,
left_vmin = -1.0*slope_cutoff, left_vmax = slope_cutoff,
right_vmin = -1.0*average_cutoff, right_vmax = average_cutoff)
def get_local_words(preds, vocab, NEs=[], k=50):
"""
given the word probabilities over many coordinates,
first normalize the probability of each word in different
locations to get a probability distribution, then compute
the entropy of the word's distribution over all coordinates
and return the words that are low entropy and are not
named entities.
"""
#normalize the probabilites of each vocab using entropy
normalized_preds = normalize(preds, norm='l1', axis=0)
entropies = stats.entropy(normalized_preds)
sorted_indices = np.argsort(entropies)
sorted_local_words = np.array(vocab)[sorted_indices].tolist()
filtered_local_words = []
NEset = set(NEs)
for word in sorted_local_words:
if word in NEset: continue
filtered_local_words.append(word)
return filtered_local_words[0:k]
def cr(self):
# Composite Reliability
composite = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
block = self.data_[self.Variables['measurement']
[self.Variables['latent'] == self.latent[i]]]
p = len(block.columns)
if(p != 1):
cor_mat = np.cov(block.T)
evals, evecs = np.linalg.eig(cor_mat)
U, S, V = np.linalg.svd(cor_mat, full_matrices=False)
indices = np.argsort(evals)
indices = indices[::-1]
evecs = evecs[:, indices]
evals = evals[indices]
loadings = V[0, :] * np.sqrt(evals[0])
numerador = np.sum(abs(loadings))**2
denominador = numerador + (p - np.sum(loadings ** 2))
cr = numerador / denominador
composite[self.latent[i]] = cr
else:
composite[self.latent[i]] = 1
composite = composite.T
return(composite)
def _get_sorted_channels_(self, all_keys, pattern):
sub_list = [f for f in all_keys if pattern in f]
all_channels = [int(f.split(pattern)[1]) for f in sub_list]
idx = numpy.argsort(all_channels)
return sub_list, idx
def set_streams(self, stream_mode):
if stream_mode == 'single-file':
sources = []
to_write = []
count = 0
params = self.get_description()
my_file = h5py.File(self.file_name)
all_matches = [re.findall('\d+', u) for u in my_file.keys()]
all_streams = []
for m in all_matches:
if len(m) > 0:
all_streams += [int(m[0])]
idx = numpy.argsort(all_streams)
for i in xrange(len(all_streams)):
params['h5_key'] = my_file.keys()[idx[i]]
new_data = type(self)(self.file_name, params)
sources += [new_data]
to_write += ['We found the datafile %s with t_start %d and duration %d' %(new_data.file_name, new_data.t_start, new_data.duration)]
print_and_log(to_write, 'debug', logger)
return sources
elif stream_mode == 'multi-files':
return H5File.set_streams(stream_mode)
def set_streams(self, stream_mode):
if stream_mode == 'single-file':
sources = []
to_write = []
count = 0
params = self.get_description()
my_file = h5py.File(self.file_name)
all_matches = my_file.get('recordings').keys()
all_streams = []
for m in all_matches:
all_streams += [int(m)]
idx = numpy.argsort(all_streams)
for count in xrange(len(all_streams)):
params['recording_number'] = all_streams[idx[count]]
new_data = type(self)(self.file_name, params)
sources += [new_data]
to_write += ['We found the datafile %s with t_start %d and duration %d' %(new_data.file_name, new_data.t_start, new_data.duration)]
print_and_log(to_write, 'debug', logger)
return sources
elif stream_mode == 'multi-files':
return H5File.set_streams(stream_mode)
def rho_estimation(data, update=None, compute_rho=True, mratio=0.01):
N = len(data)
rho = numpy.zeros(N, dtype=numpy.float32)
if update is None:
dist = distancematrix(data)
didx = lambda i,j: i*N + j - i*(i+1)//2 - i - 1
nb_selec = max(5, int(mratio*N))
sdist = {}
if compute_rho:
for i in xrange(N):
indices = numpy.concatenate((didx(i, numpy.arange(i+1, N)), didx(numpy.arange(0, i-1), i)))
tmp = numpy.argsort(numpy.take(dist, indices))[:nb_selec]
sdist[i] = numpy.take(dist, numpy.take(indices, tmp))
rho[i] = numpy.mean(sdist[i])
else:
M = len(update[0])
nb_selec = max(5, int(mratio*M))
sdist = {}
for i in xrange(N):
dist = distancematrix(data[i].reshape(1, len(data[i])), update[0]).ravel()
all_dist = numpy.concatenate((dist, update[1][i]))
idx = numpy.argsort(all_dist)[:nb_selec]
sdist[i] = numpy.take(all_dist, idx)
rho[i] = numpy.mean(sdist[i])
return rho, dist, sdist, nb_selec
def update_data_plot(self):
reverse_sort = np.argsort(self.sort_idcs)
if len(self.inspect_points):
inspect = reverse_sort[np.array(sorted(self.inspect_points))]
data = numpy.vstack((np.ones(len(inspect))*(2*self.raw_lags[-1]-self.raw_lags[-2]), inspect+0.5)).T
self.inspect_markers.set_offsets(data)
self.inspect_markers.set_color(self.inspect_colors)
else:
self.inspect_markers.set_offsets([])
self.inspect_markers.set_color([])
self.ui.data_overview.draw_idle()
def eigenDecompose(self, X, K, normalize=True):
if (X.shape[1] >= X.shape[0]):
s,U = la.eigh(K)
else:
U, s, _ = la.svd(X, check_finite=False, full_matrices=False)
if (s.shape[0] < U.shape[1]): s = np.concatenate((s, np.zeros(U.shape[1]-s.shape[0]))) #note: can use low-rank formulas here
s=s**2
if normalize: s /= float(X.shape[1])
if (np.min(s) < -1e-10): raise Exception('Negative eigenvalues found')
s[s<0]=0
ind = np.argsort(s)[::-1]
U = U[:, ind]
s = s[ind]
return s,U
def threshold_from_predictions(y, y_pred, false_positive_margin=0, recall=1):
"""Determines a threshold for classifying examples as positive
Args:
y: labels
y_pred: scores from the classifier
recall: Threshold is set to classify at least this fraction of positive
labelled examples as positive
false_positive_margin: Threshold is set to acheive desired recall, and
then is extended to include an additional fraction of negative
labelled examples equal to false_positive_margin (This allows adding
a buffer to the threshold while maintaining a constant "cost")
"""
n_positive = np.count_nonzero(y)
n_negative = len(y) - n_positive
if n_positive == 0:
return np.max(y_pred)
if false_positive_margin == 0 and recall == 1:
return np.min(y_pred[y])
ind = np.argsort(y_pred)
y_pred_sorted = y_pred[ind]
y_sorted = y[ind]
so_far = [0, 0]
j = 0
for i in reversed(range(len(y_sorted))):
so_far[y_sorted[i]] += 1
if so_far[1] >= int(np.floor(recall * n_positive)):
j = i
break
so_far = [0, 0]
if false_positive_margin == 0:
return y_pred_sorted[j]
k = 0
for i in reversed(range(j)):
so_far[y_sorted[i]] += 1
if so_far[0] >= false_positive_margin * n_negative:
k = i
break
return y_pred_sorted[k]
def threshold_from_predictions(y, y_pred, false_positive_margin=0, recall=1):
"""Determines a threshold for classifying examples as positive
Args:
y: labels
y_pred: scores from the classifier
recall: Threshold is set to classify at least this fraction of positive
labelled examples as positive
false_positive_margin: Threshold is set to acheive desired recall, and
then is extended to include an additional fraction of negative
labelled examples equal to false_positive_margin (This allows adding
a buffer to the threshold while maintaining a constant "cost")
"""
n_positive = np.count_nonzero(y)
n_negative = len(y) - n_positive
if n_positive == 0:
return np.max(y_pred)
if false_positive_margin == 0 and recall == 1:
return np.min(y_pred[y])
ind = np.argsort(y_pred)
y_pred_sorted = y_pred[ind]
y_sorted = y[ind]
so_far = [0, 0]
j = 0
for i in reversed(range(len(y_sorted))):
so_far[y_sorted[i]] += 1
if so_far[1] >= int(np.floor(recall * n_positive)):
j = i
break
so_far = [0, 0]
if false_positive_margin == 0:
return y_pred_sorted[j]
k = 0
for i in reversed(range(j)):
so_far[y_sorted[i]] += 1
if so_far[0] >= false_positive_margin * n_negative:
k = i
break
return y_pred_sorted[k]
def threshold_from_predictions(y, y_pred, false_positive_margin=0, recall=1):
"""Determines a threshold for classifying examples as positive
Args:
y: labels
y_pred: scores from the classifier
recall: Threshold is set to classify at least this fraction of positive
labelled examples as positive
false_positive_margin: Threshold is set to acheive desired recall, and
then is extended to include an additional fraction of negative
labelled examples equal to false_positive_margin (This allows adding
a buffer to the threshold while maintaining a constant "cost")
"""
n_positive = np.count_nonzero(y)
n_negative = len(y) - n_positive
if n_positive == 0:
return np.max(y_pred)
if false_positive_margin == 0 and recall == 1:
return np.min(y_pred[y])
ind = np.argsort(y_pred)
y_pred_sorted = y_pred[ind]
y_sorted = y[ind]
so_far = [0, 0]
j = 0
for i in reversed(range(len(y_sorted))):
so_far[y_sorted[i]] += 1
if so_far[1] >= int(np.floor(recall * n_positive)):
j = i
break
so_far = [0, 0]
if false_positive_margin == 0:
return y_pred_sorted[j]
k = 0
for i in reversed(range(j)):
so_far[y_sorted[i]] += 1
if so_far[0] >= false_positive_margin * n_negative:
k = i
break
return y_pred_sorted[k]
def threshold_from_predictions(y, y_pred, false_positive_margin=0, recall=1):
"""Determines a threshold for classifying examples as positive
Args:
y: labels
y_pred: scores from the classifier
recall: Threshold is set to classify at least this fraction of positive
labelled examples as positive
false_positive_margin: Threshold is set to acheive desired recall, and
then is extended to include an additional fraction of negative
labelled examples equal to false_positive_margin (This allows adding
a buffer to the threshold while maintaining a constant "cost")
"""
n_positive = np.count_nonzero(y)
n_negative = len(y) - n_positive
if n_positive == 0:
return np.max(y_pred)
if false_positive_margin == 0 and recall == 1:
return np.min(y_pred[y])
ind = np.argsort(y_pred)
y_pred_sorted = y_pred[ind]
y_sorted = y[ind]
so_far = [0, 0]
j = 0
for i in reversed(range(len(y_sorted))):
so_far[y_sorted[i]] += 1
if so_far[1] >= int(np.floor(recall * n_positive)):
j = i
break
so_far = [0, 0]
if false_positive_margin == 0:
return y_pred_sorted[j]
k = 0
for i in reversed(range(j)):
so_far[y_sorted[i]] += 1
if so_far[0] >= false_positive_margin * n_negative:
k = i
break
return y_pred_sorted[k]
def relabel_by_size(labels):
""" Relabel clusters so they are sorted by number of members, descending.
Args: labels (np.array(int)): 1-based cluster labels """
order = np.argsort(np.argsort(-np.bincount(labels)))
return 1 + order[labels]
def adjust_pvalue_bh(p):
""" Multiple testing correction of p-values using the Benjamini-Hochberg procedure """
descending = np.argsort(p)[::-1]
# q = p * N / k where p = p-value, N = # tests, k = p-value rank
scale = float(len(p)) / np.arange(len(p), 0, -1)
q = np.minimum(1, np.minimum.accumulate(scale * p[descending]))
# Return to original order
return q[np.argsort(descending)]
def compute_readpairs_per_umi_threshold(reads, subsample_rate):
''' Compute a threshold above which the UMIs are unlikely to be PCR off-products.
reads (np.array(int)) - Read pairs for each UMI
subsample_rate (float) - Subsample reads to this fraction.
Returns threshold (int) - The RPPU threshold in the subsampled space '''
if len(np.unique(reads)) < 2:
print 'Skipping RPPU threshold calculation.'
return 1
print 'RPPU subsample rate: %0.4f' % subsample_rate
reads = np.random.binomial(reads, subsample_rate)
reads = reads[reads > 0]
if len(np.unique(reads)) < 2:
print 'Subsampling gave a degenerate distribution of RPPU. Skipping RPPU threshold calculation.'
return 1
new_n50 = tk_stats.NX(reads, 0.5)
print 'New N50: %d:' % new_n50
# Log-transform counts
log_reads = np.log(reads)
# Run K-Means. Reshape necessary because kmeans takes a matrix.
kmeans = sk_cluster.KMeans(2).fit(log_reads.reshape((-1,1)))
kmeans.predict(log_reads.reshape((-1,1)))
# Take the cluster with the smallest mean
min_cluster = np.argsort(np.ravel(kmeans.cluster_centers_))[0]
print 'RPPU component means: ' + str(list(iter(np.exp(kmeans.cluster_centers_))))
print 'RPPU component members: ' + str(np.bincount(kmeans.labels_))
# Take the max element in the min-cluster
threshold = np.max(reads[kmeans.labels_ == min_cluster])
return threshold
def rebalance(self):
"""
Rebalances the binary heap. Takes O(n log n) time to run.
Avoid using, when possible.
"""
# Sort array by priority
sorted_indices_by_priority = np.argsort(-self.pq_array[:,0])
self.pq_array = self.pq_array[sorted_indices_by_priority]
pq_indices = range(self.size)
# Create hash tables
self.pq_hash = dict(zip(pq_indices,self.pq_array[:,1]))
self.exp_hash = dict(zip(self.pq_array[:,1],pq_indices))
def rank_output(self):
self.output_ranks = np.argsort(self.output_raw, axis=1, kind='mergesort').ravel()[::-1].astype(np.int32)
def overlay_emojiface(probs):
if max(probs) > 0.8:
emotion = emotions[np.argmax(probs)]
return 'emoji/{}-{}.png'.format(emotion, emotion)
else:
index1, index2 = np.argsort(probs)[::-1][:2]
emotion1 = emotions[index1]
emotion2 = emotions[index2]
return 'emoji/{}-{}.png'.format(emotion1, emotion2)
def __call__(self, words, weights, vocabulary_max):
if len(words) < vocabulary_max * self.trigger_ratio:
return words, weights
if not isinstance(words, numpy.ndarray):
words = numpy.array(words)
# Tail optimization does not help with very large vocabularies
if len(words) > vocabulary_max * 2:
indices = numpy.argpartition(weights, len(weights) - vocabulary_max)
indices = indices[-vocabulary_max:]
words = words[indices]
weights = weights[indices]
return words, weights
# Vocabulary typically consists of these three parts:
# 1) the core - we found it's border - `core_end` - 15%
# 2) the body - 70%
# 3) the minor tail - 15%
# (1) and (3) are roughly the same size
# (3) can be safely discarded, (2) can be discarded with care,
# (1) shall never be discarded.
sorter = numpy.argsort(weights)[::-1]
weights = weights[sorter]
trend_start = int(len(weights) * 0.2)
trend_finish = int(len(weights) * 0.8)
z = numpy.polyfit(numpy.arange(trend_start, trend_finish),
numpy.log(weights[trend_start:trend_finish]),
1)
exp_z = numpy.exp(z[1] + z[0] * numpy.arange(len(weights)))
avg_error = numpy.abs(weights[trend_start:trend_finish] -
exp_z[trend_start:trend_finish]).mean()
tail_size = numpy.argmax((numpy.abs(weights - exp_z) < avg_error)[::-1])
weights = weights[:-tail_size][:vocabulary_max]
words = words[sorter[:-tail_size]][:vocabulary_max]
return words, weights