我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.stats.spearmanr()。
def calc_word_sim(model, eval_file): df = pd.read_csv(eval_file, sep=',', header=0) # eval dataset col1, col2, score = df.columns.values model_vocab = model.vocab.keys() ground = [] sys = [] for idx, row in df.iterrows(): if row[col1] in model_vocab and row[col2] in model_vocab: ground.append(float(row[score])) sys.append(model.similarity(row[col1], row[col2])) # compute Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) print sys # import pdb;pdb.set_trace() corr, p_val = stats.spearmanr(sys, ground) logger.info("# of pairs found: %s / %s" % (len(ground), len(df))) logger.info("correlation: %s" % corr) return corr, p_val
def word_sim_test(filename, pos_vectors): delim = ',' actual_sim_list, pred_sim_list = [], [] missed = 0 with open(filename, 'r') as pairs: for pair in pairs: w1, w2, actual_sim = pair.strip().split(delim) try: w1_vec = create_word_vector(w1, pos_vectors) w2_vec = create_word_vector(w2, pos_vectors) pred = float(np.inner(w1_vec, w2_vec)) actual_sim_list.append(float(actual_sim)) pred_sim_list.append(pred) except KeyError: missed += 1 spearman, _ = st.spearmanr(actual_sim_list, pred_sim_list) pearson, _ = st.pearsonr(actual_sim_list, pred_sim_list) return spearman, pearson, missed
def sim_getCorrelation(We,words,f, weight4ind, scoring_function, params): f = open(f,'r') lines = f.readlines() golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0]; p2 = i[1]; score = float(i[2]) X1, X2 = data_io.getSeqs(p1,p2,words) seq1.append(X1) seq2.append(X2) golds.append(score) x1,m1 = data_io.prepare_data(seq1) x2,m2 = data_io.prepare_data(seq2) m1 = data_io.seq2weight(x1, m1, weight4ind) m2 = data_io.seq2weight(x2, m2, weight4ind) scores = scoring_function(We,x1,x2,m1,m2, params) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def getCorrelation(model,words,f, params=[]): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0]; p2 = i[1]; score = float(i[2]) X1, X2 = data_io.getSeqs(p1,p2,words) seq1.append(X1) seq2.append(X2) golds.append(score) x1,m1 = data_io.prepare_data(seq1) x2,m2 = data_io.prepare_data(seq2) if params and params.weightfile: m1 = data_io.seq2weight(x1, m1, params.weight4ind) m2 = data_io.seq2weight(x2, m2, params.weight4ind) scores = model.scoring_function(x1,x2,m1,m2) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def correlations(A,B,pc_n=100): p = (1 - distance.correlation(A.flatten(),B.flatten())) spear = spearmanr(A.flatten(),B.flatten()) dist_genes = np.zeros(A.shape[0]) for i in range(A.shape[0]): dist_genes[i] = 1 - distance.correlation(A[i],B[i]) pg = (np.average(dist_genes[np.isfinite(dist_genes)])) dist_sample = np.zeros(A.shape[1]) for i in range(A.shape[1]): dist_sample[i] = 1 - distance.correlation(A[:,i],B[:,i]) ps = (np.average(dist_sample[np.isfinite(dist_sample)])) pc_dist = [] if pc_n > 0: u0,s0,vt0 = np.linalg.svd(A) u,s,vt = np.linalg.svd(B) for i in range(pc_n): pc_dist.append(abs(1 - distance.cosine(u0[:,i],u[:,i]))) pc_dist = np.array(pc_dist) return p,spear[0],pg,ps,pc_dist
def white4D_functional(): print("Testing correlation for 4D white noise") N = 20 x1 = randrange(-1000, 1000, 1) y1 = randrange(-1000, 1000, 1) z1 = randrange(-1000, 1000, 1) w1 = randrange(-1000, 1000, 1) x2 = x1 + randrange(-1000, 1000, 1) y2 = y1 + randrange(-1000, 1000, 1) z2 = z1 + randrange(-1000, 1000, 1) w2 = w1 + randrange(-1000, 1000, 1) values1 = [[[[combined(white, x/N, y/N) for x in range(x1, x1 + N)] for y in range(y1, y1 + N)] for z in range(z1, z1 + N)] for w in range(w1, w1 + N)] values2 = [[[[combined(white, x/N, y/N) for x in range(x2, x2 + N)] for y in range(y2, y2 + N)] for z in range(z2, z2 + N)] for w in range(w2, w2 + N)] rho = spearmanr(values1, values2, axis = None) assert abs(rho[0]) < 0.5 print("rho = %s" % rho[0]) print("\tNot signifying correlation found")
def run(self): for d_type, datasets in self.sim_datasets.iteritems(): for data, fn in datasets: logging.info( 'testing on data {0} of type {1} ({2} pairs)'.format( fn, d_type, len(data.pairs))) for e_type, models in self.e_models.iteritems(): for model, fn in models: logging.info( '\ttesting embedding {0} of type {1}'.format( fn, e_type)) answers, gold_sims, oovs = [], [], 0 for (w1, w2), gold in data.pairs.iteritems(): sim = model.get_sim(w1, w2) if sim: answers.append(sim) gold_sims.append(gold) else: oovs += 1 corr = spearmanr(answers, gold_sims) logging.info('Spearman correlation: {0}'.format(corr)) logging.info('pairs skipped (OOVs): {0}'.format(oovs))
def MA_RIBBON(df, ma_series): ma_array = np.zeros([len(df), len(ma_series)]) ema_list = [] for idx, ma_len in enumerate(ma_series): ema_i = EMA(df, n = ma_len, field = 'close') ma_array[:, idx] = ema_i ema_list.append(ema_i) corr = np.empty([len(df)]) pval = np.empty([len(df)]) dist = np.empty([len(df)]) corr[:] = np.NAN pval[:] = np.NAN dist[:] = np.NAN max_n = max(ma_series) for idy in range(len(df)): if idy >= max_n - 1: corr[idy], pval[idy] = stats.spearmanr(ma_array[idy,:], range(len(ma_series), 0, -1)) dist[idy] = max(ma_array[idy,:]) - min(ma_array[idy,:]) corr_ts = pd.Series(corr*100, index = df.index, name = "MARIBBON_CORR") pval_ts = pd.Series(pval*100, index = df.index, name = "MARIBBON_PVAL") dist_ts = pd.Series(dist, index = df.index, name = "MARIBBON_DIST") return pd.concat([corr_ts, pval_ts, dist_ts] + ema_list, join='outer', axis=1)
def eval_sts(ycat, y, name, quiet=False): """ Evaluate given STS regression-classification predictions and print results. """ if ycat.ndim == 1: ypred = ycat else: ypred = loader.sts_categorical2labels(ycat) if y.ndim == 1: ygold = y else: ygold = loader.sts_categorical2labels(y) pr = pearsonr(ypred, ygold)[0] sr = spearmanr(ypred, ygold)[0] e = mse(ypred, ygold) if not quiet: print('%s Pearson: %f' % (name, pr,)) print('%s Spearman: %f' % (name, sr,)) print('%s MSE: %f' % (name, e,)) return STSRes(pr, sr, e)
def getCorrelation(model,words,f): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[1]; p2 = i[2]; score = float(i[0]) if len(p1.split()[0].split('_')) == 2: X1, X2, SX1, SX2 = getSeqs2(p1,p2,words) else: X1, X2 = getSeqs(p1,p2,words) seq1.append(X1) seq2.append(X2) golds.append(score) x1,m1 = utils.prepare_data(seq1) x2,m2 = utils.prepare_data(seq2) scores = model.scoring_function(x1,x2,m1,m2) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def getCorrelation2(model,words,f): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] seq2 = [] sseq1 = [] sseq2 = [] for i in lines: i = i.split("\t") p1 = i[1]; p2 = i[2]; score = float(i[0]) X1, X2, SX1, SX2 = getSeqs2(p1,p2,words) seq1.append(X1) seq2.append(X2) sseq1.append(SX1) sseq2.append(SX2) golds.append(score) x1,m1,s1 = utils.prepare_data2(seq1,sseq1) x2,m2,s2 = utils.prepare_data2(seq2,sseq2) scores = model.scoring_function2(x1,x2,m1,m2,s1,s2) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def spearman(self, dataset): if not isinstance(dataset, list) \ or len(dataset) == 0 \ or len(dataset[0]) != 3 \ or not isinstance(dataset[0][2], float): raise TypeError('Dataset is not of correct type, list of [str, str, float] triples expected.') gs_scores, sys_scores = [], [] for one, two, gs_score in dataset: try: sys_score = self.sim(one, two) gs_scores.append(gs_score) sys_scores.append(sys_score) except KeyError: if self.reportMissing: print('Warning: Missing pair %s-%s - skipping' % (one, two)) continue return spearmanr(gs_scores, sys_scores)
def get_corr_func(method): if method in ['kendall', 'spearman']: from scipy.stats import kendalltau, spearmanr def _pearson(a, b): return np.corrcoef(a, b)[0, 1] def _kendall(a, b): rs = kendalltau(a, b) if isinstance(rs, tuple): return rs[0] return rs def _spearman(a, b): return spearmanr(a, b)[0] _cor_methods = { 'pearson': _pearson, 'kendall': _kendall, 'spearman': _spearman } return _cor_methods[method]
def evaluate1Word(wv, reference): """Evaluate wv against reference, return (rho, count) where rwo is Spearman's rho and count is the number of reference word pairs that could be evaluated against. """ count=0 gold, predicted = [], [] for words, sim in sorted(reference, key=lambda ws: ws[1]): if " " not in words[0] and " " not in words[1]: #print words[0],words[1] try: v1, v2 = wv[words[0]], wv[words[1]] except KeyError: count+=1 continue #print words gold.append((words, sim)) predicted.append((words, cosine(v1, v2))) simlist = lambda ws: [s for w,s in ws] rho, p = spearmanr(simlist(gold), simlist(predicted)) print "Word not found in WordVector",count return (rho, len(gold))
def _corrfunc(x, y, **kws): """ Annotate grid with correaltion coefficient. Solution from http://stackoverflow.com/a/30942817 """ if args.c == 'spearman': r, _ = stats.spearmanr(x, y) corr_type = 'Rho' elif args.c == 'pearson': r, _ = stats.pearsonr(x, y) corr_type = 'r' else: raise Exception('Invalid correlation statistic.') correlations.append(r) ax = plotter.plt.gca() ax.annotate("{} = {:.2f}".format(corr_type, r), xy=(.1, .9), xycoords=ax.transAxes)
def get_feature_importance(feature): import scipy.stats as sps import pandas as pd y_train = pd.read_csv('../data/train.csv')['is_duplicate'] return sps.spearmanr(feature,y_train)[0] # import pickle # pickle.dump(X_train,open("data_train.pkl", 'wb'), protocol=2) # # data_file=['test_deptree','test_glove_sim_dist','test_pca_glove', # 'test_pca_pattern','test_w2w','test_pos','test_pca_char'] # # path='../test/' # for it in range(6): # tmp=[] # flist=[item+str(it) for item in data_file] # test=np.empty((400000,0)) # if it==5: # test=np.empty((345796,0)) # for f in flist: # test=np.hstack([test,pd.read_pickle(path+f+'.pkl')]) # pickle.dump(test,open('data_test{0}.pkl'.format(it),'wb'),protocol=2)
def getCorrelation(model,words,f): f = open(f,'r') lines = f.readlines() preds = [] golds = [] seq1 = [] seq2 = [] for i in lines: i = i.split("\t") p1 = i[0]; p2 = i[1]; score = float(i[2]) X1, X2 = getSeqs(p1,p2,words) seq1.append(X1) seq2.append(X2) golds.append(score) x1,m1 = utils.prepare_data(seq1) x2,m2 = utils.prepare_data(seq2) scores = model.scoring_function(x1,x2,m1,m2) preds = np.squeeze(scores) return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
def validation_check(): # Load graph g = Graph(is_training=False); print("Graph loaded") # Load data X, Y = load_data(mode="val") with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)); print("Restored!") # Get model mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name # Inference if not os.path.exists(hp.results): os.mkdir(hp.results) with open(os.path.join(hp.results, "validation_results.txt"), 'a') as fout: expected, predicted = [], [] for step in range(len(X) // hp.batch_size): x = X[step * hp.batch_size: (step + 1) * hp.batch_size] y = Y[step * hp.batch_size: (step + 1) * hp.batch_size] # predict intensities logits = sess.run(g.logits, {g.x: x}) expected.extend(list(y)) predicted.extend(list(logits)) # Get spearman coefficients score, _ = spearmanr(expected, predicted) fout.write("{}\t{}\n".format(mname, score))
def _calculate(self, input): input = input[~np.isnan(input).any(axis=1)] return spearmanr(input[:,0], input[:,1])[0]
def series_corr(word_year_series_1, word_year_series_2, i_year_words, start_year=1900, end_year=2000, series_1_norms=None, series_2_norms=None): """ Gets the per-year correlation between the two word time series. Words are included even if they have values missing for a year, but there missing values are excluded from the year in question. """ year_corrs = [] year_ps = [] years = range(start_year, end_year + 1) if start_year not in i_year_words: i_year_words = {year:i_year_words for year in years} if series_1_norms == None: series_1_norms = ([0 for year in years], [1 for year in years]) if series_2_norms == None: series_2_norms = ([0 for year in years], [1 for year in years]) for i in xrange(len(years)): year = years[i] s1 = [] s2 = [] for word in i_year_words[year]: if word in word_year_series_1 and word in word_year_series_2: if not np.isnan(word_year_series_1[word][year]) and not np.isnan(word_year_series_2[word][year]): s1.append((word_year_series_1[word][year] - series_1_norms[0][i]) / series_1_norms[1][i]) s2.append((word_year_series_2[word][year] - series_2_norms[0][i]) / series_2_norms[1][i]) corr, p = spearmanr(s1, s2) year_corrs.append(corr) year_ps.append(p) return year_corrs, year_ps
def get_scores(self): self.model.eval() num_classes = self.dataset_cls.NUM_CLASSES predict_classes = torch.arange(1, num_classes + 1).expand(self.batch_size, num_classes) test_kl_div_loss = 0 predictions = [] true_labels = [] for batch in self.data_loader: output = self.model(batch.sentence_1, batch.sentence_2, batch.ext_feats) test_kl_div_loss += F.kl_div(output, batch.label, size_average=False).data[0] # handle last batch which might have smaller size if len(predict_classes) != len(batch.sentence_1): predict_classes = torch.arange(1, num_classes + 1).expand(len(batch.sentence_1), num_classes) if self.data_loader.device != -1: with torch.cuda.device(self.device): predict_classes = predict_classes.cuda() true_labels.append((predict_classes * batch.label.data).sum(dim=1)) predictions.append((predict_classes * output.data.exp()).sum(dim=1)) del output predictions = torch.cat(predictions).cpu().numpy() true_labels = torch.cat(true_labels).cpu().numpy() test_kl_div_loss /= len(batch.dataset.examples) pearson_r = pearsonr(predictions, true_labels)[0] spearman_r = spearmanr(predictions, true_labels)[0] return [pearson_r, spearman_r, test_kl_div_loss], ['pearson_r', 'spearman_r', 'KL-divergence loss']
def spearman(y_true, y_pred): """ Calculate Spearman's rank correlation coefficient between ``y_true`` and ``y_pred``. :param y_true: The true/actual/gold labels for the data. :type y_true: array-like of float :param y_pred: The predicted/observed labels for the data. :type y_pred: array-like of float :returns: Spearman's rank correlation coefficient if well-defined, else 0 """ ret_score = spearmanr(y_true, y_pred)[0] return ret_score if not np.isnan(ret_score) else 0.0
def compare_distances(A,B,random_samples=[],s=200,pvalues=False): if len(random_samples) == 0: random_samples = np.zeros(A.shape[1],dtype=np.bool) random_samples[:min(s,A.shape[1])] = True np.random.shuffle(random_samples) dist_x = distance.pdist(A[:,random_samples].T,'euclidean') dist_y = distance.pdist(B[:,random_samples].T,'euclidean') pear = pearsonr(dist_x,dist_y) spear = spearmanr(dist_x,dist_y) if pvalues: return pear,spear else: return pear[0],spear[0]
def calAvgSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2): assert(len(senseVec1)==len(senseVec2)) avgCos = [] for t in xrange(len(senseVec1)): thisCos = [] p1 = (senseScore1[t]) p2 = (senseScore2[t]) for i in xrange(len(senseVec1[t])): for j in xrange(len(senseVec2[t])): thisCos.append((1-cosine(senseVec1[t][i],senseVec2[t][j]))*p1[i]*p2[j]) avgCos.append(np.sum(thisCos)) return spearmanr(test_score, avgCos)[0]
def calMaxSimC(test_score, senseVec1, senseScore1,senseVec2, senseScore2): assert(len(senseVec1)==len(senseVec2)) avgCos = [] for t in xrange(len(senseVec1)): i = np.argmax(senseScore1[t]) j = np.argmax(senseScore2[t]) thisCos = (1-cosine(senseVec1[t][i],senseVec2[t][j])) avgCos.append(thisCos) return spearmanr(test_score, avgCos)[0]
def white2D_functional(): print("Testing correlation for 2D white noise") N = 100 x1 = randrange(-1000, 1000, 1) y1 = randrange(-1000, 1000, 1) x2 = x1 + randrange(-1000, 1000, 1) y2 = y1 + randrange(-1000, 1000, 1) values1 = [[combined(white, x/N, y/N) for x in range(x1, x1 + N)] for y in range(y1, y1 + N)] values2 = [[combined(white, x/N, y/N) for x in range(x2, x2 + N)] for y in range(y2, y2 + N)] rho = spearmanr(values1, values2, axis = None) assert abs(rho[0]) < 0.5 print("rho = %s" % rho[0]) print("\tNot signifying correlation found")
def white3D_functional(): print("Testing correlation for 3D white noise") N = 100 x1 = randrange(-1000, 1000, 1) y1 = randrange(-1000, 1000, 1) z1 = randrange(-1000, 1000, 1) x2 = x1 + randrange(-1000, 1000, 1) y2 = y1 + randrange(-1000, 1000, 1) z2 = z1 + randrange(-1000, 1000, 1) values1 = [[[combined(white, x/N, y/N) for x in range(x1, x1 + N)] for y in range(y1, y1 + N)] for z in range(z1, z1 + N)] values2 = [[[combined(white, x/N, y/N) for x in range(x2, x2 + N)] for y in range(y2, y2 + N)] for z in range(z2, z2 + N)] rho = spearmanr(values1, values2, axis = None) assert abs(rho[0]) < 0.5 print("rho = %s" % rho[0]) print("\tNot signifying correlation found")
def spearman_scorer(estimator, X, y): logging.info('predicting ...') predicted = estimator.predict(y) return spearmanr(list(predicted), y)
def test(): logging.basicConfig( level=logging.INFO, format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") data = [((f[0], f[1]), float(f[2])) for f in [line.strip().split("|||") for line in open(sys.argv[1])]] print "sample data:", data[:3] train_data, devel_data, test_data = cut(data) logging.info('loading model...') glove_embedding = GloveEmbedding(sys.argv[2]) logging.info('done!') dim = int(sys.argv[3]) X_train = featurize(train_data, glove_embedding, dim) Y_train = np.array([e[1] for e in train_data]) logging.info("Input shape: {0}".format(X_train.shape)) print X_train[:3] logging.info("Label shape: {0}".format(Y_train.shape)) print Y_train[:3] input_dim = X_train.shape[1] output_dim = 1 model = create_model(input_dim, output_dim) model.fit(X_train, Y_train, nb_epoch=int(sys.argv[4]), batch_size=32) X_devel = featurize(devel_data, glove_embedding, dim) Y_devel = np.array([e[1] for e in devel_data]) pred = model.predict_proba(X_devel, batch_size=32) corr = spearmanr(pred, Y_devel) print "Spearman's R: {0}".format(corr)
def evaluate(model, dev_data): pred = model.predict_proba(dev_data.data, batch_size=32) corr = spearmanr(pred, dev_data.labels) print "Spearman's R: {0}".format(corr)
def ma_ribbon(df, ma_series): ma_array = np.zeros([len(df)]) for idx, ma_len in enumerate(ma_series): key = 'EMA_CLOSE_' + str(ma_len) ema(df, ma_len, field = 'close') ma_array[idx] = df[key][-1] corr, pval = stats.spearmanr(ma_array, range(len(ma_series), 0, -1)) dist = max(ma_array) - min(ma_array) df["MARIBBON_CORR"][-1] = corr * 100 df["MARIBBON_PVAL"][-1] = pval * 100 df["MARIBBON_DIST"][-1] = dist
def getSpearmanr(infile): x_list = list() y_list = list() for i, line in enumerate(open(infile, 'r')): words = line.strip('\n').split('\t') x_list.append((i, float(words[2]))) y_list.append((i, float(words[3]))) x_list = sorted(x_list, key=lambda x:x[1]) y_list = sorted(y_list, key=lambda x:x[1]) x_list = sorted([(x, i) for i, (x, score) in enumerate(x_list)], key=lambda x: x[0]) y_list = sorted([(y, i) for i, (y, score) in enumerate(y_list)], key=lambda x: x[0]) x_list, y_list = np.array(x_list), np.array(y_list) rho, pval = spearmanr(x_list[:, 1], y_list[:, 1]) return rho, pval
def calc_correl(self, dev_pred, test_pred): dev_prs, _ = pearsonr(dev_pred, self.dev_y_org) test_prs, _ = pearsonr(test_pred, self.test_y_org) dev_spr, _ = spearmanr(dev_pred, self.dev_y_org) test_spr, _ = spearmanr(test_pred, self.test_y_org) dev_tau, _ = kendalltau(dev_pred, self.dev_y_org) test_tau, _ = kendalltau(test_pred, self.test_y_org) return dev_prs, test_prs, dev_spr, test_spr, dev_tau, test_tau
def check_similarity_match(X_embed, S): """ Since SimEcs are supposed to project the data into an embedding space where the target similarities can be linearly approximated, check if X_embed*X_embed^T = S (check mean squared error and Spearman correlation coefficient) Inputs: - X_embed: Nxd matrix with coordinates in the embedding space - S: NxN matrix with target similarities (do whatever transformations were done before using this as input to the SimEc, e.g. centering, etc.) Returns: - msq, rho, r: mean squared error, Spearman and Pearson correlation coefficent between linear kernel of embedding and target similarities (mean squared error is more exact, corrcoef a more relaxed error measure) """ # compute linear kernel as approximated similarities S_approx = X_embed.dot(X_embed.T) # to get results that are comparable across similarity measures, we have to normalize them somehow, # in this case by dividing by the absolute max value of the target similarity matrix n = np.max(np.abs(S)) S_norm = S/n S_approx /= n # compute mean squared error msqe = np.mean((S_norm - S_approx) ** 2) # compute Spearman correlation coefficient rho = spearmanr(S_norm.flatten(), S_approx.flatten())[0] # compute Pearson correlation coefficient r = pearsonr(S_norm.flatten(), S_approx.flatten())[0] return msqe, rho, r
def compute_score(self, conf, hy): conf['_r2'] = r2_score(self.test_y, hy) conf['_spearmanr'] = spearmanr(self.test_y, hy)[0] conf['_pearsonr'] = pearsonr(self.test_y, hy)[0] conf['_score'] = conf['_' + self.score] # print(conf)
def profile(filepath, n, exact=True, save=False, verbose=True, use_gpu=False, report=open('temp.txt', 'w')): if exact: tol = 0 else: tol = None solpath = 'data/{}_sol.dat'.format(filepath2name(filepath)) if not os.path.isfile(solpath): solve(filepath, n, seed=0, verbose=verbose) q, r, ranks = pickle.load(open(solpath, 'rb')) if use_gpu: model_classes = [PPRIterativeTF, PPRLUDecompositionTF, PPRBearTF] else: model_classes = [PPRIterative, PPRLUDecomposition, PPRBear] for model_class in model_classes: with tf.Session() as sess: start = time.time() if use_gpu: model = model_class(sess, n, filepath, drop_tol=tol, verbose=verbose) else: model = model_class(drop_tol=tol, verbose=verbose) model.preprocess(filepath) end = time.time() if use_gpu: sess.run(tf.global_variables_initializer()) elapsed = end - start if save: model.save('models/{}.ppr'.format(model.alias)) print("[{}]({},{},n={})".format(model.alias, 'gpu' if use_gpu else 'cpu', 'exact' if exact else 'apprx', n), file=report) print("preprocess\t{}".format(elapsed), file=report) start = time.time() r_ = model.query(q) end = time.time() elapsed = end - start print("query time\t{}".format(elapsed), file=report) ranks_ = pr2ranks(r_) spearman = spearmanr(ranks, ranks_) r_ = r_ / r_.sum() print("diff norm\t{}".format(norm(r - r_)), file=report) print("cosine sim\t{}".format(r.dot(r_) / norm(r) / norm(r_)), file=report) print("spearman corr\t{}".format(spearman.correlation), file=report) print("", file=report)
def test_corr_rank(self): tm._skip_if_no_scipy() import scipy import scipy.stats as stats # kendall and spearman A = tm.makeTimeSeries() B = tm.makeTimeSeries() A[-5:] = A[:5] result = A.corr(B, method='kendall') expected = stats.kendalltau(A, B)[0] self.assertAlmostEqual(result, expected) result = A.corr(B, method='spearman') expected = stats.spearmanr(A, B)[0] self.assertAlmostEqual(result, expected) # these methods got rewritten in 0.8 if scipy.__version__ < LooseVersion('0.9'): raise nose.SkipTest("skipping corr rank because of scipy version " "{0}".format(scipy.__version__)) # results from R A = Series( [-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, - 0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606]) B = Series( [-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292, 1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375]) kexp = 0.4319297 sexp = 0.5853767 self.assertAlmostEqual(A.corr(B, method='kendall'), kexp) self.assertAlmostEqual(A.corr(B, method='spearman'), sexp)
def test_nancorr_spearman(self): tm.skip_if_no_package('scipy.stats') from scipy.stats import spearmanr targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method='spearman') targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method='spearman')
def spearman(x,y): return stats.spearmanr(x, y)[0] ########################################### # Start ###########################################
def forward(self, bottom, top): """Compute the SROCC and LCC and output them to top.""" #ipdb.set_trace() testPreds = bottom[0].data testPreds = np.reshape(testPreds,testPreds.shape[0]) testLabels = bottom[1].data testLabels = np.reshape(testLabels,testLabels.shape[0]) top[0].data[...] = stats.spearmanr(testPreds, testLabels)[0] top[1].data[...] = stats.pearsonr(testPreds, testLabels)[0]
def comp_corr(df, ptype): if ptype=='begin': valid_df = df[(df.p00 >2)] else: valid_df = df[(df.p11 >2)] lengths = valid_df.span_length.values if ptype == 'begin': plengths = valid_df.p00.values else: plengths = valid_df.p11.values print float(len(valid_df))/len(df), '\t', stats.spearmanr(plengths, lengths)[0]
def _correlation(self, output, score): return [spearmanr(output, score), pearsonr(output, score)]
def do_spearmanr(list1, list2, alpha=0.05): c, p = spearmanr(list1, list2) if p < alpha: return c return 'n.s.'
def calcroh(file_name): human_list = list() pred_list = list() with open(file_name) as i_f: for line in i_f: human_list.append(line.strip().split()[2]) pred_list.append(line.strip().split()[3]) return spearmanr(human_list, pred_list)
def cal_spear(text): list_1 = [] list_2 = [] with open(text) as i_f: for line in i_f: list_1.append(line.strip().split()[2]) list_2.append(line.strip().split()[3]) return spearmanr(list_1,list_2)
def generate_indicator_(gram_q1,gram_q2,N): len_gram_q1 = list(map(len,gram_q1)) len_gram_q2 = list(map(len,gram_q2)) max_len = max(max(len_gram_q1),max(len_gram_q2)) q1_indicator = np.zeros((N,max_len)) q2_indicator = np.zeros((N,max_len)) for i in tqdm(np.arange(N)): for j,w in enumerate(gram_q1[i]): if w in gram_q2[i]: q1_indicator[i,j] = 1 for j,w in enumerate(gram_q2[i]): if w in gram_q1[i]: q2_indicator[i,j] = 1 return q1_indicator,q2_indicator # sps.spearmanr(q1_indicator[:,1],y_train)[0]
def calc_dis_jarccard2(neighs,neighs2): sim_fea = [] for i in neighs: for j in neighs2: if i==j:continue if (j in index_q) and (i in index_q): q_str = index_q[i] nei_str = index_q[j] s1 = set(q_str.lower().split()) s2 = set(nei_str.lower().split()) sim_fea.append(dist_utils._jaccard_coef(s1, s2)) aggregation_mode = ["mean", "std", "max", "min", "median"] aggregator = [None if m == "" else getattr(np, m) for m in aggregation_mode] score = [] for n, agg in enumerate(aggregator): if len(sim_fea) == 0: s = -1 try: s = agg(sim_fea) except: s = -1 score.append(s) return score # sps.spearmanr(train_fea,train['is_duplicate'])[0]
def drop_feature(data): drop_list = [] for i in range(data.shape[1]): for j in range(i,data.shape[1]): s = sps.spearmanr(data[:,i],data[:,j])[0] if abs(s)>0.8: drop_list.append(j) drop_list = set(drop_list) return drop_list #select imp feature