我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用sklearn.neighbors.NearestNeighbors()。
def computeNeighboursScores(self): all_instances = self.iteration.datasets.instances # Connectivity matrix pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))]) pipeline.fit(all_instances.getFeatures()) # Labels labels = np.array([generateLabel(x) for x in all_instances.getLabels()]) # Compute neighbour scores scores = [] all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False) for i, label in enumerate(labels): if label != 0: continue else: neighbours = all_neighbours[i] score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours) scores.append(score) return np.array(scores)
def getpossibleedges(datapointwts,seeds): # datapointwts = densify(datapointwts); X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(S) distances, indices = nbrs.kneighbors(X) for cd in range(len(seeds)): cluster[cd] = [] for ii, ll in enumerate(indices): dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll] cd = ll[dd.index(min(dd))]; cluster[cd].append(datapointwts[ii]) p2cluster.append(cd) for ii, xx in enumerate(datapointwts): if ii>1: if datapointwts[ii-1][-1]<datapointwts[ii][-1] and datapointwts[ii-1][-1]>datapointwts[ii][-1]-11: cd1 = p2cluster[ii-1]; cd2 = p2cluster[ii]; if not cd1== cd2: gedges1[(cd1,cd2)] = gedges1.get((cd1,cd2),0)+1; return(gedges1)
def point2cluster(datapointwts,seeds,theta): cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in datapointwts]; S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in seeds]; Xrot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in datapointwts]; Srot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in seeds]; for cd in range(len(seeds)): cluster[cd] = [] nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S) distances, indices = nbrs.kneighbors(X) nbrsrot = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(Srot) distancesrot, indicesrot = nbrsrot.kneighbors(Xrot) for ii, ll in enumerate(indices): # print(distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii]) cd = indicesrot[ii][0] if distances[ii][0] < distancesrot[ii][0]: cd = indices[ii][0]; # print(cd,distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii]) cluster[cd].append(datapointwts[ii]) p2cluster.append(cd) return(cluster,p2cluster)
def splitclustersparallel(datapointwts,seeds): X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = []; nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S) distances, indices = nbrs.kneighbors(X) for cd in range(len(seeds)): cluster[cd] = []; roadwidth.append(0); for ii, ll in enumerate(indices): dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll] cd = ll[dd.index(min(dd))]; cluster[cd].append(datapointwts[ii]) p2cluster.append(cd) for cl in cluster: mang = seeds[cl][-1]; scl = seeds[cl] if len(cluster[cl]) > 10: std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90) roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1]) for xx in cluster[cl]]) print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1]) for xx in cluster[cl]])
def median_kneighbour_distance(X, k=5): """ Calculate the median kneighbor distance. Find the distance between a set of random datapoints and their kth nearest neighbours. This is a heuristic for setting the kernel length scale. """ N_all = X.shape[0] k = min(k, N_all) N_subset = min(N_all, 2000) sample_idx_train = np.random.permutation(N_all)[:N_subset] nn = neighbors.NearestNeighbors(k) nn.fit(X[sample_idx_train, :]) d, idx = nn.kneighbors(X[sample_idx_train, :]) return np.median(d[:, -1])
def cosine_knn(corpus_vector, queries_vector, k=10): """ :param corpus_vector: vectorized document text :param queries_vector: vectorized query text :param k: number of neighbours :return: (distances, indices) of knn """ # based on # http://scikit-learn.org/stable/modules/neighbors.html # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html # since we want to use cosine similarity to account for document length # we have to use bruteforce search # parallelize to number of cores with n_jobs -1 nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine') nbrs.fit(corpus_vector) distances, indices = nbrs.kneighbors(queries_vector) return distances, indices
def index(self, metric='cosine'): """ Build a nearest neighbor retrieval index to perform similarity lookups and analogies Arguments: metric: string, or sklearn compatible callable Returns: self Raises: TokenContainerException if no pretrained vectors have been loaded """ if self.W is not None: alg = 'brute' if (metric == 'cosine') else 'auto' from sklearn.neighbors import NearestNeighbors self._nn = NearestNeighbors(metric=metric, algorithm=alg) self._nn.fit(self.W) else: raise TokenContainerException( 'cannot build similarity on vectorless structure' ) return self
def extract_lab_histogram(mode, clusters): nn = neighbors.NearestNeighbors(n_neighbors=1) nn.fit(clusters) out_filename = mode + '_color' try: os.remove(out_filename) except: pass out = open(out_filename, 'ab') cnt = 0 with open(mode + '_list') as f: for line in f: line = line[:-1] image = cv2.imread(line) image = cv2.resize(image, (100, 100)) image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab) points = image.reshape((-1, 3)) cn = nn.kneighbors(points) hist = np.histogram(cn[1], bins=50, range=(1, 50))[0] hist.tofile(out) cnt = cnt + 1 if cnt % 1000 == 0: print(cnt)
def buildNNDataStructure(self): """Builds a nearest neighbor data structure. User doesn't need to call this unless the self.problems attribute was changed manually.""" if len(self.problemFeatures)==0 or len(self.featureNames)==0: return try: from sklearn.neighbors import NearestNeighbors,BallTree from scipy.spatial import KDTree with self.lock: try: farray = self.problemFeatures.array except AttributeError: farray = np.array(self.problemFeatures.items) if self.metricTransform is not None: farray = np.dot(farray,self.metricTransform) #self.nn = NearestNeighbors(n_neighbors=1,algorithm="auto").fit(farray) self.nn = BallTree(farray) #self.nn = KDTree(farray) self.nnBuildSize = len(self.problemFeatures) except ImportError: print "IKDatabase: Warning, scikit-learn is not installed, queries will be much slower" with self.lock: self.nn = None self.nnBuildSize = 0 return
def __init__(self, x, ys): import numpy as np from sklearn.neighbors import NearestNeighbors #print x, ys CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] ) #print CI, x.get_info() #print for i in ys: CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) ) #idx = 0 #for i in np.array(CI)[1:]: # print idx+1, i, ys[idx].get_info() # idx += 1 self.neigh = NearestNeighbors(2, 0.4) self.neigh.fit(np.array(CI)) #print self.neigh.kneighbors( CI[0], len(CI) ) self.CI = CI self.ys = ys
def build_search_tree(datadir, featurename='vgg16_block5_conv3-vlad-64.h5'): ndim = 64 features_file = os.path.join(datadir, featurename) print(features_file) global keys, features keys, features = load_features(features_file) print('reducing features') pca = PCA(n_components=ndim) features = pca.fit_transform(features) print('ready') print('building search tree') nn = NearestNeighbors() global nneighs nneighs = nn.fit(features) print('ready')
def test_unsupervised_kneighbors(n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5): # Test unsupervised neighbors methods X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for p in P: results_nodist = [] results = [] for algorithm in ALGORITHMS: neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, p=p) neigh.fit(X) results_nodist.append(neigh.kneighbors(test, return_distance=False)) results.append(neigh.kneighbors(test, return_distance=True)) for i in range(len(results) - 1): assert_array_almost_equal(results_nodist[i], results[i][1]) assert_array_almost_equal(results[i][0], results[i + 1][0]) assert_array_almost_equal(results[i][1], results[i + 1][1])
def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors X = rng.random_sample((10, 3)) nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1) nbrs_fid.fit(X) dist1, ind1 = nbrs_fid.kneighbors(X) nbrs = neighbors.NearestNeighbors(n_neighbors=1) for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(input) dist2, ind2 = nbrs.kneighbors(X) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_radius_neighbors_boundary_handling(): """Test whether points lying on boundary are handled consistently Also ensures that even with only one query point, an object array is returned rather than a 2d array. """ X = np.array([[1.5], [3.0], [3.01]]) radius = 3.0 for algorithm in ALGORITHMS: nbrs = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm).fit(X) results = nbrs.radius_neighbors([[0.0]], return_distance=False) assert_equal(results.shape, (1,)) assert_equal(results.dtype, object) assert_array_equal(results[0], [0, 1])
def test_callable_metric(): def custom_metric(x1, x2): return np.sqrt(np.sum(x1 ** 2 + x2 ** 2)) X = np.random.RandomState(42).rand(20, 2) nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto', metric=custom_metric) nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute', metric=custom_metric) nbrs1.fit(X) nbrs2.fit(X) dist1, ind1 = nbrs1.kneighbors(X) dist2, ind2 = nbrs2.kneighbors(X) assert_array_almost_equal(dist1, dist2)
def __init__(self, is_multiclass=True, K_CLOSEST_NEIGHBORS=2): # Constants self.K_RECO = 5.0 # Num of neighbors for weight learning self.K_CLOSEST_NEIGHBORS = K_CLOSEST_NEIGHBORS self.weights = None self.kNN_finder = NearestNeighbors( n_neighbors=K_CLOSEST_NEIGHBORS, metric=self._calculate_dist, metric_params=None, # Dict otherwise n_jobs=-1 )
def assignClasses(self): clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size) train_locs = self.df_train[['lat', 'lon']].values clusterer.fit(train_locs) clusters = clusterer.get_clusters() cluster_points = dd(list) for i, cluster in enumerate(clusters): cluster_points[cluster].append(train_locs[i]) logging.info('#labels: %d' %len(cluster_points)) self.cluster_median = OrderedDict() for cluster in sorted(cluster_points): points = cluster_points[cluster] median_lat = np.median([p[0] for p in points]) median_lon = np.median([p[1] for p in points]) self.cluster_median[cluster] = (median_lat, median_lon) dev_locs = self.df_dev[['lat', 'lon']].values test_locs = self.df_test[['lat', 'lon']].values nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4) nnbr.fit(np.array(self.cluster_median.values())) self.dev_classes = nnbr.kneighbors(dev_locs, n_neighbors=1, return_distance=False)[:, 0] self.test_classes = nnbr.kneighbors(test_locs, n_neighbors=1, return_distance=False)[:, 0] self.train_classes = clusters if self.one_hot_labels: num_labels = np.max(self.train_classes) + 1 y_train = np.zeros((len(self.train_classes), num_labels), dtype=np.float32) y_train[np.arange(len(self.train_classes)), self.train_classes] = 1 y_dev = np.zeros((len(self.dev_classes), num_labels), dtype=np.float32) y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1 y_test = np.zeros((len(self.test_classes), num_labels), dtype=np.float32) y_test[np.arange(len(self.test_classes)), self.test_classes] = 1 self.train_classes = y_train self.dev_classes = y_dev self.test_classes = y_test
def network_layout(matrix, k=30): nbrs = NearestNeighbors(k, algorithm='brute', metric='cosine').fit(matrix) G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix)) node_labels = label_propagation(G, verbose=True) communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])]) pos = graphviz_layout(G, prog="sfdp") coords = np.array([pos[i] for i in range(len(pos))]) print(coords.shape) return coords, communities_labelprop
def __init__(self, MMDLayer, MMDTargetTrain, MMDTargetValidation_split=0.1, MMDTargetSampleSize=1000, n_neighbors = 25, scales = None, weights = None): if scales == None: print("setting scales using KNN") med = np.zeros(20) for ii in range(1,20): sample = MMDTargetTrain[np.random.randint(MMDTargetTrain.shape[0], size=MMDTargetSampleSize),:] nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample) distances,dummy = nbrs.kneighbors(sample) #nearest neighbor is the point so we need to exclude it med[ii]=np.median(distances[:,1:n_neighbors]) med = np.median(med) scales = [med/2, med, med*2] # CyTOF print(scales) scales = K.variable(value=np.asarray(scales)) if weights == None: print("setting all scale weights to 1") weights = K.eval(K.shape(scales)[0]) weights = K.variable(value=np.asarray(weights)) self.MMDLayer = MMDLayer MMDTargetTrain, MMDTargetValidation = train_test_split(MMDTargetTrain, test_size=MMDTargetValidation_split, random_state=42) self.MMDTargetTrain = K.variable(value=MMDTargetTrain) self.MMDTargetTrainSize = K.eval(K.shape(self.MMDTargetTrain)[0]) self.MMDTargetValidation = K.variable(value=MMDTargetValidation) self.MMDTargetValidationSize = K.eval(K.shape(self.MMDTargetValidation)[0]) self.MMDTargetSampleSize = MMDTargetSampleSize self.kernel = self.RaphyKernel self.scales = scales self.weights = weights #calculate the raphy kernel applied to all entries in a pairwise distance matrix
def get_chunk_nns(self, X, q_centroids, question_details, chunk): nbrs = NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=1000).fit(X) dist, nns = nbrs.kneighbors(q_centroids, return_distance=True) q_array = [] for q_point in range(nns.shape[0]): doc_nns = [] for n_point in range(nns.shape[1]): doc_nns.append(self.idmap[chunk[0] + nns[q_point, n_point]]) q = Question(question_details[q_point][0], question_details[q_point][1], doc_nns, list(dist[q_point, :])) q_array.append(q) return q_array # Dataset indeces are splitted in N chucks. Nearest top-(N*k) neighbors are extracted from each chunk, and then # the final top-k neighbors are extracted from those.
def getseeds(datapoint,radius,theta): chosen = []; seeds = []; # random.shuffle(datapoint) periodsampl = 500000 for p in datapoint: chosen.append(p); for j,p in enumerate(chosen): ok = -1; if j<periodsampl: for q in seeds: if taxidist(p,q,theta)<radius: ok = 1 break; if ok <1: seeds.append(p) else: if j%periodsampl == 0:# and (is_power2(int(j/1000))): # print(j,time.time()-start) S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in seeds]; nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S) X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in chosen[j:min(len(chosen),j+periodsampl)]]; distances, indices = nbrs.kneighbors(X) if distances[j%periodsampl][0] >radius: seeds.append(p) print('seeds: ', len(seeds)) return (seeds)
def __init__(self, analyzer=None, matching=None, name=None, verbose=0, n_epochs=10, alpha=0.25, min_alpha=0.05, n_jobs=4, **kwargs): # self.model = model self.alpha = alpha self.min_alpha = min_alpha self.verbose = verbose self.name = "paragraph-vectors" if name is None else name if matching is True: self._matching = Matching() elif matching is False or matching is None: self._matching = None else: self._matching = Matching(**dict(matching)) self.analyzer = analyzer self.model = Doc2Vec(alpha=alpha, min_alpha=alpha, size=500, window=8, min_count=1, sample=1e-5, workers=n_jobs, negative=20, dm=0, dbow_words=1, # words only with dm!=0? dm_mean=0, # unused when in concat mode dm_concat=1, dm_tag_count=1 ) self.n_epochs = n_epochs self._neighbors = NearestNeighbors(**kwargs)
def query(self, query, k=None): model, matching = self.model, self._matching nn, analyze = self._neighbors, self.analyzer verbose = self.verbose if k is None: k = len(self._centroids) if matching: matched = matching.predict(query) print("Matched:", matched) tags = self._y[matched] dvs = np.asarray([model.docvecs[tag] for tag in tags]) n_ret = min(k, len(matched)) if n_ret == 0: return [] nn.fit(dvs) else: tags = self._y n_ret = k # NearestNeighbors are already fit if verbose > 0: print(len(tags), "documents matched.") q = analyze(query) qv = model.infer_vector(q).reshape(1, -1) ind = nn.kneighbors(qv, n_neighbors=n_ret, return_distance=False)[0] y = tags[ind] return y
def query(self, query, k=None, matched_indices=None): # matching step matching_ind = self._matching(query) # print(matching_ind, file=sys.stderr) Xm, matched_doc_ids = self._X[matching_ind], self._y[matching_ind] # matching_docs, matching_doc_ids = self._matching(query) # calculate elements to retrieve n_ret = len(matching_ind) if n_ret == 0: return [] if self.verbose > 0: print("Found {} matches:".format(n_ret)) # n_ret = min(n_ret, k) if k > 0 else n_ret # model dependent transformation xq = self._cv.transform([query]) q = self.tfidf.transform(xq) # Xm = self.vectorizer.transform(matching_docs) # model dependent nearest neighbor search or scoring or whatever nn = NearestNeighbors(metric='cosine', algorithm='brute').fit(Xm) # abuse kneighbors in this case # AS q only contains one element, we only need its results. if k is not None and k < n_ret: n_ret = k ind = nn.kneighbors(q, # q contains a single element n_neighbors=n_ret, # limit to k neighbors return_distance=False)[0] # so we only need 1 res # dont forget to convert the indices to document ids of matching labels = matched_doc_ids[ind] return labels
def __init__(self, embedding, analyzer, name="WCD", n_jobs=1, normalize=True, verbose=0, oov=None, matching=True, **kwargs): self.name = name self._embedding = embedding self._normalize = normalize self._oov = oov self.verbose = verbose self.n_jobs = n_jobs self._neighbors = NearestNeighbors(**kwargs) self._analyzer = analyzer if matching is True: self._matching = Matching() elif matching is False or matching is None: self._matching = None else: self._matching = Matching(**dict(matching))
def __init__(self, embedding, analyzer='word', matching=None, name="FWCD", n_jobs=1, use_idf=True): """TODO: to be defined1. """ self.name = name self.matching = Matching(**dict(matching)) if matching else None self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2', use_idf=use_idf) self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine', algorithm='brute')
def test_nearest_centroid_ranker(): # in the case where there is a single point by centroid, # nearest centroid should reduce to nearest neighbor from sklearn.neighbors import NearestNeighbors np.random.seed(0) n_samples = 100 n_features = 120 X = np.random.rand(n_samples, n_features) normalize(X, copy=False) index = np.arange(n_samples, dtype='int') y = np.arange(n_samples, dtype='int') index_train, index_test, y_train, y_test = train_test_split(index, y) X_train = X[index_train] X_test = X[index_test] nn = NearestNeighbors(n_neighbors=1, algorithm='brute') nn.fit(X_train) dist_ref, idx_ref = nn.kneighbors(X_test) nc = NearestCentroidRanker() nc.fit(X_train, y_train) dist_pred = nc.decision_function(X_test) y_pred = nc.predict(X_test) # ensures that we have the same number of unique ouput points # (even if absolute labels are not preserved) assert np.unique(idx_ref[:,0]).shape == np.unique(y_pred).shape assert_allclose(dist_pred, dist_ref[:,0])
def fit(self, X, y): """Fit the model using X as training data Parameters ---------- X : {array-like, sparse matrix, BallTree, KDTree} Training data, shape [n_samples, n_features], """ X = check_array(X, accept_sparse='csr') y = np.asarray(y, dtype='int') y_unique = np.unique(y) index = np.arange(len(y), dtype='int') if len(y_unique) == 0: raise ValueError('The training set must have at least ' 'one document category!') # define nearest neighbors search objects for each category self._mod = [NearestNeighbors(n_neighbors=1, leaf_size=self.leaf_size, algorithm=self.algorithm, n_jobs=self.n_jobs, # euclidean metric by default metric='cosine', ) for el in range(len(y_unique))] index_mapping = [] for imod, y_val in enumerate(y_unique): mask = (y == y_val) index_mapping.append(index[mask]) self._mod[imod].fit(X[mask]) self.index_mapping = index_mapping
def __knn_sklearn(X, k, n_jobs=-1, verbose=False, **kwargs): nn = NearestNeighbors(n_neighbors=k+1, n_jobs=n_jobs, algorithm='ball_tree', **kwargs) nn.fit(X) if verbose: print('Indexing done.') dist, ind = nn.kneighbors(X, k+1, return_distance=True) if verbose: print('Query done.') return dist[:,1:].astype(X.dtype), ind[:,1:]
def encode(self, data, metric = 'euclidean'): """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. metric : string One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html. Valid options include: - euclidean - cityblock - l1 - cosine Returns ------- encoded_data : real array-like, shape(n_samples, n_features) ``data``, as represented by the prototypes in codebook. ts_symbols : list, shape(n_samples, 1) A discrete symbolic time series """ nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos) _, self.__symbols = nbrs.kneighbors(data) self.__encoding = self.protos[self.__symbols] return (self.__encoding, self.__symbols)
def encode(self, data, metric = 'euclidean'): """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. metric : string One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`. Valid options include: - euclidean - cityblock - l1 - cosine Returns ------- encoded_data : real array-like, shape(n_samples, n_features) ``data``, as represented by the prototypes in codebook. ts_symbols : list, shape(n_samples, 1) A discrete symbolic time series """ nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos) _, self.__symbols = nbrs.kneighbors(data) self.__encoding = self.protos[self.__symbols] return (self.__encoding, self.__symbols)
def fit(self, data): """ Learn data, and construct a vector codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. Returns ------- self : object The instance itself """ [n_samples, _] = data.shape self.protos = data[self.rng.choice(n_samples, self.n_protos), ] # avg_p = np.mean(data, 0) #dist_from_avg_p = np.sum(pairwise_distances(avg_p, data)) #ndistortion = [] for iteration in range(self.iterations): sample = data[self.rng.choice(n_samples, 1), ] t = iteration / float(self.iterations) lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs) I = np.argsort(np.argsort(D)) H = np.exp(-I / epsilon).ravel() diff = sample - self.protos for proto_id in range(self.n_protos): self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :] #nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos) #distances, _ = nbrs.kneighbors(data) #ndistortion.append( np.sum(distances) / dist_from_avg_p ) return self
def encode(self, data, metric='euclidean'): """ Employ a nearest-neighbor rule to encode the given ``data`` using the codebook. Parameters ---------- data : real array-like, shape(n_samples, n_features) Data matrix, each row represents a sample. metric : string One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html. Valid options include: - euclidean - cityblock - l1 - cosine Returns ------- encoded_data : real array-like, shape(n_samples, n_features) ``data``, as represented by the prototypes in codebook. ts_symbols : list, shape(n_samples, 1) A discrete symbolic time series """ # Perform a proposed data mining procedure as described in [Laskaris2004]. mds = MDS(1, random_state=self.rng) protos_1d = mds.fit_transform(self.protos).ravel() sorted_protos_1d = np.argsort(protos_1d) sprotos = self.protos[sorted_protos_1d] nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos) _, self.__symbols = nbrs.kneighbors(data) self.__encoding = sprotos[self.__symbols] return (self.__encoding, self.__symbols)
def __init__(self,n_neighbors=5,loss='L2'): if loss in ['L1','L2','SMAPE']: loss = {'L1':L1,'L2':L2,'SMAPE':SMAPE}[loss] self.loss = loss self.n_neighbors = n_neighbors self.model = NearestNeighbors(n_neighbors,algorithm='auto',n_jobs=-1) self.solver = lambda x:solver(x,loss)
def compute_distances(cls, inst_id): global feat_nn global feat_ids it = cls.objects.annotate(height=F('face__bbox_y2') - F('face__bbox_y1')).filter( height__gte=0.1).order_by('id') if feat_nn is None: _print('Loading features...') feats = list(it[::5]) feat_ids = np.array([f.id for f in feats]) feat_vectors = [f.load_features() for f in feats] X = np.vstack(feat_vectors) _print('Constructing KNN tree...') feat_nn = NearestNeighbors().fit(X) _print('Done!') # Erase distances from previous computation prev = list(cls.objects.filter(distto__isnull=False)) for feat in prev: feat.distto = None cls.objects.bulk_update(prev) dists, indices = feat_nn.kneighbors([cls.objects.get(face=inst_id).load_features()], 1000) for dist, feat_id in zip(dists[0], feat_ids[indices[0]]): feat = cls.objects.get(id=feat_id) feat.distto = dist feat.save()
def identity_detect(videos, exemplar, features): log.debug('Loading features') ids, vectors = zip(*[((i, j), f.load_features()) for i, vid_features in enumerate(features) for j, f in enumerate(vid_features)]) log.debug('Building k-nn tree') feat_nn = NearestNeighbors().fit(np.vstack(vectors)) log.debug('Doing look-up') exemplar_vector = FaceFeatures.objects.get( face=exemplar, labeler__name='facenet').load_features() dists, id_indices = feat_nn.kneighbors([exemplar_vector], min(10000, len(vectors))) face_map = defaultdict(list) for (dist, k) in zip(dists[0], id_indices[0]): (i, j) = ids[k] if dist > FEATURE_DISTANCE_THRESHOLD: break face_map[videos[i].id].append(features[i][j]) return [face_map[video.id] for video in videos] # Remove faces with negative coords and small height
def __init__(self): SingleClassifier.SingleClassifier.__init__(self) # weak classifier algorithms = ['brute', 'ball_tree', 'kd_tree'] self.clf = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples. Returns ------- self : detector Return self. """ X = check_array(X) self._knn = NearestNeighbors( metric = self.metric, metric_params = self.metric_params, n_jobs = self.n_jobs, n_neighbors = self.n_neighbors, p = self.p ).fit(X) self.y_score_ = self.anomaly_score() self.threshold_ = np.percentile( self.y_score_, 100.0 * (1.0 - self.fpr) ) return self
def calc_mahalanobis(x, y, n_neighbors): from sklearn.neighbors import DistanceMetric, NearestNeighbors DistanceMetric.get_metric('mahalanobis', V=np.cov(x)) nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='mahalanobis', metric_params={'V': np.cov(x)}) return nn.fit(x).kneighbors(y)
def train_and_score(metric, training, testing, ks): print "Training and scoring" scores = [] knn = NearestNeighbors(metric=metric, algorithm="brute") knn.fit(training) for k in ks: print "Evaluating for", k, "neighbors" neighbor_indices = knn.kneighbors(testing, n_neighbors=k, return_distance=False) all_predicted_scores = [] all_labels = [] for user_id in xrange(testing.shape[0]): user_row = testing[user_id, :] _, interaction_indices = user_row.nonzero() interacted = set(interaction_indices) non_interacted = set(xrange(testing.shape[1])) - interacted n_samples = min(len(non_interacted), len(interacted)) sampled_interacted = random.sample(interacted, n_samples) sampled_non_interacted = random.sample(non_interacted, n_samples) indices = list(sampled_interacted) indices.extend(sampled_non_interacted) labels = [1] * n_samples labels.extend([0] * n_samples) neighbors = training[neighbor_indices[user_id, :], :] predicted_scores = neighbors.mean(axis=0) for idx in indices: all_predicted_scores.append(predicted_scores[0, idx]) all_labels.extend(labels) print len(all_labels), len(all_predicted_scores) auc = roc_auc_score(all_labels, all_predicted_scores) print "k", k, "AUC", auc
def __init__(self): self.knnModel = NearestNeighbors(n_neighbors=15) self.log = logging.getLogger(__name__)
def train(self, userFeatureTable, ratingsMat): userFeatureTable.loc[:, "age"] = userFeatureTable.loc[:, "age"] / 10. # ad hoc fix, make sure feature's range is similar self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable) # ratingMat is the rating matrix self.ratingsMat = ratingsMat self.userFeatureTable = userFeatureTable self.userIds = self.userFeatureTable.index # the actual order seen by the knnmodel
def find_knn(self, target_matrix, target_features): neighbors = NearestNeighbors(n_neighbors=self.__args.n_neighbors, algorithm=self.__args.alg).fit( target_matrix.values) distances, indexes = neighbors.kneighbors(target_features) return distances, indexes
def fit(atributos): neighbor = NearestNeighbors(metric='euclidean') neighbor.fit(atributos) return neighbor
def index(self, metric='cosine'): alg = 'brute' if (metric == 'cosine') else 'auto' if not SKLEARN: raise WordVectorBoxException("Needs sklearn to work") self._nn = NearestNeighbors(metric=metric, algorithm=alg) self._nn.fit(self.W) return self