Python sklearn.neighbors 模块,KNeighborsClassifier() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.neighbors.KNeighborsClassifier()

项目:MachineLearningBasics    作者:zoebchhatriwala    | 项目源码 | 文件源码
def main():

    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)

    clrTree = tree.DecisionTreeClassifier()
    clrTree = clrTree.fit(x_train, y_train)
    outTree = clrTree.predict(x_test)

    clrKN = KNeighborsClassifier()
    clrKN = clrKN.fit(x_train, y_train)
    outKN = clrKN.predict(x_test)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%")
    print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_feature_importance(self,clf, model_name ):
        clfs = {'RandomForestClassifier':'feature_importances',
                'ExtraTreesClassifier': 'feature_importances',
                'AdaBoostClassifier': 'feature_importances',
                'LogisticRegression': 'coef',
                'svm.SVC': 'coef',
                'GradientBoostingClassifier': 'feature_importances',
                'GaussianNB': None,
                'DecisionTreeClassifier': 'feature_importances',
                'SGDClassifier': 'coef',
                'KNeighborsClassifier': None,
                'linear.SVC': 'coef'}

        if clfs[model_name] == 'feature_importances':
            return  list(clf.feature_importances_)
        elif clfs[model_name] == 'coef':
            return  list(clf.coef_.tolist())
        else:
            return None
项目:rltk    作者:usc-isi-i2    | 项目源码 | 文件源码
def get_classifier_class(class_name):
    name_table = {
        'svm': SVC,
        'k_neighbors': KNeighborsClassifier,
        'gaussian_process': GaussianProcessClassifier,
        'decision_tree': DecisionTreeClassifier,
        'random_forest': RandomForestClassifier,
        'ada_boost': AdaBoostClassifier,
        'mlp': MLPClassifier,
        'gaussian_naive_bayes': GaussianNB,
        'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis
    }

    if class_name not in name_table:
        raise ValueError('No such classifier')

    return name_table[class_name]
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def __create_classifiers(self):
        classifiers = list()
        classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
                            "name": "sgd"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
                            "name": "knn1"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'),
                            "name": "knn3"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'),
                            "name": "knn5"})
        classifiers.append({"func": GaussianNB(),
                            "name": "naive_bayes"})

        # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"})
        # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"})
        # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"})
        return classifiers
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def define_model(self, model, parameters, n_cores = 0):
        clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
                'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
                'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
                'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
                'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
                'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
                'GaussianNB': GaussianNB(),
                'DecisionTreeClassifier': DecisionTreeClassifier(),
                'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
                'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 
                'linear.SVC': svm.LinearSVC() }

        if model not in clfs:
            raise ConfigError("Unsupported model {}".format(model))

        clf = clfs[model]
        clf.set_params(**parameters)
        return clf
项目:Stock-Market-Prediction    作者:Diptiranjan1    | 项目源码 | 文件源码
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,
                                                        y,
                                                        test_size=0.25)

    #clf = neighbors.KNeighborsClassifier()

    clf = VotingClassifier([('lsvc',svm.LinearSVC()),
                            ('knn',neighbors.KNeighborsClassifier()),
                            ('rfor',RandomForestClassifier())])


    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('accuracy:',confidence)
    predictions = clf.predict(X_test)
    print('predicted class counts:',Counter(predictions))
    print()
    print()
    return confidence

# examples of running:
项目:Generative-ConvACs    作者:HUJI-Deep    | 项目源码 | 文件源码
def knn_masked_data(trX,trY,missing_data_dir, input_shape, k):

    raw_im_data = np.loadtxt(join(script_dir,missing_data_dir,'index.txt'),delimiter=' ',dtype=str)
    raw_mask_data = np.loadtxt(join(script_dir,missing_data_dir,'index_mask.txt'),delimiter=' ',dtype=str)
    # Using 'brute' method since we only want to do one query per classifier
    # so this will be quicker as it avoids overhead of creating a search tree
    knn_m = KNeighborsClassifier(algorithm='brute',n_neighbors=k)
    prob_Y_hat = np.zeros((raw_im_data.shape[0],int(np.max(trY)+1)))
    total_images = raw_im_data.shape[0]
    pbar = progressbar.ProgressBar(widgets=[progressbar.FormatLabel('\rProcessed %(value)d of %(max)d Images '), progressbar.Bar()], maxval=total_images, term_width=50).start()
    for i in range(total_images):
        mask_im=load_image(join(script_dir,missing_data_dir,raw_mask_data[i][0]), input_shape,1).reshape(np.prod(input_shape))
        mask = np.logical_not(mask_im > eps) # since mask is 1 at missing locations
        v_im=load_image(join(script_dir,missing_data_dir,raw_im_data[i][0]), input_shape, 255).reshape(np.prod(input_shape))
        rep_mask = np.tile(mask,(trX.shape[0],1))
        # Corrupt whole training set according to the current mask
        corr_trX = np.multiply(trX, rep_mask)        
        knn_m.fit(corr_trX, trY)
        prob_Y_hat[i,:] = knn_m.predict_proba(v_im.reshape(1,-1))
        pbar.update(i)
    pbar.finish()
    return prob_Y_hat
项目:sudokuextract    作者:hbldh    | 项目源码 | 文件源码
def _load_sklearn_default_classifier():
    if sys.version_info[0] == 2:
        file_name = "sklearn_classifier_py2.pklz"
        protocol = 2
    else:
        file_name = "sklearn_classifier_py3.pklz"
        protocol = 3

    file_path = resource_filename('sudokuextract.data', file_name)
    if resource_exists('sudokuextract.data', file_name):
        f = gzip.open(file_path, 'rb')
        classifier = pickle.load(f)
        f.close()
    else:
        classifier = KNeighborsClassifier(n_neighbors=10)
        classifier = fit_combined_classifier(classifier)
        f = gzip.open(file_path, 'wb')
        pickle.dump(classifier, f, protocol=protocol)
        f.close()

    return classifier
项目:sudokuextract    作者:hbldh    | 项目源码 | 文件源码
def _load_sudokuextract_default_classifier():
    file_name = "sudokuextract_classifier.pklz"
    protocol = 2

    file_path = resource_filename('sudokuextract.data', file_name)
    if resource_exists('sudokuextract.data', file_name):
        f = gzip.open(file_path, 'rb')
        classifier_json = pickle.load(f)
        classifier = KNeighborsClassifier(classifier_json.get('n_neighbors'),
                                          classifier_json.get('weights'),
                                          classifier_json.get('metric'),
                                          classifier_json.get('p'))
        classifier.fit(np.array(classifier_json.get('data')),
                       np.array(classifier_json.get('labels')))
        f.close()
    else:
        classifier = KNeighborsClassifier(n_neighbors=10)
        classifier = fit_combined_classifier(classifier)
        f = gzip.open(file_path, 'wb')
        pickle.dump(classifier.to_json(), f, protocol=protocol)
        f.close()
    return classifier
项目:gpam_stats    作者:ricoms    | 项目源码 | 文件源码
def n3_error_rate_nearest_neighbor_classifier(data):

    features = data.columns[:-1, ]
    mistakes = 0
    n = data.shape[0]

    for i in range(n):
        bad_df = data.index.isin([i])
        good_df = ~bad_df

        knn = KNeighborsClassifier( n_neighbors=1 )
        knn.fit(data.iloc[good_df].iloc[:, :-1], data.iloc[good_df].iloc[: ,-1])
        temp = np.array(data.iloc[i ,:-1]).reshape(1,-1)
        mistake = 1 if data.iloc[i, -1] != knn.predict(temp) else 0

        mistakes = mistakes + mistake

    n3 = (1.0 * mistakes) / n
    if n3 > 1:
        n3 = 1
    return n3
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'weights': ['uniform', 'distance'],
                             'n_neighbors': range(2,60)
                             }
                            ]


        clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:nlvr_tau_nlp_final_proj    作者:udiNaveh    | 项目源码 | 文件源码
def check_word2vec(embed_dict, embeds, key_words = ['of', 'is', 'a', 'yellow', 'circle', 'box']):

    KN = KNeighborsClassifier(n_neighbors=3)

    print('fitting pseudo-KNN...')
    KN.fit(embeds, [1]*len(embeds))
    inds = KN.kneighbors(embeds, return_distance=False)
    # print(inds)

    embeds_list = embeds.tolist()
    for word in key_words:
        req_words = []
        ind = embeds_list.index(embed_dict[word].tolist())
        req_inds = inds[ind]
        for idx in req_inds:
            for w in embed_dict:
                if (embed_dict[w] == embeds[idx]).all()==True:
                    req_words.append(w)
        print('for:', word, ', the 3nn are:', req_words)
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def knn_ps2(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["year"] = (1 + df["year"]) * 10.
        df_new["hour"] = (1 + df["hour"]) * 4.
        df_new["weekday"] = (1 + df["weekday"]) * 3.11
        df_new["month"] = (1 + df["month"]) * 2.11
        df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) * 10.
        df_new["x"] = df["x"] * 465.
        df_new["y"] = df["y"] * 975.
        return df_new
    logging.info("train knn_ps2 model")
    df_cell_train_feats_knn = prepare_feats(df_cell_train_feats)
    clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(len(y_train))/5.3).astype(int),
                               weights=lambda x: x ** -2, metric='manhattan', n_jobs=-1)
    clf.fit(df_cell_train_feats_knn, y_train)
    df_cell_test_feats_knn = prepare_feats(df_cell_test_feats)
    y_test_pred = clf.predict_proba(df_cell_test_feats_knn)
    return y_test_pred
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def classification_knn(self):
        self.signals.PrintInfo.emit("???????? KNN")
        output_dir = self.output_dir + 'knn_out/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # ????????? ????????? ? ???????
        # ???????? ????? ???????? ???????? ?? ??? ?????????????, ???? ?????????? ? ??????.
        vectorizer = HashingVectorizer()
        fdata = vectorizer.fit_transform(self.fdata)
        trainingSet = fdata[:self.split]
        testSet = fdata[self.split:]

        # ??????? ? ????????? ????????????? ? ????? ??????????????
        classificator = KNeighborsClassifier(n_neighbors=self.knn_n_neighbors)
        classificator.fit(trainingSet, self.trainingClass)
        results = classificator.predict(testSet)
        proba = classificator.predict_proba(testSet)

        self.write_results_to_file(output_dir + 'results.csv', results, proba, classificator.classes_, self.test_filenames)
        out_text = self.compile_result_string(results, proba, classificator.classes_, self.test_filenames)
        self.signals.PrintInfo.emit(out_text)
项目:stock-price-prediction    作者:chinuy    | 项目源码 | 文件源码
def buildModel(dataset, method, parameters):
    """
    Build final model for predicting real testing data
    """
    features = dataset.columns[0:-1]

    if method == 'RNN':
        clf = performRNNlass(dataset[features], dataset['UpDown'])
        return clf

    elif method == 'RF':
        clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)

    elif method == 'KNN':
        clf = neighbors.KNeighborsClassifier()

    elif method == 'SVM':
        c = parameters[0]
        g =  parameters[1]
        clf = SVC(C=c, gamma=g)

    elif method == 'ADA':
        clf = AdaBoostClassifier()

    return clf.fit(dataset[features], dataset['UpDown'])
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def splitValidateModel(self, visualizePredictions = False):
    (label_vector, input_vector) = loadData(self.featureFile)

    indexArray = range(0, len(input_vector))
    trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \
      cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit))

    kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
    kNNClassifier.fit(trainData, trainLabels) 
    predictedLabels = kNNClassifier.predict(testData)

    print("Classification report for classifier %s:\n%s\n"
          % ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels))
    print('Split Validation training :: Done.\n')

    if visualizePredictions:
      self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def trainLimited(self, featureFile, n_datapoints):
    (label_vector, input_vector) = loadData(featureFile)

    trainData, testData, trainLabels, testLabels = \
      cross_validation.train_test_split(input_vector, label_vector, test_size=(0))

    n_totalrows = int((len(label_vector)/n_datapoints))
    for n in range(0, n_totalrows):
      limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
      limited_input_vector = trainData[0: (n+1) * n_datapoints]

      kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
      kNNClassifier.fit(limited_input_vector, limited_label_vector)

      scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
      print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
项目:yttresearch-machine-learning-algorithms-analysis    作者:gdemos01    | 项目源码 | 文件源码
def KNNClassifier(action):

        # Setting our classifier to Logistic Regression
        clf = KNeighborsClassifier(n_neighbors=5)

        dir = input('Give Data Directory: ')

        if int(action) == 1:
                print('Loading Data')
                PopularityClassifier.loadData(dir)   
                PopularityClassifier.youtubePopular(dir,clf,0)
                PopularityClassifier.twitterPopular(dir,clf,0)       
                PopularityClassifier.bothPopular(dir,clf,0)
        elif int(action) == 2:
                print('Loading Data')
                ViralityClassifier.loadData(dir)
                ViralityClassifier.youtubeViral(dir,clf,0)
                ViralityClassifier.twitterViral(dir,clf,0)
                ViralityClassifier.bothViral(dir,clf,0)
        else:
                print('Loading Data')
                ViralityAndPopularityClassifier.loadData(dir)
                ViralityAndPopularityClassifier.youtubeViralAndPopular(dir,clf,0)
                ViralityAndPopularityClassifier.twitterViralAndPopular(dir,clf,0)
                ViralityAndPopularityClassifier.bothViralAndPopular(dir,clf,0)
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_init(self):
        """
        Testing the init method
        """
        model = neighbors.KNeighborsClassifier(3)
        viz = DecisionBoundariesVisualizer(model)

        self.assertEquals(viz.step_size, 0.0025)
        self.assertEqual(viz.name, 'KNeighborsClassifier')
        self.assertEqual(viz.estimator, model)

        self.assertIsNone(viz.classes_)
        self.assertIsNone(viz.features_)
        self.assertIsNotNone(viz.markers)
        self.assertIsNotNone(viz.scatter_alpha)
        self.assertTrue(viz.show_scatter)

        self.assertIsNone(viz.Z)
        self.assertIsNone(viz.xx)
        self.assertIsNone(viz.yy)
        self.assertIsNone(viz.class_labels)
        self.assertIsNone(viz.title)
        self.assertIsNone(viz.x)
        self.assertIsNone(viz.y)
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_draw_ax_show_scatter_False(self):
        """Test that the matplotlib functions are being called when the
        scatter plot isn't drawn
        """
        model = neighbors.KNeighborsClassifier(3)
        viz = DecisionBoundariesVisualizer(
            model, features=['one', 'two'], show_scatter=False)
        fitted_viz = viz.fit(X_two_cols, y=y)
        fitted_viz.ax = mock.Mock()
        fitted_viz.ax.pcolormesh = mock.MagicMock()
        fitted_viz.ax.scatter = mock.MagicMock()
        fitted_viz.ax.legend = mock.MagicMock()

        fitted_viz.draw(X_two_cols, y=y)
        self.assertEquals(len(fitted_viz.ax.pcolormesh.mock_calls), 1)
        self.assertEquals(len(fitted_viz.ax.scatter.mock_calls), 0)
        self.assertEquals(len(fitted_viz.ax.legend.mock_calls), 1)
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_finalize(self):
        model = neighbors.KNeighborsClassifier(3)
        viz = DecisionBoundariesVisualizer(
            model, features=['one', 'two'], show_scatter=False)
        fitted_viz = viz.fit(X_two_cols, y=y)
        fitted_viz.draw(X_two_cols, y=y)

        fitted_viz.ax = mock.Mock()
        fitted_viz.ax.legend = mock.MagicMock()
        fitted_viz.ax.set_xlabel = mock.MagicMock()
        fitted_viz.ax.set_ylabel = mock.MagicMock()

        fitted_viz.poof()

        fitted_viz.ax.legend.assert_called_once_with(loc='best', frameon=True)
        fitted_viz.ax.set_xlabel.assert_called_once_with('one')
        fitted_viz.ax.set_ylabel.assert_called_once_with('two')
项目:DataMiningCompetitionFirstPrize    作者:lzddzh    | 项目源码 | 文件源码
def learn(x, y, test_x):
    weight_list = []
    for j in range(len(y)):
        if y[j] == "0":
            weight_list.append(variables.weight_0_gdbt)
        if y[j] == "1000":
            weight_list.append(variables.weight_1000_gdbt)
        if y[j] == "1500":
            weight_list.append(variables.weight_1500_gdbt)
        if y[j] == "2000":
            weight_list.append(variables.weight_2000_gdbt)

    clf = KNeighborsClassifier(1, weight_list).fit(x, y)

    prediction_list = clf.predict(test_x)
    return prediction_list
项目:python-machine-learning-book    作者:jeremyn    | 项目源码 | 文件源码
def use_sbs_with_knn(columns, X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=2)
    sbs = SBS(knn, k_features=1)
    sbs.fit(X_train, y_train)

    k_feat = [len(k) for k in sbs.subsets_]
    plt.plot(k_feat, sbs.scores_, marker='o')
    plt.ylim([0.7, 1.1])
    plt.ylabel('Accuracy')
    plt.xlabel('Number of features')
    plt.grid()
    plt.show()

    k5 = list(sbs.subsets_[8])
    print(columns[1:][k5])

    knn.fit(X_train, y_train)
    print("Training accuracy: %s" % knn.score(X_train, y_train))
    print("Test accuracy: %s" % knn.score(X_test, y_test))

    knn.fit(X_train[:, k5], y_train)
    print("Training accuracy: %s" % knn.score(X_train[:, k5], y_train))
    print("Test accuracy: %s" % knn.score(X_test[:, k5], y_test))
项目:ML    作者:saurabhsuman47    | 项目源码 | 文件源码
def knn_cv(post_features, post_class, n_folds, n_neighbors, length_dataset = -1):

    if(length_dataset == -1):
        length_dataset = len(post_class)
    cv = KFold(n = length_dataset, n_folds = n_folds, shuffle = True)
    train_accuracy = []
    test_accuracy = []

    for train,test in cv:
        knn = neighbors.KNeighborsClassifier(n_neighbors = n_neighbors)
        knn.fit(post_features[train],post_class[train])
        train_accuracy.append(knn.score(post_features[train], post_class[train]))
        test_accuracy.append(knn.score(post_features[test], post_class[test]))

#    return (sum(train_accuracy)/n_folds), (sum(test_accuracy)/n_folds)
    return np.mean(train_accuracy), np.mean(test_accuracy)
项目:Stock-Market-Analysis-and-Prediction    作者:samshara    | 项目源码 | 文件源码
def performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    KNN binary Classification
    """
    clf = KNeighborsClassifier(3)
    clf.fit(X_train, y_train)

    if savemodel == True:
        #fname_out = '{}-{}.pickle'.format(fout, datetime.now().date())
        fname_out = fout+'.pickle'
        with open(fname_out, 'wb') as f:
            pickle.dump(clf, f, -1)    

    accuracy = clf.score(X_test, y_test)

    return accuracy
项目:python_utils    作者:Jayhello    | 项目源码 | 文件源码
def cross_validation():
    x_train, x_test, y_train, y_test = load_data()
    k_lst = list(range(1, 30))
    lst_scores = []

    for k in k_lst:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
        lst_scores.append(scores.mean())

    # changing to misclassification error
    MSE = [1 - x for x in lst_scores]
    optimal_k = k_lst[MSE.index(min(MSE))]
    print "The optimal number of neighbors is %d" % optimal_k
    # plot misclassification error vs k
    # plt.plot(k_lst, MSE)
    # plt.ylabel('Misclassification Error')
    plt.plot(k_lst, lst_scores)
    plt.xlabel('Number of Neighbors K')
    plt.ylabel('correct classification rate')
    plt.show()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_neighbors_iris():
    # Sanity checks on the iris dataset
    # Puts three points of each label in the plane and performs a
    # nearest neighbor query on points near the decision boundary.

    for algorithm in ALGORITHMS:
        clf = neighbors.KNeighborsClassifier(n_neighbors=1,
                                             algorithm=algorithm)
        clf.fit(iris.data, iris.target)
        assert_array_equal(clf.predict(iris.data), iris.target)

        clf.set_params(n_neighbors=9, algorithm=algorithm)
        clf.fit(iris.data, iris.target)
        assert_true(np.mean(clf.predict(iris.data) == iris.target) > 0.95)

        rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm)
        rgs.fit(iris.data, iris.target)
        assert_greater(np.mean(rgs.predict(iris.data).round() == iris.target),
                       0.95)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_neighbors_digits():
    # Sanity check on the digits dataset
    # the 'brute' algorithm has been observed to fail if the input
    # dtype is uint8 due to overflow in distance calculations.

    X = digits.data.astype('uint8')
    Y = digits.target
    (n_samples, n_features) = X.shape
    train_test_boundary = int(n_samples * 0.8)
    train = np.arange(0, train_test_boundary)
    test = np.arange(train_test_boundary, n_samples)
    (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test]

    clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='brute')
    score_uint8 = clf.fit(X_train, Y_train).score(X_test, Y_test)
    score_float = clf.fit(X_train.astype(float), Y_train).score(
        X_test.astype(float), Y_test)
    assert_equal(score_uint8, score_float)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
项目:classify_dream_of_the_red_chamber    作者:MrQianJinSi    | 项目源码 | 文件源码
def knn_clf(observations, n_neighbors):
  # ??????
  range1 = [20, 30]
  len1 = len(range(range1[0], range1[1]))
  range2 = [110, 120]
  len2 = len(range(range2[0], range2[1]))

  training_index = list(range(range1[0], range1[1])) + list(range(range2[0],
    range2[1]))
  training_data = observations[training_index, :]
  training_label = np.ones(len1+len2, dtype='int32')
  training_label[len1:] = 2
  # ??????
  knn = KNeighborsClassifier(n_neighbors = 3)#, weights = 'distance')
  knn.fit(training_data, training_label) 
  # ??
  knn_pre = knn.predict(observations)

  print('????????')
  for i in range(8):
    print(knn_pre[i*10:(i+1)*10])

  print('????????????')
  for i in range(8,12):
    print(knn_pre[i*10:(i+1)*10])
项目:ISM2017    作者:ybayle    | 项目源码 | 文件源码
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None):
    """Description of compare
    compare multiple classifier and display the best one
    """
    utils.print_success("Comparison of differents classifiers")
    if data is not None:
        train_features = data["train_features"]
        train_groundtruths = data["train_groundtruths"]
        test_features = data["test_features"]
        test_groundtruths = data["test_groundtruths"]
    else:
        train = utils.abs_path_file(train)
        test = utils.abs_path_file(test)
        train_features, train_groundtruths = read_file(train)
        test_features, test_groundtruths = read_file(test)
    if not utils.create_dir(res_dir):
        res_dir = utils.abs_path_dir(res_dir)
    classifiers = {
        "RandomForest": RandomForestClassifier(n_jobs=-1)
        # "RandomForest": RandomForestClassifier(n_estimators=5),
        # "KNeighbors":KNeighborsClassifier(3),
        # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
        # "DecisionTree":DecisionTreeClassifier(max_depth=5),
        # "MLP":MLPClassifier(),
        # "AdaBoost":AdaBoostClassifier(),
        # "GaussianNB":GaussianNB(),
        # "QDA":QuadraticDiscriminantAnalysis(),
        # "SVM":SVC(kernel="linear", C=0.025),
        # "GradientBoosting":GradientBoostingClassifier(),
        # "ExtraTrees":ExtraTreesClassifier(),
        # "LogisticRegression":LogisticRegression(),
        # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis()
    }
    for key in classifiers:
        utils.print_success(key)
        clf = classifiers[key]
        utils.print_info("\tFit")
        clf.fit(train_features, train_groundtruths)
        utils.print_info("\tPredict")
        predictions = clf.predict(test_features)
    return predictions
项目:Machine-Learning    作者:grasses    | 项目源码 | 文件源码
def __init__(self, config = conf, split = 0.3, clf = KNeighborsClassifier(), auto_rebuild = False, debug = False):
        self.clf = clf
        self.conf = conf
        self.split = split
        self.debug = debug
        self.auto_rebuild = auto_rebuild
        self.init()
项目:Machine-Learning    作者:grasses    | 项目源码 | 文件源码
def __init__(self, conf = conf, clf = KNeighborsClassifier(), debug = False):
        self.clf = clf
        self.conf = conf
        self.debug = debug
        self.base = os.path.dirname(os.path.realpath(__file__))
        self.vote_db = {}
        self.letter_db = {}
        self.writer_db = {}
        self.total = self.right = 0
项目:XTREE    作者:ai-se    | 项目源码 | 文件源码
def knn(train, test, smoteit=True):
  "kNN"
  if smoteit:
    train = SMOTE(train)
  neigh = KNeighborsClassifier()
  train_DF = formatData(train)
  test_DF = formatData(test)
  features = train_DF.columns[:-2]
  klass = train_DF[train_DF.columns[-2]]
  # set_trace()
  neigh.fit(train_DF[features], klass)
  preds = neigh.predict(test_DF[test_DF.columns[:-2]]).tolist()
  return preds
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
        clf = KNeighborsClassifier(n_neighbors = 33)
        min_max_scaler = preprocessing.MinMaxScaler()
        self.clf = Pipeline([('scaler', min_max_scaler), ('estimator', clf)])
        return
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def lession_4():
    iris = datasets.load_iris()
    iris_X = iris.data
    iris_y = iris.target
    # print iris_X[:2]
    # print iris_y
    X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size=0.3)
    knn = KNeighborsClassifier()
    knn.fit(X_train,y_train)
    print knn.predict(X_test)
    print y_test

# dataset usage
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def knn_classify(self, out_folder_path, training_set, test_set, training_labels, test_labels, k=1, msg=""):
        print("message: " + msg)
        out_file_pre_path = os.path.join(out_folder_path, "knn" + str(k) + msg)  # Any output file should extend this path

        knn_classifier = neighbors.KNeighborsClassifier(k, weights='distance')
        knn_classifier.fit(training_set, training_labels)
        predicted = knn_classifier.predict(test_set)

        success = accuracy_score(test_labels, predicted, normalize=False)
        conf_matrix = self.__retrieve_confusion_matrix(test_labels, predicted, out_file_pre_path)
        return conf_matrix, success
项目:srep    作者:Answeror    | 项目源码 | 文件源码
def __init__(self):
        from sklearn.neighbors import KNeighborsClassifier as KNN
        self.clf = KNN()
项目:sentiment-analysis    作者:lplping    | 项目源码 | 文件源码
def KnnClass(x_train,y_train):
    from sklearn.neighbors import KNeighborsClassifier
    clf=KNeighborsClassifier()
    clf.fit(x_train,y_train)
    return clf


#========Decision Tree ========#
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def get_models(test):
    return [
        (LinearSVC, {
            'C': [0.01, 0.1, 1.0, 10.0],
            'multi_class': ['ovr', 'crammer_singer'],
        }),
    ] + ([
        (KNeighborsClassifier, {
            'weights': ['uniform', 'distance'],
        }),
        (SVC, {
            'C': [0.01, 0.1, 1.0, 10.0, 100.0],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'decision_function_shape': ['ovr', 'ovo'],
        }),
        (RandomForestClassifier, {
            'criterion': ['gini', 'entropy'],
            'min_samples_split': [5, 10, 25],
            'min_samples_leaf': [5, 10, 25],
            'n_estimators': [5, 10, 50, 100],
        })
    ] if not test else [])
项目:gpam_stats    作者:ricoms    | 项目源码 | 文件源码
def n4_non_linearity_of_nearest_neighbor_classifier( data, random_seed = 42, iterations = 20 ):

    def generate_interpolated_data_cl(data, cl, features, labels):
        points_in_class = data[data[labels] == cl].index.tolist()
        data_interpolated = pd.DataFrame(columns = features + [labels])

        for a, b in random_combinations(points_in_class):
            new_point = linear_interpolation(data.iloc[a, :-1], data.iloc[b, :-1] )
            df = pd.DataFrame([new_point + [cl]], columns = features + [labels] )
            data_interpolated = data_interpolated.append(df)

        return data_interpolated

    def get_n4_for_iteration(data):  

        labels = data.columns[-1]
        features = data.columns[:-1,].tolist()
        classes = data.iloc[:, -1].unique()
        data_to_interpolate = data.copy()

        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(data[features], data[labels])

        for cl in classes:
            data_interpolated = generate_interpolated_data_cl(data_to_interpolate, cl, features, labels)

        mistakes = 1 - knn.score(data_interpolated[features], data_interpolated[labels])

        return mistakes

    random.seed( random_seed )
    n4 = []

    for i in range(iterations):
        mistakes = get_n4_for_iteration(data)
        n4.append(mistakes)

    return np.mean(n4)
项目:meinkurve    作者:michgur    | 项目源码 | 文件源码
def __init__(self,data_file):
        self.file = data_file
        df = pd.read_csv(data_file)
        X = np.array(df.drop(['class'], 1))
        y = np.array(df['class'])
        self.size = sum(1 for line in open(data_file))

        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

        clf = neighbors.KNeighborsClassifier()
        self.prediction = clf.fit(X_train, y_train)
项目:avito-contest    作者:fmilepe    | 项目源码 | 文件源码
def KNN(X, y):
    print("Iniciando treinamento do KNN")
    clf = KNeighborsClassifier(n_jobs=6,leaf_size=15)
    kf = KFold(len(y),n_folds=20)
    clf.fit(X,y)

    X_score = X[:10000]
    y_score = y[:10000]
    score = clf.score(X_score, y_score)
    print("KNN score: ", score)

    return clf
项目:MixtureOfExperts    作者:krishnakalyan3    | 项目源码 | 文件源码
def knn_model(X, y):
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X, y)
    return neigh
项目:intellead-classification    作者:intellead    | 项目源码 | 文件源码
def classification(lead):
    #classifiers = [
    #    ('ab', AdaBoostClassifier()),
    #    ('dt', DecisionTreeClassifier(max_depth=5)),
    #    ('kn', KNeighborsClassifier(16)),
    #]
    inputs = get_dataset_input_from_database(lead.keys())
    outputs = get_dataset_output_from_database()
    print('The total number of examples in the dataset is: %d' % (len(inputs)))
    inputs_training, inputs_test, outputs_training, outputs_test = train_test_split(inputs, outputs, test_size=0.3, random_state=42)
    print('The number of examples used for training are: %d' % (len(inputs_training)))
    print('The number of examples used for testing are: %d' % (len(inputs_test)))
    knn = KNeighborsClassifier(n_neighbors=7, p=2)
    knn.fit(inputs_training, np.ravel(outputs_training))
    print('[K=7] The probability of the algorithm to be right is: %f%%' % (knn.score(inputs_test, outputs_test) * 100))
    #voting_classifier = VotingClassifier(estimators=classifiers, voting='hard')
    #voting_classifier = voting_classifier.fit(inputs_training, np.ravel(outputs_training))
    #print('The probability of the machine to be right is: %f%%' % (voting_classifier.score(inputs_test, outputs_test) * 100))
    print('Lead data:')
    print(lead)
    data_to_predict = convert_dict_to_tuple(lead)
    print('Lead data to predict:')
    print(data_to_predict)
    lead_status = knn.predict(data_to_predict)
    lead_status_value = lead_status[0]
    #lead_status = voting_classifier.predict(data_to_predict)
    print('According to lead data, his status is: %d' % (lead_status_value))
    print('[0] unqualified [1] qualified')
    proba = knn.predict_proba(data_to_predict)
    max_proba = max(proba[0])
    print('Proba is: %d%%' %(max_proba*100))
    lead_status_dict = dict()
    dict.update(lead_status_dict, value=str(lead_status_value))
    dict.update(lead_status_dict, proba=str(max_proba))
    return lead_status_dict
项目:Audio-classification-using-Bag-of-Frames-approach    作者:amogh3892    | 项目源码 | 文件源码
def knn_predict(training_samples, training_labels, test_samples, test_lables,k_neighbours = 5,weights = "uniform",algorithm = "auto"):
    from sklearn.neighbors import KNeighborsClassifier

    clf = KNeighborsClassifier(n_neighbors = k_neighbours, weights =weights, algorithm = algorithm)

    t0 = time()
    clf.fit(training_samples,training_labels)
    training_time = round(time()-t0, 3)

    t0 = time()
    pred = clf.predict(test_samples)
    test_time = round(time()-t0, 3)

    from sklearn.metrics import accuracy_score

    acc = accuracy_score(pred,test_lables)

    no_features = np.array(training_samples).shape[1]
    training_samples = np.array(training_samples).shape[0]
    test_samples = np.array(test_samples).shape[0]

    with open("Temp\\results.txt","w") as outfile:
        outfile.write("Alogirthm : {}\n".format("KNN"))
        outfile.write("K  = {}\n".format(k_neighbours))
        outfile.write("weight = {}\n".format(weights))
        outfile.write("algorithm = {}\n".format(algorithm))
        outfile.write("No of features : {}\n".format(no_features))
        outfile.write("No of training samples : {}\n".format(training_samples))
        outfile.write("No of test samples : {}\n".format(test_samples))
        outfile.write("Training time : {}\n".format(training_time))
        outfile.write("Test time : {}\n".format(test_time))
        outfile.write("Accuracy : {}\n".format(acc))

    with open("Temp\\result_labels.csv","wb") as outfile:
        np.savetxt(outfile,pred)
项目:MENGEL    作者:CodeSpaceHQ    | 项目源码 | 文件源码
def train_random_forest():
    # Selecting the model
    return mp.ModelProperties(), RandomForestClassifier(n_estimators=100) # Default estimators is 10


# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
项目:MENGEL    作者:CodeSpaceHQ    | 项目源码 | 文件源码
def train_knn():
    # Selecting the model
    return mp.ModelProperties(), neighbors.KNeighborsClassifier() # default is 5 neighbors


# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn-svm-svc
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def __init__(self, isTrain, isOutlierRemoval=0):
        super(ClassificationKNN, self).__init__(isTrain, isOutlierRemoval)
        # data preprocessing
        self.dataPreprocessing()

        # first parameter is the K neighbors
        # 'uniform' assigns uniform weights to each neighbor
        # 'distance' assigns weights proportional to the inverse of the distance from the query point
        # default metric is euclidean distance
        self.clf = neighbors.KNeighborsClassifier(2, weights='uniform')
项目:Sentences-analysis    作者:sungminoh    | 项目源码 | 文件源码
def build_classifier(self):
        self.classifier = KNeighborsClassifier(n_neighbors=1)
        self.classifier.fit(self.coordinates, self.labels)