我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.neighbors.KNeighborsClassifier()。
def main(): iris = datasets.load_iris() x = iris.data y = iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5) clrTree = tree.DecisionTreeClassifier() clrTree = clrTree.fit(x_train, y_train) outTree = clrTree.predict(x_test) clrKN = KNeighborsClassifier() clrKN = clrKN.fit(x_train, y_train) outKN = clrKN.predict(x_test) # Prediction accuracy print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%") print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def get_classifier_class(class_name): name_table = { 'svm': SVC, 'k_neighbors': KNeighborsClassifier, 'gaussian_process': GaussianProcessClassifier, 'decision_tree': DecisionTreeClassifier, 'random_forest': RandomForestClassifier, 'ada_boost': AdaBoostClassifier, 'mlp': MLPClassifier, 'gaussian_naive_bayes': GaussianNB, 'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis } if class_name not in name_table: raise ValueError('No such classifier') return name_table[class_name]
def __create_classifiers(self): classifiers = list() classifiers.append({"func": linear_model.SGDClassifier(loss="log"), "name": "sgd"}) classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'), "name": "knn1"}) classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'), "name": "knn3"}) classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'), "name": "knn5"}) classifiers.append({"func": GaussianNB(), "name": "naive_bayes"}) # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"}) # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"}) # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"}) return classifiers
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc',svm.LinearSVC()), ('knn',neighbors.KNeighborsClassifier()), ('rfor',RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:',confidence) predictions = clf.predict(X_test) print('predicted class counts:',Counter(predictions)) print() print() return confidence # examples of running:
def knn_masked_data(trX,trY,missing_data_dir, input_shape, k): raw_im_data = np.loadtxt(join(script_dir,missing_data_dir,'index.txt'),delimiter=' ',dtype=str) raw_mask_data = np.loadtxt(join(script_dir,missing_data_dir,'index_mask.txt'),delimiter=' ',dtype=str) # Using 'brute' method since we only want to do one query per classifier # so this will be quicker as it avoids overhead of creating a search tree knn_m = KNeighborsClassifier(algorithm='brute',n_neighbors=k) prob_Y_hat = np.zeros((raw_im_data.shape[0],int(np.max(trY)+1))) total_images = raw_im_data.shape[0] pbar = progressbar.ProgressBar(widgets=[progressbar.FormatLabel('\rProcessed %(value)d of %(max)d Images '), progressbar.Bar()], maxval=total_images, term_width=50).start() for i in range(total_images): mask_im=load_image(join(script_dir,missing_data_dir,raw_mask_data[i][0]), input_shape,1).reshape(np.prod(input_shape)) mask = np.logical_not(mask_im > eps) # since mask is 1 at missing locations v_im=load_image(join(script_dir,missing_data_dir,raw_im_data[i][0]), input_shape, 255).reshape(np.prod(input_shape)) rep_mask = np.tile(mask,(trX.shape[0],1)) # Corrupt whole training set according to the current mask corr_trX = np.multiply(trX, rep_mask) knn_m.fit(corr_trX, trY) prob_Y_hat[i,:] = knn_m.predict_proba(v_im.reshape(1,-1)) pbar.update(i) pbar.finish() return prob_Y_hat
def _load_sklearn_default_classifier(): if sys.version_info[0] == 2: file_name = "sklearn_classifier_py2.pklz" protocol = 2 else: file_name = "sklearn_classifier_py3.pklz" protocol = 3 file_path = resource_filename('sudokuextract.data', file_name) if resource_exists('sudokuextract.data', file_name): f = gzip.open(file_path, 'rb') classifier = pickle.load(f) f.close() else: classifier = KNeighborsClassifier(n_neighbors=10) classifier = fit_combined_classifier(classifier) f = gzip.open(file_path, 'wb') pickle.dump(classifier, f, protocol=protocol) f.close() return classifier
def _load_sudokuextract_default_classifier(): file_name = "sudokuextract_classifier.pklz" protocol = 2 file_path = resource_filename('sudokuextract.data', file_name) if resource_exists('sudokuextract.data', file_name): f = gzip.open(file_path, 'rb') classifier_json = pickle.load(f) classifier = KNeighborsClassifier(classifier_json.get('n_neighbors'), classifier_json.get('weights'), classifier_json.get('metric'), classifier_json.get('p')) classifier.fit(np.array(classifier_json.get('data')), np.array(classifier_json.get('labels'))) f.close() else: classifier = KNeighborsClassifier(n_neighbors=10) classifier = fit_combined_classifier(classifier) f = gzip.open(file_path, 'wb') pickle.dump(classifier.to_json(), f, protocol=protocol) f.close() return classifier
def n3_error_rate_nearest_neighbor_classifier(data): features = data.columns[:-1, ] mistakes = 0 n = data.shape[0] for i in range(n): bad_df = data.index.isin([i]) good_df = ~bad_df knn = KNeighborsClassifier( n_neighbors=1 ) knn.fit(data.iloc[good_df].iloc[:, :-1], data.iloc[good_df].iloc[: ,-1]) temp = np.array(data.iloc[i ,:-1]).reshape(1,-1) mistake = 1 if data.iloc[i, -1] != knn.predict(temp) else 0 mistakes = mistakes + mistake n3 = (1.0 * mistakes) / n if n3 > 1: n3 = 1 return n3
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'weights': ['uniform', 'distance'], 'n_neighbors': range(2,60) } ] clf = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def check_word2vec(embed_dict, embeds, key_words = ['of', 'is', 'a', 'yellow', 'circle', 'box']): KN = KNeighborsClassifier(n_neighbors=3) print('fitting pseudo-KNN...') KN.fit(embeds, [1]*len(embeds)) inds = KN.kneighbors(embeds, return_distance=False) # print(inds) embeds_list = embeds.tolist() for word in key_words: req_words = [] ind = embeds_list.index(embed_dict[word].tolist()) req_inds = inds[ind] for idx in req_inds: for w in embed_dict: if (embed_dict[w] == embeds[idx]).all()==True: req_words.append(w) print('for:', word, ', the 3nn are:', req_words)
def knn_ps2(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["year"] = (1 + df["year"]) * 10. df_new["hour"] = (1 + df["hour"]) * 4. df_new["weekday"] = (1 + df["weekday"]) * 3.11 df_new["month"] = (1 + df["month"]) * 2.11 df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) * 10. df_new["x"] = df["x"] * 465. df_new["y"] = df["y"] * 975. return df_new logging.info("train knn_ps2 model") df_cell_train_feats_knn = prepare_feats(df_cell_train_feats) clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(len(y_train))/5.3).astype(int), weights=lambda x: x ** -2, metric='manhattan', n_jobs=-1) clf.fit(df_cell_train_feats_knn, y_train) df_cell_test_feats_knn = prepare_feats(df_cell_test_feats) y_test_pred = clf.predict_proba(df_cell_test_feats_knn) return y_test_pred
def classification_knn(self): self.signals.PrintInfo.emit("???????? KNN") output_dir = self.output_dir + 'knn_out/' if not os.path.exists(output_dir): os.makedirs(output_dir) # ????????? ????????? ? ??????? # ???????? ????? ???????? ???????? ?? ??? ?????????????, ???? ?????????? ? ??????. vectorizer = HashingVectorizer() fdata = vectorizer.fit_transform(self.fdata) trainingSet = fdata[:self.split] testSet = fdata[self.split:] # ??????? ? ????????? ????????????? ? ????? ?????????????? classificator = KNeighborsClassifier(n_neighbors=self.knn_n_neighbors) classificator.fit(trainingSet, self.trainingClass) results = classificator.predict(testSet) proba = classificator.predict_proba(testSet) self.write_results_to_file(output_dir + 'results.csv', results, proba, classificator.classes_, self.test_filenames) out_text = self.compile_result_string(results, proba, classificator.classes_, self.test_filenames) self.signals.PrintInfo.emit(out_text)
def buildModel(dataset, method, parameters): """ Build final model for predicting real testing data """ features = dataset.columns[0:-1] if method == 'RNN': clf = performRNNlass(dataset[features], dataset['UpDown']) return clf elif method == 'RF': clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) elif method == 'KNN': clf = neighbors.KNeighborsClassifier() elif method == 'SVM': c = parameters[0] g = parameters[1] clf = SVC(C=c, gamma=g) elif method == 'ADA': clf = AdaBoostClassifier() return clf.fit(dataset[features], dataset['UpDown'])
def splitValidateModel(self, visualizePredictions = False): (label_vector, input_vector) = loadData(self.featureFile) indexArray = range(0, len(input_vector)) trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \ cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit)) kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(trainData, trainLabels) predictedLabels = kNNClassifier.predict(testData) print("Classification report for classifier %s:\n%s\n" % ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels)) print('Split Validation training :: Done.\n') if visualizePredictions: self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
def trainLimited(self, featureFile, n_datapoints): (label_vector, input_vector) = loadData(featureFile) trainData, testData, trainLabels, testLabels = \ cross_validation.train_test_split(input_vector, label_vector, test_size=(0)) n_totalrows = int((len(label_vector)/n_datapoints)) for n in range(0, n_totalrows): limited_label_vector = trainLabels[0: (n+1) * n_datapoints] limited_input_vector = trainData[0: (n+1) * n_datapoints] kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(limited_input_vector, limited_label_vector) scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5) print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def KNNClassifier(action): # Setting our classifier to Logistic Regression clf = KNeighborsClassifier(n_neighbors=5) dir = input('Give Data Directory: ') if int(action) == 1: print('Loading Data') PopularityClassifier.loadData(dir) PopularityClassifier.youtubePopular(dir,clf,0) PopularityClassifier.twitterPopular(dir,clf,0) PopularityClassifier.bothPopular(dir,clf,0) elif int(action) == 2: print('Loading Data') ViralityClassifier.loadData(dir) ViralityClassifier.youtubeViral(dir,clf,0) ViralityClassifier.twitterViral(dir,clf,0) ViralityClassifier.bothViral(dir,clf,0) else: print('Loading Data') ViralityAndPopularityClassifier.loadData(dir) ViralityAndPopularityClassifier.youtubeViralAndPopular(dir,clf,0) ViralityAndPopularityClassifier.twitterViralAndPopular(dir,clf,0) ViralityAndPopularityClassifier.bothViralAndPopular(dir,clf,0)
def test_init(self): """ Testing the init method """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer(model) self.assertEquals(viz.step_size, 0.0025) self.assertEqual(viz.name, 'KNeighborsClassifier') self.assertEqual(viz.estimator, model) self.assertIsNone(viz.classes_) self.assertIsNone(viz.features_) self.assertIsNotNone(viz.markers) self.assertIsNotNone(viz.scatter_alpha) self.assertTrue(viz.show_scatter) self.assertIsNone(viz.Z) self.assertIsNone(viz.xx) self.assertIsNone(viz.yy) self.assertIsNone(viz.class_labels) self.assertIsNone(viz.title) self.assertIsNone(viz.x) self.assertIsNone(viz.y)
def test_draw_ax_show_scatter_False(self): """Test that the matplotlib functions are being called when the scatter plot isn't drawn """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( model, features=['one', 'two'], show_scatter=False) fitted_viz = viz.fit(X_two_cols, y=y) fitted_viz.ax = mock.Mock() fitted_viz.ax.pcolormesh = mock.MagicMock() fitted_viz.ax.scatter = mock.MagicMock() fitted_viz.ax.legend = mock.MagicMock() fitted_viz.draw(X_two_cols, y=y) self.assertEquals(len(fitted_viz.ax.pcolormesh.mock_calls), 1) self.assertEquals(len(fitted_viz.ax.scatter.mock_calls), 0) self.assertEquals(len(fitted_viz.ax.legend.mock_calls), 1)
def test_finalize(self): model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( model, features=['one', 'two'], show_scatter=False) fitted_viz = viz.fit(X_two_cols, y=y) fitted_viz.draw(X_two_cols, y=y) fitted_viz.ax = mock.Mock() fitted_viz.ax.legend = mock.MagicMock() fitted_viz.ax.set_xlabel = mock.MagicMock() fitted_viz.ax.set_ylabel = mock.MagicMock() fitted_viz.poof() fitted_viz.ax.legend.assert_called_once_with(loc='best', frameon=True) fitted_viz.ax.set_xlabel.assert_called_once_with('one') fitted_viz.ax.set_ylabel.assert_called_once_with('two')
def learn(x, y, test_x): weight_list = [] for j in range(len(y)): if y[j] == "0": weight_list.append(variables.weight_0_gdbt) if y[j] == "1000": weight_list.append(variables.weight_1000_gdbt) if y[j] == "1500": weight_list.append(variables.weight_1500_gdbt) if y[j] == "2000": weight_list.append(variables.weight_2000_gdbt) clf = KNeighborsClassifier(1, weight_list).fit(x, y) prediction_list = clf.predict(test_x) return prediction_list
def use_sbs_with_knn(columns, X_train, X_test, y_train, y_test): knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(X_train, y_train) k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.show() k5 = list(sbs.subsets_[8]) print(columns[1:][k5]) knn.fit(X_train, y_train) print("Training accuracy: %s" % knn.score(X_train, y_train)) print("Test accuracy: %s" % knn.score(X_test, y_test)) knn.fit(X_train[:, k5], y_train) print("Training accuracy: %s" % knn.score(X_train[:, k5], y_train)) print("Test accuracy: %s" % knn.score(X_test[:, k5], y_test))
def knn_cv(post_features, post_class, n_folds, n_neighbors, length_dataset = -1): if(length_dataset == -1): length_dataset = len(post_class) cv = KFold(n = length_dataset, n_folds = n_folds, shuffle = True) train_accuracy = [] test_accuracy = [] for train,test in cv: knn = neighbors.KNeighborsClassifier(n_neighbors = n_neighbors) knn.fit(post_features[train],post_class[train]) train_accuracy.append(knn.score(post_features[train], post_class[train])) test_accuracy.append(knn.score(post_features[test], post_class[test])) # return (sum(train_accuracy)/n_folds), (sum(test_accuracy)/n_folds) return np.mean(train_accuracy), np.mean(test_accuracy)
def performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): """ KNN binary Classification """ clf = KNeighborsClassifier(3) clf.fit(X_train, y_train) if savemodel == True: #fname_out = '{}-{}.pickle'.format(fout, datetime.now().date()) fname_out = fout+'.pickle' with open(fname_out, 'wb') as f: pickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) return accuracy
def cross_validation(): x_train, x_test, y_train, y_test = load_data() k_lst = list(range(1, 30)) lst_scores = [] for k in k_lst: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy') lst_scores.append(scores.mean()) # changing to misclassification error MSE = [1 - x for x in lst_scores] optimal_k = k_lst[MSE.index(min(MSE))] print "The optimal number of neighbors is %d" % optimal_k # plot misclassification error vs k # plt.plot(k_lst, MSE) # plt.ylabel('Misclassification Error') plt.plot(k_lst, lst_scores) plt.xlabel('Number of Neighbors K') plt.ylabel('correct classification rate') plt.show()
def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. for algorithm in ALGORITHMS: clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm=algorithm) clf.fit(iris.data, iris.target) assert_array_equal(clf.predict(iris.data), iris.target) clf.set_params(n_neighbors=9, algorithm=algorithm) clf.fit(iris.data, iris.target) assert_true(np.mean(clf.predict(iris.data) == iris.target) > 0.95) rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm) rgs.fit(iris.data, iris.target) assert_greater(np.mean(rgs.predict(iris.data).round() == iris.target), 0.95)
def test_neighbors_digits(): # Sanity check on the digits dataset # the 'brute' algorithm has been observed to fail if the input # dtype is uint8 due to overflow in distance calculations. X = digits.data.astype('uint8') Y = digits.target (n_samples, n_features) = X.shape train_test_boundary = int(n_samples * 0.8) train = np.arange(0, train_test_boundary) test = np.arange(train_test_boundary, n_samples) (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test] clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='brute') score_uint8 = clf.fit(X_train, Y_train).score(X_test, Y_test) score_float = clf.fit(X_train.astype(float), Y_train).score( X_test.astype(float), Y_test) assert_equal(score_uint8, score_float)
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average='micro') scoring_macro = make_scorer(precision_score, average='macro') scoring_samples = make_scorer(precision_score, average='samples') score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cval.cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def knn_clf(observations, n_neighbors): # ?????? range1 = [20, 30] len1 = len(range(range1[0], range1[1])) range2 = [110, 120] len2 = len(range(range2[0], range2[1])) training_index = list(range(range1[0], range1[1])) + list(range(range2[0], range2[1])) training_data = observations[training_index, :] training_label = np.ones(len1+len2, dtype='int32') training_label[len1:] = 2 # ?????? knn = KNeighborsClassifier(n_neighbors = 3)#, weights = 'distance') knn.fit(training_data, training_label) # ?? knn_pre = knn.predict(observations) print('????????') for i in range(8): print(knn_pre[i*10:(i+1)*10]) print('????????????') for i in range(8,12): print(knn_pre[i*10:(i+1)*10])
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def __init__(self, config = conf, split = 0.3, clf = KNeighborsClassifier(), auto_rebuild = False, debug = False): self.clf = clf self.conf = conf self.split = split self.debug = debug self.auto_rebuild = auto_rebuild self.init()
def __init__(self, conf = conf, clf = KNeighborsClassifier(), debug = False): self.clf = clf self.conf = conf self.debug = debug self.base = os.path.dirname(os.path.realpath(__file__)) self.vote_db = {} self.letter_db = {} self.writer_db = {} self.total = self.right = 0
def knn(train, test, smoteit=True): "kNN" if smoteit: train = SMOTE(train) neigh = KNeighborsClassifier() train_DF = formatData(train) test_DF = formatData(test) features = train_DF.columns[:-2] klass = train_DF[train_DF.columns[-2]] # set_trace() neigh.fit(train_DF[features], klass) preds = neigh.predict(test_DF[test_DF.columns[:-2]]).tolist() return preds
def setClf(self): clf = KNeighborsClassifier(n_neighbors = 33) min_max_scaler = preprocessing.MinMaxScaler() self.clf = Pipeline([('scaler', min_max_scaler), ('estimator', clf)]) return
def lession_4(): iris = datasets.load_iris() iris_X = iris.data iris_y = iris.target # print iris_X[:2] # print iris_y X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size=0.3) knn = KNeighborsClassifier() knn.fit(X_train,y_train) print knn.predict(X_test) print y_test # dataset usage
def knn_classify(self, out_folder_path, training_set, test_set, training_labels, test_labels, k=1, msg=""): print("message: " + msg) out_file_pre_path = os.path.join(out_folder_path, "knn" + str(k) + msg) # Any output file should extend this path knn_classifier = neighbors.KNeighborsClassifier(k, weights='distance') knn_classifier.fit(training_set, training_labels) predicted = knn_classifier.predict(test_set) success = accuracy_score(test_labels, predicted, normalize=False) conf_matrix = self.__retrieve_confusion_matrix(test_labels, predicted, out_file_pre_path) return conf_matrix, success
def __init__(self): from sklearn.neighbors import KNeighborsClassifier as KNN self.clf = KNN()
def KnnClass(x_train,y_train): from sklearn.neighbors import KNeighborsClassifier clf=KNeighborsClassifier() clf.fit(x_train,y_train) return clf #========Decision Tree ========#
def get_models(test): return [ (LinearSVC, { 'C': [0.01, 0.1, 1.0, 10.0], 'multi_class': ['ovr', 'crammer_singer'], }), ] + ([ (KNeighborsClassifier, { 'weights': ['uniform', 'distance'], }), (SVC, { 'C': [0.01, 0.1, 1.0, 10.0, 100.0], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'decision_function_shape': ['ovr', 'ovo'], }), (RandomForestClassifier, { 'criterion': ['gini', 'entropy'], 'min_samples_split': [5, 10, 25], 'min_samples_leaf': [5, 10, 25], 'n_estimators': [5, 10, 50, 100], }) ] if not test else [])
def n4_non_linearity_of_nearest_neighbor_classifier( data, random_seed = 42, iterations = 20 ): def generate_interpolated_data_cl(data, cl, features, labels): points_in_class = data[data[labels] == cl].index.tolist() data_interpolated = pd.DataFrame(columns = features + [labels]) for a, b in random_combinations(points_in_class): new_point = linear_interpolation(data.iloc[a, :-1], data.iloc[b, :-1] ) df = pd.DataFrame([new_point + [cl]], columns = features + [labels] ) data_interpolated = data_interpolated.append(df) return data_interpolated def get_n4_for_iteration(data): labels = data.columns[-1] features = data.columns[:-1,].tolist() classes = data.iloc[:, -1].unique() data_to_interpolate = data.copy() knn = KNeighborsClassifier(n_neighbors=1) knn.fit(data[features], data[labels]) for cl in classes: data_interpolated = generate_interpolated_data_cl(data_to_interpolate, cl, features, labels) mistakes = 1 - knn.score(data_interpolated[features], data_interpolated[labels]) return mistakes random.seed( random_seed ) n4 = [] for i in range(iterations): mistakes = get_n4_for_iteration(data) n4.append(mistakes) return np.mean(n4)
def __init__(self,data_file): self.file = data_file df = pd.read_csv(data_file) X = np.array(df.drop(['class'], 1)) y = np.array(df['class']) self.size = sum(1 for line in open(data_file)) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) clf = neighbors.KNeighborsClassifier() self.prediction = clf.fit(X_train, y_train)
def KNN(X, y): print("Iniciando treinamento do KNN") clf = KNeighborsClassifier(n_jobs=6,leaf_size=15) kf = KFold(len(y),n_folds=20) clf.fit(X,y) X_score = X[:10000] y_score = y[:10000] score = clf.score(X_score, y_score) print("KNN score: ", score) return clf
def knn_model(X, y): neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, y) return neigh
def classification(lead): #classifiers = [ # ('ab', AdaBoostClassifier()), # ('dt', DecisionTreeClassifier(max_depth=5)), # ('kn', KNeighborsClassifier(16)), #] inputs = get_dataset_input_from_database(lead.keys()) outputs = get_dataset_output_from_database() print('The total number of examples in the dataset is: %d' % (len(inputs))) inputs_training, inputs_test, outputs_training, outputs_test = train_test_split(inputs, outputs, test_size=0.3, random_state=42) print('The number of examples used for training are: %d' % (len(inputs_training))) print('The number of examples used for testing are: %d' % (len(inputs_test))) knn = KNeighborsClassifier(n_neighbors=7, p=2) knn.fit(inputs_training, np.ravel(outputs_training)) print('[K=7] The probability of the algorithm to be right is: %f%%' % (knn.score(inputs_test, outputs_test) * 100)) #voting_classifier = VotingClassifier(estimators=classifiers, voting='hard') #voting_classifier = voting_classifier.fit(inputs_training, np.ravel(outputs_training)) #print('The probability of the machine to be right is: %f%%' % (voting_classifier.score(inputs_test, outputs_test) * 100)) print('Lead data:') print(lead) data_to_predict = convert_dict_to_tuple(lead) print('Lead data to predict:') print(data_to_predict) lead_status = knn.predict(data_to_predict) lead_status_value = lead_status[0] #lead_status = voting_classifier.predict(data_to_predict) print('According to lead data, his status is: %d' % (lead_status_value)) print('[0] unqualified [1] qualified') proba = knn.predict_proba(data_to_predict) max_proba = max(proba[0]) print('Proba is: %d%%' %(max_proba*100)) lead_status_dict = dict() dict.update(lead_status_dict, value=str(lead_status_value)) dict.update(lead_status_dict, proba=str(max_proba)) return lead_status_dict
def knn_predict(training_samples, training_labels, test_samples, test_lables,k_neighbours = 5,weights = "uniform",algorithm = "auto"): from sklearn.neighbors import KNeighborsClassifier clf = KNeighborsClassifier(n_neighbors = k_neighbours, weights =weights, algorithm = algorithm) t0 = time() clf.fit(training_samples,training_labels) training_time = round(time()-t0, 3) t0 = time() pred = clf.predict(test_samples) test_time = round(time()-t0, 3) from sklearn.metrics import accuracy_score acc = accuracy_score(pred,test_lables) no_features = np.array(training_samples).shape[1] training_samples = np.array(training_samples).shape[0] test_samples = np.array(test_samples).shape[0] with open("Temp\\results.txt","w") as outfile: outfile.write("Alogirthm : {}\n".format("KNN")) outfile.write("K = {}\n".format(k_neighbours)) outfile.write("weight = {}\n".format(weights)) outfile.write("algorithm = {}\n".format(algorithm)) outfile.write("No of features : {}\n".format(no_features)) outfile.write("No of training samples : {}\n".format(training_samples)) outfile.write("No of test samples : {}\n".format(test_samples)) outfile.write("Training time : {}\n".format(training_time)) outfile.write("Test time : {}\n".format(test_time)) outfile.write("Accuracy : {}\n".format(acc)) with open("Temp\\result_labels.csv","wb") as outfile: np.savetxt(outfile,pred)
def train_random_forest(): # Selecting the model return mp.ModelProperties(), RandomForestClassifier(n_estimators=100) # Default estimators is 10 # http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
def train_knn(): # Selecting the model return mp.ModelProperties(), neighbors.KNeighborsClassifier() # default is 5 neighbors # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn-svm-svc
def __init__(self, isTrain, isOutlierRemoval=0): super(ClassificationKNN, self).__init__(isTrain, isOutlierRemoval) # data preprocessing self.dataPreprocessing() # first parameter is the K neighbors # 'uniform' assigns uniform weights to each neighbor # 'distance' assigns weights proportional to the inverse of the distance from the query point # default metric is euclidean distance self.clf = neighbors.KNeighborsClassifier(2, weights='uniform')
def build_classifier(self): self.classifier = KNeighborsClassifier(n_neighbors=1) self.classifier.fit(self.coordinates, self.labels)