我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.ensemble.AdaBoostClassifier()。
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def get_classifier_class(class_name): name_table = { 'svm': SVC, 'k_neighbors': KNeighborsClassifier, 'gaussian_process': GaussianProcessClassifier, 'decision_tree': DecisionTreeClassifier, 'random_forest': RandomForestClassifier, 'ada_boost': AdaBoostClassifier, 'mlp': MLPClassifier, 'gaussian_naive_bayes': GaussianNB, 'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis } if class_name not in name_table: raise ValueError('No such classifier') return name_table[class_name]
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def __init__(self, isTrain, isOutlierRemoval): super(ClassificationAdaBoost, self).__init__(isTrain, isOutlierRemoval) # data preprocessing self.dataPreprocessing() self.dt_stump = DecisionTreeClassifier(max_depth=10) self.ada = AdaBoostClassifier( base_estimator=self.dt_stump, learning_rate=1, n_estimators=7, algorithm="SAMME.R") # self.dt_stump = DecisionTreeClassifier(max_depth=14) # self.ada = AdaBoostClassifier( # base_estimator=self.dt_stump, # learning_rate=1, # n_estimators=50, # algorithm="SAMME")
def __init__( self,data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=AdaBoostClassifier(), data_block=data_block, predictors=predictors,cv_folds=cv_folds, scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics ) self.model_output = pd.Series(self.default_parameters) self.model_output['Feature_Importance'] = "-" #Set parameters to default values: self.set_parameters(set_default=True)
def get_classifier(self): algo=self.algo if algo=="GBT": return GradientBoostingClassifier() elif algo=="RF": return RandomForestClassifier() elif algo=="ADB": return AdaBoostClassifier() elif algo =="DT": return DecisionTreeClassifier() elif algo=="NB": return BernoulliNB() elif algo=="SGD": return SGDClassifier() elif algo=="SVC": return LinearSVC() elif algo=="MLPC": return MLPClassifier(activation='logistic', batch_size='auto', early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, random_state=1, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) return 0
def performAdaBoostClass(X_train, y_train, X_test, y_test, fout, savemodel): """ Ada Boosting binary Classification """ # n = parameters[0] # l = parameters[1] clf = AdaBoostClassifier() clf.fit(X_train, y_train) # if savemodel == True: # fname_out = '{}-{}.pickle'.format(fout, datetime.now()) # with open(fname_out, 'wb') as f: # cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) print "AdaBoost: ", accuracy
def buildModel(dataset, method, parameters): """ Build final model for predicting real testing data """ features = dataset.columns[0:-1] if method == 'RNN': clf = performRNNlass(dataset[features], dataset['UpDown']) return clf elif method == 'RF': clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) elif method == 'KNN': clf = neighbors.KNeighborsClassifier() elif method == 'SVM': c = parameters[0] g = parameters[1] clf = SVC(C=c, gamma=g) elif method == 'ADA': clf = AdaBoostClassifier() return clf.fit(dataset[features], dataset['UpDown'])
def learn(x, y, test_x): # set sample weight weight_list = [] for j in range(len(y)): if y[j] == "0": weight_list.append(variables.weight_0_ada) if y[j] == "1000": weight_list.append(variables.weight_1000_ada) if y[j] == "1500": weight_list.append(variables.weight_1500_ada) if y[j] == "2000": weight_list.append(variables.weight_2000_ada) clf = AdaBoostClassifier(n_estimators=variables.n_estimators_ada, learning_rate=variables.learning_rate_ada).fit(x, y, np.asarray( weight_list)) prediction_list = clf.predict(test_x) prediction_list_prob = clf.predict_proba(test_x) return prediction_list, prediction_list_prob
def init_clf(clf_used, params=None): if params is not None: params_used = params elif clf_used == 'svm': params_used = svm_params elif clf_used == 'ada_boost': params_used = rf_params elif clf_used == 'lr': params_used = lr_params else: params_used = rf_params if clf_used == 'svm': clf = SVC(**params_used) elif clf_used == 'ada_boost': rf = RandomForestClassifier(**rf_params) clf = AdaBoostClassifier(base_estimator=rf, **params_used) elif clf_used == 'lr': clf = LogisticRegressionCV(**params_used) else: clf = RandomForestClassifier(**params_used) return clf
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): """ Ada Boosting binary Classification """ # n = parameters[0] # l = parameters[1] clf = AdaBoostClassifier() clf.fit(X_train, y_train) if savemodel == True: #fname_out = '{}-{}.pickle'.format(fout, datetime.now()) fname_out = fout + '.pickle' with open(fname_out, 'wb') as f: pickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) return accuracy
def test_AdaBoostClassifier(*data): ''' test Ada score with different number of classifiers :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data clf=ensemble.AdaBoostClassifier(learning_rate=0.1) clf.fit(X_train,y_train) ## graph fig=plt.figure() ax=fig.add_subplot(1,1,1) estimators_num=len(clf.estimators_) X=range(1,estimators_num+1) ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score") ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score") ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("AdaBoostClassifier") plt.show()
def test_AdaBoostClassifier_learning_rate(*data): ''' test performance with different learning rate :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data learning_rates=np.linspace(0.01,1) fig=plt.figure() ax=fig.add_subplot(1,1,1) traing_scores=[] testing_scores=[] for learning_rate in learning_rates: clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,n_estimators=500) clf.fit(X_train,y_train) traing_scores.append(clf.score(X_train,y_train)) testing_scores.append(clf.score(X_test,y_test)) ax.plot(learning_rates,traing_scores,label="Traing score") ax.plot(learning_rates,testing_scores,label="Testing score") ax.set_xlabel("learning rate") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("AdaBoostClassifier") plt.show()
def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier()) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2), 'algorithm': ('SAMME', 'SAMME.R')} clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0) parameters = {'n_estimators': (1, 2), 'base_estimator__max_depth': (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(boston.data, boston.target)
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def constructModel(corpus, classList, features, modelOutput): """ Trains a Decision Tree model on the test corpus. Args: corpus: A list of lists, containing the GC content, coverage, and class number. classList: A list of class names. features: List of variables used by each contig. modelOutput: Location to save model as GraphViz DOT, or False to save no model. Returns: classifier: A DecisionTreeClassifier object that has been trained on the test corpus. """ corpus.sort() # just in case X = [] Y = [] for item in corpus: X.append(item[:-1]) # all but the last item Y.append(item[-1]) # only the last item X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0) # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper #treeClassifier = tree.DecisionTreeClassifier() #treeClassifier = treeClassifier.fit(X_train, Y_train) #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test)) baggingClassifier = ensemble.BaggingClassifier() baggingClassifier = baggingClassifier.fit(X_train, Y_train) click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test)) #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10) #forestClassifier = forestClassifier.fit(X_train, Y_train) #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test)) #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100) #adaClassifier = adaClassifier.fit(X_train, Y_train) #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test)) #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100) #gradientClassifier = gradientClassifier.fit(X_train, Y_train) #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test)) if modelOutput: with open(modelOutput, 'w') as dotfile: tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features, class_names=classList, filled=True, rounded=True, special_characters=True) return baggingClassifier
def adaboost(train, test, smoteit=True): "ADABOOST" if smoteit: train = SMOTE(train) clf = AdaBoostClassifier() train_DF = formatData(train) test_DF = formatData(test) features = train_DF.columns[:-2] klass = train_DF[train_DF.columns[-2]] # set_trace() clf.fit(train_DF[features], klass) preds = clf.predict(test_DF[test_DF.columns[:-2]]).tolist() return preds
def adaBoost(self, settings, data=None, dropna=True): df = self.__loadData(data, dropna) features = df.columns[:-1] X = df[features] y = df.iloc[:, -1].values seed = 7 num_trees = 500 kfold = model_selection.KFold(n_splits=10, random_state=seed) print kfold model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) results = model_selection.cross_val_score(model, X, y, cv=kfold) model.fit(X, y) print results.mean() print model.score(X, y) return True
def classification(lead): #classifiers = [ # ('ab', AdaBoostClassifier()), # ('dt', DecisionTreeClassifier(max_depth=5)), # ('kn', KNeighborsClassifier(16)), #] inputs = get_dataset_input_from_database(lead.keys()) outputs = get_dataset_output_from_database() print('The total number of examples in the dataset is: %d' % (len(inputs))) inputs_training, inputs_test, outputs_training, outputs_test = train_test_split(inputs, outputs, test_size=0.3, random_state=42) print('The number of examples used for training are: %d' % (len(inputs_training))) print('The number of examples used for testing are: %d' % (len(inputs_test))) knn = KNeighborsClassifier(n_neighbors=7, p=2) knn.fit(inputs_training, np.ravel(outputs_training)) print('[K=7] The probability of the algorithm to be right is: %f%%' % (knn.score(inputs_test, outputs_test) * 100)) #voting_classifier = VotingClassifier(estimators=classifiers, voting='hard') #voting_classifier = voting_classifier.fit(inputs_training, np.ravel(outputs_training)) #print('The probability of the machine to be right is: %f%%' % (voting_classifier.score(inputs_test, outputs_test) * 100)) print('Lead data:') print(lead) data_to_predict = convert_dict_to_tuple(lead) print('Lead data to predict:') print(data_to_predict) lead_status = knn.predict(data_to_predict) lead_status_value = lead_status[0] #lead_status = voting_classifier.predict(data_to_predict) print('According to lead data, his status is: %d' % (lead_status_value)) print('[0] unqualified [1] qualified') proba = knn.predict_proba(data_to_predict) max_proba = max(proba[0]) print('Proba is: %d%%' %(max_proba*100)) lead_status_dict = dict() dict.update(lead_status_dict, value=str(lead_status_value)) dict.update(lead_status_dict, proba=str(max_proba)) return lead_status_dict
def adaboost_predict(training_samples, training_labels, test_samples, test_lables,n_estimators=50, learning_rate=1.0): from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators = n_estimators, learning_rate =learning_rate) t0 = time() clf.fit(training_samples,training_labels) training_time = round(time()-t0, 3) t0 = time() pred = clf.predict(test_samples) test_time = round(time()-t0, 3) from sklearn.metrics import accuracy_score acc = accuracy_score(pred,test_lables) no_features = np.array(training_samples).shape[1] training_samples = np.array(training_samples).shape[0] test_samples = np.array(test_samples).shape[0] with open("Temp\\results.txt","w") as outfile: outfile.write("Alogirthm : {}\n".format("Adaboost")) outfile.write("Estimators = {}\n".format(n_estimators)) outfile.write("Learning rate = {}\n".format(learning_rate)) outfile.write("No of features : {}\n".format(no_features)) outfile.write("No of training samples : {}\n".format(training_samples)) outfile.write("No of test samples : {}\n".format(test_samples)) outfile.write("Training time : {}\n".format(training_time)) outfile.write("Test time : {}\n".format(test_time)) outfile.write("Accuracy : {}\n".format(acc)) with open("Temp\\result_labels.csv","wb") as outfile: np.savetxt(outfile,pred)
def __init__(self, isTrain): super(ClassificationHmmGeneralize, self).__init__(isTrain) # data preprocessing self.dataPreprocessing() self.dt_stump = DecisionTreeClassifier(max_depth=10) self.ada = AdaBoostClassifier( base_estimator=self.dt_stump, learning_rate=1, n_estimators=5, algorithm="SAMME.R") # load the general data # feature 0~7: flight number dummy variables # feature 8: departure date; feature 9: observed date state; # feature 10: minimum price; feature 11: maximum price # feature 12: output; feature 13: current price # feature 14: flight index self.X_general = np.load('inputGeneralClf_HmmParsed/X_train.npy') self.y_general = np.load('inputGeneralClf_HmmParsed/y_train.npy') self.y_general = self.y_general.reshape((self.y_general.shape[0], 1)) self.y_general_price = np.load('inputGeneralClf_HmmParsed/y_train_price.npy') self.y_general_price = self.y_general_price.reshape((self.y_general_price.shape[0], 1)) self.y_general_index = np.load('inputGeneralClf_HmmParsed/y_index.npy') self.y_general_index = self.y_general_index.reshape((self.y_general_index.shape[0], 1)) self.routes_general = ["BGY_OTP", # route 1 "BUD_VKO", # route 2 "CRL_OTP", # route 3 "CRL_WAW", # route 4 "LTN_OTP", # route 5 "LTN_PRG", # route 6 "OTP_BGY", # route 7 "OTP_CRL", # route 8 "OTP_LTN", # route 9 "PRG_LTN", # route 10 "VKO_BUD", # route 11 "WAW_CRL"] # route 12
def get_data_preprocessor_balancing(params, y): d_balancing = params['layer_dict_list'][1] if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = None # for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier'] params['sample_weight'] = None elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = 'auto' # for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier'] if len(y.shape) > 1: offsets = [2 ** i for i in range(y.shape[1])] y_ = np.sum(y * offsets, axis=1) else: y_ = y unique, counts = np.unique(y_, return_counts=True) cw = 1. / counts cw = cw / np.mean(cw) sample_weight = np.ones(y_.shape) for i, ue in enumerate(unique): mask = y_ == ue sample_weight[mask] *= cw[i] params['sample_weight'] = sample_weight return params
def generate_filter(X_train, y_train): # clf = RidgeClassifierCV(alphas=[0.01, 0.1, 1, 10]) clf = RandomForestClassifier(n_jobs=4) # clf = AdaBoostClassifier() clf.fit(X_train, y_train) return clf
def perform_adaboost(self,X_train_std,y_train,X_test_std, y_test): ##perform adaboost ada = AdaBoostClassifier(n_estimators=10) ada.fit(X_train_std, y_train) train_score=cross_val_score(ada,X_train_std, y_train) print('The training accuracy is {:.2f}%'.format(train_score.mean()*100)) test_score=cross_val_score(ada,X_test_std, y_test) print('The test accuracy is {:.2f}%'.format(test_score.mean()*100)) X=X_test_std y=y_test resolution=0.01 #Z = svm.predict(np.array([xx1.ravel(), xx2.ravel()]).T) markers = ('s', 'x', 'o', '^', 'v') colors = ('red', 'blue', 'green', 'gray', 'cyan') cmap = ListedColormap(colors[:len(np.unique(y_test))]) X=X_test_std y=y_test # plot the decision surface x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution)) Z = ada.predict(np.array([xx1.ravel(), xx2.ravel()]).T) Z = Z.reshape(xx1.shape) plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) for idx, cl in enumerate(np.unique(y)): plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.5, c=cmap(idx), marker=markers[idx], label=cl) plt.show()
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(self.trainingSet, self.trainingLabel) pred_labels = {} for user in self.testDict: pred_labels[user] = classifier.predict([self.model.docvecs[user]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) return pred_labels # classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, # max_depth=1, random_state=0) # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'GBDT:' # print classification_report(self.testLabel, pred_labels) # # clf = AdaBoostClassifier(n_estimators=100) # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'AdaBoost:' # print classification_report(self.testLabel, pred_labels) # # clf = RandomForestClassifier(n_estimators=10) # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Random Forest:' # print classification_report(self.testLabel, pred_labels)
def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, savemodel): """ Ada Boosting binary Classification """ # n = parameters[0] # l = parameters[1] clf = AdaBoostClassifier() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) return accuracy
def test_sample_weight_elm(): """Smoke test - AdaBoostClassifier should work with ELMClassifer.""" X = Xdigits_binary[:50] y = ydigits_binary[:50] elm = ELMClassifier(n_hidden=20) clf = AdaBoostClassifier(n_estimators=3, base_estimator=elm) clf.fit(X, y) assert_greater(clf.score(X, y), 0.9)
def getBestOne(self, name): # if the classifier has already generated try: from sklearn.externals import joblib clf = joblib.load(name + '.pkl') return clf except: pass # if the classifier is not exists # search for the best loop time bestAccuracyRate, n_estimators = 0, 1 for loopTimes in range(2, 200): sclf = AdaBoostClassifier(base_estimator=self.clf, learning_rate=1, n_estimators=loopTimes, algorithm='SAMME') # cross validation to get the score X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.1, random_state=0) sclf.fit(X_train, Y_train) accuracyRate = sclf.score(X_test, Y_test) if accuracyRate > bestAccuracyRate: bestAccuracyRate = accuracyRate n_estimators = loopTimes # save the classifier as a dump joblib.dump(sclf, name + '.pkl') return AdaBoostClassifier(base_estimator=self.clf, learning_rate=1, n_estimators=n_estimators, algorithm='SAMME')
def ada_boost_classifier(self, data, target, learning_rate=1, n_estimators=400, enable_ada=False): ada_boost = AdaBoostClassifier( base_estimator=self.clf, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_boost.fit(data, target) if not enable_ada: self.clf = ada_boost print "AdaBoost training finished"
def ada_boost_classifier_err(self, data, target, learning_rate=1, n_estimators=400, show_score=False): ada_boost = AdaBoostClassifier( base_estimator=self.clf, learning_rate=learning_rate, n_estimators=n_estimators, algorithm="SAMME.R") ada_boost.fit(data, target) score = ada_boost.score(data, target) if not show_score: print "Fitness score: " + str(score) return 1.0 - score
def varius_classifiers(): # List of tuples of a classifier and its parameters. clf_list = [] clf_linearsvm = LinearSVC() params_linearsvm = {"C": [0.5, 1, 5, 10, 100, 10**10],"tol":[0.1, 0.0000000001],"class_weight":['balanced']} clf_list.append( (clf_linearsvm, params_linearsvm) ) clf_tree = DecisionTreeClassifier() params_tree = { "min_samples_split":[2, 5, 10, 20],"criterion": ('gini', 'entropy')} clf_list.append( (clf_tree, params_tree) ) clf_random_tree = RandomForestClassifier() params_random_tree = { "n_estimators":[2, 3, 5],"criterion": ('gini', 'entropy')} clf_list.append( (clf_random_tree, params_random_tree) ) clf_adaboost = AdaBoostClassifier() params_adaboost = { "n_estimators":[20, 30, 50, 100]} clf_list.append( (clf_adaboost, params_adaboost) ) clf_knn = KNeighborsClassifier() params_knn = {"n_neighbors":[2, 5], "p":[2,3]} clf_list.append( (clf_knn, params_knn) ) clf_log = LogisticRegression() params_log = {"C":[0.5, 1, 10, 10**2,10**10, 10**20],"tol":[0.1, 0.00001, 0.0000000001],"class_weight":['balanced']} clf_list.append( (clf_log, params_log) ) clf_lda = LinearDiscriminantAnalysis() params_lda = {"n_components":[0, 1, 2, 5, 10]} clf_list.append( (clf_lda, params_lda) ) logistic = LogisticRegression() rbm = BernoulliRBM() clf_rbm = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) params_rbm = {"logistic__tol":[0.0000000001, 10**-20],"logistic__C":[0.05, 1, 10, 10**2,10**10, 10**20],"logistic__class_weight":['balanced'],"rbm__n_components":[2,3,4]} clf_list.append( (clf_rbm, params_rbm) ) return clf_list
def exportPresentationData(classifier,action): dir = input('Give Data Directory: ') if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def exportPresentationData(classifier,action,dir): if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def define_clfs_params(self): ''' Defines all relevant parameters and classes for classfier objects. Edit these if you wish to change parameters. ''' # These are the classifiers self.clfs = { 'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1), 'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'), 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200), 'LR': LogisticRegression(penalty = 'l1', C = 1e5), 'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0), 'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10), 'NB': GaussianNB(), 'DT': DecisionTreeClassifier(), 'SGD': SGDClassifier(loss = 'log', penalty = 'l2'), 'KNN': KNeighborsClassifier(n_neighbors = 3) } # These are the parameters which will be run through self.params = { 'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]}, 'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]}, 'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]}, 'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]}, 'NB': {}, 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]}, 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} }
def runner(i): sem.acquire() print("learn begin %s" % i) clf = ensemble.AdaBoostClassifier(naive_bayes.GaussianNB()) clf = clf.fit(traindata, trainlabel[i]) svms.append((i, clf)) result[i] = clf.predict_proba(testdata) dbresult[i] = clf.predict_proba(dbdata) #print("label %s done\n%s" # % (i, metrics.classification_report(testlabel[i], result[i]))) #print metrics.confusion_matrix(testlabel[i], result) sem.release()
def runner(i): sem.acquire() print("learn begin %s" % i) clf = ensemble.AdaBoostClassifier(svm.LinearSVC()) clf = clf.fit(traindata, trainlabel[i]) svms.append((i, clf)) result[i] = clf.predict_proba(testdata) dbresult[i] = clf.predict_proba(dbdata) #print("label %s done\n%s" # % (i, metrics.classification_report(testlabel[i], result[i]))) #print metrics.confusion_matrix(testlabel[i], result) sem.release()
def __init__(self, genres, data, type='knn', name='', clf_kwargs=None): self.logger = get_logger('classifier') self.display_name = name self.genres = genres self.m_genres = { genre:i for i, genre in enumerate(genres) } self.randstate = np.random.RandomState() self.scaler = StandardScaler() clf_kwargs = { } if not clf_kwargs else clf_kwargs if type in ['svm', 'mlp']: clf_kwargs['random_state'] = self.randstate if type == 'knn': self.proto_clf = KNeighborsClassifier(**clf_kwargs) elif type == 'svm': self.proto_clf = SVC(**clf_kwargs) elif type == 'dtree': self.proto_clf = DecisionTreeClassifier(**clf_kwargs) elif type == 'gnb': self.proto_clf = GaussianNB(**clf_kwargs) elif type == 'perc': self.proto_clf = Perceptron(**clf_kwargs) elif type == 'mlp': self.proto_clf = MLPClassifier(**clf_kwargs) elif type == 'ada': self.proto_clf = AdaBoostClassifier(**clf_kwargs) else: raise LookupError('Classifier type "{}" is invalid'.format(type)) self._convert_data(data) self.logger.info('Classifier: {} (params={})'.format( self.proto_clf.__class__.__name__, clf_kwargs ))
def train_model(text_matrix, categories): # model = AdaBoostClassifier( # DecisionTreeClassifier(max_depth=3), # n_estimators=500, # algorithm="SAMME") model = RandomForestClassifier(n_estimators=100, max_depth=8) model.fit(text_matrix, categories) return model
def set_adaboost_classifier(self): return SkLearner(ensemble.AdaBoostClassifier())
def getModels(): result = [] result.append("LinearRegression") result.append("BayesianRidge") result.append("ARDRegression") result.append("ElasticNet") result.append("HuberRegressor") result.append("Lasso") result.append("LassoLars") result.append("Rigid") result.append("SGDRegressor") result.append("SVR") result.append("MLPClassifier") result.append("KNeighborsClassifier") result.append("SVC") result.append("GaussianProcessClassifier") result.append("DecisionTreeClassifier") result.append("RandomForestClassifier") result.append("AdaBoostClassifier") result.append("GaussianNB") result.append("LogisticRegression") result.append("QuadraticDiscriminantAnalysis") return result
def test_AdaBoostClassifier_base_classifier(*data): ''' test Adaboost classifier with different number of classifier, and category of classifier :param data: train_data, test_data, train_value, test_value :return: None ''' from sklearn.naive_bayes import GaussianNB X_train,X_test,y_train,y_test=data fig=plt.figure() ax=fig.add_subplot(2,1,1) clf=ensemble.AdaBoostClassifier(learning_rate=0.1) clf.fit(X_train,y_train) ## graph estimators_num=len(clf.estimators_) X=range(1,estimators_num+1) ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score") ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score") ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1) ax.set_title("AdaBoostClassifier with Decision Tree") ax=fig.add_subplot(2,1,2) clf=ensemble.AdaBoostClassifier(learning_rate=0.1,base_estimator=GaussianNB()) clf.fit(X_train,y_train) ## graph estimators_num=len(clf.estimators_) X=range(1,estimators_num+1) ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score") ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score") ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_ylim(0,1) ax.set_title("AdaBoostClassifier with Gaussian Naive Bayes") plt.show()
def test_AdaBoostClassifier_algorithm(*data): ''' test performance with different algorithm :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data algorithms=['SAMME.R','SAMME'] fig=plt.figure() learning_rates=[0.05,0.1,0.5,0.9] for i,learning_rate in enumerate(learning_rates): ax=fig.add_subplot(2,2,i+1) for i ,algorithm in enumerate(algorithms): clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate, algorithm=algorithm) clf.fit(X_train,y_train) ## ?? estimators_num=len(clf.estimators_) X=range(1,estimators_num+1) ax.plot(list(X),list(clf.staged_score(X_train,y_train)), label="%s:Traing score"%algorithms[i]) ax.plot(list(X),list(clf.staged_score(X_test,y_test)), label="%s:Testing score"%algorithms[i]) ax.set_xlabel("estimator num") ax.set_ylabel("score") ax.legend(loc="lower right") ax.set_title("learing rate:%f"%learning_rate) fig.suptitle("AdaBoostClassifier") plt.show()
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def test_classification_toy(): # Check classification on a toy dataset. for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg, random_state=0) clf.fit(X, y_class) assert_array_equal(clf.predict(T), y_t_class) assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_) assert_equal(clf.predict_proba(T).shape, (len(T), 2)) assert_equal(clf.decision_function(T).shape, (len(T),))
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert_equal(proba.shape[1], len(classes)) assert_equal(clf.decision_function(iris.data).shape[1], len(classes)) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_pickle(): # Check pickability. import pickle # Adaboost classifier for alg in ['SAMME', 'SAMME.R']: obj = AdaBoostClassifier(algorithm=alg) obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_equal(score, score2) # Adaboost regressor obj = AdaBoostRegressor(random_state=0) obj.fit(boston.data, boston.target) score = obj.score(boston.data, boston.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(boston.data, boston.target) assert_equal(score, score2)
def test_error(): # Test that it gives proper exception on deficient input. assert_raises(ValueError, AdaBoostClassifier(learning_rate=-1).fit, X, y_class) assert_raises(ValueError, AdaBoostClassifier(algorithm="foo").fit, X, y_class) assert_raises(ValueError, AdaBoostClassifier().fit, X, y_class, sample_weight=np.asarray([-1]))
def test_base_estimator(): # Test different base estimators. from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # XXX doesn't work with y_class because RF doesn't support classes_ # Shouldn't AdaBoost run a LabelBinarizer? clf = AdaBoostClassifier(RandomForestClassifier()) clf.fit(X, y_regr) clf = AdaBoostClassifier(SVC(), algorithm="SAMME") clf.fit(X, y_class) from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0) clf.fit(X, y_regr) clf = AdaBoostRegressor(SVR(), random_state=0) clf.fit(X, y_regr) # Check that an empty discrete ensemble fails in fit, not predict. X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]] y_fail = ["foo", "bar", 1, 2] clf = AdaBoostClassifier(SVC(), algorithm="SAMME") assert_raises_regexp(ValueError, "worse than random", clf.fit, X_fail, y_fail)
def test_sample_weight_missing(): from sklearn.linear_model import LogisticRegression from sklearn.cluster import KMeans clf = AdaBoostClassifier(KMeans(), algorithm="SAMME") assert_raises(ValueError, clf.fit, X, y_regr) clf = AdaBoostRegressor(KMeans()) assert_raises(ValueError, clf.fit, X, y_regr)