我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用sklearn.ensemble.BaggingClassifier()。
def test_classification(): # Check classification for various parameter settings. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [1, 2, 4], "bootstrap": [True, False], "bootstrap_features": [True, False]}) for base_estimator in [None, DummyClassifier(), Perceptron(), DecisionTreeClassifier(), KNeighborsClassifier(), SVC()]: for params in grid: BaggingClassifier(base_estimator=base_estimator, random_state=rng, **params).fit(X_train, y_train).predict(X_test)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws]))
def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def test_base(): # Check BaseEnsemble methods. ensemble = BaggingClassifier(base_estimator=Perceptron(), n_estimators=3) iris = load_iris() ensemble.fit(iris.data, iris.target) ensemble.estimators_ = [] # empty the list and create estimators manually ensemble._make_estimator() ensemble._make_estimator() ensemble._make_estimator() ensemble._make_estimator(append=False) assert_equal(3, len(ensemble)) assert_equal(3, len(ensemble.estimators_)) assert_true(isinstance(ensemble[0], Perceptron))
def __init__(self, info, verbose=True, debug_mode=False): self.label_num=info['label_num'] self.target_num=info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = None #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba if debug_mode>=2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task']=='regression': if info['is_sparse']==True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingRegressor" self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True) self.predict_method = self.model.predict # Always predict probabilities else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start... elif info['is_sparse']: self.name = "BaggingNBClassifier" self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingClassifier" self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)") if info['task']=='multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def constructModel(corpus, classList, features, modelOutput): """ Trains a Decision Tree model on the test corpus. Args: corpus: A list of lists, containing the GC content, coverage, and class number. classList: A list of class names. features: List of variables used by each contig. modelOutput: Location to save model as GraphViz DOT, or False to save no model. Returns: classifier: A DecisionTreeClassifier object that has been trained on the test corpus. """ corpus.sort() # just in case X = [] Y = [] for item in corpus: X.append(item[:-1]) # all but the last item Y.append(item[-1]) # only the last item X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0) # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper #treeClassifier = tree.DecisionTreeClassifier() #treeClassifier = treeClassifier.fit(X_train, Y_train) #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test)) baggingClassifier = ensemble.BaggingClassifier() baggingClassifier = baggingClassifier.fit(X_train, Y_train) click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test)) #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10) #forestClassifier = forestClassifier.fit(X_train, Y_train) #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test)) #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100) #adaClassifier = adaClassifier.fit(X_train, Y_train) #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test)) #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100) #gradientClassifier = gradientClassifier.fit(X_train, Y_train) #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test)) if modelOutput: with open(modelOutput, 'w') as dotfile: tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features, class_names=classList, filled=True, rounded=True, special_characters=True) return baggingClassifier
def __init__(self, n_estimators=100, tie_break=1, default_label=0, random_state=None): """Sets up the MDR ensemble Parameters ---------- n_estimators: int (default: 100) Number of MDR models to include in the ensemble tie_break: int (default: 1) Default label in case there's a tie in a set of feature pair values default_label: int (default: 0) Default label in case there's no data for a set of feature pair values random_state: int, RandomState instance or None (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Returns ------- None """ self.n_estimators = n_estimators self.tie_break = tie_break self.default_label = default_label self.random_state = random_state self.feature_map = defaultdict(lambda: default_label) self.ensemble = BaggingClassifier(base_estimator=MDR(tie_break=tie_break, default_label=default_label), n_estimators=n_estimators, random_state=random_state)
def __init__(self): self.learner = BaggingClassifier(KNeighborsClassifier())
def exportPresentationData(classifier,action): dir = input('Give Data Directory: ') if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def exportPresentationData(classifier,action,dir): if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def learn(x, y, test_x): clf = BaggingClassifier(KNeighborsClassifier(1, 'distance'), max_samples=variables.max_samples_knnBag, max_features=variables.max_features_knnBag, n_jobs=variables.n_jobs_knnBag, n_estimators=variables.n_estimators_knnBag, bootstrap=variables.bootstrap_knnBag, bootstrap_features=variables.bootstrap_features_knnBag, random_state=variables.random_knnBag ).fit(x, y) prediction_list = clf.predict(test_x) return prediction_list
def runner(i): sem.acquire() print("learn begin %s" % i) clf = ensemble.BaggingClassifier(naive_bayes.GaussianNB()) clf = clf.fit(traindata, trainlabel[i]) svms.append((i, clf)) result[i] = clf.predict_proba(testdata) dbresult[i] = clf.predict_proba(dbdata) #print("label %s done\n%s" # % (i, metrics.classification_report(testlabel[i], result[i]))) #print metrics.confusion_matrix(testlabel[i], result) sem.release()
def trainerb(traindata, trainlabel): clf = ensemble.BaggingClassifier( linear_model.LogisticRegression()) #clf.verbose = 1 #clf.tol = clf.tol / 10 clf = clf.fit(traindata, trainlabel) return clf
def runner(i): sem.acquire() print("learn begin %s" % i) clf = ensemble.BaggingClassifier(svm.LinearSVC()) clf = clf.fit(traindata, trainlabel[i]) svms.append((i, clf)) result[i] = clf.predict_proba(testdata) dbresult[i] = clf.predict_proba(dbdata) #print("label %s done\n%s" # % (i, metrics.classification_report(testlabel[i], result[i]))) #print metrics.confusion_matrix(testlabel[i], result) sem.release()
def runner(i): sem.acquire() print("learn begin %s" % i) clf = ensemble.BaggingClassifier(neighbors.KNeighborsClassifier()) clf = clf.fit(traindata, trainlabel[i]) svms.append((i, clf)) result[i] = clf.predict_proba(testdata) dbresult[i] = clf.predict_proba(dbdata) print("label %s done\n%s" % (i, metrics.classification_report(testlabel[i], result[i]))) #print metrics.confusion_matrix(testlabel[i], result) sem.release()
def set_bagging_classifier(self): return SkLearner(ensemble.BaggingClassifier(tree.DecisionTreeClassifier()))
def test_probability(): # Predict probabilities. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=rng).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BaggingClassifier(base_estimator=LogisticRegression(), random_state=rng, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def test_error(): # Test that it gives proper exception on deficient input. X, y = iris.data, iris.target base = DecisionTreeClassifier() # Test max_samples assert_raises(ValueError, BaggingClassifier(base, max_samples=-1).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples=0.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples=2.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples=1000).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples="foobar").fit, X, y) # Test max_features assert_raises(ValueError, BaggingClassifier(base, max_features=-1).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features=0.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features=2.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features=5).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features="foobar").fit, X, y) # Test support of decision_function assert_false(hasattr(BaggingClassifier(base).fit(X, y), 'decision_function'))
def test_gridsearch(): # Check that bagging ensembles can be grid-searched. # Transform iris into a binary classification task X, y = iris.data, iris.target y[y == 2] = 1 # Grid search with scoring based on decision_function parameters = {'n_estimators': (1, 2), 'base_estimator__C': (1, 2)} GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y)
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target)
def test_bagging_sample_weight_unsupported_but_passed(): estimator = BaggingClassifier(DummyZeroEstimator()) rng = check_random_state(0) estimator.fit(iris.data, iris.target).predict(iris.data) assert_raises(ValueError, estimator.fit, iris.data, iris.target, sample_weight=rng.randint(10, size=(iris.data.shape[0])))
def test_warm_start_with_oob_score_fails(): # Check using oob_score and warm_start simultaneously fails X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) assert_raises(ValueError, clf.fit, X, y)
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def challenge(): ## use dev openml to run # Download task, run learner, publish results task = tasks.get_task(14951) ## clf = BaggingClassifier(SVC(), n_estimators = 128) ''' clf = RandomForestClassifier(n_estimators = 128, class_weight = 'balanced_subsample') ''' ''' clf = BaggingClassifier(ExtraTreeClassifier(), n_estimators = 20) ''' ''' param_grid = {'max_depth': np.linspace(1, 15, num = 15, dtype = np.int64), 'class_weight': ['balanced', 'balanced_subsample', None], 'min_samples_split': np.linspace(1, 15, num = 15, dtype = np.int64), 'criterion': ['gini', 'entropy'] } base_clf = RandomForestClassifier(n_estimators = 20) clf = GridSearchCV(base_clf, param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4) ''' ''' ## grid search - gamma and C, grid_den = 20, time needed = 13.36s grid_den = 1 param_grid = {#'C': np.logspace(-5, 5, num = grid_den, base = 2.0), 'gamma': np.logspace(-5, 5, num = grid_den, base = 2.0) } clf = GridSearchCV(SVC(probability = True), param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4) ''' clf = KNeighborsClassifier(n_neighbors = 5, algorithm = 'brute', metric = 'cosine') run = runs.run_task(task, clf) return_code, response = run.publish() # get the run id for reference if(return_code == 200): response_dict = xmltodict.parse(response) run_id = response_dict['oml:upload_run']['oml:run_id'] print("Uploaded run with id %s. Check it at www.openml.org/r/%s" % (run_id,run_id))
def test_sparse_classification(): # Check classification for various parameter settings on sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set""" def fit(self, X, y): super(CustomSVC, self).fit(X, y) self.data_type_ = type(X) return self rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) parameter_sets = [ {"max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True}, {"max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True}, {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: for f in ['predict', 'predict_proba', 'predict_log_proba', 'decision_function']: # Trained on sparse format sparse_classifier = BaggingClassifier( base_estimator=CustomSVC(decision_function_shape='ovr'), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = getattr(sparse_classifier, f)(X_test_sparse) # Trained on dense format dense_classifier = BaggingClassifier( base_estimator=CustomSVC(decision_function_shape='ovr'), random_state=1, **params ).fit(X_train, y_train) dense_results = getattr(dense_classifier, f)(X_test) assert_array_equal(sparse_results, dense_results) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([t == sparse_type for t in types])
def test_base_estimator(): # Check base_estimator and its default values. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, Perceptron)) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, SVR))