def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc',svm.LinearSVC()), ('knn',neighbors.KNeighborsClassifier()), ('rfor',RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:',confidence) predictions = clf.predict(X_test) print('predicted class counts:',Counter(predictions)) print() print() return confidence # examples of running:
def article_trainers(articles: ArticleDB): """ Run repeated models against article db to predict validity score for articles. """ models = [(DecisionTreeClassifier, {}), (RandomForestClassifier, {}), (LogisticRegression, {'C': [0.01, 0.1, 1, 10, 100]}), (MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}), (LinearSVC, {'C': [0.01, 0.1, 1, 10, 100]})] trained_models = [] for classifier, param_grid in models: res = train_model(articles, classifier, param_grid, probabilities=True) trained_models.append((str(res), res)) ensemble_learner = VotingClassifier(estimators=trained_models[:4], voting='soft') train_model(articles, ensemble_learner, {})
def test_estimator_init(): eclf = VotingClassifier(estimators=[]) msg = ('Invalid `estimators` attribute, `estimators` should be' ' a list of (string, estimator) tuples') assert_raise_message(AttributeError, msg, eclf.fit, X, y) clf = LogisticRegression(random_state=1) eclf = VotingClassifier(estimators=[('lr', clf)], voting='error') msg = ('Voting must be \'soft\' or \'hard\'; got (voting=\'error\')') assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr', clf)], weights=[1, 2]) msg = ('Number of classifiers and weights must be equal' '; got 2 weights, 1 estimators') assert_raise_message(ValueError, msg, eclf.fit, X, y)
def __init__(self, api, lobes=False): """ lobes = a dict of classifiers to use in the VotingClassifier defaults to RandomForestClassifier and DecisionTreeClassifier """ self.api = api if not lobes: lobes = {'rf': RandomForestClassifier(n_estimators=7, random_state=666), 'dt': DecisionTreeClassifier() } self.lobe = VotingClassifier( estimators=[(lobe, lobes[lobe]) for lobe in lobes], voting='hard', n_jobs=-1) self._trained = False self.split = splitTrainTestData self.prep = prepDataframe
def _voting(estimators, **kwargs): """Build the classifier """ clfObj = VotingClassifier([(k.shStr, k) for k in estimators], n_jobs=1, **kwargs) clfObj.lgStr = ' + '.join([k.lgStr for k in estimators]) clfObj.shStr = ' + '.join([k.shStr for k in estimators]) return clfObj
def classification(lead): #classifiers = [ # ('ab', AdaBoostClassifier()), # ('dt', DecisionTreeClassifier(max_depth=5)), # ('kn', KNeighborsClassifier(16)), #] inputs = get_dataset_input_from_database(lead.keys()) outputs = get_dataset_output_from_database() print('The total number of examples in the dataset is: %d' % (len(inputs))) inputs_training, inputs_test, outputs_training, outputs_test = train_test_split(inputs, outputs, test_size=0.3, random_state=42) print('The number of examples used for training are: %d' % (len(inputs_training))) print('The number of examples used for testing are: %d' % (len(inputs_test))) knn = KNeighborsClassifier(n_neighbors=7, p=2) knn.fit(inputs_training, np.ravel(outputs_training)) print('[K=7] The probability of the algorithm to be right is: %f%%' % (knn.score(inputs_test, outputs_test) * 100)) #voting_classifier = VotingClassifier(estimators=classifiers, voting='hard') #voting_classifier = voting_classifier.fit(inputs_training, np.ravel(outputs_training)) #print('The probability of the machine to be right is: %f%%' % (voting_classifier.score(inputs_test, outputs_test) * 100)) print('Lead data:') print(lead) data_to_predict = convert_dict_to_tuple(lead) print('Lead data to predict:') print(data_to_predict) lead_status = knn.predict(data_to_predict) lead_status_value = lead_status[0] #lead_status = voting_classifier.predict(data_to_predict) print('According to lead data, his status is: %d' % (lead_status_value)) print('[0] unqualified [1] qualified') proba = knn.predict_proba(data_to_predict) max_proba = max(proba[0]) print('Proba is: %d%%' %(max_proba*100)) lead_status_dict = dict() dict.update(lead_status_dict, value=str(lead_status_value)) dict.update(lead_status_dict, proba=str(max_proba)) return lead_status_dict
def fit_voting(self): voting = 'soft' names = [ # 'svm(word_n_grams,char_n_grams,all_caps,hashtags,punctuations,punctuation_last,emoticons,emoticon_last,' # 'elongated,negation_count)', # 'logreg(w2v_doc)', # 'logreg(w2v_word_avg_google)', 'word2vec_bayes', 'cnn_word(embedding=google)', 'rnn_word(embedding=google)', ] classifiers = [ExternalModel({ self.val_docs: os.path.join(self.data_dir, 'results/val/{}.json'.format(name)), self.test_docs: os.path.join(self.data_dir, 'results/test/{}.json'.format(name)), }) for name in names] all_scores = [] for classifier in classifiers: scores = classifier.predict_proba(self.val_docs) if voting == 'hard': scores = Binarizer(1 / 3).transform(scores) all_scores.append(scores) all_scores = np.array(all_scores) all_scores_first, all_scores_rest = all_scores[0], all_scores[1:] le = LabelEncoder().fit(self.classes_) val_label_indexes = le.transform(self.val_labels()) # assume w_0=1 as w is invariant to scaling w = basinhopping( lambda w_: -(val_label_indexes == np.argmax(( all_scores_first + all_scores_rest * w_.reshape((len(w_), 1, 1)) ).sum(axis=0), axis=1)).sum(), np.ones(len(classifiers) - 1), niter=1000, minimizer_kwargs=dict(method='L-BFGS-B', bounds=[(0, None)] * (len(classifiers) - 1)) ).x w = np.hstack([[1], w]) w /= w.sum() logging.info('w: {}'.format(w)) estimator = VotingClassifier(list(zip(names, classifiers)), voting=voting, weights=w) estimator.le_ = le estimator.estimators_ = classifiers return 'vote({})'.format(','.join(names)), estimator
def train_gesture_classifier(userlist, foldername): """ :param userlist: :param foldername: :return: """ work_arr = list() class_alpha_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'O': 13, 'P': 14, 'Q': 15, 'R': 16, 'S': 17, 'T': 18, 'U': 19, 'V': 20, 'W': 21, 'X': 22, 'Y': 23} print("Generating training features for gesture classifier...") for i0 in userlist: current_folder = foldername + i0 + '/' crop_df = pd.read_csv(current_folder + i0 + '_loc.csv', index_col=0, header=0) filelist = [x for x in os.listdir(current_folder) if x.endswith('.jpg')] for filename in filelist: img_arr = imread(current_folder + filename, as_grey=True) crop_before_x = crop_df.loc[i0 + '/' + filename, 'top_left_x'] crop_before_y = crop_df.loc[i0 + '/' + filename, 'top_left_y'] crop_after_x = crop_df.loc[i0 + '/' + filename, 'bottom_right_x'] crop_after_y = crop_df.loc[i0 + '/' + filename, 'bottom_right_y'] work_arr.append((img_arr, crop_before_x, crop_before_y, crop_after_x, crop_after_y, class_alpha_dict[filename[0]])) x_train = list(map(generate_training_set, work_arr)) del work_arr print("Garbage collector deleted objects:", gc.collect()) random.shuffle(x_train) y_train = [x[1] for x in x_train] x_train = [x[0] for x in x_train] print("Size of gesture classifier training set:", len(y_train)) rfc_classifier = RandomForestClassifier(n_estimators=500, max_features='sqrt', n_jobs=8, warm_start=False) svc_classifier = SVC(cache_size=6000, kernel='linear', tol=1e-3, decision_function_shape='ovr', C=1, probability=True) voting_classifier = VotingClassifier(estimators=[('sv', svc_classifier), ('rf1', rfc_classifier)], voting='soft') voting_classifier.fit(x_train, y_train) print("Gesture classifier training complete.") return voting_classifier
def exportPresentationData(classifier,action): dir = input('Give Data Directory: ') if int(classifier)==1: clf = GradientBoostingClassifier() classify(dir,clf,action) elif int(classifier) == 2: clf = LogisticRegression() classify(dir,clf,action) elif int(classifier) == 3: clf = KNeighborsClassifier(n_neighbors=5) classify(dir,clf,action) elif int(classifier) == 4: clf = DecisionTreeClassifier() classify(dir,clf,action) elif int(classifier) == 5: clf = svm.LinearSVC() classify_type2(dir,clf,action) elif int(classifier) == 6: clf = RandomForestClassifier() classify(dir,clf,action) elif int(classifier) == 7: clf = ExtraTreesClassifier() classify(dir,clf,action) elif int(classifier) == 8: clf = IsolationForest() classify_type2(dir,clf,action) elif int(classifier) == 9: clf = AdaBoostClassifier(n_estimators=100) classify(dir,clf,action) elif int(classifier) == 10: clf = BaggingClassifier(DecisionTreeClassifier()) classify(dir,clf,action) elif int(classifier) == 11: clf1 = GradientBoostingClassifier() clf2 = AdaBoostClassifier() clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft') classify(dir,clf,action)
def create_voting_classifier_ensemble(model_tuple_list): ''' INPUT - model tuple list: list of model tuples (name, model) OUTPUT - a fit ensemble Return fit voting ensemble ''' ensemble = VotingClassifier(model_tuple_list, voting='soft') ensemble.fit(X_train_b, y_train_b) return ensemble
def test_predictproba_hardvoting(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='hard') msg = "predict_proba is not available when voting='hard'" assert_raise_message(AttributeError, msg, eclf.predict_proba, X)
def test_notfitted(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='soft') msg = ("This VotingClassifier instance is not fitted yet. Call \'fit\'" " with appropriate arguments before using this method.") assert_raise_message(NotFittedError, msg, eclf.predict_proba, X)
def test_majority_label_iris(): """Check classification by majority label on dataset iris.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') scores = cross_val_score(eclf, X, y, cv=5, scoring='accuracy') assert_almost_equal(scores.mean(), 0.95, decimal=2)
def test_tie_situation(): """Check voting classifier selects smaller class label in tie situation.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') assert_equal(clf1.fit(X, y).predict(X)[73], 2) assert_equal(clf2.fit(X, y).predict(X)[73], 1) assert_equal(eclf.fit(X, y).predict(X)[73], 1)
def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
def test_multilabel(): """Check if error is raised for multilabel classification.""" X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123) clf = OneVsRestClassifier(SVC(kernel='linear')) eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard') try: eclf.fit(X, y) except NotImplementedError: return
def test_gridsearch(): """Check GridSearch support.""" clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') params = {'lr__C': [1.0, 100.0], 'voting': ['soft', 'hard'], 'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]} grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target)
def ensemble(algs, alg_names, ensemble_name=None, in_ensemble=None, weights=None, voting="soft"): # Estimators for the ensemble estimators = [] # Construct ensemble name if weights is not None: name = "Weighted Ensemble of " else: name = "Ensemble of " # Add respective algorithms to estimators and construct name for index, alg in enumerate(algs): if (in_ensemble is None) or in_ensemble[index]: estimators.append((alg_names[index], alg)) name += alg_names[index] + ", " # Remove extra comma name = name[:-2] # Use provided name if not none if ensemble_name is not None: # Set name name = ensemble_name # Create ensemble alg = VotingClassifier(estimators=estimators, voting=voting, weights=weights) # Return ensemble and name return {"alg": alg, "name": name}
def predictAndTestEnsemble(X, y, Xtest, ytest, classifiers=[], selectKBest=0): """ Trains an Ensemble of classifiers (with default params) and using a training dataset, and returns majority vote using the same training dataset and an out-of-sample test dataset :type X: list :param y: The labels corresponding to the training feature vectors :type y: list :param Xtest: The matrix of test feature vectors :type Xtest: list :param ytest: The labels corresponding to the test feature vectors :type ytest: list :param classifiers: A list of classifiers to use in the ensemble :type classifiers: list of str :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the k-folds """ try: predicted, predicted_test = [], [] # Prepare the data X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Define classifiers ensembleClassifiers = [] for c in classifiers: if c.lower().find("knn") != -1: K = int(c.split('-')[-1]) clf = neighbors.KNeighborsClassifier(n_neighbors=K) elif c.lower().find("svm") != -1: clf = svm.SVC(kernel='linear', C=1) elif c.lower().find("forest") != -1: E = int(c.split('-')[-1]) clf = ensemble.RandomForestClassifier(n_estimators=E,) # Add to list ensembleClassifiers.append((c, clf)) # Select K Best features if applicable X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest # Train and fit the voting classifier voting = VotingClassifier(estimators=ensembleClassifiers, voting='hard') prettyPrint("Fitting ensemble model") voting = voting.fit(X_new, y) prettyPrint("Validating model") predicted = voting.predict(X_new) # Same for the test dataset prettyPrint("Testing the model") predicted_test = voting.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def define_model(modelname): """ Outputs model type and parameters Input ---- model: str model type e.g., Logistic Regression parameters: ls hyperparameters of corresponding model Output ------ clf: model object Model Object Classifier """ if modelname == 'LR': return linear_model.LogisticRegression() elif modelname == 'NN': return neighbors.KNeighborsClassifier() elif modelname == 'DT': return tree.DecisionTreeClassifier() elif modelname == 'RF': return ensemble.RandomForestClassifier() elif modelname == 'NB': return naive_bayes.GaussianNB() elif modelname == 'SVM': return svm.SVC() elif modelname == 'ET': return ensemble.ExtraTreesClassifier() elif modelname == 'SGD': return linear_model.SGDClassifier() elif modelname == 'AB': return ensemble.AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=1) ) elif modelname == 'GB': return ensemble.GradientBoostingClassifier() elif modelname == 'VC': return ensemble.VotingClassifier(estimators=[ ('RFC', ensemble.RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)), ('ETC', ensemble.ExtraTreesClassifier(max_depth=None, max_features=5, n_estimators=10, random_state=0, min_samples_split=1)), ('ABC', ensemble.AdaBoostClassifier())], voting='soft') elif modelname == 'VC2': return ensemble.VotingClassifier(estimators=[ ('LR', linear_model.LogisticRegression(C=0.1, random_state=1)), ('RFC', ensemble.RandomForestClassifier(max_depth=None, n_estimators=10, random_state=0, min_samples_split=1)), ('ETC', ensemble.ExtraTreesClassifier(max_depth=None, max_features=5, n_estimators=10, random_state=0, min_samples_split=1))], voting='soft') else: raise ConfigError("Can't find the model: {}".format(model))
def test_predict_proba_on_toy_problem(): """Calculate predicted probabilities on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) clf1_res = np.array([[0.59790391, 0.40209609], [0.57622162, 0.42377838], [0.50728456, 0.49271544], [0.40241774, 0.59758226]]) clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]]) clf3_res = np.array([[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0., 1.], [0., 1.]]) t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4 t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4 t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4 t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4 eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2, 1, 1]) eclf_res = eclf.fit(X, y).predict_proba(X) assert_almost_equal(t00, eclf_res[0][0], decimal=1) assert_almost_equal(t11, eclf_res[1][1], decimal=1) assert_almost_equal(t21, eclf_res[2][1], decimal=1) assert_almost_equal(t31, eclf_res[3][1], decimal=1) try: eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') eclf.fit(X, y).predict_proba(X) except AttributeError: pass else: raise AssertionError('AttributeError for voting == "hard"' ' and with predict_proba not raised')