def trained_models(): dataset = datasets.load_breast_cancer() X = dataset.data y = dataset.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345) rf = RandomForestClassifier() rf.fit(X_train, y_train) lr = LogisticRegression() lr.fit(X_train, y_train) svc_w_linear_kernel = SVC(kernel='linear') svc_w_linear_kernel.fit(X_train, y_train) svc_wo_linear_kernel = SVC() svc_wo_linear_kernel.fit(X_train, y_train) dummy = DummyClassifier() dummy.fit(X_train, y_train) return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel, 'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'penalty': ['l1'], 'C': np.logspace(-5,5)}, {'penalty': ['l2'], 'C': np.logspace(-5,5)}] clf = GridSearchCV(linear_model.LogisticRegression(tol=1e-6), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def test_homonym(H, sent, features, C=1.0): X_0 = features(matching(sent, H[0])) X_1 = features(matching(sent, H[1])) y_0 = numpy.zeros(len(X_0)) y_1 = numpy.ones(len(X_1)) X = normalize(numpy.vstack([X_0, X_1]), norm='l2') y = numpy.hstack([y_0, y_1]) classifier = LogisticRegression(C=C) fold = StratifiedKFold(y, n_folds=10) score = [] count = [] for tr, te in fold: X_tr, X_te = X[tr], X[te] y_tr, y_te = y[tr], y[te] classifier.fit(X_tr, y_tr) score.append(sum(classifier.predict(X_te) == y_te)) count.append(len(y_te)) score = numpy.array(score, dtype='float') count = numpy.array(count, dtype='float') result = {'word1_count': len(y_0), 'word2_count': len(y_1), 'majority': 1.0 * max(len(y_0),len(y_1))/len(y), 'kfold_acc': score/count } return result
def train_xgboost(): df = pd.read_csv('survival_data.csv', index_col=0, encoding = 'UTF-7') p = np.array([np.mean(np.load('training/%s_flair.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) q = np.array([np.mean(np.load('training/%s_t1.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) r = np.array([np.mean(np.load('training/%s_t1ce.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) s = np.array([np.mean(np.load('training/%s_t2.nii.gz.npy' % str(id)), axis=0) for id in folder_names_train]) y=np.array([]) t=0 z=np.array([]) for ind in range(len(folder_names_train)): try: temp = df.get_value(str(folder_names_train[ind]),'Survival') y=np.append(y,temp) temp = df.get_value(str(folder_names_train[ind]),'Age') z=np.append(z,np.array([temp])) except Exception as e: t+=1 print (t,str(e),"Label Not found, deleting entry") y=np.append(y,0) z=np.array([[v] for v in z]) t=np.concatenate((p,q),axis=1) u=np.concatenate((r,s),axis=1) x=np.concatenate((t,u),axis=1) #print(x.shape) #print (x) #print (x.shape,z.shape) x=np.concatenate((x,z),axis=1) #print (x) #clf=linear_model.LogisticRegression(C=1e5) #clf = RandomForestRegressor() clf = xgb.XGBRegressor() clf.fit(x,y) return clf
def __init__( self,data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=LogisticRegression(), data_block=data_block, predictors=predictors,cv_folds=cv_folds, scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics ) self.model_output=pd.Series(self.default_parameters) self.model_output['Coefficients'] = "-" #Set parameters to default values: self.set_parameters(set_default=True)
def test_model_detection(self): sklearn_model = LogisticRegression() pipeline_model = Pipeline([('log', sklearn_model)]) xgb_model = XGBClassifier() nn_model = NNModel(100,10) sklearn_opt = Optimizer(sklearn_model,[], lambda x: x) pipeline_opt = Optimizer(pipeline_model,[], lambda x: x) xgb_opt = Optimizer(xgb_model,[], lambda x: x) nn_opt = Optimizer(nn_model,[], lambda x: x) self.assertEqual(sklearn_opt.model_module, 'sklearn') self.assertEqual(pipeline_opt.model_module, 'pipeline') self.assertEqual(xgb_opt.model_module, 'xgboost') self.assertEqual(nn_opt.model_module, 'keras')
def build_models_NLP(train_pos_vec, train_neg_vec): """ Returns a BernoulliNB and LosticRegression Model that are fit to the training data. """ Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec) # Use sklearn's BernoulliNB and LogisticRegression functions to fit two models to the training data. # For BernoulliNB, use alpha=1.0 and binarize=None # For LogisticRegression, pass no parameters train_vec = [] train_vec.extend(train_pos_vec) train_vec.extend(train_neg_vec) nb_model = BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True) nb_model.fit(train_vec, Y) lr_model = LogisticRegression() lr_model.fit(train_vec, Y) return nb_model, lr_model
def build_models_DOC(train_pos_vec, train_neg_vec): """ Returns a GaussianNB and LosticRegression Model that are fit to the training data. """ Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec) # Use sklearn's GaussianNB and LogisticRegression functions to fit two models to the training data. # For LogisticRegression, pass no parameters train_vec = [] train_vec.extend(train_pos_vec) train_vec.extend(train_neg_vec) nb_model = GaussianNB() nb_model.fit(train_vec, Y) lr_model = LogisticRegression() lr_model.fit(train_vec, Y) return nb_model, lr_model
def learns(tests,trains,indep=lambda x: x[:-1], dep = lambda x: x[-1], rf = Abcd(), lg = Abcd(), dt = Abcd(), nb = Abcd()): x1,y1,x2,y2= trainTest(tests,trains,indep,dep) forest = RandomForestClassifier(n_estimators = 50) forest = forest.fit(x1,y1) for n,got in enumerate(forest.predict(x2)): rf(predicted = got, actual = y2[n]) logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(x1, y1) for n,got in enumerate(logreg.predict(x2)): lg(predicted = got, actual = y2[n]) bayes = GaussianNB() bayes.fit(x1,y1) for n,got in enumerate(bayes.predict(x2)): nb(predicted = got, actual = y2[n]) dectree = DecisionTreeClassifier(criterion="entropy", random_state=1) dectree.fit(x1,y1) for n,got in enumerate(dectree.predict(x2)): dt(predicted = got, actual = y2[n])
def test_dsapp_lr(data): dsapp_lr = ScaledLogisticRegression() dsapp_lr.fit(data['X_train'], data['y_train']) minmax_scaler = preprocessing.MinMaxScaler() dsapp_cutoff = CutOff() lr = linear_model.LogisticRegression() pipeline =Pipeline([ ('minmax_scaler',minmax_scaler), ('dsapp_cutoff', dsapp_cutoff), ('lr', lr) ]) pipeline.fit(data['X_train'], data['y_train']) assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
def cv_reg_lr(trX, trY, vaX, vaY, Cs=[0.01, 0.05, 0.1, 0.5, 1., 5., 10., 50., 100.]): tr_accs = [] va_accs = [] models = [] for C in Cs: model = LR(C=C) model.fit(trX, trY) tr_pred = model.predict(trX) va_pred = model.predict(vaX) tr_acc = metrics.accuracy_score(trY, tr_pred) va_acc = metrics.accuracy_score(vaY, va_pred) print '%.4f %.4f %.4f'%(C, tr_acc, va_acc) tr_accs.append(tr_acc) va_accs.append(va_acc) models.append(model) best = np.argmax(va_accs) print 'best model C: %.4f tr_acc: %.4f va_acc: %.4f'%(Cs[best], tr_accs[best], va_accs[best]) return models[best]
def train_and_predict(self, param_dict, predict_on='val'): """Initializes a LR classifier according to the desired parameter settings, trains it, and returns the predictions on the appropriate evaluation dataset. Args: param_dict: A dictionary with keys representing parameter names and values representing settings for those parameters. predict_on: The dataset used for evaluating the model. Can set to 'Test' to get final results. Returns: The predicted Y labels. """ if predict_on == 'test': predict_X = self.data_loader.test_X else: predict_X = self.data_loader.val_X self.model = linear_model.LogisticRegression(penalty=param_dict['penalty'], C=param_dict['C']) self.model.fit(self.data_loader.train_X, self.data_loader.train_Y) preds = self.predict_on_data(predict_X) return preds
def test_build_param_grid_set_estimator(): clf1 = SVC() clf2 = LogisticRegression() clf3 = SVC() clf4 = SGDClassifier() estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])), ('clf', None)]), clf=[set_grid(clf1, kernel=['linear']), clf2, set_grid(clf3, kernel=['poly'], degree=[2, 3]), clf4]) param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]}, {'clf': [clf3], 'clf__kernel': ['poly'], 'clf__degree': [2, 3], 'sel__k': [2, 3]}, {'clf': [clf2, clf4], 'sel__k': [2, 3]}] assert build_param_grid(estimator) == param_grid
def test_make_grid_search(): X, y = load_iris(return_X_y=True) lr = LogisticRegression() svc = set_grid(SVC(), kernel=['poly'], degree=[2, 3]) gs1 = make_grid_search(lr, cv=5) # empty grid gs2 = make_grid_search(svc, cv=5) gs3 = make_grid_search([lr, svc], cv=5) for gs, n_results in [(gs1, 1), (gs2, 2), (gs3, 3)]: gs.fit(X, y) assert gs.cv == 5 assert len(gs.cv_results_['params']) == n_results svc_mask = gs3.cv_results_['param_root'] == svc assert svc_mask.sum() == 2 assert gs3.cv_results_['param_root__degree'][svc_mask].tolist() == [2, 3] assert gs3.cv_results_['param_root'][~svc_mask].tolist() == [lr]
def convert(model, feature_names, target): """Convert a Logistic Regression model to the protobuf spec. Parameters ---------- model: LogisticRegression A trained LogisticRegression model. feature_names: [str], optional (default=None) Name of the input columns. target: str, optional (default=None) Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, LogisticRegression) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_')) return _MLModel(_convert(model, feature_names, target))
def test_stacked_classfier_extkfold(self): bclf = LogisticRegression(random_state=1) clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), RidgeClassifier(random_state=1), ] sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=0, Kfold=StratifiedKFold(self.iris.target, 3), stack_by_proba=False, oob_score_flag=True, oob_metrics=log_loss) sl.fit(self.iris.data, self.iris.target) score = sl.score(self.iris.data, self.iris.target) self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def test_fwls_classfier(self): feature_func = lambda x: np.ones(x.shape) bclf = LogisticRegression(random_state=1) clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), RidgeClassifier(random_state=1), ] sl = FWLSClassifier(bclf, clfs, feature_func=feature_func, n_folds=3, verbose=0, Kfold=StratifiedKFold(self.iris.target, 3), stack_by_proba=False) sl.fit(self.iris.data, self.iris.target) score = sl.score(self.iris.data, self.iris.target) self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def prec_log(X_train, y_train, X_test, y_test): from sklearn.linear_model import LogisticRegression if not issparse(X_train): X_train = X_train.reshape((X_train.shape[0], -1)) if not issparse(X_test): X_test = X_test.reshape((X_test.shape[0], -1)) LOGGER.info('start predict: X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format( X_train.shape, y_train.shape, X_test.shape, y_test.shape)) X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) clf = LogisticRegression(solver='sag', n_jobs=-1, verbose=1) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) prec = float(np.sum(y_pred == y_test)) / len(y_test) LOGGER.info('prec_log={:.6f}%'.format(prec*100.0)) return clf, y_pred
def distance(self, d1, d2): # Extract summary statistics from the dataset s1 = self.statistics_calc.statistics(d1) s2 = self.statistics_calc.statistics(d2) # compute distnace between the statistics training_set_features = np.concatenate((s1, s2), axis=0) label_s1 = np.zeros(shape=(len(s1), 1)) label_s2 = np.ones(shape=(len(s2), 1)) training_set_labels = np.concatenate((label_s1, label_s2), axis=0).ravel() reg_inv = 1e5 log_reg_model = linear_model.LogisticRegression(C=reg_inv, penalty='l1') log_reg_model.fit(training_set_features, training_set_labels) score = log_reg_model.score(training_set_features, training_set_labels) distance = 2.0 * (score - 0.5) return distance
def cross_validation_accuracy(clf, X, labels, k): """ Compute the average testing accuracy over k folds of cross-validation. You can use sklearn's KFold class here (no random seed, and no shuffling needed). Params: clf......A LogisticRegression classifier. X........A csr_matrix of features. labels...The true labels for each instance in X k........The number of cross-validation folds. Returns: The average testing accuracy of the classifier over each fold of cross-validation. """ ###TODO pass
def fit_best_classifier(docs, labels, best_result): """ Using the best setting from eval_all_combinations, re-vectorize all the training data and fit a LogisticRegression classifier to all training data. (i.e., no cross-validation done here) Params: docs..........List of training document strings. labels........The true labels for each training document (0 or 1) best_result...Element of eval_all_combinations with highest accuracy Returns: clf.....A LogisticRegression classifier fit to all training data. vocab...The dict from feature name to column index. """ ###TODO pass
def top_coefs(clf, label, n, vocab): """ Find the n features with the highest coefficients in this classifier for this label. See the .coef_ attribute of LogisticRegression. Params: clf.....LogisticRegression classifier label...1 or 0; if 1, return the top coefficients for the positive class; else for negative. n.......The number of coefficients to return. vocab...Dict from feature name to column index. Returns: List of (feature_name, coefficient) tuples, SORTED in descending order of the coefficient for the given class label. """ ###TODO pass
def print_top_misclassified(test_docs, test_labels, X_test, clf, n): """ Print the n testing documents that are misclassified by the largest margin. By using the .predict_proba function of LogisticRegression <https://goo.gl/4WXbYA>, we can get the predicted probabilities of each class for each instance. We will first identify all incorrectly classified documents, then sort them in descending order of the predicted probability for the incorrect class. E.g., if document i is misclassified as positive, we will consider the probability of the positive class when sorting. Params: test_docs.....List of strings, one per test document test_labels...Array of true testing labels X_test........csr_matrix for test data clf...........LogisticRegression classifier fit on all training data. n.............The number of documents to print. Returns: Nothing; see Log.txt for example printed output. """ ###TODO pass
def task73(features): features = numpy.array(features) words = list(set(features[:, 1])) pos_vec = numpy.zeros(len(words)) neg_vec = numpy.zeros(len(words)) for feature in features: index = words.index(feature[1]) if feature[0] == '-1': pos_vec[index] += 1 else: neg_vec[index] += 1 model = linear_model.LogisticRegression() model.fit([pos_vec, neg_vec], [1, -1]) return (words, model)
def test_lr_on_data(X_train, y_train, X_validate, y_validate, X_test, y_test): y_train_flatten = list(itertools.chain(*y_train)) # Train LR Model lr = LogisticRegression(solver='lbfgs') lr.fit(X_train, y_train_flatten) # Test model on validation set predictions_val = lr.predict_proba(X_validate) predictions_val = array([i[-1] for i in predictions_val]) best_threshold_validate = find_threshold_logistic(y_validate, predictions_val, predictions_val) precision_val, recall_val, f1_val = evaluate_with_threshold(y_validate, predictions_val, predictions_val, best_threshold_validate) globals.logger.info("Found threshold: %f. Precision/recall/f1 over validation set: %f/%f/%f" % (best_threshold_validate, precision_val, recall_val, f1_val)) # Test model on test set predictions_test = lr.predict_proba(X_test) predictions_test = array([i[-1] for i in predictions_test]) best_threshold_test = find_threshold_logistic(y_test, predictions_test, predictions_test, verbose=True) precision, recall, f1 = evaluate_with_threshold(y_test, predictions_test, predictions_test, best_threshold_test) globals.logger.info("Found threshold: %f. Precision/recall/f1 over test set: %f/%f/%f" % (best_threshold_test, precision, recall, f1)) return precision, recall, f1
def train_using_logistic(feat1, feat2): n_plus = len(feat1) n_minus = len(feat2) X = np.concatenate((feat1, feat2), axis=0) y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0) y = y + 1 print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape) logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(X, y) print("Score using logistic regression on training data is ", logreg.score(X, y)) return logreg
def generate_LR_model(file_name): train_df = read_from_file(file_name) selected_train_df = train_df.filter(regex='label|connectionType_.*|telecomsOperator_.*|sitesetID_.*|positionType_.*|gender_.*|haveBaby_.*|age_scaled') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Logistic Regression Model...' start_time = datetime.datetime.now() clf = linear_model.LogisticRegression(penalty='l2',C=1.0,solver='sag',n_jobs=-1, tol=1e-6, max_iter=200)#, class_weight='balanced') clf.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: ' print (end_time-start_time).seconds print 'Save Model...' joblib.dump(clf, 'LR.model') return clf
def get_classifier(method='logistic_regression'): if 'logistic_regression' == method: return LogisticRegression(C=1e3, tol=0.01, multi_class='ovr', solver='liblinear', n_jobs=-1, random_state=123) if 'random_forest' == method: return RandomForestClassifier(n_estimators=250, bootstrap=False, n_jobs=-1, random_state=123) if 'gradient_boosting' == method: return xgb.XGBClassifier(max_depth=10, subsample=0.7, n_estimators=500, min_child_weight=0.05, colsample_bytree=0.3, learning_rate=0.1)
def TL(): allurls = './data/data.csv' #path to our all urls file allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe allurlsdata = np.array(allurlsdata) #converting it into an array random.shuffle(allurlsdata) #shuffling y = [d[1] for d in allurlsdata] #all labels corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad) vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer X = vectorizer.fit_transform(corpus) #get the X vector X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio lgs = LogisticRegression() #using logistic regression lgs.fit(X_train, y_train) print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98% return vectorizer, lgs
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
def test_mdr_sklearn_pipeline_parallel(): """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1) assert np.mean(cv_scores) > 0.
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) self.classifier = SVC() self.classifier.fit(self.trainingSet, self.trainingLabel) pred_labels = {} for user in self.testDict: pred_labels[user] = self.classifier.predict([[self.BDS[user]]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel return pred_labels
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = models.TfidfModel(corpus) corpus = [text for text in model[corpus]] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) pred_labels = {} classifier = SVC() classifier.fit(self.trainingSet, self.trainingLabel) for user in self.testDict: pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) return pred_labels # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
def prepare_fit_model_for_factors(model_type, x_train, y_train): """ Given a model type, train and test data Args: model_type (str): 'classification' or 'regression' x_train: y_train: Returns: (sklearn.base.BaseEstimator): A fit model. """ if model_type == 'classification': algorithm = LogisticRegression() elif model_type == 'regression': algorithm = LinearRegression() else: algorithm = None if algorithm is not None: algorithm.fit(x_train, y_train) return algorithm
def __init__(self, info, verbose=True, debug_mode=False): self.label_num=info['label_num'] self.target_num=info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = None #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba if debug_mode>=2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task']=='regression': if info['is_sparse']==True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingRegressor" self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True) self.predict_method = self.model.predict # Always predict probabilities else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start... elif info['is_sparse']: self.name = "BaggingNBClassifier" self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingClassifier" self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)") if info['task']=='multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def run_predict_logistic_regression(X_train,Y_train,X_test,Y_test): clf = LogisticRegression() clf = clf.fit(X_train, Y_train) pred = clf.predict(X_test) print('Logistic 0-1 error. \n Training: ', zero_one_score(Y_train, clf.predict(X_train)), '\n Test:', zero_one_score(Y_test, pred)) return clf
def train_clf(self, trainfiles): # tokens: list of words, labels: list of corresponding labels # go document by document because of local context final_labels = [] featmat = [] for trainfile in trainfiles: for tokens, labels in yield_tokens_labels(trainfile): final_labels.extend(labels) featmat.append(self.make_featmat_rep(tokens)) featmat = np.vstack(featmat) print("training classifier") clf = logreg(class_weight='balanced', random_state=1) clf.fit(featmat, final_labels) self.clf = clf
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
