我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.linear_model.SGDClassifier()。
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def __init__(self, filename, target_map, classifier='svm'): self.seed_ = 0 self.filename_ = filename self.target_map_ = target_map self.target_ids_ = (np.unique(target_map.keys())).astype(np.int32) self.epoch_no_ = 0 self.st_time_ = time.time() # Setup classifier print('-------------------------------') print('====> Building Classifier, setting class weights') if classifier == 'svm': self.clf_hyparams_ = {'C':[0.01, 0.1, 1.0, 10.0, 100.0], 'class_weight': ['balanced']} self.clf_base_ = LinearSVC(random_state=self.seed_) elif classifier == 'sgd': self.clf_hyparams_ = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'class_weight':['auto']} # 'loss':['hinge'], self.clf_ = SGDClassifier(loss='log', penalty='l2', shuffle=False, random_state=self.seed_, warm_start=True, n_jobs=-1, n_iter=1, verbose=4) else: raise Exception('Unknown classifier type %s. Choose from [sgd, svm, gradient-boosting, extra-trees]' % classifier)
def __create_classifiers(self): classifiers = list() classifiers.append({"func": linear_model.SGDClassifier(loss="log"), "name": "sgd"}) classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'), "name": "knn1"}) classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'), "name": "knn3"}) classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'), "name": "knn5"}) classifiers.append({"func": GaussianNB(), "name": "naive_bayes"}) # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"}) # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"}) # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"}) return classifiers
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def try_params( n_iterations, params ): n_iterations = int( round( n_iterations )) print "n_iterations:", n_iterations pprint( params ) if params['scaler']: scaler = eval( "{}()".format( params['scaler'] )) x_train_ = scaler.fit_transform( data['x_train'].astype( float )) x_test_ = scaler.transform( data['x_test'].astype( float )) local_data = { 'x_train': x_train_, 'y_train': data['y_train'], 'x_test': x_test_, 'y_test': data['y_test'] } else: local_data = data # we need a copy because at the next small round the best params will be re-used params_ = dict( params ) params_.pop( 'scaler' ) clf = SGD( n_iter = n_iterations, **params_ ) return train_and_eval_sklearn_classifier( clf, local_data )
def learn(self, features, labels): """ Fits the classifier If it's state is empty, the classifier is fitted, if not the classifier is partially fitted. See sklearn's SGDClassifier fit and partial_fit methods. Args: features (:obj:`list` of :obj:`list` of :obj:`float`) labels (:obj:`list` of :obj:`str`): Labels for each set of features. New features are learnt. """ labels = np.ravel(labels) self.__learn_labels(labels) if len(labels) == 0: return labels = self.labels.transform(labels) if self.feature_length > 0 and hasattr(self.clf, 'partial_fit'): # FIXME? check docs, may need to pass class=[...] self.clf = self.clf.partial_fit(features, labels) else: self.clf = self.clf.fit(features, labels) self.feature_length = len(features[0])
def make_classifier(self, name, ids, labels): """Entrenar un clasificador SVM sobre los textos cargados. Crea un clasificador que se guarda en el objeto bajo el nombre `name`. Args: name (str): Nombre para el clasidicador. ids (list): Se espera una lista de N ids de textos ya almacenados en el TextClassifier. labels (list): Se espera una lista de N etiquetas. Una por cada id de texto presente en ids. Nota: Usa el clasificador de `Scikit-learn <http://scikit-learn.org/>`_ """ if not all(np.in1d(ids, self.ids)): raise ValueError("Hay ids de textos que no se encuentran \ almacenados.") setattr(self, name, SGDClassifier()) classifier = getattr(self, name) indices = np.searchsorted(self.ids, ids) classifier.fit(self.tfidf_mat[indices, :], labels)
def test_build_param_grid_set_estimator(): clf1 = SVC() clf2 = LogisticRegression() clf3 = SVC() clf4 = SGDClassifier() estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])), ('clf', None)]), clf=[set_grid(clf1, kernel=['linear']), clf2, set_grid(clf3, kernel=['poly'], degree=[2, 3]), clf4]) param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]}, {'clf': [clf3], 'clf__kernel': ['poly'], 'clf__degree': [2, 3], 'sel__k': [2, 3]}, {'clf': [clf2, clf4], 'sel__k': [2, 3]}] assert build_param_grid(estimator) == param_grid
def get_sgdc(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)) ])
def run(self): training_x, training_y, training_ids = self.get_training_data() test_x, test_y, test_ids = self.get_test_data() clf = self.define_model(self.model_name, self.model_params) clf.fit(training_x, training_y) res_predict = clf.predict(test_x) if (self.model_name == "SGDClassifier" and (clf.loss =="hinge" or clf.loss == "perceptron")) or self.model_name == "linear.SVC": res = list(clf.decision_function(test_x)) else: res = list(clf.predict_proba(test_x)[:,1]) #fp, fn, tp, tn = self.compute_confusion_matrix(res[:,0], test_y) result_dictionary = {'training_ids': training_ids, 'predictions_test_y': list(res_predict), 'prob_prediction_test_y': res , 'test_y': list(test_y), 'test_ids': list(test_ids), 'model_name': self.model_name, 'model_params': self.model_params, 'label': self.label, 'feature_columns_used': self.cols_to_use, 'config': self.config, 'feature_importance': self.get_feature_importance(clf, self.model_name), 'columned_used_for_feat_importance': list(training_x.columns.values)} return result_dictionary, clf
def demo(): import sys sys.path.append( '../core' ) from tools import make_XOR_dataset X,Y = make_XOR_dataset() N,L = Y.shape from sklearn import linear_model h_ = linear_model.SGDClassifier(n_iter=100) from CC import RCC cc = RCC(h=h_) e = Ensemble(n_estimators=10,base_estimator=cc) e.fit(X, Y) # test it print(e.predict(X)) print("vs") print(Y)
def get_classifier(self): algo=self.algo if algo=="GBT": return GradientBoostingClassifier() elif algo=="RF": return RandomForestClassifier() elif algo=="ADB": return AdaBoostClassifier() elif algo =="DT": return DecisionTreeClassifier() elif algo=="NB": return BernoulliNB() elif algo=="SGD": return SGDClassifier() elif algo=="SVC": return LinearSVC() elif algo=="MLPC": return MLPClassifier(activation='logistic', batch_size='auto', early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, random_state=1, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) return 0
def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier from sklearn.dummy import DummyClassifier from sklearn.metrics import f1_score from sklearn.multioutput import MultiOutputClassifier dummy = MultiOutputClassifier(DummyClassifier()) dummy.fit(train_embeds, train_labels) log = MultiOutputClassifier(SGDClassifier(loss="log"), n_jobs=10) log.fit(train_embeds, train_labels) f1 = 0 for i in range(test_labels.shape[1]): print("F1 score", f1_score(test_labels[:,i], log.predict(test_embeds)[:,i], average="micro")) for i in range(test_labels.shape[1]): print("Random baseline F1 score", f1_score(test_labels[:,i], dummy.predict(test_embeds)[:,i], average="micro"))
def test_cat(): print 'Testing categorization...' filein = 'test_lookup.csv' fileout = 'test_cat.csv' df = pd.read_csv(filein) model = linear_model.SGDClassifier(loss='log') catData = df[~df.category.isnull()] uncatData = df[df.category.isnull()] print str(float(len(catData))/float(len(df)) * 100.) + "% of transactions categorized with lookup." ts.train_model(catData,model,embeddings,model_type='logreg',new_run=True) ts.use_model(uncatData,model,embeddings,0.0,model_type='logreg') df = pd.concat([catData, uncatData]) df.sort_index(inplace=True) df.to_csv(fileout,index=False)
def train_and_pickle_classifier(): import numpy as np from sklearn.linear_model import SGDClassifier clf = SGDClassifier(loss='log', random_state=1, n_iter=1) csv_filename = os.path.join('datasets', 'movie_data.csv') doc_stream = stream_docs(path=csv_filename) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if X_train is None: break else: X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Test accuracy: %.3f" % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) pickle.dump(clf, open(CLF_FILENAME, 'wb'), protocol=4)
def __init__(self, path, etype, **kwargs): super(EnsembleModel, self).__init__(path, etype=etype, **kwargs) self.basedir = "models/ensemble/" self.goldstd = kwargs.get("goldstd") self.data = {} self.offsets = [] self.pipeline = Pipeline( [ #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) # ('clf', svm.NuSVC(nu=0.01 )) ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True)) # ('clf', tree.DecisionTreeClassifier(criterion="entropy")), # ('clf', MultinomialNB()) # ('clf', GaussianNB()) #('clf', svm.SVC(kernel="rbf", degree=2, C=1)), #('clf', svm.SVC(kernel="linear", C=2)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def test_transform_linear_model(): for clf in (LogisticRegression(C=0.1), LinearSVC(C=0.01, dual=False), SGDClassifier(alpha=0.001, n_iter=50, shuffle=True, random_state=0)): for thresh in (None, ".09*mean", "1e-5 * median"): for func in (np.array, sp.csr_matrix): X = func(data) clf.set_params(penalty="l1") clf.fit(X, y) X_new = assert_warns( DeprecationWarning, clf.transform, X, thresh) if isinstance(clf, SGDClassifier): assert_true(X_new.shape[1] <= X.shape[1]) else: assert_less(X_new.shape[1], X.shape[1]) clf.set_params(penalty="l2") clf.fit(X_new, y) pred = clf.predict(X_new) assert_greater(np.mean(pred == y), 0.7)
def test_prefit(): """ Test all possible combinations of the prefit parameter. """ # Passing a prefit parameter with the selected model # and fitting a unfit model with prefit=False should give same results. clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf) model.fit(data, y) X_transform = model.transform(data) clf.fit(data, y) model = SelectFromModel(clf, prefit=True) assert_array_equal(model.transform(data), X_transform) # Check that the model is rewritten if prefit=False and a fitted model is # passed model = SelectFromModel(clf, prefit=False) model.fit(data, y) assert_array_equal(model.transform(data), X_transform) # Check that prefit=True and calling fit raises a ValueError model = SelectFromModel(clf, prefit=True) assert_raises(ValueError, model.fit, data, y)
def score_function(field): stats = field.get_stats() if "Creature" not in stats: return 0 else: return stats["Creature"] # res = modelling.run_simulation(universe, check_stop_function, score_function, verbose=True, times=30) # print res # print np.asarray(res).mean() # random 1000 10 [193, 37, 97, 224, 349, 165, 251, 130, 184, 335] # SGDClassifier 1000 10 [9, 106, 127, 11, 187, 38, 193, 114, 236, 27] # random 500 20 [63, 24, 38, 14, 30, 65, 29, 60, 28, 25, 93, 44, 51, 26, 104, 56, 53, 38, 23, 42] mean 45.299999999999997 # SGDClassifier 500 20 [116, 52, 50, 82, 109, 49, 109, 37, 25, 115, 130, 180, 52, 52, 113, 46, 34, 135, 26, 33] mean 77.25 # random 500 20 [71, 24, 57, 56, 34, 14, 75, 66, 41, 56, 29, 69, 30, 72, 40, 57, 49, 24, 41, 48] mean 47.65 # SGDClassifier 500 20 [175, 40, 117, 96, 119, 116, 58, 134, 67, 87, 73, 147, 124, 125, 82, 139, 78, 110, 74, 100] mean 103.05 # random 500 30 [42, 32, 62, 34, 30, 44, 51, 35, 63, 59, 50, 40, 75, 59, 50, 33, 45, 95, 82, 41, 43, 89, 94, 66, 64, 46, 34, 82, 66, 76] # 56.0666666667 # SGDClassifier 500 30 [62, 85, 72, 42, 17, 48, 74, 53, 42, 73, 57, 29, 82, 51, 80, 84, 86, 73, 51, 36, 85, 85, 46, 59, 68, 33, 44, 38, 62, 26] # 58.1
def compute_sgd(data): logging.info('Computing SGD') n_splits = 10 folder = StratifiedKFold(n_splits=n_splits, shuffle=True) for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']), total=n_splits): # {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05} model = SGDClassifier( loss='log', penalty='elasticnet', fit_intercept=True, n_iter=100, shuffle=True, n_jobs=-1, l1_ratio=0.0001, alpha=1e-05, class_weight=None) model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first]) data['y_train_pred'][ix_second] = logit(model.predict_proba(data['X_train'][ix_second, :])[:, 1]) data['y_test_pred'].append(logit(model.predict_proba(data['X_test'])[:, 1])) data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1) return data
def classify(n = 50): #clf = MultinomialNB(fit_prior=False) #clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0}) clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0}) clf.fit(mat[:n], rel[:n]) return clf
def initialModeling(data): X, y = processData(data) global n n = X.shape[1] print "I'm training the model using ", X.shape[0], " samples and ", n, " features.\n" global model model = SGDClassifier(loss="log", alpha=100, verbose=1) model.fit(X, y) # 6th: update model
def __init__(self, classifier=None): if classifier: self.clf = classifier else: self.clf = SGDClassifier(loss="log", penalty="l2", shuffle=True, n_iter=2500) self.labels = preprocessing.LabelEncoder() self.feature_length = -1
def predict(self, features, verbose=False): """ Probability estimates of each feature See sklearn's SGDClassifier predict and predict_proba methods. Args: features (:obj:`list` of :obj:`list` of :obj:`float`) verbose: Boolean, optional. If true returns an array where each element is a dictionary, where keys are labels and values are the respective probabilities. Defaults to False. Returns: Array of array of numbers, or array of dictionaries if verbose i True """ probs = self.clf.predict_proba(features) if verbose: labels = self.labels.classes_ res = [] for prob in probs: vals = {} for i, val in enumerate(prob): label = labels[i] vals[label] = val res.append(vals) return res else: return probs
def do_l2norm(X_data): x_normalized=preprocessing.normalize(X_data,norm='l2') return x_normalized #svm = SGDClassifier(loss = 'hinge') #https://ljalphabeta.gitbooks.io/python-/content/kernelsvm.html
def use_SGD(X_data,y_data): clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X_data, y_data) return clf # def use_KNN(X_data,y_data): # def use_RandomForest(X_data,y_data):
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = lm.PartialSGDClassifier(classes=[0, 1], random_state=0, max_iter=1000, tol=1e-3) b = lm_.SGDClassifier(random_state=0, max_iter=1000, tol=1e-3) a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(a, b, exclude='loss_function_')
def test_init_no_file(): mm = sgdc_modelmanager.SGDCModelManager() assert isinstance(mm, sgdc_modelmanager.SGDCModelManager) assert isinstance(mm.clf, Pipeline) assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
def test_init(): mm = sgdc_modelmanager.SGDCModelManager('sgdcmodel.pickle') assert isinstance(mm, modelmanager.ModelManager) assert isinstance(mm.clf, Pipeline) assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
def test_init(): mm = mnb_modelmanager.MNBModelManager('sgdcmodel.pickle') assert isinstance(mm, modelmanager.ModelManager) assert isinstance(mm.clf, Pipeline) assert isinstance(mm.clf.named_steps['clf'], SGDClassifier)
def test_init(): ct = classifytext.ClassifyText() assert isinstance(ct.mm, sgdc_modelmanager.SGDCModelManager) assert isinstance(ct.mm.clf, Pipeline) assert isinstance(ct.mm.clf.named_steps['clf'], SGDClassifier)
def test_init_sgdc(): ct = classifytext.ClassifyText(type=classifytext.SGDC) assert isinstance(ct.mm, sgdc_modelmanager.SGDCModelManager) assert isinstance(ct.mm.clf, Pipeline) assert isinstance(ct.mm.clf.named_steps['clf'], SGDClassifier)
def train_clf(self, X, idxss, rs): N = sum(len(idx) for idx in idxss) n_epochs = self.compute_epochs(N) if self.optimization == 'fastxml': penalty = 'l1' else: penalty = 'l2' X_train, y_train = self.build_XY(X, idxss, rs) in_liblinear = X_train.shape[0] > (self.auto_weight * self.max_leaf_size) if self.engine == 'liblinear' or (self.engine == 'auto' and in_liblinear): if self.loss == 'log': # No control over penalty clf = LogisticRegression(solver='liblinear', random_state=rs, tol=1, C=self.C, penalty=penalty) else: clf = LinearSVC(C=self.C, fit_intercept=self.bias, max_iter=n_epochs, class_weight='balanced', penalty=penalty, random_state=rs) else: clf = SGDClassifier(loss=self.loss, penalty=penalty, n_iter=n_epochs, alpha=self.alpha, fit_intercept=self.bias, class_weight='balanced', random_state=rs) clf.fit(X_train, y_train) # Halves the memory requirement clf.coef_ = sparsify(clf.coef_, self.eps) if self.bias: clf.intercept_ = clf.intercept_.astype('float32') return clf, CLF(clf.coef_, clf.intercept_)
def demo(): import sys sys.path.append( '../core' ) from tools import make_XOR_dataset X,Y = make_XOR_dataset() N,L = Y.shape br = BR(L, linear_model.SGDClassifier(n_iter=100)) br.fit(X, Y) # test it print(br.predict(X)) print("vs") print(Y)
def demo(): import sys sys.path.append( '../core' ) from tools import make_XOR_dataset from BR import BR set_printoptions(precision=3, suppress=True) X,Y = make_XOR_dataset() N,L = Y.shape print("CLASSIFICATION") h = linear_model.SGDClassifier(n_iter=100) nn = ELM(8,f=tanh,h=BR(-1,h)) nn.fit(X, Y) # test it print(nn.predict(X)) print("vs") print(Y) print("REGRESSION") r = ELM(100,h=linear_model.LinearRegression()) r.fit(X,Y) print(Y) print(r.predict(X)) print("REGRESSION OI") r = ELM_OI(100,h=BR(-1,h=linear_model.SGDRegressor())) r.fit(X,Y) print(Y) print(r.predict(X))
def demo(): import sys from molearn.core.tools import make_XOR_dataset X,Y = make_XOR_dataset() N,L = Y.shape print(Y) print("vs") print("RCC") cc = RCC(SGDClassifier(n_iter=100,loss='log')) cc.fit(X, Y) print(cc.predict(X)) print("MCC") mcc = MCC(SGDClassifier(n_iter=100,loss='log'),M=1000) mcc.fit(X, Y) Yp = mcc.predict(X, M=50) print("with 50 iterations ...") print(Yp) Yp = mcc.predict(X, 'default') print("with default (%d) iterations ..." % 1000) print(Yp) print("PCC") pcc = PCC(SGDClassifier(n_iter=100,loss='log')) pcc.fit(X, Y) print(pcc.predict(X))
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
def model_trainer(model_dict, X_train, y_train, adv=None, rd=None, rev=None): """Trains and returns SVM. Also save SVM to file.""" print('Training model...') start_time = time.time() abs_path_m = resolve_path_m(model_dict) svm_model = model_dict['svm_type'] C = model_dict['penconst'] penalty = model_dict['penalty'] if adv is None: adv_mag = None # Create model based on parameters if svm_model == 'linear': dual = True if penalty == 'l1': dual = False clf = svm.LinearSVC(C=C, penalty=penalty, dual=dual) # clf = linear_model.SGDClassifier(alpha=C,l1_ratio=0) elif svm_model != 'linear': clf = svm.SVC(C=C, kernel=svm_model) # Train model clf.fit(X_train, y_train) print('Finish training in {:d}s'.format(int(time.time() - start_time))) # Save model joblib.dump(clf, abs_path_m + get_svm_model_name(model_dict, rd, rev) + '.pkl') return clf #------------------------------------------------------------------------------#
def __init__(self,name,kwargs): from sklearn.linear_model import SGDClassifier super(GCSGDClassifier,self).__init__(name,SGDClassifier,kwargs)
def get_model_score(training, validation): model = linear_model.SGDClassifier(loss='log', n_iter=5) model.fit(get_input_data(training), get_output_data(training)) curr_score = model.score(get_input_data(validation), get_output_data(validation)) return curr_score
def __init__(self): # loss="log" makes it use logistic regression self.model = linear_model.SGDClassifier(loss="log", n_iter=5)
def main(): #before_release movie_info_before_release = load_movie_info_before_release() print '***Before release***' X = create_input(movie_info_before_release) Y = create_output_before_release(movie_info_before_release) clf = linear_model.SGDClassifier(loss='log') test_classifier(clf, X, Y, 'before_release') clf = GaussianNB() test_classifier(clf, X, Y, 'before_release') clf = RandomForestClassifier(n_estimators=10, max_depth=10) test_classifier(clf, X, Y, 'before_release') #After release movie_info = load_movie_info() print '***After release***' X = create_input(movie_info) Y = create_output(movie_info) clf = linear_model.SGDClassifier(loss='log') test_classifier(clf, X, Y, 'after_release') clf = GaussianNB() test_classifier(clf, X, Y, 'after_release') clf = RandomForestClassifier(n_estimators=10, max_depth=10) test_classifier(clf, X, Y, 'after_release')
def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier from sklearn.dummy import DummyClassifier from sklearn.metrics import f1_score dummy = DummyClassifier() dummy.fit(train_embeds, train_labels) log = SGDClassifier(loss="log", n_jobs=10) log.fit(train_embeds, train_labels) print("F1 score:", f1_score(test_labels, log.predict(test_embeds), average="micro")) print("Random baseline f1 score:", f1_score(test_labels, dummy.predict(test_embeds), average="micro"))
def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier from sklearn.dummy import DummyClassifier from sklearn.metrics import f1_score dummy = DummyClassifier() dummy.fit(train_embeds, train_labels) log = SGDClassifier(loss="log", n_jobs=55) log.fit(train_embeds, train_labels) print("Test scores") print(f1_score(test_labels, log.predict(test_embeds), average="micro")) print("Train scores") print(f1_score(train_labels, log.predict(train_embeds), average="micro")) print("Random baseline") print(f1_score(test_labels, dummy.predict(test_embeds), average="micro"))
def get_data_preprocessor_balancing(params, y): d_balancing = params['layer_dict_list'][1] if params['balancing'] == str(d_balancing['None']) or params['balancing'] == 'None': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = None # for clf: ['Adasample_weightBoostClassifier', 'GradientBoostingClassifier'] params['sample_weight'] = None elif params['balancing'] == str(d_balancing['weighting']) or params['balancing'] == 'weighting': # for fp: ['ExtraTreesClassifier', 'LinearSVC'] + clf: ['DecisionTreeClassifier', 'ExtraTreesClassifier', 'LinearSVC', 'SVC', 'RandomForestClassifier', 'SGDClassifier'] params['class_weight'] = 'auto' # for clf: ['AdaBoostClassifier', 'GradientBoostingClassifier'] if len(y.shape) > 1: offsets = [2 ** i for i in range(y.shape[1])] y_ = np.sum(y * offsets, axis=1) else: y_ = y unique, counts = np.unique(y_, return_counts=True) cw = 1. / counts cw = cw / np.mean(cw) sample_weight = np.ones(y_.shape) for i, ue in enumerate(unique): mask = y_ == ue sample_weight[mask] *= cw[i] params['sample_weight'] = sample_weight return params
def run_cat(filename,modelname,fileout,embeddings,new_run=True,run_parse=True, model_type='logreg',C=10.0, alpha=1.0, cutoff=0.50, n_iter=1): # pull relevant data and run parsing and classification df = pd.read_csv(filename) if (len(df.columns)==2): # make sure columns have the right names df.columns = ['raw','amount'] if new_run: # initialize the model; if model_type=='logreg': model = linear_model.SGDClassifier(loss='log',warm_start=True, n_iter=n_iter,alpha=alpha) elif model_type=='passive-aggressive': model = linear_model.PassiveAggressiveClassifier(C=C,warm_start=True) elif model_type=='naive-bayes': model = naive_bayes.GaussianNB() else: raise NameError('model_type must be logreg, passive-aggressive, or naive-bayes') else: # load a saved, pre-trained model modelFileLoad = open(modelname, 'rb') model = pickle.load(modelFileLoad) fileCities = dirs.data_dir + 'cities_by_state.pickle' us_cities = pd.read_pickle(fileCities) df = cat_df(df,model,us_cities,embeddings,new_run,run_parse,cutoff=cutoff, model_type=model_type) df.to_csv(fileout,index=False) # Saving logistic regression model from training set 1 modelFileSave = open(modelname, 'wb') pickle.dump(model, modelFileSave) modelFileSave.close() # ------ testing functions
def model_fitting(train_set, train_labels, classifier_name, n_jobs=cpu_count()): """ The fitting process with sklearn algorithms. :param train_set: numpy array, required :param train_labels: list, required :param classifier_name: string, required :param n_jobs: integer, required :return: object - Fit classifier model according to the given training data """ classifier_list = {"svm_linear": SVC(probability=True, kernel='linear', C=1.0), "svm_poly": SVC(probability=True, kernel='poly', C=1.0), "svm_rbf": SVC(probability=True, kernel='rbf', C=1.0, gamma=0.01), "linear_svc": LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.1, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, random_state=None, max_iter=3000), "knn": KNeighborsClassifier(n_neighbors=100, weights='distance', leaf_size=30, n_jobs=n_jobs), "random_forests": RandomForestClassifier(n_estimators=350, criterion='entropy', min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=600, n_jobs=n_jobs), "logistic_regression": LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.4, fit_intercept=True, intercept_scaling=1, random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr', warm_start=False, n_jobs=n_jobs), "decision_trees": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=100, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, presort=False), "sgd": SGDClassifier(alpha=.0001, n_iter=500, penalty="elasticnet", n_jobs=n_jobs), "neural_network": Classifier(layers=[Layer("Sigmoid", units=14), Layer("Sigmoid", units=13), Layer("Sigmoid", units=12), Layer("Sigmoid", units=10), Layer("Softmax")], learning_rate=0.01, n_iter=200, batch_size=10, regularize='L1', n_stable=50, dropout_rate=0, verbose=True), "GBC": GradientBoostingClassifier(max_depth=10, max_leaf_nodes=850, min_samples_leaf=15, learning_rate=0.1), "XGB": XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10, min_child_weight=2, missing=None, n_estimators=100, nthread=n_jobs, reg_alpha=0, objective='binary:logistic', reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1)} return classifier_list[classifier_name].fit(train_set, train_labels)
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
def train_classifier(download=True, parameters=None, ngram_range=(1, 1)): """Train the intent classifier.""" if download: download_wiki() path = os.path.join(l.TOPDIR, 'train.json') training_set = json.load(open(path)) path = os.path.join(l.TOPDIR, 'wiki.json') wiki_set = json.load(open(path)) target_names = list(set([i['unit'] for i in training_set + wiki_set])) train_data, train_target = [], [] for example in training_set + wiki_set: train_data.append(clean_text(example['text'])) train_target.append(target_names.index(example['unit'])) tfidf_model = TfidfVectorizer(sublinear_tf=True, ngram_range=ngram_range, stop_words='english') matrix = tfidf_model.fit_transform(train_data) if parameters is None: parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50, 'alpha': 0.00001, 'fit_intercept': True} clf = SGDClassifier(**parameters).fit(matrix, train_target) obj = {'tfidf_model': tfidf_model, 'clf': clf, 'target_names': target_names} path = os.path.join(l.TOPDIR, 'clf.pickle') pickle.dump(obj, open(path, 'w')) ###############################################################################