我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.svm.SVC。
def trained_models(): dataset = datasets.load_breast_cancer() X = dataset.data y = dataset.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345) rf = RandomForestClassifier() rf.fit(X_train, y_train) lr = LogisticRegression() lr.fit(X_train, y_train) svc_w_linear_kernel = SVC(kernel='linear') svc_w_linear_kernel.fit(X_train, y_train) svc_wo_linear_kernel = SVC() svc_wo_linear_kernel.fit(X_train, y_train) dummy = DummyClassifier() dummy.fit(X_train, y_train) return {'RF':rf, 'LR':lr, 'SVC_w_linear_kernel':svc_w_linear_kernel, 'Dummy':dummy, 'SVC_wo_linear_kernel':svc_wo_linear_kernel}
def evaluate_svm(train_data, train_labels, test_data, test_labels, n_jobs=-1): """ Evaluates a representation using a Linear SVM It uses 3-fold cross validation for selecting the C parameter :param train_data: :param train_labels: :param test_data: :param test_labels: :param n_jobs: :return: the test accuracy """ # Scale data to 0-1 scaler = MinMaxScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) parameters = {'kernel': ['linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]} model = svm.SVC(max_iter=10000) clf = grid_search.GridSearchCV(model, parameters, n_jobs=n_jobs, cv=3) clf.fit(train_data, train_labels) lin_svm_test = clf.score(test_data, test_labels) return lin_svm_test
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def get_classifier_class(class_name): name_table = { 'svm': SVC, 'k_neighbors': KNeighborsClassifier, 'gaussian_process': GaussianProcessClassifier, 'decision_tree': DecisionTreeClassifier, 'random_forest': RandomForestClassifier, 'ada_boost': AdaBoostClassifier, 'mlp': MLPClassifier, 'gaussian_naive_bayes': GaussianNB, 'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis } if class_name not in name_table: raise ValueError('No such classifier') return name_table[class_name]
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def __init__( self,data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=SVC(), data_block=data_block, predictors=predictors, cv_folds=cv_folds,scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics ) self.model_output=pd.Series(self.default_parameters) self.model_output['Coefficients'] = "-" #Set parameters to default values: self.set_parameters(set_default=True) #Check if probabilities enables: if not self.alg.get_params()['probability']: self.probabilities_available = False
def quiz15(): X, Y, N = read_file("features.train") Y_0 = (Y == 0).astype(int) c_l = [] w_l = [] for i in range(-6, 4, 2): c = 10 ** i c_l.append(c) clf = svm.SVC(C=c, kernel='linear', shrinking=False) clf.fit(X, Y_0) w = clf.coef_.flatten() norm_w = np.linalg.norm(w, ord=2) w_l.append(norm_w) print("C = ", c, ' norm(w) =', norm_w) plt.semilogx(c_l, w_l) plt.savefig("h5_q15.png", dpi=300)
def train_and_predict(self, param_dict, predict_on='val'): """Initializes an SVM classifier according to the desired parameter settings, trains it, and returns the predictions on the appropriate evaluation dataset. Args: param_dict: A dictionary with keys representing parameter names and values representing settings for those parameters. predict_on: The dataset used for evaluating the model. Can set to 'Test' to get final results. Returns: The predicted Y labels. """ if predict_on == 'test': predict_X = self.data_loader.test_X else: predict_X = self.data_loader.val_X self.model = SVC(C=param_dict['C'], kernel=param_dict['kernel'], gamma=param_dict['beta']) self.model.fit(self.data_loader.train_X, self.data_loader.train_Y) preds = self.predict_on_data(predict_X) return preds
def example_of_aggregating_sim_matrix(raw_data, labels, num_subjects, num_epochs_per_subj): # aggregate the kernel matrix to save memory svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) clf = Classifier(svm_clf, num_processed_voxels=1000, epochs_per_subj=num_epochs_per_subj) rearranged_data = raw_data[num_epochs_per_subj:] + raw_data[0:num_epochs_per_subj] rearranged_labels = labels[num_epochs_per_subj:] + labels[0:num_epochs_per_subj] clf.fit(list(zip(rearranged_data, rearranged_data)), rearranged_labels, num_training_samples=num_epochs_per_subj*(num_subjects-1)) predict = clf.predict() print(predict) print(clf.decision_function()) test_labels = labels[0:num_epochs_per_subj] incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj logger.info( 'when aggregating the similarity matrix to save memory, ' 'the accuracy is %d / %d = %.2f' % (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj, (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj) ) # when the kernel matrix is computed in portion, the test data is already in print(clf.score(None, test_labels))
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj): # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel # when the kernel matrix is computed in portions; also, this method only works # for self-correlation, i.e. correlation between the same data matrix. # no shrinking, set C=1 svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) #logit_clf = LogisticRegression() clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) # doing leave-one-subject-out cross validation # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_subjects, shuffle=False) scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)), y=labels, cv=skf) print(scores) logger.info( 'the overall cross validation accuracy is %.2f' % np.mean(scores) )
def example_of_correlating_two_components(raw_data, raw_data2, labels, num_subjects, num_epochs_per_subj): # aggregate the kernel matrix to save memory svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) num_training_samples=num_epochs_per_subj*(num_subjects-1) clf.fit(list(zip(raw_data[0:num_training_samples], raw_data2[0:num_training_samples])), labels[0:num_training_samples]) X = list(zip(raw_data[num_training_samples:], raw_data2[num_training_samples:])) predict = clf.predict(X) print(predict) print(clf.decision_function(X)) test_labels = labels[num_training_samples:] incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj logger.info( 'when aggregating the similarity matrix to save memory, ' 'the accuracy is %d / %d = %.2f' % (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj, (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj) ) # when the kernel matrix is computed in portion, the test data is already in print(clf.score(X, test_labels))
def example_of_correlating_two_components_aggregating_sim_matrix(raw_data, raw_data2, labels, num_subjects, num_epochs_per_subj): # aggregate the kernel matrix to save memory svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) clf = Classifier(svm_clf, num_processed_voxels=1000, epochs_per_subj=num_epochs_per_subj) num_training_samples=num_epochs_per_subj*(num_subjects-1) clf.fit(list(zip(raw_data, raw_data2)), labels, num_training_samples=num_training_samples) predict = clf.predict() print(predict) print(clf.decision_function()) test_labels = labels[num_training_samples:] incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj logger.info( 'when aggregating the similarity matrix to save memory, ' 'the accuracy is %d / %d = %.2f' % (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj, (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj) ) # when the kernel matrix is computed in portion, the test data is already in print(clf.score(None, test_labels)) # python3 classification.py face_scene bet.nii.gz face_scene/prefrontal_top_mask.nii.gz face_scene/fs_epoch_labels.npy
def test_build_param_grid_set_estimator(): clf1 = SVC() clf2 = LogisticRegression() clf3 = SVC() clf4 = SGDClassifier() estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])), ('clf', None)]), clf=[set_grid(clf1, kernel=['linear']), clf2, set_grid(clf3, kernel=['poly'], degree=[2, 3]), clf4]) param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]}, {'clf': [clf3], 'clf__kernel': ['poly'], 'clf__degree': [2, 3], 'sel__k': [2, 3]}, {'clf': [clf2, clf4], 'sel__k': [2, 3]}] assert build_param_grid(estimator) == param_grid
def test_make_grid_search(): X, y = load_iris(return_X_y=True) lr = LogisticRegression() svc = set_grid(SVC(), kernel=['poly'], degree=[2, 3]) gs1 = make_grid_search(lr, cv=5) # empty grid gs2 = make_grid_search(svc, cv=5) gs3 = make_grid_search([lr, svc], cv=5) for gs, n_results in [(gs1, 1), (gs2, 2), (gs3, 3)]: gs.fit(X, y) assert gs.cv == 5 assert len(gs.cv_results_['params']) == n_results svc_mask = gs3.cv_results_['param_root'] == svc assert svc_mask.sum() == 2 assert gs3.cv_results_['param_root__degree'][svc_mask].tolist() == [2, 3] assert gs3.cv_results_['param_root'][~svc_mask].tolist() == [lr]
def train(train_dataTables, human_marks): global classifier samples =[] target = [] for nn, dataTable in enumerate(train_dataTables): for i in xrange(dataTable.row): for j in xrange(dataTable.col): mention = dataTable[i][j] if mention.cid == -1: continue eids = dataTable.get_eids(i, j) words = dataTable.get_words(i, j) entites = dataTable.get_entities(i ,j) true_id = human_marks[nn][i][j]['id'] for ii, entity in enumerate(mention.candidates): prior = entity.popular SR = mention.getSR(ii, entites) res = int(true_id == entity.id) samples.append([prior, SR]) target.append(res) from sklearn import svm classifier = svm.SVC(probability=True) classifier.fit(samples, target)
def run(self): training_x, training_y, training_ids = self.get_training_data() test_x, test_y, test_ids = self.get_test_data() clf = self.define_model(self.model_name, self.model_params) clf.fit(training_x, training_y) res_predict = clf.predict(test_x) if (self.model_name == "SGDClassifier" and (clf.loss =="hinge" or clf.loss == "perceptron")) or self.model_name == "linear.SVC": res = list(clf.decision_function(test_x)) else: res = list(clf.predict_proba(test_x)[:,1]) #fp, fn, tp, tn = self.compute_confusion_matrix(res[:,0], test_y) result_dictionary = {'training_ids': training_ids, 'predictions_test_y': list(res_predict), 'prob_prediction_test_y': res , 'test_y': list(test_y), 'test_ids': list(test_ids), 'model_name': self.model_name, 'model_params': self.model_params, 'label': self.label, 'feature_columns_used': self.cols_to_use, 'config': self.config, 'feature_importance': self.get_feature_importance(clf, self.model_name), 'columned_used_for_feat_importance': list(training_x.columns.values)} return result_dictionary, clf
def svc_model(self, X, y, x_test, y_test, x_val, y_val, i, j): X, y = shuffle(X, y, random_state=self.SEED) clf = SVC(C=self.C, kernel='rbf', gamma=self.gamma, cache_size=self.cache_size, verbose=0, random_state=self.SEED) model = clf.fit(X, y) yhat_train = model.predict(X) yhat_val = model.predict(x_val) yhat_test = model.predict(x_test) train_error = (1 - accuracy_score(y, yhat_train)) * 100 val_error = (1 - accuracy_score(y_val, yhat_val)) * 100 test_error = (1 - accuracy_score(y_test, yhat_test)) * 100 self.warn_log.append([i, train_error, val_error, test_error]) return model
def __init__(self, isTrain, isOutlierRemoval=0): """ The linear models ``LinearSVC()`` and ``SVC(kernel='linear')`` yield slightly different decision boundaries. This can be a consequence of the following differences: - ``LinearSVC`` minimizes the squared hinge loss while ``SVC`` minimizes the regular hinge loss. - ``LinearSVC`` uses the One-vs-All (also known as One-vs-Rest) multiclass reduction while ``SVC`` uses the One-vs-One multiclass reduction. :return: """ super(ClassificationSVM, self).__init__(isTrain, isOutlierRemoval) # data preprocessing self.dataPreprocessing() self.clf = svm.SVC() # define the SVM classifier C = 1.0 # SVM regularization parameter self.svc = svm.SVC(kernel='linear', C=C, max_iter=100000) self.rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C) self.poly_svc = svm.SVC(kernel='poly', coef0=1, degree=3, C=C) self.lin_svc = svm.LinearSVC(C=C)
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': np.logspace(-4, 3, 30), 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}, {'kernel': ['poly'], 'degree': [1, 2, 3, 4], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'coef0': np.logspace(-4, 3, 30)}, {'kernel': ['linear'], 'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}] clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def svm_classify(X, label, split_ratios, C): """ trains a linear SVM on the data input C specifies the penalty factor for SVM """ train_size = int(len(X)*split_ratios[0]) val_size = int(len(X)*split_ratios[1]) train_data, valid_data, test_data = X[0:train_size], X[train_size:train_size + val_size], X[train_size + val_size:] train_label, valid_label, test_label = label[0:train_size], label[train_size:train_size + val_size], label[train_size + val_size:] print('training SVM...') clf = svm.SVC(C=C, kernel='linear') clf.fit(train_data, train_label.ravel()) p = clf.predict(train_data) train_acc = accuracy_score(train_label, p) p = clf.predict(valid_data) valid_acc = accuracy_score(valid_label, p) p = clf.predict(test_data) test_acc = accuracy_score(test_label, p) return [train_acc, valid_acc, test_acc]
def setup(self): """ This function ... :return: """ # Call the setup of the base class super(Classifier, self).setup() # Create the vector classifier self.vector_classifier = svm.SVC(gamma=0.001, C=100.) # support vector classification # Determine the path to the collection directory for the current mode collection_mode_path = os.path.join(self.collection_user_path, self.config.mode) # Determine the paths to the 'yes' and 'no' saturation collection directories self.yes_path = os.path.join(collection_mode_path, "yes") self.no_path = os.path.join(collection_mode_path, "no") # Determine the path to the classification directory for the current mode self.classification_mode_path = os.path.join(self.classification_user_path, self.config.mode) # -----------------------------------------------------------------
def test_visualize(): pytest.importorskip('graphviz') X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = SVC(random_state=0) grid = {'C': [.1, .5, .9]} gs = dcv.GridSearchCV(clf, grid).fit(X, y) assert hasattr(gs, 'dask_graph_') with tmpdir() as d: gs.visualize(filename=os.path.join(d, 'mydask')) assert os.path.exists(os.path.join(d, 'mydask.png')) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize()
def performSVMClass(X_train, y_train, X_test, y_test, fout, savemodel): """ SVM binary Classification """ # c = parameters[0] # g = parameters[1] clf = SVC() clf.fit(X_train, y_train) # if savemodel == True: # fname_out = '{}-{}.pickle'.format(fout, datetime.now()) # with open(fname_out, 'wb') as f: # cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) return accuracy
def performSVMClass(X_train, y_train, X_test, y_test): classifier = svm.SVC() classifier.fit(X_train, y_train) results = classifier.predict(X_test) # colors = {1:'red', 0:'blue'} # df = pd.DataFrame(dict(adj=X_test[:,5], return_=X_test[:,50], label=results)) # fig, ax = plt.subplots() # colors = {1:'red', 0:'blue'} # ax.scatter(df['adj'],df['return_'], c=df['label'].apply(lambda x: colors[x])) # # ax.scatter(X_test[:,5], X_test[:,50], c=y_test_list.apply(lambda x: colors[x])) # plt.show() # print y_pred # cm = confusion_matrix(y_test, results) # print cm # plt.figure() # plot_confusion_matrix(cm) # plt.show() num_correct = (results == y_test).sum() recall = num_correct / len(y_test) # print "SVM model accuracy (%): ", recall * 100, "%" return recall*100
def performSVMClass(X_train, y_train, X_test, y_test, fout, savemodel): """ SVM binary Classification """ # c = parameters[0] # g = parameters[1] clf = SVC() clf.fit(X_train, y_train) # if savemodel == True: # fname_out = '{}-{}.pickle'.format(fout, datetime.now()) # with open(fname_out, 'wb') as f: # cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) print "SVM: ", accuracy
def support_vector_machine(self, sensors_set): features = list(self.dataset.get_sensors_set_features(sensors_set)) print("SUPPORT VECTOR MACHINE.....") print("CLASSIFICATION BASED ON THESE SENSORS: ", self.dataset.get_remained_sensors(sensors_set)) print("NUMBER OF FEATURES: ", len(features)) train_features, train_classes, test_features, test_classes = self.__get_sets_for_classification( self.dataset.get_train, self.dataset.get_test, features) train_features_scaled, test_features_scaled = util.scale_features(train_features, test_features) classifier_svm = SVC(C=const.PAR_SVM_C[sensors_set], gamma=const.PAR_SVM_GAMMA[sensors_set], verbose=False) classifier_svm.fit(train_features_scaled, train_classes) test_prediction = classifier_svm.predict(test_features_scaled) acc = accuracy_score(test_classes, test_prediction) print("ACCURACY : " + str(acc)) print("END SUPPORT VECTOR MACHINE.....") if not os.path.exists(const.DIR_RESULTS): os.makedirs(const.DIR_RESULTS) file_content = "acc\n" + str(acc) with open(const.DIR_RESULTS + "/" + str(sensors_set) + const.FILE_SUPPORT_VECTOR_MACHINE_RESULTS, 'w') as f: f.write(file_content) # use different algorithms changing target classes, try all combination of two target classes
def test_support_vector_classifier(self): for dtype in self.number_data_type.keys(): scikit_model = SVC(kernel='rbf', gamma=1.2, C=1) data = self.scikit_data['data'].astype(dtype) target = self.scikit_data['target'].astype(dtype) > self.scikit_data['target'].astype(dtype).mean() scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target) coreml_model = create_model(spec) for idx in range(0, 10): test_data = data[idx].reshape(1, -1) try: self.assertEqual(scikit_model.predict(test_data)[0], bool(int(coreml_model.predict({'data': test_data})['target'])), msg="{} != {} for Dtype: {}".format( scikit_model.predict(test_data)[0], bool(int(coreml_model.predict({'data': test_data})['target'])), dtype ) ) except RuntimeError: print("{} not supported. ".format(dtype))
def learn(training_data, training_labels, show_score=False, store=False): print ("Start Learning....") clf = SVC(kernel='linear', probability=True, C=1) clf.fit(training_data, training_labels) print ("Done Learning.") if store: print ("Pickling classifier...") pickle.dump(clf, open(path_config.CLASSIFIER_PICKLING_FILE, 'wb')) print ("Done Pickling.") if show_score: print ("Scoring classifier ...") print ("Data-Level Training Set Prediction Accuracy: %s" % clf.score(training_data, training_labels))
def classification_linear_svm(self): self.signals.PrintInfo.emit("???????? Linear SVM") output_dir = self.output_dir + 'linear_svm_out/' if not os.path.exists(output_dir): os.makedirs(output_dir) vectorizer = HashingVectorizer() fdata = vectorizer.fit_transform(self.fdata) trainingSet = fdata[:self.split] testSet = fdata[self.split:] classificator = SVC(kernel="linear", probability=True, C=self.linear_svm_c) classificator.fit(trainingSet, self.trainingClass) results = classificator.predict(testSet) proba = classificator.predict_proba(testSet) self.write_results_to_file(output_dir + 'results.csv', results, proba, classificator.classes_, self.test_filenames) out_text = self.compile_result_string(results, proba, classificator.classes_, self.test_filenames) self.signals.PrintInfo.emit(out_text)
def classification_rbf_svm(self): self.signals.PrintInfo.emit("RBF SVM") output_dir = self.output_dir + 'rbf_svm_out/' if not os.path.exists(output_dir): os.makedirs(output_dir) vectorizer = HashingVectorizer() fdata = vectorizer.fit_transform(self.fdata) trainingSet = fdata[:self.split] testSet = fdata[self.split:] classificator = SVC(gamma=2, probability=True, C=self.rbf_svm_c) classificator.fit(trainingSet, self.trainingClass) results = classificator.predict(testSet) proba = classificator.predict_proba(testSet) self.write_results_to_file(output_dir + 'results.csv', results, proba, classificator.classes_,self.test_filenames) out_text = self.compile_result_string(results, proba, classificator.classes_, self.test_filenames) self.signals.PrintInfo.emit(out_text)
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) self.classifier = SVC() self.classifier.fit(self.trainingSet, self.trainingLabel) pred_labels = {} for user in self.testDict: pred_labels[user] = self.classifier.predict([[self.BDS[user]]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel return pred_labels
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = models.TfidfModel(corpus) corpus = [text for text in model[corpus]] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) pred_labels = {} classifier = SVC() classifier.fit(self.trainingSet, self.trainingLabel) for user in self.testDict: pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) return pred_labels # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
def buildModel(dataset, method, parameters): """ Build final model for predicting real testing data """ features = dataset.columns[0:-1] if method == 'RNN': clf = performRNNlass(dataset[features], dataset['UpDown']) return clf elif method == 'RF': clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) elif method == 'KNN': clf = neighbors.KNeighborsClassifier() elif method == 'SVM': c = parameters[0] g = parameters[1] clf = SVC(C=c, gamma=g) elif method == 'ADA': clf = AdaBoostClassifier() return clf.fit(dataset[features], dataset['UpDown'])
def Training_model(): #???????????? f = open("f://emotion/mysite/weibo_emotion/emotion_file/data_count.txt") # ??????????? f.readline() # ???? data = np.loadtxt(f) #????????? f1 = open("f://emotion/mysite/weibo_emotion/emotion_file/data_jixing.txt") leibie = np.loadtxt(f1) f.close() f1.close() #TF-IDF?? transformer = TfidfTransformer() tfidf = transformer.fit_transform(data) data1 = tfidf.toarray() #SVM????? clf = svm.SVC() # class clf.fit(data1, leibie) # training the svc model return clf
def train(): training_set=[] training_labels=[] os.chdir("/Users/muyunyan/Desktop/EC500FINAL/logo/") counter=0 a=os.listdir(".") for i in a: os.chdir(i) print(i) for d in os.listdir("."): img = cv2.imread(d) res=cv2.resize(img,(250,250)) gray_image = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY) xarr=np.squeeze(np.array(gray_image).astype(np.float32)) m,v=cv2.PCACompute(xarr) arr= np.array(v) flat_arr= arr.ravel() training_set.append(flat_arr) training_labels.append(i) os.chdir("..") trainData=training_set responses=training_labels svm = svm.SVC() svm.fit(trainData,responses) return svm
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y): online = OnlineBase(name, label_p, label_n, oracle, n_features, ftype, error=.5) x, y = online.collect_pts(100, -1) i = 0 q = online.get_n_query() C_range = np.logspace(-2, 5, 10, base=10) gamma_range = np.logspace(-5, 1, 10, base=10) param_grid = dict(gamma=gamma_range, C=C_range) while q < 3500: i += 1 # h_ = ex.fit(x, y) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv, verbose=0, n_jobs=-1) grid.fit(x, y) h_ = grid.best_estimator_ online_ = OnlineBase('', label_p, label_n, h_.predict, n_features, ftype, error=.1) x_, _ = online_.collect_pts(10, 200) if x_ is not None and len(x_) > 0: x.extend(x_) y.extend(oracle(x_)) q += online_.get_n_query() pred_y = h_.predict(test_x) print len(x), q, sm.accuracy_score(test_y, pred_y)
def grid_retrain_in_x(self): gamma_range = np.logspace(-15, 3, 19, base=2) param_grid = dict(gamma=gamma_range) if len(np.unique(self.y_ex)) < 2: return 1, 1 try: cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=.2) grid = GridSearchCV(SVC(C=1e5), param_grid=param_grid, cv=cv, n_jobs=-1) grid.fit(self.X_ex, self.y_ex) rbf_svc2 = grid.best_estimator_ except ValueError: rbf_svc2 = SVC(C=1e5) rbf_svc2.fit(self.X_ex, self.y_ex) self.set_clf2(rbf_svc2) return self.benchmark()
def grid_search(self): C_range = np.logspace(-5, 15, 21, base=2) param_grid = dict(C=C_range) cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(kernel='poly', max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0) logger.info('start grid search for Linear') grid.fit(self.X_ex, self.y_ex) logger.info('end grid search for Linear') scores = [x[1] for x in grid.grid_scores_] # final train clf = grid.best_estimator_ pred_train = clf.predict(self.X_ex) pred_val = clf.predict(self.val_x) pred_test = clf.predict(self.test_x) r = Result(self.name + ' (X)', 'Poly', len(self.X_ex), sm.accuracy_score(self.y_ex, pred_train), sm.accuracy_score(self.val_y, pred_val), sm.accuracy_score(self.test_y, pred_test)) return r
def fit_model(X, y): classifier = svm.SVC() parameters = {'kernel':['poly', 'rbf', 'sigmoid'], 'degree':[1, 2, 3], 'C':[0.1, 1, 10]} f1_scorer = make_scorer(performance_metric, greater_is_better=True) clf = GridSearchCV(classifier, param_grid=parameters, scoring=f1_scorer) clf.fit(X, y) return clf # Read student data
def create_model(self, training_articles): model = OneVsRestClassifier(svm.SVC(probability=True)) features = [] labels = [] i = 0 for article in training_articles: print("Generating features for article " + str(i) + "...") google_cloud_response = self.analyze_text_google_cloud(article["article"]) relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"]) # Only count this article if a relevant entity is present if relevant_entities: article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"]) features.append(article_features) labels.append(article["label"]) else: print("Skipping article " + str(i) + "...") i = i + 1 print("Performing feature scaling...") scaler = preprocessing.StandardScaler().fit(features) features_scaled = scaler.transform(features) print("Fitting model...") model.fit(features_scaled, labels) print("Saving model...") joblib.dump(scaler, "data_analysis/caler.pkl") joblib.dump(model, "data_analysis/model.pkl") print("Done!") # For use in prod
def classify(train=None, test=None, data=None, res_dir="res/", disp=True, outfilename=None): """Description of compare compare multiple classifier and display the best one """ utils.print_success("Comparison of differents classifiers") if data is not None: train_features = data["train_features"] train_groundtruths = data["train_groundtruths"] test_features = data["test_features"] test_groundtruths = data["test_groundtruths"] else: train = utils.abs_path_file(train) test = utils.abs_path_file(test) train_features, train_groundtruths = read_file(train) test_features, test_groundtruths = read_file(test) if not utils.create_dir(res_dir): res_dir = utils.abs_path_dir(res_dir) classifiers = { "RandomForest": RandomForestClassifier(n_jobs=-1) # "RandomForest": RandomForestClassifier(n_estimators=5), # "KNeighbors":KNeighborsClassifier(3), # "GaussianProcess":GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # "DecisionTree":DecisionTreeClassifier(max_depth=5), # "MLP":MLPClassifier(), # "AdaBoost":AdaBoostClassifier(), # "GaussianNB":GaussianNB(), # "QDA":QuadraticDiscriminantAnalysis(), # "SVM":SVC(kernel="linear", C=0.025), # "GradientBoosting":GradientBoostingClassifier(), # "ExtraTrees":ExtraTreesClassifier(), # "LogisticRegression":LogisticRegression(), # "LinearDiscriminantAnalysis":LinearDiscriminantAnalysis() } for key in classifiers: utils.print_success(key) clf = classifiers[key] utils.print_info("\tFit") clf.fit(train_features, train_groundtruths) utils.print_info("\tPredict") predictions = clf.predict(test_features) return predictions
def classify(n = 50): #clf = MultinomialNB(fit_prior=False) #clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0}) clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0}) clf.fit(mat[:n], rel[:n]) return clf
def baseline_svm(): train_data = pd.read_csv(r"data/train.csv") print u"?????\n",train_data.info() print u'?????\n',train_data.describe() #display_data(train_data) # ???????? #display_with_process(train_data) # ??????????????????,???? process_data = pre_processData(train_data,'process_train_data') # ???????????? train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X = train_np[:,1:] y = train_np[:,0] model = svm.SVC(C=1.0,tol=1e-6).fit(X,y) # print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) '''??????''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data') # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_svm_result/prediction.csv',index=False) # baseline???????——0.76077
def baseline_svm_crossValidate(): origin_train_data = pd.read_csv(r"data/train.csv") process_data = pre_processData(origin_train_data,'process_train_data') # ???????????? process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2) train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X_train = train_np[:,1:] y_train = train_np[:,0] model = svm.SVC(kernel='rbf',tol=1e-6).fit(X_train,y_train) #print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') cv_np = cv_data.as_matrix() X_cv = cv_np[:,1:] y_cv = cv_np[:,0] predictions = model.predict(X_cv) print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0]) error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)] predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId']) predictions_item.columns=['error_PassengerId'] # error_items = error_items.reset_index(drop=True) error_result = pd.concat([error_items,predictions_item],axis=1) error_result.to_csv(r'error.csv',index=False) '''??????''' '''test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'svm_result/prediction.csv',index=False)''' # baseline crossValidate???????——??????
def svc_model(self): model = svm.SVC(probability=True, C=0.3, kernel='linear') return model
def use_SVM(X_data,y_data): p_gamma = 0.1 p_C = 10 svm = SVC(kernel = 'rbf',random_state=0, gamma=p_gamma ,C=p_C, probability=True) svm.fit(X_data,y_data) joblib.dump(svm,"./sklearn_model/svm_trainval1_{param1}_{param2}".format(param1 = p_gamma,param2 = p_C)) return svm
def classifier_train(feature_matrix_0, feature_matrix_1, algorithm = 'SVM'): """ Trains a binary classifier using the SVM algorithm with the following parameters Arguments feature_matrix_0: Matrix with examples for Class 0 feature_matrix_0: Matrix with examples for Class 1 algorithm: Currently only SVM is supported Outputs classfier: trained classifier (scikit object) mu_ft, std_ft: normalization parameters for the data """ # Create vector Y (class labels) class0 = np.zeros((feature_matrix_0.shape[0],1)) class1 = np.ones((feature_matrix_1.shape[0],1)) # Concatenate feature matrices and their respective labels y = np.concatenate((class0, class1),axis=0) features_all = np.concatenate((feature_matrix_0, feature_matrix_1),axis=0) # Normalize inputs mu_ft = np.mean(features_all) std_ft = np.std(features_all) X = (features_all - mu_ft) / std_ft # Train SVM, using default parameters classifier = svm.SVC() classifier.fit(X, y) return classifier, mu_ft, std_ft