我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cross_validation.train_test_split()。
def main(): iris = datasets.load_iris() x = iris.data y = iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5) clrTree = tree.DecisionTreeClassifier() clrTree = clrTree.fit(x_train, y_train) outTree = clrTree.predict(x_test) clrKN = KNeighborsClassifier() clrKN = clrKN.fit(x_train, y_train) outKN = clrKN.predict(x_test) # Prediction accuracy print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%") print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
def main(): iris = datasets.load_iris() x = iris.data y = iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5) clr = NewClassifier() clr.fit(x_train, y_train) prediction = clr.predict(x_test) # Prediction accuracy print("Accuracy: " + str(accuracy_score(y_test, prediction) * 100) + "%") # Run main
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc',svm.LinearSVC()), ('knn',neighbors.KNeighborsClassifier()), ('rfor',RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:',confidence) predictions = clf.predict(X_test) print('predicted class counts:',Counter(predictions)) print() print() return confidence # examples of running:
def threshold_estimate(x,y): x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.1, random_state=0) weight = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 1])) w1 = np.array([1]*y_train.shape[0]) w1[y_train==1]=weight print("samples: %d %d %f" % (x_train.shape[0], x_test.shape[0], weight)) estimator = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=1000, nthread=50) estimator.fit(x_train, y_train, sample_weight=w1) y_scores = estimator.predict_proba(x_test)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, y_scores) f1 = 2*precision[2:]*recall[2:]/(precision[2:]+recall[2:]) m_idx = np.argmax(f1) m_thresh = thresholds[2+m_idx] print("%d %f %f" % (precision.shape[0], f1[m_idx], m_thresh)) return m_thresh # Estimate threshold for the classifier using inner-round cross validation
def load_data(): global training_data, testing_data lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) xs = lfw_people.data ys = lfw_people.target inputs = [] labels = list(ys) for face in xs: V = Vol(50, 37, 1, 0.0) V.w = list(face) inputs.append(augment(V, 30)) x_tr, x_te, y_tr, y_te = train_test_split(inputs, labels, test_size=0.25) training_data = zip(x_tr, y_tr) testing_data = zip(x_te, y_te) print 'Dataset made...'
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'): from sklearn.cross_validation import train_test_split from sklearn.metrics import silhouette_score shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers)) train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio) train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values)) full_mat = np.array(list(shape_df.values)) centroids = None labels = None best_score = 0 for k in k_range: res = cluster_shapes(train_mat, full_mat, k) score = silhouette_score(full_mat, res[1]) if score > best_score: centroids = res[0] labels = res[1] best_score = score mols[cluster_key] = labels return mols, centroids
def rede_neural(X, y): print("Iniciando treinamento da Rede Neural") X2 = normalize(X) clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', algorithm='adam', alpha=1e-5, learning_rate='constant',tol=1e-8,learning_rate_init=0.0002, early_stopping=True,validation_fraction=0.2) kf = KFold(len(y),n_folds=3) i = 0 for train,test in kf: start = time.time() i = i + 1 print("Treinamento",i) # dividindo dataset em treino e test #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1) X_train, X_test, y_train, y_test = X2[train], X2[test], y[train], y[test] # fit clf.fit(X_train, y_train) print("score:",clf.score(X_test, y_test),"(",(time.time()-start)/60.0,"minutos )") return clf
def split_to_test_and_train(data, labels, entities, test_size=DEFAULT_TEST_SIZE): d_train, d_test, l_train, l_test, c_train, c_test = train_test_split(data, labels, entities, test_size=test_size) d_test_2 = [] l_test_2 = [] c_test_2 = [] train_dict = {} for d in d_train: train_dict[d] = 1 for d,l,c in zip(d_test, l_test, c_test): if (train_dict.has_key(d)): continue d_test_2.append(d) l_test_2.append(l) c_test_2.append(c) return (d_train, d_test_2, l_train, l_test_2, c_train, c_test_2) # utility to extracts entities from preproceseed files
def get_train_test(pandas_data, target_col): # Separating target from the rest of the data x = pandas_data.drop(target_col, 1) x = data_scaling.scale_numeric_data(x) # Selection of training/target data for validation and training. target_loc = pandas_data.columns.get_loc(target_col) data = pd.DataFrame.as_matrix(pandas_data) y = data[:, target_loc] x = pd.DataFrame.as_matrix(x) # Selecting training and test sets return cross_validation.train_test_split(x, y, test_size=0.2) # Removes the target column from the input data. # Returns two DataFrames.
def getDatas(dataset_dir_name): movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) #word_tokenizer ?????????????????????????????????????????????????? vectorizer = CountVectorizer(binary = True, decode_error = u'ignore') word_tokenizer = vectorizer.build_tokenizer() #????????list doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train) doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test) return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
def fastLapModel(xList, labels, names, multiple=0, full_set=0): X = numpy.array(xList) y = numpy.array(labels) featureNames = [] featureNames = numpy.array(names) # take fixed holdout set 30% of data rows xTrain, xTest, yTrain, yTest = train_test_split( X, y, test_size=0.30, random_state=531) # for final model (no CV) if full_set: xTrain = X yTrain = y check_set(xTrain, xTest, yTrain, yTest) print "Fitting the model to the data set..." # train random forest at a range of ensemble sizes in order to see how the # mse changes mseOos = [] m = 10 ** multiple nTreeList = range(500 * m, 1000 * m, 100 * m) # iTrees = 10000 for iTrees in nTreeList: depth = None maxFeat = int(np.sqrt(np.shape(xTrain)[1])) + 1 # try tweaking RFmd = ensemble.RandomForestRegressor(n_estimators=iTrees, max_depth=depth, max_features=maxFeat, oob_score=False, random_state=531, n_jobs=-1) # RFmd.n_features = 5 RFmd.fit(xTrain, yTrain) # Accumulate mse on test set prediction = RFmd.predict(xTest) mseOos.append(mean_squared_error(yTest, prediction)) # plot training and test errors vs number of trees in ensemble plot.plot(nTreeList, mseOos) plot.xlabel('Number of Trees in Ensemble') plot.ylabel('Mean Squared Error') #plot.ylim([0.0, 1.1*max(mseOob)]) plot.show() print("MSE") print(mseOos[-1]) return xTrain, xTest, yTrain, yTest, RFmd
def build_decision_tree(filename): """ ?????????????? """ f=open(sys.argv[1],'r') reader=csv.reader(f) x=[] y=[] for line in reader: if line[1] in ['1','2','3']:#??????,?????? x.append(line[2:4]+line[5:]) y.append(line[1]) x_train,x_test,y_train,y_test=cross_validation.train_test_split(x,y, test_size=0.2, random_state=42) clf=tree.DecisionTreeClassifier(max_depth=5) clf=clf.fit(x_train,y_train) score=clf.score(x_test,y_test) print score return clf,score
def train_xgboost(): df = pd.read_csv('data/stage1_labels.csv') print(df.head()) x = np.array([np.mean(np.load('npy_result/%s.npy' % str(id)), axis=0) for id in df['id'].tolist()]) y = df['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20) clf = xgb.XGBRegressor(max_depth=10, n_estimators=1500, min_child_weight=9, learning_rate=0.05, nthread=8, subsample=0.80, colsample_bytree=0.80, seed=4242) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50) return clf
def pipeline(iteration,C,gamma,random_seed): x_train, _x , y_train, _y = train_test_split(train_x,train_y,test_size=0.4,random_state=random_seed) print x_train.shape clf = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,cache_size=7000,class_weight='balanced',verbose=True,random_state=random_seed) clf.fit(x_train,y_train) #predict test set pred = clf.predict_proba(test_x) test_result = pd.DataFrame(columns=["Idx","score"]) test_result.Idx = test_Idx test_result.score = pred[:,1] test_result.to_csv('./test/svm_{0}.csv'.format(iteration),index=None) #predict val set pred = clf.predict_proba(val_x) val_result = pd.DataFrame(columns=["Idx","score"]) val_result.Idx = val_Idx val_result.score = pred[:,1] val_result.to_csv('./val/svm_{0}.csv'.format(iteration),index=None)
def splitValidateModel(self, visualizePredictions = False): (label_vector, input_vector) = loadData(self.featureFile) indexArray = range(0, len(input_vector)) trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \ cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit)) kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(trainData, trainLabels) predictedLabels = kNNClassifier.predict(testData) print("Classification report for classifier %s:\n%s\n" % ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels)) print('Split Validation training :: Done.\n') if visualizePredictions: self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
def trainLimited(self, featureFile, n_datapoints): (label_vector, input_vector) = loadData(featureFile) trainData, testData, trainLabels, testLabels = \ cross_validation.train_test_split(input_vector, label_vector, test_size=(0)) n_totalrows = int((len(label_vector)/n_datapoints)) for n in range(0, n_totalrows): limited_label_vector = trainLabels[0: (n+1) * n_datapoints] limited_input_vector = trainData[0: (n+1) * n_datapoints] kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(limited_input_vector, limited_label_vector) scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5) print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def trainLimitedSoftmax(self, featureFile, n_datapoints): (label_vector, input_vector) = self.__loadData__(featureFile) n_totalrows = int((len(label_vector)/n_datapoints)) k=[] trainData, testData, trainLabels, testLabels = \ cross_validation.train_test_split(input_vector, label_vector, test_size=(0.2)) for n in range(0, n_totalrows): limited_label_vector = trainLabels[0: (n+1) * n_datapoints] limited_input_vector = trainData[0: (n+1) * n_datapoints] _, maxVal = self.trainSoftmaxWithData(limited_input_vector, limited_label_vector, 1000) print 'Total Average Value: %s \n\n' % (maxVal) k.append(maxVal) print('Limited Softmax training result ----------') for i in range (0,len(k)): print '%f on %d datapoints' % (k[i], (n_datapoints * (i+1))) print '------------------------------------------'
def load_split_data(grid_variable): """ Load train_validation and validation data sets for testing and tuning different machine learning models. """ # Set work directory os.chdir('C://Users//thep3//OneDrive//Documents//Kaggle//Facebook V - Predicting Check Ins//data//') # Load data train = pd.read_csv("train_modified.csv", sep = ",") grid_variables = ['grid_cell_20x40', 'grid_cell_50x50', 'grid_cell_100x100', 'grid_cell_50x100', 'grid_cell_75x150', 'grid_cell_100x200'] grid_variables.remove(grid_variable) train = train.drop(grid_variables, 1) train, test = train_test_split(train, test_size = 0.3, random_state = 0) # Return data return train, test
def split_dataset(data_set,split=0.5): ''' According to 'spilt',split the dataset to train_set and test_set :param data_set: a Bunch object :param split: integer :return: x_train, x_test, y_train, y_test:Training data and target values ''' print('spilting dataset......') start_time = time.time() x_train, x_test, y_train, y_test = cross_validation.train_test_split(data_set.data, data_set.target, test_size=split, random_state=0) print('spilting took %.2f s' % (time.time() - start_time)) # train_set=(x_train,y_train) # test_set=(x_test,y_test) # return train_set,test_set return x_train, x_test, y_train, y_test
def test_onehot(): data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=333) train = pd.DataFrame(X_train) test = pd.DataFrame(X_test) t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=False, dummy_na=True) assert t_train.shape[1] == t_test.shape[1] assert t_train.shape[1] == 441 t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=True, dummy_na=False) assert t_train.shape[1] == t_test.shape[1] assert t_train.shape[1] == 500
def baseline_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") #print u"?????\n",train_data.info() #print u'?????\n',train_data.describe() #display_data(train_data) # ???????? #display_with_process(train_data) # ??????????????????,???? process_data = pre_processData(train_data,'process_train_data') # ???????????? train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:]) #=cv_error.to_csv(r'error.csv',index=True) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''??????''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data') # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) # baseline?SVM??——0.78947
def baseline_randomForest(): train_data = pd.read_csv(r"data/train.csv") print u"?????\n",train_data.info() print u'?????\n',train_data.describe() #display_data(train_data) # ???????? #display_with_process(train_data) # ??????????????????,???? process_data = pre_processData(train_data,'process_train_data',optimize=False) # ???????????? train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X = train_np[:,1:] y = train_np[:,0] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) model = RandomForestClassifier(n_estimators=100).fit(X,y) #predictions = model.predict(X_test) #print np.float32(np.sum(predictions == y_test))/np.float32(predictions.shape[0]) '''??''' test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'baseline_randomForest_result/prediction.csv',index=False) # baseline crossValidate?SVM??———???????
def baseline_svm_crossValidate(): origin_train_data = pd.read_csv(r"data/train.csv") process_data = pre_processData(origin_train_data,'process_train_data') # ???????????? process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2) train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X_train = train_np[:,1:] y_train = train_np[:,0] model = svm.SVC(kernel='rbf',tol=1e-6).fit(X_train,y_train) #print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') cv_np = cv_data.as_matrix() X_cv = cv_np[:,1:] y_cv = cv_np[:,0] predictions = model.predict(X_cv) print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0]) error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)] predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId']) predictions_item.columns=['error_PassengerId'] # error_items = error_items.reset_index(drop=True) error_result = pd.concat([error_items,predictions_item],axis=1) error_result.to_csv(r'error.csv',index=False) '''??????''' '''test_data = pd.read_csv(r"data/test.csv") process_test_data = pre_processData(test_data,'process_test_data',optimize=False) # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'svm_result/prediction.csv',index=False)''' # baseline crossValidate???????——??????
def baseline_logisticRegression_crossValidate(): origin_train_data = pd.read_csv(r"data/train.csv") process_data = fe_preprocessData(origin_train_data,'process_train_data') # ???????????? process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2) train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X_train = train_np[:,1:] y_train = train_np[:,0] model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)}) cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') cv_np = cv_data.as_matrix() X_cv = cv_np[:,1:] y_cv = cv_np[:,0] predictions = model.predict(X_cv) print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0]) '''?????????????????''' error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)] predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId']) predictions_item.columns=['error_PassengerId'] error_result = pd.concat([error_items,predictions_item],axis=1) error_result.to_csv(r'error.csv',index=False) #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) #=prediction = model.predict(X_test) #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0]) '''??????''' '''test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True) # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'logisticRegression_result/prediction.csv',index=False)''' #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5)
def optimize_logisticRegression(): train_data = pd.read_csv(r"data/train.csv") print u"?????\n",train_data.info() print u'?????\n',train_data.describe() #display_data(train_data) # ???????? #display_with_process(train_data) # ??????????????????,???? process_data = fe_preprocessData(train_data,'process_train_data') # ???????????? train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # ??????????? train_np = train_data.as_matrix() # ???? '''??model''' X = train_np[:,1:] y = train_np[:,0] #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train) model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y) print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)}) '''??????''' test_data = pd.read_csv(r"data/test.csv") process_test_data = fe_preprocessData(test_data,'process_test_data') # ????? test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') test_np = test_data.as_matrix() predict = model.predict(test_np) result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)}) result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False) #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6) #print cross_validation.cross_val_score(clf, X,y,cv=5) ## ????????
def logistic_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'First round:',metrics.accuracy_score(y_test,y_pred) #tune parameter C crange =[0.01,0.1,1,10,100] for num in crange: model = LogisticRegression(C=num) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred)
def svm_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = svm.LinearSVC(C=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'First round:',metrics.accuracy_score(y_test,y_pred) #tune parameter C crange =[0.01,0.1,1,10,100] for num in crange: model = svm.LinearSVC(C=num) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred)
def nb_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1) model = MultinomialNB() model.fit(X_train, y_train) y_pred = model.predict(X_test) print metrics.accuracy_score(y_test,y_pred)
def rf_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) rf_model = RandomForestClassifier(n_estimators = 100, n_jobs=-1) rf_model.fit(X_train, y_train) y_pred = rf_model.predict(X_test) print metrics.accuracy_score(y_test,y_pred) #plot confusion_matrix, 'col' is the y target
def constructModel(corpus, classList, features, modelOutput): """ Trains a Decision Tree model on the test corpus. Args: corpus: A list of lists, containing the GC content, coverage, and class number. classList: A list of class names. features: List of variables used by each contig. modelOutput: Location to save model as GraphViz DOT, or False to save no model. Returns: classifier: A DecisionTreeClassifier object that has been trained on the test corpus. """ corpus.sort() # just in case X = [] Y = [] for item in corpus: X.append(item[:-1]) # all but the last item Y.append(item[-1]) # only the last item X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0) # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper #treeClassifier = tree.DecisionTreeClassifier() #treeClassifier = treeClassifier.fit(X_train, Y_train) #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test)) baggingClassifier = ensemble.BaggingClassifier() baggingClassifier = baggingClassifier.fit(X_train, Y_train) click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test)) #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10) #forestClassifier = forestClassifier.fit(X_train, Y_train) #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test)) #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100) #adaClassifier = adaClassifier.fit(X_train, Y_train) #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test)) #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100) #gradientClassifier = gradientClassifier.fit(X_train, Y_train) #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test)) if modelOutput: with open(modelOutput, 'w') as dotfile: tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features, class_names=classList, filled=True, rounded=True, special_characters=True) return baggingClassifier
def train_test(): df = pd.read_csv("data_preprocessed.csv",header=None) label_cols = df.columns[0:2] Y = df[label_cols] feature_cols = df.columns[2:len(df.columns)] X = df[feature_cols] X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1) train_df = pd.concat([y_train,X_train],axis=1) test_df = pd.concat([y_test,X_test], axis=1) return train_df, test_df
def train(): os.chdir(dname) for selected_stock in onlyfiles: df = pd.read_csv(os.path.join('data_files',selected_stock)) #preprocessing the data df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] #measure of volatility df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] forecast_col = 'Adj. Close' df.fillna(value=-99999, inplace=True) forecast_out = int(math.ceil(0.01 * len(df))) df['label'] = df[forecast_col].shift(-forecast_out) X = np.array(df.drop(['label'],1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] df.dropna(inplace=True) y = np.array(df['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) svr = SVR() pickle.dump(svr,open(join(dname+'/models/svr_unfit/', selected_stock+'svr.sav'),'wb')) svr.fit(X_train, y_train) lr = LinearRegression() pickle.dump(lr,open(join(dname+'/models/lr_unfit/', selected_stock+'lr.sav'),'wb')) lr.fit(X_train, y_train) mlp = MLPRegressor() pickle.dump(mlp,open(join(dname+'/models/mlp_unfit/', selected_stock+'mlp.sav'),'wb')) mlp.fit(X_train, y_train) pickle.dump(svr,open(join(dname+'/models/svr_fit/', selected_stock+'svr.sav'),'wb')) pickle.dump(lr,open(join(dname+'/models/lr_fit/', selected_stock+'lr.sav'),'wb')) pickle.dump(mlp,open(join(dname+'/models/mlp_fit/', selected_stock+'mlp.sav'),'wb')) print(selected_stock+" - trained")
def read(self, nb_classes, img_rows=IMAGE_SIZE, img_cols=IMAGE_SIZE, img_channels=3): images, labels = extract_data('./train/') labels = np.reshape(labels, [-1]) # numpy.reshape X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.3, random_state=random.randint(0, 100)) X_valid, X_test, y_valid, y_test = train_test_split(images, labels, test_size=0.5, random_state=random.randint(0, 100)) X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 3) X_valid = X_valid.reshape(X_valid.shape[0], img_rows, img_cols, 3) X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 3) input_shape = (img_rows, img_cols, 3) # the data, shuffled and split between train and test sets print('X_train shape:', X_train.shape) print(X_train.shape[0], 'train samples') print(X_valid.shape[0], 'valid samples') print(X_test.shape[0], 'test samples') # convert class vectors to binary class matrices Y_train = np_utils.to_categorical(y_train, nb_classes) Y_valid = np_utils.to_categorical(y_valid, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) X_train = X_train.astype('float32') X_valid = X_valid.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_valid /= 255 X_test /= 255 self.X_train = X_train self.X_valid = X_valid self.X_test = X_test self.Y_train = Y_train self.Y_valid = Y_valid self.Y_test = Y_test
def create_train_test_split(xs, ys, num_catagories, train_test_split_percentage): train_test_split_idxs = np.array([itm for itm in range(0, xs.shape[0])]) [idxs_train, idxs_test, temp1, temp2] = train_test_split(train_test_split_idxs, train_test_split_idxs, test_size=train_test_split_percentage, random_state=42) xs_train, xs_test = xs[idxs_train], xs[idxs_test] ys_train, ys_test = ys[idxs_train], ys[idxs_test] ys_train = one_hot(ys_train, num_catagories) ys_test = one_hot(ys_test, num_catagories) return [xs_train, xs_test, ys_train, ys_test]
def splitData(features, labels, testSize = 0.3): ''' Split data into train and test sets @param features: Features generated from data @param labels: symptom severity label for each note @param testSize: fraction of data to use for testing models @return feats_train: the features for training @return feats_test: the features for testing @return labels_train: symptom severity labels corresponding to training features @return labels_test: symptom severity labels corresponding to test features ''' feats_train, feats_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=testSize, random_state=15) return(feats_train, feats_test, labels_train, labels_test)
def naive_bayes(X, t): # leave-one-out strategy to get average accuracy n = len(t) true_num = 0 for i in range(n): X_train = list(X) del X_train[i] y_train = list(t) del y_train[i] X_test = X[i] y_test = [t[i]] prior, likelihood, num = train_naive_bayes(X_train, y_train) if test_naive_bayes(X_test, y_test, prior, likelihood, num): true_num += 1 accuracy = 1.0 * true_num / n # 8/2 split pre = [] rec = [] for _ in range(100): X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2) prior, likelihood, num = train_naive_bayes(X_train, t_train) precision, recall = test_naive_bayes(X_test, t_test, prior, likelihood, num) pre.append(precision) rec.append(recall) pre = sum(pre) / len(pre) rec = sum(rec) / len(rec) F = 2 / (1/pre + 1/rec) return accuracy, pre, rec, F
def lession_4(): iris = datasets.load_iris() iris_X = iris.data iris_y = iris.target # print iris_X[:2] # print iris_y X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size=0.3) knn = KNeighborsClassifier() knn.fit(X_train,y_train) print knn.predict(X_test) print y_test # dataset usage
def case1(): from sklearn import datasets news = datasets.fetch_20newsgroups(subset='all') # print len(news.data) # print len(news.target) # print '*'*10 # print news.data[0] # print '*'*10 # print news.target[0] from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer vec = CountVectorizer() x = vec.fit_transform(news.data) # print x.shape # print x[:2] print x[:10,:10].toarray() TFIDF = TfidfTransformer() x_tfidf = TFIDF.fit_transform(x) print x_tfidf[:10,:10].toarray() from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233) tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233) from sklearn.naive_bayes import MultinomialNB mnb =MultinomialNB() tf_mnb = MultinomialNB() mmb.fit(Xtrain,ytrain) tf_mnb.fit(tf_Xtrain,tf_ytrain)
def main(): digits = load_digits() x_train, x_test, y_train_, y_test_ = cross_validation.train_test_split(digits.data, digits.target, test_size=0.2, random_state=0) lb = preprocessing.LabelBinarizer() lb.fit(digits.target) y_train = lb.transform(y_train_) y_test = lb.transform(y_test_) sess = tf.InteractiveSession() x = tf.placeholder(tf.float32, shape=[None, 64]) y_ = tf.placeholder(tf.float32, shape=[None, 10]) w_1 = weight_variable([64, 32]) b_1 = bias_variable([32]) h_1 = tf.nn.relu(tf.matmul(x, w_1) + b_1) w_2 = weight_variable([32, 10]) b_2 = bias_variable([10]) y = tf.nn.softmax(tf.matmul(h_1, w_2) + b_2) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) sess.run(tf.initialize_all_variables()) for i in range(1000): train_step.run(feed_dict={x: x_train, y_: y_train}) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print(accuracy.eval(feed_dict={x: x_test, y_: y_test}))
def main(): digits = load_digits() x_train, x_test, y_train_, y_test_ = cross_validation.train_test_split(digits.data, digits.target, test_size=0.2, random_state=0) lb = preprocessing.LabelBinarizer() lb.fit(digits.target) y_train = lb.transform(y_train_) y_test = lb.transform(y_test_) sess = tf.InteractiveSession() x = tf.placeholder(tf.float32, shape=[None, 64]) y_ = tf.placeholder(tf.float32, shape=[None, 10]) phase_train = tf.placeholder(tf.bool, name='phase_train') w_1 = weight_variable([64, 32]) b_1 = bias_variable([32]) t_1 = tf.matmul(x, w_1) + b_1 bn = batch_norm(t_1, 1, phase_train) h_1 = binarized_ops.binarized(bn) w_2 = weight_variable([32, 10]) b_2 = bias_variable([10]) y = tf.nn.softmax(tf.matmul(h_1, w_2) + b_2) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) sess.run(tf.initialize_all_variables()) for i in range(1000): train_step.run(feed_dict={x: x_train, y_: y_train, phase_train: True}) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print(accuracy.eval(feed_dict={x: x_test, y_: y_test, phase_train: False}))