我们从Python开源项目中,提取了以下23个代码示例,用于说明如何使用sklearn.datasets.load_svmlight_file()。
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True): X, y = sk_dt.load_svmlight_file(folder + "/real-sim") y = np.array([int(yy) for yy in y]) if one_hot: y = to_one_hot_enc(y) res = [Dataset(data=X, target=y)] if partitions_proportions: res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions) res = Datasets.from_list(res) if as_tensor: [dat.convert_to_tensor() for dat in res] return res # noinspection PyPep8Naming
def run(dataset_name, n_features, n_repeat=5, n_learning_round=5): base_dir = os.path.join(os.getcwd(), '../targets/%s/' % dataset_name) model_file = os.path.join(base_dir, 'train.scale.model') result = Result(dataset_name + '-'+ 'active') for repeat in range(0, n_repeat): print 'Round %d of %d'% (repeat, n_repeat - 1) ex = LibSVMOnline(dataset_name, model_file, (1, -1), n_features, 'uniform', 1e-1) X_test, y_test = load_svmlight_file(os.path.join(base_dir, 'test.scale'), n_features) X_test = X_test.todense() for i in result.index: q_by_u = result.Q_by_U[i] main = ActiveLearning(ex, (None, None), (X_test, y_test), n_features, q_by_u * (n_features + 1), n_learning_round) L_unif, L_test = main.do() result.L_unif[i].append(L_unif) result.L_test[i].append(L_test) result.nquery[i].append(ex.get_n_query()) print result
def load_svm(name): data = load_svmlight_file(name) return data[0], data[1]
def load_dataset(path_train, n_features, path_valid=None, path_test=None, multilabel=False, classes_=None): le = LabelEncoder2(multilabel=multilabel) if path_valid is None and path_test is None: # TODO zero_based=True? X, Y = load_svmlight_file(path_train, dtype=np.float32, n_features=n_features, multilabel=multilabel) if classes_ is None: le.fit(Y) Y = le.transform(Y) else: le.set_classes(classes_) Y = le.transform(Y) return X, Y, None, None, le elif path_test is None: X, Y, Xvalid, Yvalid = load_svmlight_files((path_train, path_valid), dtype=np.float32, n_features=n_features, multilabel=multilabel) if classes_ is None: le.fit(np.concatenate((Y, Yvalid), axis=0)) Y = le.transform(Y) Yvalid = le.transform(Yvalid) else: le.set_classes(classes_) Y = le.transform(Y) Yvalid = le.transform(Yvalid) return X, Y, Xvalid, Yvalid, le else: X, Y, Xvalid, Yvalid, Xtest, Ytest = load_svmlight_files((path_train, path_valid, path_test), dtype=np.float32, n_features=n_features, multilabel=multilabel) if classes_ is None: le.fit(np.concatenate((Y, Yvalid, Ytest), axis=0)) Y = le.transform(Y) Yvalid = le.transform(Yvalid) Ytest = le.transform(Ytest) else: le.set_classes(classes_) Y = le.transform(Y) Yvalid = le.transform(Yvalid) return X, Y, Xvalid, Yvalid, Xtest, Ytest, le
def train(self, depgraphs): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph """ try: input_file = tempfile.NamedTemporaryFile( prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # this is very slow. self._model = svm.SVC( kernel='poly', degree=2, coef0=0, gamma=0.2, C=0.5, verbose=False, probability=True) print('Training support vector machine...') self._model.fit(x_train, y_train) print('done!') finally: os.remove(input_file.name)
def train(self, depgraphs, modelfile): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str """ try: input_file = tempfile.NamedTemporaryFile( prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) if self._algorithm == self.ARC_STANDARD: self._create_training_examples_arc_std(depgraphs, input_file) else: self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # Todo : because of probability = True => very slow due to # cross-validation. Need to improve the speed here model = svm.SVC( kernel='poly', degree=2, coef0=0, gamma=0.2, C=0.5, verbose=True, probability=True) model.fit(x_train, y_train) # Save the model to file name (as pickle) pickle.dump(model, open(modelfile, 'wb')) finally: remove(input_file.name)
def load_step_res(step_fold_identifier): file_name_base = hashlib.sha224(step_fold_identifier).hexdigest() logger.info("loading [%s] from [%s]", step_fold_identifier, file_name_base) with open(file_name_base + ".train", "rb") as f: X_train, y_train = load_svmlight_file(f) X_train = X_train.toarray() with open(file_name_base + ".test", "rb") as f: X_test, y_test = load_svmlight_file(f) X_test = X_test.toarray() return (X_train, y_train, X_test, y_test)
def load_data(path, dense=False): """Load data from a CSV or libsvm format file. Args: path (str): A path to the CSV or libsvm format file containing data. dense (boolean): An optional variable indicating if the return matrix should be dense. By default, it is false. """ with open(path, 'r') as f: line = f.readline().strip() if ':' in line: X, y = load_svmlight_file(path) X = X.astype(np.float32) if dense: X = X.todense() elif ',' in line: X = np.loadtxt(path, delimiter=',', skiprows=0 if is_number(line.split(',')[0]) else 1) y = X[:, 0] X = X[:, 1:] else: raise NotImplementedError, "Neither CSV nor LibSVM formatted file." return X, y
def run(train_data, test_data, n_features, labels, gamma, C, feature_type='uniform'): print train_data assert os.path.isfile(train_data), '%s is not a file' % train_data assert os.path.isfile(test_data), '%s is not a file' % test_data X, Y = load_svmlight_file(train_data, n_features=n_features) Xt, Yt = load_svmlight_file(test_data, n_features=n_features) Xt = Xt.todense() if gamma is None: gamma = 1.0 / n_features if C is None: C = 1 rbf_svc = svm.SVC(kernel='rbf', gamma=gamma, C=C).fit(X, Y) print '--------------- original -----------------' baseline = sm.accuracy_score(Yt, rbf_svc.predict(Xt)) print 'original: %f' % baseline CAL_v(train_data, labels[1], labels[0], rbf_svc.predict, n_features, feature_type, Xt, Yt) # run('data/diabetes.aa', 'data/diabetes.ab', 8, (+1, -1), gamma=2.0, C=.5, feature_type='uniform') # run('data/breast-cancer.aa', 'data/breast-cancer.ab', 10, (1, 0), gamma=0.5, C=.125, feature_type='uniform') # run('data/australian.aa', 'data/australian.ab', 14, gamma=0.03125, C=.125, feature_type='uniform') # run('./data/fourclass.aa', './data/fourclass.ab', 2, (1, -1), gamma=8.0, C=128, feature_type='uniform')
def run(dataset_name, n_features): base_dir = os.path.join(os.getcwd(), '../targets/%s/' % dataset_name) model_file = os.path.join(base_dir, 'train.scale.model') result = Result('baseline') n_repeat = 10 for repeat in range(0, n_repeat): print 'Round %d of %d'% (repeat, n_repeat - 1) # load model and collect QSV ex = LibSVMOnline(dataset_name, model_file, (1, -1), n_features, 'uniform', 1e-1) # generate test score X_test, y_test = load_svmlight_file(os.path.join(base_dir, 'test.scale'), n_features) X_test = X_test.todense() train_x, train_y = [], [] for i in result.index: q_by_u = result.Q_by_U[i] ex.collect_up_to_budget(q_by_u * (n_features + 1)) train_x.extend(ex.pts_near_b) train_y.extend(ex.pts_near_b_labels) base = Baseline(ex.batch_predict, (train_x, train_y), (X_test, y_test), n_features) L_unif, L_test = base.do() result.L_unif[i].append(L_unif) result.L_test[i].append(L_test) result.nquery[i].append(ex.get_n_query()) # print ex.get_n_query() / (n_features + 1), ',', L_unif, ',', L_test print result
def run(train_data, test_data, n_features, gamma, C, feature_type='uniform'): X, Y = load_svmlight_file(train_data, n_features=n_features) Xt, Yt = load_svmlight_file(test_data, n_features=n_features) rbf_svc = svm.SVC(kernel='rbf', gamma=gamma, C=C).fit(X, Y) ex = GridRBFSolver(train_data, rbf_svc.predict, Xt, Yt, feature_type, 1e-9) ex.do(1500)
def run(train_data, test_data, n_features, labels, gamma, C, feature_type='uniform'): print train_data assert os.path.isfile(train_data), '%s is not a file' % train_data assert os.path.isfile(test_data), '%s is not a file' % test_data X, Y = load_svmlight_file(train_data, n_features=n_features) Xt, Yt = load_svmlight_file(test_data, n_features=n_features) Xt = Xt.todense() if gamma is None: gamma = 1.0 / n_features if C is None: C = 1 rbf_svc = svm.SVC(kernel='rbf', gamma=gamma, C=C).fit(X, Y) print '--------------- original -----------------' baseline = sm.accuracy_score(Yt, rbf_svc.predict(Xt)) print 'original: %f' % baseline retrain_in_x_with_grid(train_data, labels[1], labels[0], rbf_svc.predict, n_features, feature_type, Xt, Yt, None) retrain_in_f_with_grid(train_data, labels[1], labels[0], rbf_svc.predict, n_features, feature_type, Xt, Yt, None) # run('data/diabetes.aa', 'data/diabetes.ab', 8, (+1, -1), gamma=2.0, C=.5, feature_type='uniform') # run('data/breast-cancer.aa', 'data/breast-cancer.ab', 10, (1, 0), gamma=0.5, C=.125, feature_type='uniform') # run('data/australian.aa', 'data/australian.ab', 14, gamma=0.03125, C=.125, feature_type='uniform') # run('./data/fourclass.aa', './data/fourclass.ab', 2, (1, -1), gamma=8.0, C=128, feature_type='uniform')
def load_file(self, file_path): data = load_svmlight_file(file_path) return data[0], data[1]
def train(self, depgraphs, modelfile, verbose=True): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str """ try: input_file = tempfile.NamedTemporaryFile( prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) if self._algorithm == self.ARC_STANDARD: self._create_training_examples_arc_std(depgraphs, input_file) else: self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # Todo : because of probability = True => very slow due to # cross-validation. Need to improve the speed here model = svm.SVC( kernel='poly', degree=2, coef0=0, gamma=0.2, C=0.5, verbose=verbose, probability=True) model.fit(x_train, y_train) # Save the model to file name (as pickle) pickle.dump(model, open(modelfile, 'wb')) finally: remove(input_file.name)
def load_dataset(self, X=None): if self.conf.verbosity > 1: print "Loading dataset..." if X is None: self.X_train, self.tl = load_svmlight_file(self.conf.fname_in, dtype=np.float32, multilabel=False) # we're saving tl (target labels) just in case they exist and the user needs them - since # this is unsupervised learning, we completely ignore the labels and don't expect them to exist else: self.X_train = X self.X_train = self.X_train.todense() if (self.conf.mod1size + self.conf.mod2size) != self.X_train.shape[1]: raise ValueError("Provided dimensionality of 1st modality ("+str(self.conf.mod1size)+") and 2nd modality ("+str(self.conf.mod2size)+") " \ "does not sum to the dimensionality provided in the input file ("+str(self.X_train.shape[1])+")") # indices of missing modalities (stored for later) self.idxMissingFirst = [] self.idxMissingSecond = [] # generate training data for modality translation self.X_first = [] self.X_second = [] bothMissing = both = 0 if self.conf.ignore_zeroes: # zeroes are not treated as missing modalities # I have no idea why this might be useful, but ok :D # since idxMissing* are left empty, this is the only # place where we should take care of this for i in range(self.X_train.shape[0]): both += 1 self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size])) self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:])) else: # zero vectors are treated as missing modalities (default) for i in range(self.X_train.shape[0]): if not np.any(self.X_train[i, :self.conf.mod1size]): # first missing if np.any(self.X_train[i, self.conf.mod1size:]): # second not missing # second ok, need to reconstruct first self.idxMissingFirst.append(i) else: bothMissing += 1 # missing both else: # first ok if not np.any(self.X_train[i, self.conf.mod1size:]): # second missing self.idxMissingSecond.append(i) else: #both ok -> use them to train translator both += 1 self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size])) self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:])) if self.conf.verbosity > 1: print "Both modalities present:",both, "\nMissing 1st:", len(self.idxMissingFirst), "\nMissing 2nd:",len(self.idxMissingSecond) print "Missing both modalities:", bothMissing, "\n" self.X_first = np.array(self.X_first) self.X_second = np.array(self.X_second)
def run(dataset): n_features = len(meta[dataset]['val_name']) result_online = Result('%s-%s' %(dataset, 'aws-online'), aws=True) result_baseline = Result('%s-%s' %(dataset, 'aws-baseline'), aws=True) result_active = Result('%s-%s' %(dataset, 'aws-active'), aws=True) for repeat in range(0, n_repeat): print 'Round %d of %d'% (repeat, n_repeat - 1) ex = AWSOnline(meta[dataset]['model_id'], 1, 0, n_features, meta[dataset]['val_name'], ftype='uniform', error=.1) test_x, test_y = load_svmlight_file('/Users/Fan/dev/ML/code/binary-classifiers/targets/%s/test.scale' % dataset, n_features) test_x = test_x.todense() test_y = [a if a == 1 else 0 for a in test_y] train_x, train_y = [], [] for i in result_active.index: q_by_u = result_active.Q_by_U[i] print 'Active learning with budget %d / %d' % (q_by_u, q_by_u * (n_features + 1)) main = ActiveLearning(ex, (None, None), (test_x, test_y), n_features, q_by_u * (n_features + 1), 5) L_unif, L_test = main.do() result_active.L_unif[i].append(L_unif) result_active.L_test[i].append(L_test) result_active.nquery[i].append(ex.get_n_query()) ex = AWSOnline(meta[dataset]['model_id'], 1, 0, n_features, meta[dataset]['val_name'], ftype='uniform', error=.1) for i in result_online.index: q_by_u = result_online.Q_by_U[i] print 'collecting up to budget %d / %d' % (q_by_u, q_by_u * (n_features + 1)) ex.collect_up_to_budget(q_by_u * (n_features + 1)) train_x.extend(ex.pts_near_b) train_y.extend(ex.pts_near_b_labels) print 'retraining with %d points' % len(train_y) # online e = RBFKernelRetraining(ex.batch_predict, (train_x, train_y), (test_x, test_y), n_features) L_unif, L_test = e.grid_retrain_in_x() result_online.L_unif[i].append(L_unif) result_online.L_test[i].append(L_test) result_online.nquery[i].append(ex.get_n_query()) # baseline e = Baseline(ex.batch_predict, (train_x, train_y), (test_x, test_y), n_features) L_unif, L_test = e.do() result_baseline.L_unif[i].append(L_unif) result_baseline.L_test[i].append(L_test) result_baseline.nquery[i].append(ex.get_n_query()) print result_online print result_baseline print result_active