Python sklearn.datasets 模块，load_svmlight_file() 实例源码

我们从Python开源项目中，提取了以下23个代码示例，用于说明如何使用sklearn.datasets.load_svmlight_file()。

项目：RFHO 作者：lucfra | 项目源码 | 文件源码

def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
    X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
    y = np.array([int(yy) for yy in y])
    if one_hot:
        y = to_one_hot_enc(y)
    res = [Dataset(data=X, target=y)]
    if partitions_proportions:
        res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
        res = Datasets.from_list(res)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return res


# noinspection PyPep8Naming

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(dataset_name, n_features, n_repeat=5, n_learning_round=5):
    base_dir = os.path.join(os.getcwd(), '../targets/%s/' % dataset_name)
    model_file = os.path.join(base_dir, 'train.scale.model')

    result = Result(dataset_name + '-'+ 'active')
    for repeat in range(0, n_repeat):
        print 'Round %d of %d'% (repeat, n_repeat - 1)

        ex = LibSVMOnline(dataset_name, model_file, (1, -1), n_features, 'uniform', 1e-1)
        X_test, y_test = load_svmlight_file(os.path.join(base_dir, 'test.scale'), n_features)
        X_test = X_test.todense()

        for i in result.index:
            q_by_u = result.Q_by_U[i]

            main = ActiveLearning(ex, (None, None), (X_test, y_test), n_features,
                                  q_by_u * (n_features + 1), n_learning_round)

            L_unif, L_test = main.do()

            result.L_unif[i].append(L_unif)
            result.L_test[i].append(L_test)
            result.nquery[i].append(ex.get_n_query())

    print result

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def load_svm(name):
    data = load_svmlight_file(name)
    return data[0], data[1]

项目：ltls 作者：kjasinska | 项目源码 | 文件源码

def load_dataset(path_train, n_features, path_valid=None, path_test=None, multilabel=False, classes_=None):
    le = LabelEncoder2(multilabel=multilabel)
    if path_valid is None and path_test is None:  # TODO zero_based=True?
        X, Y = load_svmlight_file(path_train, dtype=np.float32, n_features=n_features, multilabel=multilabel)
        if classes_ is None:
            le.fit(Y)
            Y = le.transform(Y)
        else:
            le.set_classes(classes_)
            Y = le.transform(Y)
        return X, Y, None, None, le
    elif path_test is None:
        X, Y, Xvalid, Yvalid = load_svmlight_files((path_train, path_valid), dtype=np.float32,
                                                   n_features=n_features,
                                                   multilabel=multilabel)
        if classes_ is None:
            le.fit(np.concatenate((Y, Yvalid), axis=0))
            Y = le.transform(Y)
            Yvalid = le.transform(Yvalid)
        else:
            le.set_classes(classes_)
            Y = le.transform(Y)
            Yvalid = le.transform(Yvalid)
        return X, Y, Xvalid, Yvalid, le

    else:
        X, Y, Xvalid, Yvalid, Xtest, Ytest = load_svmlight_files((path_train, path_valid, path_test), dtype=np.float32,
                                                                 n_features=n_features,
                                                                 multilabel=multilabel)
        if classes_ is None:
            le.fit(np.concatenate((Y, Yvalid, Ytest), axis=0))
            Y = le.transform(Y)
            Yvalid = le.transform(Yvalid)
            Ytest = le.transform(Ytest)
        else:
            le.set_classes(classes_)
            Y = le.transform(Y)
            Yvalid = le.transform(Yvalid)
        return X, Y, Xvalid, Yvalid, Xtest, Ytest, le

项目：one-day-with-cling 作者：mariana-scorp | 项目源码 | 文件源码

def train(self, depgraphs):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # this is very slow.
            self._model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=False,
                probability=True)

            print('Training support vector machine...')
            self._model.fit(x_train, y_train)
            print('done!')
        finally:
            os.remove(input_file.name)

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：FLASH 作者：yuyuz | 项目源码 | 文件源码

def load_step_res(step_fold_identifier):
    file_name_base = hashlib.sha224(step_fold_identifier).hexdigest()
    logger.info("loading [%s] from [%s]", step_fold_identifier, file_name_base)

    with open(file_name_base + ".train", "rb") as f:
        X_train, y_train = load_svmlight_file(f)
        X_train = X_train.toarray()

    with open(file_name_base + ".test", "rb") as f:
        X_test, y_test = load_svmlight_file(f)
        X_test = X_test.toarray()

    return (X_train, y_train, X_test, y_test)

项目：Kaggler 作者：qqgeogor | 项目源码 | 文件源码

def load_data(path, dense=False):
    """Load data from a CSV or libsvm format file.

    Args:
        path (str): A path to the CSV or libsvm format file containing data.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.
    """

    with open(path, 'r') as f:
        line = f.readline().strip()

    if ':' in line:
        X, y = load_svmlight_file(path)
        X = X.astype(np.float32)
        if dense:
            X = X.todense()
    elif ',' in line:
        X = np.loadtxt(path, delimiter=',',
                       skiprows=0 if is_number(line.split(',')[0]) else 1)
        y = X[:, 0]
        X = X[:, 1:]
    else:
        raise NotImplementedError, "Neither CSV nor LibSVM formatted file."

    return X, y

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(train_data, test_data, n_features, labels, gamma, C, feature_type='uniform'):
    print train_data
    assert os.path.isfile(train_data), '%s is not a file' % train_data
    assert os.path.isfile(test_data), '%s is not a file' % test_data

    X, Y = load_svmlight_file(train_data, n_features=n_features)
    Xt, Yt = load_svmlight_file(test_data, n_features=n_features)
    Xt = Xt.todense()

    if gamma is None:
        gamma = 1.0 / n_features

    if C is None:
        C = 1

    rbf_svc = svm.SVC(kernel='rbf', gamma=gamma, C=C).fit(X, Y)

    print '--------------- original -----------------'
    baseline = sm.accuracy_score(Yt, rbf_svc.predict(Xt))
    print 'original: %f' % baseline

    CAL_v(train_data, labels[1], labels[0], rbf_svc.predict, n_features, feature_type, Xt, Yt)


# run('data/diabetes.aa', 'data/diabetes.ab', 8, (+1, -1), gamma=2.0, C=.5, feature_type='uniform')
# run('data/breast-cancer.aa', 'data/breast-cancer.ab', 10, (1, 0), gamma=0.5, C=.125, feature_type='uniform')
# run('data/australian.aa', 'data/australian.ab', 14, gamma=0.03125, C=.125, feature_type='uniform')
# run('./data/fourclass.aa', './data/fourclass.ab', 2, (1, -1), gamma=8.0, C=128, feature_type='uniform')

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(dataset_name, n_features):
    base_dir = os.path.join(os.getcwd(), '../targets/%s/' % dataset_name)
    model_file = os.path.join(base_dir, 'train.scale.model')

    result = Result('baseline')
    n_repeat = 10
    for repeat in range(0, n_repeat):
        print 'Round %d of %d'% (repeat, n_repeat - 1)

        # load model and collect QSV
        ex = LibSVMOnline(dataset_name, model_file, (1, -1), n_features, 'uniform', 1e-1)
        # generate test score
        X_test, y_test = load_svmlight_file(os.path.join(base_dir, 'test.scale'), n_features)
        X_test = X_test.todense()
        train_x, train_y = [], []
        for i in result.index:
            q_by_u = result.Q_by_U[i]
            ex.collect_up_to_budget(q_by_u * (n_features + 1))
            train_x.extend(ex.pts_near_b)
            train_y.extend(ex.pts_near_b_labels)
            base = Baseline(ex.batch_predict, (train_x, train_y), (X_test, y_test), n_features)

            L_unif, L_test = base.do()

            result.L_unif[i].append(L_unif)
            result.L_test[i].append(L_test)
            result.nquery[i].append(ex.get_n_query())

            # print ex.get_n_query() / (n_features + 1), ',', L_unif, ',', L_test

    print result

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(dataset_name, n_features):
    base_dir = os.path.join(os.getcwd(), '../targets/%s/' % dataset_name)
    model_file = os.path.join(base_dir, 'train.scale.model')

    result = Result('baseline')
    n_repeat = 10
    for repeat in range(0, n_repeat):
        print 'Round %d of %d'% (repeat, n_repeat - 1)

        # load model and collect QSV
        ex = LibSVMOnline(dataset_name, model_file, (1, -1), n_features, 'uniform', 1e-1)
        # generate test score
        X_test, y_test = load_svmlight_file(os.path.join(base_dir, 'test.scale'), n_features)
        X_test = X_test.todense()
        train_x, train_y = [], []
        for i in result.index:
            q_by_u = result.Q_by_U[i]
            ex.collect_up_to_budget(q_by_u * (n_features + 1))
            train_x.extend(ex.pts_near_b)
            train_y.extend(ex.pts_near_b_labels)
            base = Baseline(ex.batch_predict, (train_x, train_y), (X_test, y_test), n_features)

            L_unif, L_test = base.do()

            result.L_unif[i].append(L_unif)
            result.L_test[i].append(L_test)
            result.nquery[i].append(ex.get_n_query())

            # print ex.get_n_query() / (n_features + 1), ',', L_unif, ',', L_test

    print result

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(train_data, test_data, n_features, gamma, C, feature_type='uniform'):
    X, Y = load_svmlight_file(train_data, n_features=n_features)
    Xt, Yt = load_svmlight_file(test_data, n_features=n_features)
    rbf_svc = svm.SVC(kernel='rbf', gamma=gamma, C=C).fit(X, Y)
    ex = GridRBFSolver(train_data, rbf_svc.predict, Xt, Yt, feature_type, 1e-9)
    ex.do(1500)

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(train_data, test_data, n_features, labels, gamma, C, feature_type='uniform'):
    print train_data
    assert os.path.isfile(train_data), '%s is not a file' % train_data
    assert os.path.isfile(test_data), '%s is not a file' % test_data

    X, Y = load_svmlight_file(train_data, n_features=n_features)
    Xt, Yt = load_svmlight_file(test_data, n_features=n_features)
    Xt = Xt.todense()

    if gamma is None:
        gamma = 1.0 / n_features

    if C is None:
        C = 1

    rbf_svc = svm.SVC(kernel='rbf', gamma=gamma, C=C).fit(X, Y)

    print '--------------- original -----------------'
    baseline = sm.accuracy_score(Yt, rbf_svc.predict(Xt))
    print 'original: %f' % baseline

    retrain_in_x_with_grid(train_data, labels[1], labels[0], rbf_svc.predict, n_features, feature_type, Xt, Yt, None)
    retrain_in_f_with_grid(train_data, labels[1], labels[0], rbf_svc.predict, n_features, feature_type, Xt, Yt, None)


# run('data/diabetes.aa', 'data/diabetes.ab', 8, (+1, -1), gamma=2.0, C=.5, feature_type='uniform')
# run('data/breast-cancer.aa', 'data/breast-cancer.ab', 10, (1, 0), gamma=0.5, C=.125, feature_type='uniform')
# run('data/australian.aa', 'data/australian.ab', 14, gamma=0.03125, C=.125, feature_type='uniform')
# run('./data/fourclass.aa', './data/fourclass.ab', 2, (1, -1), gamma=8.0, C=128, feature_type='uniform')

项目：neighborhood_mood_aws 作者：jarrellmark | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：Semantic-Texual-Similarity-Toolkits 作者：rgtjf | 项目源码 | 文件源码

def load_file(self, file_path):
        data = load_svmlight_file(file_path)
        return data[0], data[1]

项目：hate-to-hugs 作者：sdoran35 | 项目源码 | 文件源码

def train(self, depgraphs, modelfile, verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=verbose,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：FancyWord 作者：EastonLee | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：beepboop 作者：nicolehe | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：kind2anki 作者：prz3m | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：but_sentiment 作者：MixedEmotions | 项目源码 | 文件源码

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

项目：BiDNN 作者：v-v | 项目源码 | 文件源码

def load_dataset(self, X=None):
        if self.conf.verbosity > 1:
            print "Loading dataset..."
        if X is None:
            self.X_train, self.tl = load_svmlight_file(self.conf.fname_in, dtype=np.float32, multilabel=False)
            # we're saving tl (target labels) just in case they exist and the user needs them - since
            # this is unsupervised learning, we completely ignore the labels and don't expect them to exist
        else:
            self.X_train = X

        self.X_train = self.X_train.todense()

        if (self.conf.mod1size + self.conf.mod2size) != self.X_train.shape[1]:
            raise ValueError("Provided dimensionality of 1st modality ("+str(self.conf.mod1size)+") and 2nd modality ("+str(self.conf.mod2size)+") " \
                             "does not sum to the dimensionality provided in the input file ("+str(self.X_train.shape[1])+")")

        # indices of missing modalities (stored for later)
        self.idxMissingFirst = []
        self.idxMissingSecond = []

        # generate training data for modality translation
        self.X_first = [] 
        self.X_second = []

        bothMissing = both = 0
        if self.conf.ignore_zeroes:
            # zeroes are not treated as missing modalities
            # I have no idea why this might be useful, but ok :D
            # since idxMissing* are left empty, this is the only
            # place where we should take care of this
            for i in range(self.X_train.shape[0]):
                both += 1
                self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size]))
                self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:]))
        else:
            # zero vectors are treated as missing modalities (default)
            for i in range(self.X_train.shape[0]):
                if not np.any(self.X_train[i, :self.conf.mod1size]): # first missing
                    if np.any(self.X_train[i, self.conf.mod1size:]): # second not missing
                        # second ok, need to reconstruct first
                        self.idxMissingFirst.append(i)
                    else:
                        bothMissing +=  1 # missing both
                else: # first ok
                    if not np.any(self.X_train[i, self.conf.mod1size:]): # second missing
                        self.idxMissingSecond.append(i)
                    else: #both ok -> use them to train translator
                        both += 1
                        self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size]))
                        self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:]))

        if self.conf.verbosity > 1:
            print "Both modalities present:",both, "\nMissing 1st:", len(self.idxMissingFirst), "\nMissing 2nd:",len(self.idxMissingSecond)
            print "Missing both modalities:", bothMissing, "\n"

        self.X_first = np.array(self.X_first)
        self.X_second = np.array(self.X_second)

项目：Steal-ML 作者：ftramer | 项目源码 | 文件源码

def run(dataset):
    n_features = len(meta[dataset]['val_name'])

    result_online = Result('%s-%s' %(dataset, 'aws-online'), aws=True)
    result_baseline = Result('%s-%s' %(dataset, 'aws-baseline'), aws=True)
    result_active = Result('%s-%s' %(dataset, 'aws-active'), aws=True)

    for repeat in range(0, n_repeat):
        print 'Round %d of %d'% (repeat, n_repeat - 1)

        ex = AWSOnline(meta[dataset]['model_id'], 1, 0, n_features, meta[dataset]['val_name'], ftype='uniform', error=.1)

        test_x, test_y = load_svmlight_file('/Users/Fan/dev/ML/code/binary-classifiers/targets/%s/test.scale' % dataset, n_features)
        test_x = test_x.todense()
        test_y = [a if a == 1 else 0 for a in test_y]
        train_x, train_y = [], []

        for i in result_active.index:
            q_by_u = result_active.Q_by_U[i]
            print 'Active learning with budget %d / %d' % (q_by_u, q_by_u * (n_features + 1))
            main = ActiveLearning(ex, (None, None), (test_x, test_y), n_features,
                                  q_by_u * (n_features + 1), 5)

            L_unif, L_test = main.do()

            result_active.L_unif[i].append(L_unif)
            result_active.L_test[i].append(L_test)
            result_active.nquery[i].append(ex.get_n_query())

        ex = AWSOnline(meta[dataset]['model_id'], 1, 0, n_features, meta[dataset]['val_name'], ftype='uniform', error=.1)

        for i in result_online.index:
            q_by_u = result_online.Q_by_U[i]
            print 'collecting up to budget %d / %d' % (q_by_u, q_by_u * (n_features + 1))

            ex.collect_up_to_budget(q_by_u * (n_features + 1))
            train_x.extend(ex.pts_near_b)
            train_y.extend(ex.pts_near_b_labels)

            print 'retraining with %d points' % len(train_y)

            # online
            e = RBFKernelRetraining(ex.batch_predict, (train_x, train_y), (test_x, test_y), n_features)
            L_unif, L_test = e.grid_retrain_in_x()

            result_online.L_unif[i].append(L_unif)
            result_online.L_test[i].append(L_test)
            result_online.nquery[i].append(ex.get_n_query())

            # baseline
            e = Baseline(ex.batch_predict, (train_x, train_y), (test_x, test_y), n_features)
            L_unif, L_test = e.do()

            result_baseline.L_unif[i].append(L_unif)
            result_baseline.L_test[i].append(L_test)
            result_baseline.nquery[i].append(ex.get_n_query())

    print result_online
    print result_baseline
    print result_active