Python sklearn.decomposition 模块,TruncatedSVD() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.decomposition.TruncatedSVD()

项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def test_selective_tsvd():
    original = X
    cols = [original.columns[0], original.columns[1]]  # Only perform on first two columns...
    compare_cols = np.array(
        original[['petal length (cm)', 'petal width (cm)']].as_matrix())  # should be the same as the trans cols

    transformer = SelectiveTruncatedSVD(cols=cols, n_components=1).fit(original)
    transformed = transformer.transform(original)

    untouched_cols = np.array(transformed[['petal length (cm)', 'petal width (cm)']].as_matrix())
    assert_array_almost_equal(compare_cols, untouched_cols)
    assert 'Concept1' in transformed.columns
    assert transformed.shape[1] == 3
    assert isinstance(transformer.get_decomposition(), TruncatedSVD)
    assert SelectiveTruncatedSVD().get_decomposition() is None  # default None

    # test the selective mixin
    assert isinstance(transformer.cols, list)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_basic(algorithm):
    a = dd.TruncatedSVD(random_state=0, algorithm=algorithm)
    b = sd.TruncatedSVD(random_state=0)
    b.fit(Xdense)
    a.fit(dXdense)

    np.testing.assert_allclose(a.components_, b.components_, atol=1e-3)
    assert_estimator_equal(a, b, exclude=['components_',
                                          'explained_variance_'],
                           atol=1e-3)
    assert a.explained_variance_.shape == b.explained_variance_.shape
    np.testing.assert_allclose(a.explained_variance_,
                               b.explained_variance_,
                               rtol=0.01)

# The rest come straight from scikit-learn, with dask arrays substituted
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start()
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def reduce_dimensionality(self, X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        if self.is_pca == 'PCA':
           reducer = PCA(n_components=n_features)

        #reducer = PCA(n_components=n_features)
        if self.is_pca == 'SVD':
            reducer = TruncatedSVD(n_components=n_features)

        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        self.X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def reduce_dimensionality(self, X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        if self.is_pca == 'PCA':
           reducer = PCA(n_components=n_features)

        #reducer = PCA(n_components=n_features)
        if self.is_pca == 'SVD':
            reducer = TruncatedSVD(n_components=n_features)

        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        self.X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)
项目:document_classification    作者:scotthlee    | 项目源码 | 文件源码
def decompose(doc_vecs, n_features=100, normalize=False, flip=False):
    svd = TruncatedSVD(n_features)  
    if normalize:   
        if flip:
            lsa = make_pipeline(svd, Normalizer(copy=False))
            doc_mat = lsa.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            lsa = make_pipeline(svd, Normalizer(copy=False))        
            doc_mat = lsa.fit_transform(doc_vecs)
        return doc_mat
    else:
        if flip:
            doc_mat = svd.fit_transform(doc_vecs.transpose())
            doc_mat = doc_mat.transpose()
        else:
            doc_mat = svd.fit_transform(doc_vecs)
        return doc_mat
项目:wende    作者:h404bi    | 项目源码 | 文件源码
def init_model():
        # “????”??
        f_trunk = QuestionTrunkVectorizer(tokenizer=tokenize)

        # Word2Vec ????
        f_word2vec = Question2VecVectorizer(tokenizer=tokenize)

        # ???? (400 ?)
        union_features = FeatureUnion([
            ('f_trunk_lsa', Pipeline([
                ('trunk', f_trunk),
                # ??_????: ?????? (LSA)
                ('lsa', TruncatedSVD(n_components=200, n_iter=10))
            ])),
            ('f_word2vec', f_word2vec),
        ])

        model = Pipeline([('union', union_features), ('clf', LinearSVC(C=0.02))])
        return model
项目:themarketingtechnologist    作者:thomhopmans    | 项目源码 | 文件源码
def reduce_dimensionality(X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        # reducer = PCA(n_components=n_features)
        reducer = TruncatedSVD(n_components=n_features)
        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def transform(self):
        # ngrams
        obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
        target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
        # cooccurrence ngrams
        cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=1)
        X = tfidf.fit_transform(cooc_terms)
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


# 2nd in CrowdFlower (preprocessing_mikhail.py)
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def transform(self):
        ## get common vocabulary
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def tfidf(corpus, corpusKeys):
    #TODO clean this up
    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    stopset.extend([str(x) for x in range(9999)])
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = (vectorizer.fit_transform(corpus)).toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    terms = vectorizer.get_feature_names()
    records = []
    for i, comp in enumerate(X):
        termsInComp = zip(terms, comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        records.append((vader(corpusKeys[i], termList), termList))
    return records
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two SVD dimensions
    # Note: Not all random_states produce perfect results.
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    assert_array_equal(hasher.fit(X).transform(X).toarray(),
                       X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.)
项目:hidi    作者:VEVO    | 项目源码 | 文件源码
def __init__(self, **svd_kwargs):
        super(SVDTransform, self).__init__(TruncatedSVD, **svd_kwargs)
项目:onionstack    作者:ntddk    | 项目源码 | 文件源码
def main():
    features = []

    for i in list:
        im = cv2.imread(i)
        hist, bins = np.histogram(im.ravel(), 256, [0, 256])
        features.append(hist)

    lsa = TruncatedSVD(10)
    features = lsa.fit_transform(features)
    features = Normalizer(copy = False).fit_transform(features)

    km = KMeans(
        init='k-means++',
        n_clusters=n_clusters,
    )
    km.fit(features)

    for i in range(n_clusters):
        if not os.path.exists('./result/' + str(i)):
            os.makedirs('./result/' + str(i))

    cnt = 0

    for i in list:
        filename = i.split('/')[-1]
        print filename,
        print km.labels_[cnt]
        shutil.copyfile(i, './result/' +  str(km.labels_[cnt]) + '/' + filename)
        cnt += 1
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None. Furthermore, ``X`` will
            not be altered in the process of the fit.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        # fails thru if names don't exist:
        self.svd_ = TruncatedSVD(
            n_components=self.n_components,
            algorithm=self.algorithm,
            n_iter=self.n_iter).fit(X[cols].as_matrix())

        return self
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def get_decomposition(self):
        """Overridden from the :class:``skutil.decomposition.decompose._BaseSelectiveDecomposer`` class,
        this method returns the internal decomposition class: 
        ``sklearn.decomposition.TruncatedSVD``

        Returns
        -------
        self.svd_ : ``sklearn.decomposition.TruncatedSVD``
            The fit internal decomposition class
        """
        return self.svd_ if hasattr(self, 'svd_') else None
项目:SIF    作者:PrincetonML    | 项目源码 | 文件源码
def get_pc(data, We, weight4ind, params):
    "Comput the principal component"

    def get_weighted_average(We, x, w):
        "Compute the weighted average vectors"
        n_samples = x.shape[0]
        emb = np.zeros((n_samples, We.shape[1]))
        for i in xrange(n_samples):
            emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
        return emb

    for i in data:
        i[0].populate_embeddings(words)
        if not params.task == "sentiment":
            i[1].populate_embeddings(words)
    if params.task == "ent":
        (scores,g1x,g1mask,g2x,g2mask) = data_io.getDataEntailment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sim":
        (scores,g1x,g1mask,g2x,g2mask) = data_io.getDataSim(data, -1)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    elif params.task == "sentiment":
        (scores,g1x,g1mask) = data_io.getDataSentiment(data)
        if params.weightfile:
            g1mask = data_io.seq2weight(g1x, g1mask, weight4ind)
    emb = get_weighted_average(We, g1x, g1mask)
    svd = TruncatedSVD(n_components=params.npc, n_iter=7, random_state=0)
    svd.fit(emb)
    return svd.components_
项目:SIF    作者:PrincetonML    | 项目源码 | 文件源码
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_algorithms():
    svd_a = sd.TruncatedSVD(30, algorithm="arpack")
    svd_r = dd.TruncatedSVD(30, algorithm="tsqr", random_state=42)

    Xa = svd_a.fit_transform(Xdense)[:, :6]
    Xr = svd_r.fit_transform(dXdense)[:, :6]
    assert_array_almost_equal(Xa, Xr, decimal=5)

    comp_a = np.abs(svd_a.components_)
    comp_r = np.abs(svd_r.components_)
    # All elements are equal, but some elements are more equal than others.
    assert_array_almost_equal(comp_a[:9], comp_r[:9])
    assert_array_almost_equal(comp_a[9:], comp_r[9:], decimal=2)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_attributes():
    for n_components in (10, 25, 41):
        tsvd = dd.TruncatedSVD(n_components).fit(dXdense)
        assert tsvd.n_components == n_components
        assert tsvd.components_.shape == (n_components, n_features)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_too_many_components():
    for n_components in (n_features, n_features + 1):
        tsvd = dd.TruncatedSVD(n_components=n_components)
        with pytest.raises(ValueError):
            tsvd.fit(dXdense)
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_inverse_transform():
    # We need a lot of components for the reconstruction to be "almost
    # equal" in all positions. XXX Test means or sums instead?
    a = dd.TruncatedSVD(n_components=52, random_state=42, n_iter=5)
    b = sd.TruncatedSVD(n_components=52, random_state=42)
    b.fit(Xdense)
    Xt = a.fit_transform(dXdense)
    Xinv = a.inverse_transform(Xt)
    assert_array_almost_equal(Xinv.compute(), Xdense, decimal=1)
项目:nlp-lt    作者:minven    | 项目源码 | 文件源码
def truncated_svd(self):
        # https://github.com/chrisjmccormick/LSA_Classification/blob/master/inspect_LSA.py
        svd = TruncatedSVD(self.dimensions)   
        lsa = make_pipeline(svd, Normalizer(copy=False))
        X_reduced = lsa.fit_transform(self.bag_of_words_matrix)
        print(svd.components_[0])
        print(svd.explained_variance_ratio_) 
        print(svd.explained_variance_ratio_.sum())
项目:100knock2016    作者:tmu-nlp    | 项目源码 | 文件源码
def compress_the_dimension():
    X = io.loadmat("X_matrix")['PPMI']
    a = PCA(300)
    a.fit(X)
    #decomp = TruncatedSVD(n_components=300, n_iter=7)
    #decomp.fit(X)
    truncated_X = decomp.transform(X)

    return truncated_X
项目:atap    作者:foxbook    | 项目源码 | 文件源码
def score_models(models, loader):
    for model in models:

        name = model.named_steps['classifier'].__class__.__name__
        if 'reduction' in model.named_steps:
            name += " (TruncatedSVD)"

        scores = {
            'model': str(model),
            'name': name,
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'time': [],
        }

        for X_train, X_test, y_train, y_test in loader:
            start = time.time()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores['time'].append(time.time() - start)
            scores['accuracy'].append(accuracy_score(y_test, y_pred))
            scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
            scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
            scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

        yield scores
项目:visualize-tsne    作者:YontiLevin    | 项目源码 | 文件源码
def _perform_svd(self):
        if self._svd and self.data_vectors.shape[1] > 50:
            print('dimension reduction using svd')
            print ('dimension before: {}'.format(str(self.data_vectors.shape[1])))
            self.data_vectors = TruncatedSVD(n_components=50, random_state=0).fit_transform(self.data_vectors)
            print ('dimension after: {}'.format(str(self.data_vectors.shape[1])))
项目:lazyprogrammer    作者:inhwane    | 项目源码 | 文件源码
def main():
    svd = TruncatedSVD()
    Z = svd.fit_transform(X)
    plt.scatter(Z[:,0], Z[:,1])
    for i in xrange(D):
        plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
    plt.show()
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_ward_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'WARD/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        ward = AgglomerativeClustering(n_clusters=self.ward_clusters_count, linkage='ward')
        predict_result = ward.fit_predict(X)

        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_spectral_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'spectral/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        spectral = SpectralClustering(n_clusters=self.spectral_clusters_count)
        predict_result = spectral.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

    # aa = Affinity Propagation
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_aa_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'affinity_propagation/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
                                               max_iter=self.aa_max_iter,
                                               convergence_iter=self.aa_no_change_stop)

        predict_result = aa_clusterizator.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:TextStageProcessor    作者:mhyhre    | 项目源码 | 文件源码
def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def doPCA(X, output_columns_count):
    #DO PCA on the data and use it to transform
    svd = TruncatedSVD(output_columns_count)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)
    return X
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def SVD_results(data, n_comps=None):
    svd = SVD(n_components=n_comps)
    model = svd.fit(data)
    out_data = {'model' : model, 'reconstruction error': svd.reconstruction_err_ }
    return 'SVD', out_data
项目:subjectClassify_py    作者:haohhxx    | 项目源码 | 文件源码
def SVD_Vec(matData, dimension):
    svd = TruncatedSVD(n_components=dimension)
    newData = svd.fit_transform(matData)
    return newData
项目:subjectClassify_py    作者:haohhxx    | 项目源码 | 文件源码
def SVD_Vec(matData, dimension):
    svd = TruncatedSVD(n_components=dimension)
    newData = svd.fit_transform(matData)
    return newData
项目:subjectClassify_py    作者:haohhxx    | 项目源码 | 文件源码
def SVD_Vec(matData, dimension):
    svd = TruncatedSVD(n_components=dimension)
    newData = svd.fit_transform(matData)
    return newData
项目:DocumentClassification    作者:bahmanh    | 项目源码 | 文件源码
def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa
项目:partisan-discourse    作者:DistrictDataLabs    | 项目源码 | 文件源码
def construct_pipeline(classifier):
    """
    This function creates a feature extraction pipeline that accepts data
    from a CorpusLoader and appends the classification model to the end of
    the pipeline, returning a newly constructed Pipeline object that is
    ready to be fit and trained!
    """

    return Pipeline([
        # Create a Feature Union of Text Stats and Bag of Words
        ('union', FeatureUnion(
            transformer_list = [

                # Pipeline for pulling document structure features
                ('stats', Pipeline([
                    ('stats', TextStats()),
                    ('vect', DictVectorizer()),
                ])),

                # Pipeline for creating a bag of words TF-IDF vector
                ('bow', Pipeline([
                    ('tokens', TextNormalizer()),
                    ('tfidf',  TfidfVectorizer(
                        tokenizer=identity, preprocessor=None, lowercase=False
                    )),
                    ('best', TruncatedSVD(n_components=1000)),
                ])),

            ],

            # weight components in feature union
            transformer_weights = {
                'stats': 0.15,
                'bow': 0.85,
            },
        )),

        # Append the estimator to the end of the pipeline
        ('classifier', classifier),
    ])
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def transform(self):
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        X = tfidf.fit_transform(self.obs_corpus)
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def transform(self):
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        X = tfidf.fit_transform(self.obs_corpus)
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


# ------------------------ Cooccurrence LSA -------------------------------
# 1st in CrowdFlower
项目:Kaggle_HomeDepot    作者:ChenglongChen    | 项目源码 | 文件源码
def transform(self):
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        X_target = tfidf.fit_transform(self.target_corpus)
        X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        X_svd = svd.fit_transform(X_tfidf)
        return X_svd


# -------------------------------- TSNE ------------------------------------------
# 2nd in CrowdFlower (preprocessing_mikhail.py)
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def svd(train,test,dims=20,it=15,file_name='tf_idf',path='data/'):
    svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
    svd.fit(train)
    pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[3]:
项目:kaggle-quora-solution-8th    作者:qqgeogor    | 项目源码 | 文件源码
def svd(train,test,dims=100,it=15,file_name='tf_idf',path='data/'):
    svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
    svd.fit(train)
    pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[12]:
项目:bnp    作者:mpearmain    | 项目源码 | 文件源码
def buildKB16(n_comp = 200, seed_value = 123):
    ## data
    # read the training/test data  
    print('Importing Data')
    xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
    xtest = pd.read_csv('../input/xtest_kb6099.csv')

    # separate 
    id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
    ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
    id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)

    # fit SVD
    svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
    svd.fit(xtrain)
    xtrain = svd.transform(xtrain)
    xtest = svd.transform(xtest)
    xtrain = pd.DataFrame(xtrain)
    xtest = pd.DataFrame(xtest)

    ## store the results
    # add indices etc
    xtrain = pd.DataFrame(xtrain)
    xtrain['ID'] = id_train
    xtrain['target'] = ytrain
#
    xtest = pd.DataFrame(xtest)
    xtest['ID'] = id_test
#
#
#    # save the files
    xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
    xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)

    return
项目:AE_ts    作者:RobRomijnders    | 项目源码 | 文件源码
def plot_z_run(z_run, label, ):
    from sklearn.decomposition import TruncatedSVD
    f1, ax1 = plt.subplots(2, 1)

    PCA_model = TruncatedSVD(n_components=3).fit(z_run)
    z_run_reduced = PCA_model.transform(z_run)
    ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0)
    ax1[0].set_title('PCA on z_run')
    from sklearn.manifold import TSNE
    tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000)
    z_run_tsne = tSNE_model.fit_transform(z_run)
    ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0)
    ax1[1].set_title('tSNE on z_run')
    return
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def cv_gp_kernel(self, kernel, n, cv=5):
        X = self.X
        y = self.y
        Xn = TruncatedSVD(n).fit_transform(X)
        cv = cross_val_score(GaussianProcessClassifier(kernel=kernel), Xn, y, cv=cv)
        return cv
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def tfidf(corpus, corpusKeys, use_dict=False):
    #TODO clean this up
    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    stopset.extend([str(x) for x in range(9999)])
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = (vectorizer.fit_transform(corpus)).toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    terms = vectorizer.get_feature_names()
    records = []
    if use_dict:
        records = {}

    for i, comp in enumerate(X):
        termsInComp = zip(terms, comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        if use_dict:
            records[corpusKeys[i]] = ((vader(corpusKeys[i], termList), termList))
        else:
            records.append((vader(corpusKeys[i], termList), termList))
    return records
项目:Informed-Finance-Canary    作者:Darthone    | 项目源码 | 文件源码
def tfidf():
    qry = (StockArticle.select(Article.id, Article.title, Article.content, Article.date, Stock.id.alias('stock_id'), Stock.ticker, StockArticle).join(Stock, on=(StockArticle.stock_id == Stock.id)).join(Article, on=(StockArticle.article_id == Article.id)).where((Stock.ticker == 'GM.N') | (Stock.ticker == 'TGT.N') | (Stock.ticker == 'UAA') | (Stock.ticker == 'UAA.N'), Article.date > '2015-01-01').naive())
    corpusDict = {article.article_id : article.content for article in qry }
    corpus = corpusDict.values()
    corpusKeys = corpusDict.keys()

    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    for i in range(9999):
        stopset.append(str(i))
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = vectorizer.fit_transform(corpus)
    X = X.toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    #lsa.fit(X)
    terms = vectorizer.get_feature_names()
    tfidfList = []
    for i, comp in enumerate(X):
        termsInComp = zip(terms,comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        tfidfList = [corpusKeys[i],termList]

        vader(tfidfList)
项目:Machine-Learning    作者:zjuzpz    | 项目源码 | 文件源码
def getSVD(data):
    svd = TruncatedSVD(n_components=50, n_iter=5)
    matrix = solution(data)
    svd_matrix = svd.fit_transform(matrix)
    return svd_matrix