Python sklearn.datasets 模块，fetch_20newsgroups() 实例源码

我们从Python开源项目中，提取了以下27个代码示例，用于说明如何使用sklearn.datasets.fetch_20newsgroups()。

项目：xpandas 作者：alan-turing-institute | 项目源码 | 文件源码

def test_bag_of_words_for_series():
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    series = XSeries(dataset.data[:10])
    assert series.data_type == str

    translator = str.maketrans('', '', string.punctuation)
    tokenizer_transformer = XSeriesTransformer(
        transform_function=lambda text: text.lower().translate(translator).strip().split()
    )

    transformed_series = tokenizer_transformer.fit_transform(series)
    # print(transformed_series)

    bag_transform = BagOfWordsTransformer()

    transformed_series = bag_transform.fit_transform(transformed_series)

    # print(transformed_series)

    assert type(transformed_series) == XDataFrame

项目：sef 作者：passalis | 项目源码 | 文件源码

def load_20ng_dataset_bow():
    """
    Loads the 20NG dataset
    :return:
    """

    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')

    # Convert data to tf-idf

    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95)
    train_data = vectorizer.fit_transform(newsgroups_train.data)
    test_data = vectorizer.transform(newsgroups_test.data)
    train_data = train_data.todense()
    test_data = test_data.todense()
    train_labels = newsgroups_train.target
    test_labels = newsgroups_test.target

    return train_data, train_labels, test_data, test_labels

项目：mycroft 作者：wpm | 项目源码 | 文件源码

def demo_command(args):
    def create_data_file(partition, filename, samples):
        data = pandas.DataFrame(
            {TEXT_NAME: partition.data,
             LABEL_NAME: [partition.target_names[target] for target in partition.target]}).dropna()[:samples]
        data.to_csv(filename, index=False)
        return filename

    os.makedirs(args.directory, exist_ok=True)
    print("Download a portion of the 20 Newsgroups data and create train.csv and test.csv.")
    newsgroups_train = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
    newsgroups_test = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes"))
    train_filename = create_data_file(newsgroups_train, os.path.join(args.directory, "train.csv"), 1000)
    test_filename = create_data_file(newsgroups_test, os.path.join(args.directory, "test.csv"), 100)
    model_directory = os.path.join(args.directory, "model")
    print("Train a model.\n")
    cmd = "train bow %s --save-model %s --epochs 5 --logging progress\n" % (
        train_filename, model_directory)
    print("mycroft " + cmd)
    default_main(cmd.split())
    print("\nEvaluate it on the test data.\n")
    cmd = "evaluate %s %s\n" % (model_directory, test_filename)
    print("mycroft " + cmd)
    default_main(cmd.split())
    print("\n(Note that there is not enough training data here to generate accurate predictions.)")

项目：Bayes 作者：krzjoa | 项目源码 | 文件源码

def get_data():
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    vectorizer = CountVectorizer()

    categories = ['alt.atheism', 'talk.religion.misc',
                  'comp.graphics', 'sci.space']

    # Train set
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          categories=categories, shuffle=True)
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    y_train = newsgroups_train.target

    # Test set
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         categories=categories, shuffle=True)
    X_test = vectorizer.transform(newsgroups_test.data)
    y_test = newsgroups_test.target

    return X_train, y_train, X_test, y_test

项目：scattertext 作者：JasonKessler | 项目源码 | 文件源码

def test_build(self):
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        corpus = CorpusFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names,
            raw_texts=newsgroups_train.data
        ).build()
        self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(corpus
                         .get_term_freq_df()
                         .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
        self.assertGreater(len(corpus.get_texts()[0]), 5)

项目：scattertext 作者：JasonKessler | 项目源码 | 文件源码

def test_build(self):
        from sklearn.datasets import fetch_20newsgroups
        from sklearn.feature_extraction.text import CountVectorizer
        newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
        count_vectorizer = CountVectorizer()
        X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
        term_doc_mat = TermDocMatrixFromScikit(
            X=X_counts,
            y=newsgroups_train.target,
            feature_vocabulary=count_vectorizer.vocabulary_,
            category_names=newsgroups_train.target_names).build()
        self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
        self.assertEqual(term_doc_mat
                         .get_term_freq_df()
                         .assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism'))
                         .sort_values(by='score', ascending=False).index.tolist()[:5],
                         ['atheism', 'atheists', 'islam', 'atheist', 'belief'])

项目：pymake 作者：dtrckd | 项目源码 | 文件源码

def newsgroups_class_distrib():
    from sklearn.datasets import fetch_20newsgroups
    ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None)
    ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None)
    test_data = ngroup_test.data
    train_data = ngroup_train.data
    test_groups = ngroup_test.target
    train_groups = ngroup_train.target

    n = 2000
    train_groups = train_groups[:n]
    test_groups = test_groups[:n]

    plt.figure()
    plt.hist(train_groups, 20, normed=True, range=(0, 19))
    plt.title("train groups")

    plt.figure()
    plt.hist(test_groups, 20, normed=True, range=(0, 19))
    plt.title("test groups")

    plt.show()

项目：nlp-playground 作者：jamesmishra | 项目源码 | 文件源码

def newsgroups(*, path=None, key=None, limit=None):
    """
    Return a list of newsgroup messages from the 20 newsgroups dataset.

    Arguments:
     - path(str): Unused in this case. Dataset is managed by sklearn.
     - key(str): Unused.
     - limit(int): Unused.
    """
    # This is going to download the dataset the first time we
    # run this function. Ideally we can populate these datasets
    # ahead of time.
    from sklearn.datasets import fetch_20newsgroups
    if limit:
        return fetch_20newsgroups(subset='train').data[:limit]
    return fetch_20newsgroups(subset='train').data

项目：xpandas 作者：alan-turing-institute | 项目源码 | 文件源码

def test_bag_of_words_for_series_pipeline():
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))
    n = 100
    series = XSeries(dataset.data[:n])
    assert series.data_type == str

    translator = str.maketrans('', '', string.punctuation)
    tokenizer_transformer = XSeriesTransformer(
        transform_function=lambda text: text.lower().translate(translator).strip().split()
    )

    # series = tokenizer_transformer.transform(series)

    Y = np.random.binomial(1, 0.5, n)

    pipeline = PipeLineChain([
        ('preprocessing', XSeriesTransformer(
            transform_function=lambda text: text.lower().translate(translator).strip().split()
        )),
        ('extractor', BagOfWordsTransformer()),
        ('pca', PCA(n_components=10)),
        # ('svc', LinearSVC())
    ])

    pipeline = pipeline.fit(series)
    transformed_series = pipeline.transform(series)

    # print(transformed_series)

项目：textar 作者：datosgobar | 项目源码 | 文件源码

def setUp(self):
        """Carga de los datos de prueba (20 Newsgroups corpus)."""
        newsdata = fetch_20newsgroups(data_home="./data/")
        self.ids = [str(i) for i in range(len(newsdata.target))]
        self.texts = newsdata.data
        self.labels = [newsdata.target_names[idx] for idx in newsdata.target]
        self.tc = TextClassifier(self.texts, self.ids)

项目：base_function 作者：Rockyzsu | 项目源码 | 文件源码

def case1():
    from sklearn import datasets
    news = datasets.fetch_20newsgroups(subset='all')
    # print len(news.data)
    # print len(news.target)

    # print '*'*10
    # print news.data[0]
    # print '*'*10
    # print news.target[0]
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    vec = CountVectorizer()
    x = vec.fit_transform(news.data)
    # print x.shape
    # print x[:2]
    print x[:10,:10].toarray()
    TFIDF = TfidfTransformer()
    x_tfidf = TFIDF.fit_transform(x)
    print x_tfidf[:10,:10].toarray()


    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)

    tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)


    from sklearn.naive_bayes import MultinomialNB
    mnb =MultinomialNB()
    tf_mnb = MultinomialNB()

    mmb.fit(Xtrain,ytrain)
    tf_mnb.fit(tf_Xtrain,tf_ytrain)

项目：text-classification 作者：cahya-wirawan | 项目源码 | 文件源码

def __init__(self, cfg=None):
        super().__init__()
        self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
                                              shuffle=cfg['shuffle'], random_state=cfg['random_state'])

项目：text-classification 作者：cahya-wirawan | 项目源码 | 文件源码

def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets

项目：pyspark 作者：vsmolyakov | 项目源码 | 文件源码

def main():

    #parameters
    num_features = 400  #vocabulary size

    #load data    
    print "loading 20 newsgroups dataset..."
    categories = ['rec.autos','rec.sport.hockey','comp.graphics','sci.space']    
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=0, categories=categories, remove=('headers','footers','quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    train_labels = dataset.target 
    toc = time()
    print "elapsed time: %.4f sec" %(toc - tic)    

    #tf-idf vectorizer
    tfidf = TfidfVectorizer(max_df=0.5, max_features=num_features, \
                            min_df=2, stop_words='english', use_idf=True)
    X_tfidf = tfidf.fit_transform(train_corpus).toarray()

    #append document labels
    train_labels = train_labels.reshape(-1,1)
    X_all = np.hstack([train_labels, X_tfidf])

    #distribute the data    
    sc = SparkContext('local', 'log_reg')    
    rdd = sc.parallelize(X_all)    
    labeled_corpus = rdd.map(parse_doc)
    train_RDD, test_RDD = labeled_corpus.randomSplit([8, 2], seed=0)

    #distributed logistic regression
    print "training logistic regression..."
    model = LogisticRegressionWithLBFGS.train(train_RDD, regParam=1, regType='l1', numClasses=len(categories))

    #evaluated the model on test data
    labels_and_preds = test_RDD.map(lambda p: (p.label, model.predict(p.features)))    
    test_err = labels_and_preds.filter(lambda (v, p): v != p).count() / float(test_RDD.count())
    print "log-reg test error: ", test_err

    #model.save(sc, './log_reg_lbfgs_model')

项目：DataScience-And-MachineLearning-Handbook-For-Coders 作者：wxyyxc1992 | 项目源码 | 文件源码

def fetch_data(self, subset='train', categories=None):
        """return data
        ????????

        Arguments:
        subset -> string -- ??????? train / test / all
        """
        rand = np.random.mtrand.RandomState(8675309)
        data = fetch_20newsgroups(subset=subset,
                                  categories=categories,
                                  shuffle=True,
                                  random_state=rand)

        self.data[subset] = data

项目：DEC-keras 作者：XifengGuo | 项目源码 | 文件源码

def load_newsgroups():
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.datasets import fetch_20newsgroups
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True)
    x_sparse = vectorizer.fit_transform(newsgroups.data)
    x = np.asarray(x_sparse.todense())
    y = newsgroups.target
    print('News group data shape ', x.shape)
    print("News group number of clusters: ", np.unique(y).size)
    return x, y

项目：opentc 作者：cahya-wirawan | 项目源码 | 文件源码

def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
    """
    Retrieve data from 20 newsgroups
    :param subset: train, test or all
    :param categories: List of newsgroup name
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the newsgroup
    """
    datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
    return datasets

项目：opentc 作者：cahya-wirawan | 项目源码 | 文件源码

def __init__(self, cfg=None):
        super().__init__()
        self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'],
                                              shuffle=cfg['shuffle'], random_state=cfg['random_state'])

项目：text-analytics-with-python 作者：dipanjanS | 项目源码 | 文件源码

def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    return data

项目：neural_topic_models 作者：dallascard | 项目源码 | 文件源码

def download_articles(name, categories, subset):

    data = {}
    print("Downloading articles")
    newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=())

    for i in range(len(newsgroups_data['data'])):
        line = newsgroups_data['data'][i]
        data[str(len(data))] = {'text': line, 'label': newsgroups_data['target_names'][newsgroups_data['target'][i]]}

    print(len(data))
    raw_data_dir = os.path.join('..', 'data', '20ng', name)
    print("Saving to", raw_data_dir)
    fh.makedirs(raw_data_dir)
    fh.write_to_json(data, os.path.join(raw_data_dir, subset + '.json'))

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_20news_length_consistency():
    """Checks the length consistencies within the bunch

    This is a non-regression test for a bug present in 0.16.1.
    """
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")
    # Extract the full dataset
    data = datasets.fetch_20newsgroups(subset='all')
    assert_equal(len(data['data']), len(data.data))
    assert_equal(len(data['target']), len(data.target))
    assert_equal(len(data['filenames']), len(data.filenames))

项目：hh-page-classifier 作者：TeamHG-Memex | 项目源码 | 文件源码

def test_train_model():
    data = fetch_20newsgroups(
        random_state=42,
        categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'])
    limit = 200
    if limit is not None:
        data['target'] = data['target'][:limit]
        data['data'] = data['data'][:limit]
    n_domains = int(len(data['target']) / 5)
    docs = [
        {
            'html': '\n'.join('<p>{}</p>'.format(t) for t in text.split('\n')),
            'url': 'http://example-{}.com/{}'.format(n % n_domains, n),
            'relevant': {'sci.space': True, 'sci.med': None}.get(
                data['target_names'][target], False),
        }
        for n, (text, target) in enumerate(zip(data['data'], data['target']))]
    result = train_model(docs)
    pprint(attr.asdict(result.meta))
    assert lst_as_dict(result.meta.advice) == [
        {'kind': 'Notice',
         'text': "The quality of the classifier is very good, ROC AUC is 0.96. "
                 "You can label more pages if you want to improve quality, "
                 "but it's better to start crawling "
                 "and check the quality of crawled pages.",
         },
        ]
    assert lst_as_dict(result.meta.description) == [
        {'heading': 'Dataset',
         'text': '200 documents, 159 labeled across 40 domains.'},
        {'heading': 'Class balance',
         'text': '33% relevant, 67% not relevant.'},
        {'heading': 'Metrics', 'text': ''},
        {'heading': 'Accuracy', 'text': '0.881 ± 0.122'},
        {'heading': 'ROC AUC', 'text': '0.964 ± 0.081'}]
    assert len(result.meta.weights['pos']) > 0
    assert len(result.meta.weights['neg']) > 0
    assert isinstance(result.model, BaseModel)
    assert hasattr(result.model, 'predict_proba')

项目：nlp-playground 作者：jamesmishra | 项目源码 | 文件源码

def main():
    """
    Cluster the newsgroups dataset and measure against labels.

    In this script, we're doing a grid search against various
    TFIDF representations of the newsgroups dataset. We want
    a TFIDF representation that has a good unsupervised
    representation.

    We're measuring the quality of that unsupervised
    representation by how well it matches up to the actual
    supervised labels of the newsgroups dataset.
    """
    newsgroups = fetch_20newsgroups(
        subset='train',
        categories=CATEGORIES,
        shuffle=True
    )
    print("Loaded data")
    gridsearch = GridSearchCV(
        Pipeline([
            ('vec', TfidfVectorizer()),
            ('cluster', ClusteringWithSupervision(
                cluster_instance=MiniBatchKMeans()))
        ]),
        {
            'vec__stop_words': (None, 'english')
        }
    )
    print("Defined pipeline. Beginning fit.")
    gridsearch.fit(newsgroups.data, newsgroups.target)
    print_best_worst(gridsearch.cv_results_)
    best_estimator = gridsearch.best_estimator_
    predicted = best_estimator.predict(newsgroups.data)
    print(
        classification_report(
            newsgroups.target,
            predicted,
            target_names=newsgroups.target_names))

项目：vmfmix 作者：askerlee | 项目源码 | 文件源码

def load_20news(setName):
    newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
    totalLineNum = 0
    readDocNum = 0
    print "Loading 20 newsgroup %s data..." %setName

    setDocNum = len(newsgroups_subset.data)
    orig_docs_name = []
    orig_docs_cat = []
    orig_docs_words = []

    catNum = len(newsgroups_subset.target_names)
    cats_docsWords = [ [] for i in xrange(catNum) ]
    cats_docNames = [ [] for i in xrange(catNum) ]

    emptyFileNum = 0

    for d, text in enumerate(newsgroups_subset.data):
        if d % 50 == 49 or d == setDocNum - 1:
            print "\r%d %d\r" %( d + 1, totalLineNum ),
        text = text.encode("utf-8")
        lines = text.split("\n")
        if len(text) == 0 or len(lines) == 0:
            emptyFileNum += 1
            continue

        readDocNum += 1
        totalLineNum += len(lines)

        catID = newsgroups_subset.target[d]
        category = newsgroups_subset.target_names[catID]

        text = " ".join(lines)

        wordsInSentences, wc = extractSentenceWords(text)
        filename = newsgroups_subset.filenames[d]
        filename = os.path.basename(filename)
        orig_docs_words.append( wordsInSentences )
        orig_docs_name.append(filename)
        orig_docs_cat.append(catID)
        cats_docsWords[catID].append(wordsInSentences)
        cats_docNames[catID].append(filename)

    print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
    return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
                cats_docsWords, cats_docNames, newsgroups_subset.target_names

项目：Machine-Learning 作者：zjuzpz | 项目源码 | 文件源码

def initializeData(data): 
#    graphics_train = fetch_20newsgroups(subset = dataSet,\
#    categories = categories, shuffle = True, random_state = 42)

    wnl = WordNetLemmatizer()
    stop_words = text.ENGLISH_STOP_WORDS

    data = data 
    #List of dicts, each element represents word to number mapping for each document
    termDictList = []
    #Dictionary for each term which stores the number of documents that contains this term
    termDocCountDict = {}
    # set of term 
    termSet = set()
    # list of int, each element represents total number of terms in each tokenlized documment
    termCountList = []    

    # get focument frequency for each term
    for i in range(len(data)):
        document = data[i].lower()
        words = set(word_tokenize(document))
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term not in termDocCountDict:
                        termDocCountDict[term] = 0
                    termDocCountDict[term] += 1

    # get termDict and termSet
    for i in range(len(data)):
        termDict = {}
        termCount = 0
        document = data[i].lower()
        words = word_tokenize(document)
        for word in words:
            if word.isalpha():
                term = wnl.lemmatize(word)
                if term not in stop_words:
                    if term in termDocCountDict:
                        if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000:
                            termSet.add(term)
                            termCount += 1
                            # fill in termDict
                            if term not in termDict:
                                termDict[term] = 0
                            termDict[term] += 1
                        else:
                            del termDocCountDict[term]
        termDictList.append(termDict)
        termCountList.append(termCount)

    return (termDictList, termCountList, termDocCountDict, termSet)

# function

项目：nlp-playground 作者：jamesmishra | 项目源码 | 文件源码

def main():
    """
    Train a classifier on the 20 newsgroups dataset.

    The purpose of this is mostly trying to figure out how
    to turn text into really good vector representations
    for classification... which are also hopefully good
    vector representations for unsupervised learning too.
    """
    # We don't really use our interfaces for iterating over datasets...
    # but maybe we will in the future.
    train = fetch_20newsgroups(
        subset='train',
        # categories=CATEGORIES,
        shuffle=True
    )
    test = fetch_20newsgroups(
        subset='test',
        # categories=CATEGORIES,
        shuffle=True
    )
    print("Loaded data.", len(set(train.target)), "classes.")
    glove_vectors = glove_simple()
    print("Loaded word vectors")
    pipeline = Pipeline([
        # ('vec', TfidfVectorizer()),
        ('vec', WordVectorSum(vector_dict=glove_vectors)),
        #  ('svd', TruncatedSVD()),
        ('fit', SGDClassifier())
    ])
    print("Defined pipeline. Beginning fit.")
    gridsearch = GridSearchCV(
        pipeline,
        {
            # 'vec__stop_words': ('english',),
            # 'svd__n_components': (2, 100, 500, 1000),
            # 'vec__min_df': (1, 0.01, 0.1, 0.4),
            #  'vec__max_df': (0.5, 0.75, 0.9, 1.0),
            #  'vec__max_features': (100, 1000, 10000)
        }
    )
    gridsearch.fit(train.data, train.target)
    print("Completed fit. Beginning prediction")
    predicted = gridsearch.predict(test.data)
    print("Completed prediction.")
    accuracy = np.mean(predicted == test.target)
    print("Accuracy was", accuracy)
    print("Best params", gridsearch.best_params_)
    print_best_worst(gridsearch.cv_results_)
    print(
        classification_report(
            test.target,
            predicted,
            target_names=test.target_names))