我们从Python开源项目中,提取了以下27个代码示例,用于说明如何使用sklearn.datasets.fetch_20newsgroups()。
def test_bag_of_words_for_series(): dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) series = XSeries(dataset.data[:10]) assert series.data_type == str translator = str.maketrans('', '', string.punctuation) tokenizer_transformer = XSeriesTransformer( transform_function=lambda text: text.lower().translate(translator).strip().split() ) transformed_series = tokenizer_transformer.fit_transform(series) # print(transformed_series) bag_transform = BagOfWordsTransformer() transformed_series = bag_transform.fit_transform(transformed_series) # print(transformed_series) assert type(transformed_series) == XDataFrame
def load_20ng_dataset_bow(): """ Loads the 20NG dataset :return: """ newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') # Convert data to tf-idf vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95) train_data = vectorizer.fit_transform(newsgroups_train.data) test_data = vectorizer.transform(newsgroups_test.data) train_data = train_data.todense() test_data = test_data.todense() train_labels = newsgroups_train.target test_labels = newsgroups_test.target return train_data, train_labels, test_data, test_labels
def demo_command(args): def create_data_file(partition, filename, samples): data = pandas.DataFrame( {TEXT_NAME: partition.data, LABEL_NAME: [partition.target_names[target] for target in partition.target]}).dropna()[:samples] data.to_csv(filename, index=False) return filename os.makedirs(args.directory, exist_ok=True) print("Download a portion of the 20 Newsgroups data and create train.csv and test.csv.") newsgroups_train = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes")) newsgroups_test = fetch_20newsgroups(subset="test", remove=("headers", "footers", "quotes")) train_filename = create_data_file(newsgroups_train, os.path.join(args.directory, "train.csv"), 1000) test_filename = create_data_file(newsgroups_test, os.path.join(args.directory, "test.csv"), 100) model_directory = os.path.join(args.directory, "model") print("Train a model.\n") cmd = "train bow %s --save-model %s --epochs 5 --logging progress\n" % ( train_filename, model_directory) print("mycroft " + cmd) default_main(cmd.split()) print("\nEvaluate it on the test data.\n") cmd = "evaluate %s %s\n" % (model_directory, test_filename) print("mycroft " + cmd) default_main(cmd.split()) print("\n(Note that there is not enough training data here to generate accurate predictions.)")
def get_data(): from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer vectorizer = CountVectorizer() categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] # Train set newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True) X_train = vectorizer.fit_transform(newsgroups_train.data) y_train = newsgroups_train.target # Test set newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True) X_test = vectorizer.transform(newsgroups_test.data) y_test = newsgroups_test.target return X_train, y_train, X_test, y_test
def test_build(self): newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(newsgroups_train.data) corpus = CorpusFromScikit( X=X_counts, y=newsgroups_train.target, feature_vocabulary=count_vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics']) self.assertEqual(corpus .get_term_freq_df() .assign(score=corpus.get_scaled_f_scores('alt.atheism')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['atheism', 'atheists', 'islam', 'atheist', 'belief']) self.assertGreater(len(corpus.get_texts()[0]), 5)
def test_build(self): from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(newsgroups_train.data) term_doc_mat = TermDocMatrixFromScikit( X=X_counts, y=newsgroups_train.target, feature_vocabulary=count_vectorizer.vocabulary_, category_names=newsgroups_train.target_names).build() self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics']) self.assertEqual(term_doc_mat .get_term_freq_df() .assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
def newsgroups_class_distrib(): from sklearn.datasets import fetch_20newsgroups ngroup_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None) ngroup_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None) test_data = ngroup_test.data train_data = ngroup_train.data test_groups = ngroup_test.target train_groups = ngroup_train.target n = 2000 train_groups = train_groups[:n] test_groups = test_groups[:n] plt.figure() plt.hist(train_groups, 20, normed=True, range=(0, 19)) plt.title("train groups") plt.figure() plt.hist(test_groups, 20, normed=True, range=(0, 19)) plt.title("test groups") plt.show()
def newsgroups(*, path=None, key=None, limit=None): """ Return a list of newsgroup messages from the 20 newsgroups dataset. Arguments: - path(str): Unused in this case. Dataset is managed by sklearn. - key(str): Unused. - limit(int): Unused. """ # This is going to download the dataset the first time we # run this function. Ideally we can populate these datasets # ahead of time. from sklearn.datasets import fetch_20newsgroups if limit: return fetch_20newsgroups(subset='train').data[:limit] return fetch_20newsgroups(subset='train').data
def test_bag_of_words_for_series_pipeline(): dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) n = 100 series = XSeries(dataset.data[:n]) assert series.data_type == str translator = str.maketrans('', '', string.punctuation) tokenizer_transformer = XSeriesTransformer( transform_function=lambda text: text.lower().translate(translator).strip().split() ) # series = tokenizer_transformer.transform(series) Y = np.random.binomial(1, 0.5, n) pipeline = PipeLineChain([ ('preprocessing', XSeriesTransformer( transform_function=lambda text: text.lower().translate(translator).strip().split() )), ('extractor', BagOfWordsTransformer()), ('pca', PCA(n_components=10)), # ('svc', LinearSVC()) ]) pipeline = pipeline.fit(series) transformed_series = pipeline.transform(series) # print(transformed_series)
def setUp(self): """Carga de los datos de prueba (20 Newsgroups corpus).""" newsdata = fetch_20newsgroups(data_home="./data/") self.ids = [str(i) for i in range(len(newsdata.target))] self.texts = newsdata.data self.labels = [newsdata.target_names[idx] for idx in newsdata.target] self.tc = TextClassifier(self.texts, self.ids)
def case1(): from sklearn import datasets news = datasets.fetch_20newsgroups(subset='all') # print len(news.data) # print len(news.target) # print '*'*10 # print news.data[0] # print '*'*10 # print news.target[0] from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer vec = CountVectorizer() x = vec.fit_transform(news.data) # print x.shape # print x[:2] print x[:10,:10].toarray() TFIDF = TfidfTransformer() x_tfidf = TFIDF.fit_transform(x) print x_tfidf[:10,:10].toarray() from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233) tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233) from sklearn.naive_bayes import MultinomialNB mnb =MultinomialNB() tf_mnb = MultinomialNB() mmb.fit(Xtrain,ytrain) tf_mnb.fit(tf_Xtrain,tf_ytrain)
def __init__(self, cfg=None): super().__init__() self.__dataset__ = fetch_20newsgroups(subset=cfg['subset'], categories=cfg['categories'], shuffle=cfg['shuffle'], random_state=cfg['random_state'])
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42): """ Retrieve data from 20 newsgroups :param subset: train, test or all :param categories: List of newsgroup name :param shuffle: shuffle the list or not :param random_state: seed integer to shuffle the dataset :return: data and labels of the newsgroup """ datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state) return datasets
def main(): #parameters num_features = 400 #vocabulary size #load data print "loading 20 newsgroups dataset..." categories = ['rec.autos','rec.sport.hockey','comp.graphics','sci.space'] tic = time() dataset = fetch_20newsgroups(shuffle=True, random_state=0, categories=categories, remove=('headers','footers','quotes')) train_corpus = dataset.data # a list of 11314 documents / entries train_labels = dataset.target toc = time() print "elapsed time: %.4f sec" %(toc - tic) #tf-idf vectorizer tfidf = TfidfVectorizer(max_df=0.5, max_features=num_features, \ min_df=2, stop_words='english', use_idf=True) X_tfidf = tfidf.fit_transform(train_corpus).toarray() #append document labels train_labels = train_labels.reshape(-1,1) X_all = np.hstack([train_labels, X_tfidf]) #distribute the data sc = SparkContext('local', 'log_reg') rdd = sc.parallelize(X_all) labeled_corpus = rdd.map(parse_doc) train_RDD, test_RDD = labeled_corpus.randomSplit([8, 2], seed=0) #distributed logistic regression print "training logistic regression..." model = LogisticRegressionWithLBFGS.train(train_RDD, regParam=1, regType='l1', numClasses=len(categories)) #evaluated the model on test data labels_and_preds = test_RDD.map(lambda p: (p.label, model.predict(p.features))) test_err = labels_and_preds.filter(lambda (v, p): v != p).count() / float(test_RDD.count()) print "log-reg test error: ", test_err #model.save(sc, './log_reg_lbfgs_model')
def fetch_data(self, subset='train', categories=None): """return data ???????? Arguments: subset -> string -- ??????? train / test / all """ rand = np.random.mtrand.RandomState(8675309) data = fetch_20newsgroups(subset=subset, categories=categories, shuffle=True, random_state=rand) self.data[subset] = data
def load_newsgroups(): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import fetch_20newsgroups newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer(max_features=2000, dtype=np.float64, sublinear_tf=True) x_sparse = vectorizer.fit_transform(newsgroups.data) x = np.asarray(x_sparse.todense()) y = newsgroups.target print('News group data shape ', x.shape) print("News group number of clusters: ", np.unique(y).size) return x, y
def get_data(): data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes')) return data
def download_articles(name, categories, subset): data = {} print("Downloading articles") newsgroups_data = fetch_20newsgroups(subset=subset, categories=categories, remove=()) for i in range(len(newsgroups_data['data'])): line = newsgroups_data['data'][i] data[str(len(data))] = {'text': line, 'label': newsgroups_data['target_names'][newsgroups_data['target'][i]]} print(len(data)) raw_data_dir = os.path.join('..', 'data', '20ng', name) print("Saving to", raw_data_dir) fh.makedirs(raw_data_dir) fh.write_to_json(data, os.path.join(raw_data_dir, subset + '.json'))
def test_20news(): try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract a reduced dataset data2cats = datasets.fetch_20newsgroups( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset assert_equal(data2cats.target_names, data.target_names[-2:]) # Assert that we have only 0 and 1 as labels assert_equal(np.unique(data2cats.target).tolist(), [0, 1]) # Check that the number of filenames is consistent with data/target assert_equal(len(data2cats.filenames), len(data2cats.target)) assert_equal(len(data2cats.filenames), len(data2cats.data)) # Check that the first entry of the reduced dataset corresponds to # the first entry of the corresponding category in the full dataset entry1 = data2cats.data[0] category = data2cats.target_names[data2cats.target[0]] label = data.target_names.index(category) entry2 = data.data[np.where(data.target == label)[0][0]] assert_equal(entry1, entry2)
def test_20news_length_consistency(): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract the full dataset data = datasets.fetch_20newsgroups(subset='all') assert_equal(len(data['data']), len(data.data)) assert_equal(len(data['target']), len(data.target)) assert_equal(len(data['filenames']), len(data.filenames))
def test_train_model(): data = fetch_20newsgroups( random_state=42, categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']) limit = 200 if limit is not None: data['target'] = data['target'][:limit] data['data'] = data['data'][:limit] n_domains = int(len(data['target']) / 5) docs = [ { 'html': '\n'.join('<p>{}</p>'.format(t) for t in text.split('\n')), 'url': 'http://example-{}.com/{}'.format(n % n_domains, n), 'relevant': {'sci.space': True, 'sci.med': None}.get( data['target_names'][target], False), } for n, (text, target) in enumerate(zip(data['data'], data['target']))] result = train_model(docs) pprint(attr.asdict(result.meta)) assert lst_as_dict(result.meta.advice) == [ {'kind': 'Notice', 'text': "The quality of the classifier is very good, ROC AUC is 0.96. " "You can label more pages if you want to improve quality, " "but it's better to start crawling " "and check the quality of crawled pages.", }, ] assert lst_as_dict(result.meta.description) == [ {'heading': 'Dataset', 'text': '200 documents, 159 labeled across 40 domains.'}, {'heading': 'Class balance', 'text': '33% relevant, 67% not relevant.'}, {'heading': 'Metrics', 'text': ''}, {'heading': 'Accuracy', 'text': '0.881 ± 0.122'}, {'heading': 'ROC AUC', 'text': '0.964 ± 0.081'}] assert len(result.meta.weights['pos']) > 0 assert len(result.meta.weights['neg']) > 0 assert isinstance(result.model, BaseModel) assert hasattr(result.model, 'predict_proba')
def main(): """ Cluster the newsgroups dataset and measure against labels. In this script, we're doing a grid search against various TFIDF representations of the newsgroups dataset. We want a TFIDF representation that has a good unsupervised representation. We're measuring the quality of that unsupervised representation by how well it matches up to the actual supervised labels of the newsgroups dataset. """ newsgroups = fetch_20newsgroups( subset='train', categories=CATEGORIES, shuffle=True ) print("Loaded data") gridsearch = GridSearchCV( Pipeline([ ('vec', TfidfVectorizer()), ('cluster', ClusteringWithSupervision( cluster_instance=MiniBatchKMeans())) ]), { 'vec__stop_words': (None, 'english') } ) print("Defined pipeline. Beginning fit.") gridsearch.fit(newsgroups.data, newsgroups.target) print_best_worst(gridsearch.cv_results_) best_estimator = gridsearch.best_estimator_ predicted = best_estimator.predict(newsgroups.data) print( classification_report( newsgroups.target, predicted, target_names=newsgroups.target_names))
def load_20news(setName): newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes' totalLineNum = 0 readDocNum = 0 print "Loading 20 newsgroup %s data..." %setName setDocNum = len(newsgroups_subset.data) orig_docs_name = [] orig_docs_cat = [] orig_docs_words = [] catNum = len(newsgroups_subset.target_names) cats_docsWords = [ [] for i in xrange(catNum) ] cats_docNames = [ [] for i in xrange(catNum) ] emptyFileNum = 0 for d, text in enumerate(newsgroups_subset.data): if d % 50 == 49 or d == setDocNum - 1: print "\r%d %d\r" %( d + 1, totalLineNum ), text = text.encode("utf-8") lines = text.split("\n") if len(text) == 0 or len(lines) == 0: emptyFileNum += 1 continue readDocNum += 1 totalLineNum += len(lines) catID = newsgroups_subset.target[d] category = newsgroups_subset.target_names[catID] text = " ".join(lines) wordsInSentences, wc = extractSentenceWords(text) filename = newsgroups_subset.filenames[d] filename = os.path.basename(filename) orig_docs_words.append( wordsInSentences ) orig_docs_name.append(filename) orig_docs_cat.append(catID) cats_docsWords[catID].append(wordsInSentences) cats_docNames[catID].append(filename) print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum) return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \ cats_docsWords, cats_docNames, newsgroups_subset.target_names
def initializeData(data): # graphics_train = fetch_20newsgroups(subset = dataSet,\ # categories = categories, shuffle = True, random_state = 42) wnl = WordNetLemmatizer() stop_words = text.ENGLISH_STOP_WORDS data = data #List of dicts, each element represents word to number mapping for each document termDictList = [] #Dictionary for each term which stores the number of documents that contains this term termDocCountDict = {} # set of term termSet = set() # list of int, each element represents total number of terms in each tokenlized documment termCountList = [] # get focument frequency for each term for i in range(len(data)): document = data[i].lower() words = set(word_tokenize(document)) for word in words: if word.isalpha(): term = wnl.lemmatize(word) if term not in stop_words: if term not in termDocCountDict: termDocCountDict[term] = 0 termDocCountDict[term] += 1 # get termDict and termSet for i in range(len(data)): termDict = {} termCount = 0 document = data[i].lower() words = word_tokenize(document) for word in words: if word.isalpha(): term = wnl.lemmatize(word) if term not in stop_words: if term in termDocCountDict: if termDocCountDict[term] >= 110 and termDocCountDict[term] <= 11000: termSet.add(term) termCount += 1 # fill in termDict if term not in termDict: termDict[term] = 0 termDict[term] += 1 else: del termDocCountDict[term] termDictList.append(termDict) termCountList.append(termCount) return (termDictList, termCountList, termDocCountDict, termSet) # function
def main(): """ Train a classifier on the 20 newsgroups dataset. The purpose of this is mostly trying to figure out how to turn text into really good vector representations for classification... which are also hopefully good vector representations for unsupervised learning too. """ # We don't really use our interfaces for iterating over datasets... # but maybe we will in the future. train = fetch_20newsgroups( subset='train', # categories=CATEGORIES, shuffle=True ) test = fetch_20newsgroups( subset='test', # categories=CATEGORIES, shuffle=True ) print("Loaded data.", len(set(train.target)), "classes.") glove_vectors = glove_simple() print("Loaded word vectors") pipeline = Pipeline([ # ('vec', TfidfVectorizer()), ('vec', WordVectorSum(vector_dict=glove_vectors)), # ('svd', TruncatedSVD()), ('fit', SGDClassifier()) ]) print("Defined pipeline. Beginning fit.") gridsearch = GridSearchCV( pipeline, { # 'vec__stop_words': ('english',), # 'svd__n_components': (2, 100, 500, 1000), # 'vec__min_df': (1, 0.01, 0.1, 0.4), # 'vec__max_df': (0.5, 0.75, 0.9, 1.0), # 'vec__max_features': (100, 1000, 10000) } ) gridsearch.fit(train.data, train.target) print("Completed fit. Beginning prediction") predicted = gridsearch.predict(test.data) print("Completed prediction.") accuracy = np.mean(predicted == test.target) print("Accuracy was", accuracy) print("Best params", gridsearch.best_params_) print_best_worst(gridsearch.cv_results_) print( classification_report( test.target, predicted, target_names=test.target_names))