我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用sklearn.datasets.load_files()。
def load_files(path, encoding='gbk'): """ :param filename: structure such as the following: container_folder/ category_1_folder/ file_1.txt file_2.txt ... file_42.txt category_2_folder/ file_43.txt file_44.txt :param encoding: :return: Bunch object """ return datasets.load_files(path, encoding=encoding, decode_error='ignore', shuffle=False)
def getDatas(dataset_dir_name): movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) #word_tokenizer ?????????????????????????????????????????????????? vectorizer = CountVectorizer(binary = True, decode_error = u'ignore') word_tokenizer = vectorizer.build_tokenizer() #????????list doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train) doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test) return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
def get_datasets_localdata(container_path=None, categories=None, load_content=True, encoding='utf-8', shuffle=True, random_state=42): """ Load text files with categories as subfolder names. Individual samples are assumed to be files stored a two levels folder structure. :param container_path: The path of the container :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted) :param shuffle: shuffle the list or not :param random_state: seed integer to shuffle the dataset :return: data and labels of the dataset """ datasets = load_files(container_path=container_path, categories=categories, load_content=load_content, shuffle=shuffle, encoding=encoding, random_state=random_state) return datasets
def __init__(self, cfg=None): """ Load text files with categories as subfolder names. Individual samples are assumed to be files stored a two levels folder structure. :param container_path: The path of the container :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted) :param shuffle: shuffle the list or not :param random_state: seed integer to shuffle the dataset :return: data and labels of the dataset """ super().__init__() self.__dataset__ = load_files(container_path=cfg['container_path'], categories=cfg['categories'], load_content=cfg['load_content'], shuffle=cfg['shuffle'], encoding=cfg['encoding'], random_state=cfg['random_state'])
def test_default_empty_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 0) assert_equal(len(res.target_names), 0) assert_equal(res.DESCR, None)
def test_default_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")])
def test_load_files_w_categories_desc_and_encoding(): category = os.path.abspath(TEST_CATEGORY_DIR1).split('/').pop() res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8") assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 1) assert_equal(res.DESCR, "test") assert_equal(res.data, [u("Hello World!\n")])
def test_load_files_wo_load_content(): res = load_files(LOAD_FILES_ROOT, load_content=False) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.get('data'), None)
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: # create output directory if it doesn't exist output_dir = os.path.expanduser(args.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) # load the model print("Loading trained model...\n") meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.trained_model_dir)) facenet.load_model(args.trained_model_dir, meta_file, ckpt_file) # grab all image paths and labels print("Finding image paths and targets...\n") data = load_files(args.data_dir, load_content=False, shuffle=False) labels_array = data['target'] paths = data['filenames'] # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") image_size = images_placeholder.get_shape()[1] embedding_size = embeddings.get_shape()[1] # Run forward pass to calculate embeddings print('Generating embeddings from images...\n') start_time = time.time() batch_size = args.batch_size nrof_images = len(paths) nrof_batches = int(np.ceil(1.0*nrof_images / batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) for i in xrange(nrof_batches): start_index = i*batch_size end_index = min((i+1)*batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = facenet.load_data(paths_batch, do_random_crop=False, do_random_flip=False, image_size=image_size, do_prewhiten=True) feed_dict = { images_placeholder:images, phase_train_placeholder:False} emb_array[start_index:end_index,:] = sess.run(embeddings, feed_dict=feed_dict) time_avg_forward_pass = (time.time() - start_time) / float(nrof_images) print("Forward pass took avg of %.3f[seconds/image] for %d images\n" % (time_avg_forward_pass, nrof_images)) print("Finally saving embeddings and gallery to: %s" % (output_dir)) # save the gallery and embeddings (signatures) as numpy arrays to disk np.save(os.path.join(output_dir, "gallery.npy"), labels_array) np.save(os.path.join(output_dir, "signatures.npy"), emb_array)