我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用sklearn.datasets.get_data_home()。
def split(p): output = os.path.join(get_data_home(), "kddcup.parq") if not os.path.exists(output): dtype = { 1: 'category', 2: 'category', 3: 'category', 41: 'category', } df = pd.read_csv(p, header=None, dtype=dtype) cat_cols = df.select_dtypes(include=['category']).columns df[cat_cols] = df[cat_cols].apply(lambda col: col.cat.codes) df.columns = list(string.ascii_letters[:len(df.columns)]) ddf = dd.from_pandas(df, npartitions=16) ddf.to_parquet(output) return output
def fetch_load_letters(data_dir=None): path = os.path.join(get_data_home(data_dir), 'letter-recognition.data') if not os.path.exists(path): from urllib import request url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data' print('Downloading letter-recognition dataset from {}...'.format(url)) request.urlretrieve(url=url, filename=path) else: print('Found letter-recognition in {}!'.format(path)) X, y = [], [] with open(path) as f: reader = csv.reader(f) for row in reader: y.append(row[0]) X.append(row[1:]) labels, label_idx = np.unique(y, return_inverse=True) return np.asarray(X, dtype=float), label_idx
def download(): p = os.path.join(get_data_home(), "kddcup.data.gz") if os.path.exists(p): return p r = requests.get(URL, stream=True) with open(p, "wb") as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) return p
def test_data_home(): # get_data_home will point to a pre-existing folder data_home = get_data_home(data_home=DATA_HOME) assert_equal(data_home, DATA_HOME) assert_true(os.path.exists(data_home)) # clear_data_home will delete both the content and the folder it-self clear_data_home(data_home=data_home) assert_false(os.path.exists(data_home)) # if the folder is missing it will be created again data_home = get_data_home(data_home=DATA_HOME) assert_true(os.path.exists(data_home))
def setup_module(module): data_home = get_data_home() if not exists(join(data_home, '20news_home')): raise SkipTest("Skipping dataset loading doctests")
def setup_module(module): data_home = get_data_home() if not exists(join(data_home, 'lfw_home')): raise SkipTest("Skipping dataset loading doctests")
def setup_module(): check_skip_network() # skip the test in rcv1.rst if the dataset is not already loaded rcv1_dir = os.path.join(get_data_home(), "RCV1") if not os.path.exists(rcv1_dir): raise SkipTest("Download RCV1 dataset to run this test.")
def stream_reuters_documents(data_path=None): """Iterate over documents of the Reuters dataset. The Reuters archive will automatically be downloaded and uncompressed if the `data_path` directory does not exist. Documents are represented as dictionaries with 'body' (str), 'title' (str), 'topics' (list(str)) keys. """ DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 'reuters21578-mld/reuters21578.tar.gz') ARCHIVE_FILENAME = 'reuters21578.tar.gz' if data_path is None: data_path = os.path.join(get_data_home(), "reuters") if not os.path.exists(data_path): """Download the dataset.""" print("downloading dataset (once and for all) into %s" % data_path) os.mkdir(data_path) def progress(blocknum, bs, size): total_sz_mb = '%.2f MB' % (size / 1e6) current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) if _not_in_sphinx(): print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') archive_path = os.path.join(data_path, ARCHIVE_FILENAME) urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress) if _not_in_sphinx(): print('\r', end='') print("untarring Reuters dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) print("done.") parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): for doc in parser.parse(open(filename, 'rb')): yield doc ############################################################################### # Main # ---- # # Create the vectorizer and limit the number of features to a reasonable # maximum
def fetch_load_isolet(data_dir=None): train = 'isolet1+2+3+4.data.Z' test = 'isolet5.data.Z' path_train = os.path.join(get_data_home(data_dir), train) path_test = os.path.join(get_data_home(data_dir), test) if not os.path.exists(path_train[:-2]) or not os.path.exists(path_test[:-2]): from urllib import request url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/isolet/' if not os.path.exists(path_train[:-2]): if not os.path.exists(path_train): print('Downloading Isolated Letter Speech Recognition data set from {}...'.format( url)) request.urlretrieve(url=url+train, filename=path_train) # os.system('gzip -d ' + path_train) decompress_z(path_train) if not os.path.exists(path_test[:-2]): if not os.path.exists(path_test): print('Downloading Isolated Letter Speech Recognition data set from {}...'.format( url)) request.urlretrieve(url=url+test, filename=path_test) # os.system('gzip -d ' + path_test) decompress_z(path_test) else: print('Found Isolated Letter Speech Recognition data set!') xtr, ytr = [], [] with open(path_train[:-2]) as f: reader = csv.reader(f) for row in reader: xtr.append(row[:-1]) ytr.append(int(float(row[-1]))) labels, ytr = np.unique(ytr, return_inverse=True) xte, yte = [], [] with open(path_test[:-2]) as f: reader = csv.reader(f) for row in reader: xte.append(row[:-1]) yte.append(int(float(row[-1]))) labels, yte = np.unique(yte, return_inverse=True) return np.asarray(xtr, dtype=float), np.asarray(xte, dtype=float), ytr, yte
def stream_reuters_documents(data_path=None): """Iterate over documents of the Reuters dataset. The Reuters archive will automatically be downloaded and uncompressed if the `data_path` directory does not exist. Documents are represented as dictionaries with 'body' (str), 'title' (str), 'topics' (list(str)) keys. """ DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 'reuters21578-mld/reuters21578.tar.gz') ARCHIVE_FILENAME = 'reuters21578.tar.gz' if data_path is None: data_path = os.path.join(get_data_home(), "reuters") if not os.path.exists(data_path): """Download the dataset.""" print("downloading dataset (once and for all) into %s" % data_path) os.mkdir(data_path) def progress(blocknum, bs, size): total_sz_mb = '%.2f MB' % (size / 1e6) current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) if _not_in_sphinx(): print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') archive_path = os.path.join(data_path, ARCHIVE_FILENAME) urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress) if _not_in_sphinx(): print('\r', end='') print("untarring Reuters dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) print("done.") parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): for doc in parser.parse(open(filename, 'rb')): yield doc ############################################################################### # Main ############################################################################### # Create the vectorizer and limit the number of features to a reasonable # maximum