我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.MultiIndex()。
def _set_display_options(self, dataframe, display_schema): """ Replaces the dimension options with those that the user has specified manually e.g. change 'm' to 'mobile' """ dataframe = dataframe.copy() for key, dimension in display_schema['dimensions'].items(): if 'display_options' in dimension: display_values = [dimension['display_options'].get(value, value) for value in dataframe.index.get_level_values(key).unique()] if not display_values: continue if isinstance(dataframe.index, pd.MultiIndex): dataframe.index.set_levels(display_values, key, inplace=True) else: dataframe.index = pd.Index(display_values) return dataframe
def test_three_iterations_no_metadata(self): columns = pd.MultiIndex.from_product([[1, 200], [1, 2, 3]], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]], columns=columns, index=['S1', 'S2', 'S3']) # No counts provided because no metadata obs = _compute_summary(data, 'sample-id') d = [['S1', 1, 1, 1., 1.04, 1.18, 1.5, 2., 2.5, 2.82, 2.96, 3.], ['S1', 200, 1, 4., 4.04, 4.18, 4.5, 5., 5.5, 5.82, 5.96, 6.], ['S2', 1, 1, 1., 1.04, 1.18, 1.5, 2., 2.5, 2.82, 2.96, 3.], ['S2', 200, 1, 4., 4.04, 4.18, 4.5, 5., 5.5, 5.82, 5.96, 6.], ['S3', 1, 1, 1., 1.04, 1.18, 1.5, 2., 2.5, 2.82, 2.96, 3.], ['S3', 200, 1, 4., 4.04, 4.18, 4.5, 5., 5.5, 5.82, 5.96, 6.]] exp = pd.DataFrame(data=d, columns=['sample-id', 'depth', 'count', 'min', '2%', '9%', '25%', '50%', '75%', '91%', '98%', 'max']) pdt.assert_frame_equal(exp, obs)
def test_two_iterations_with_metadata_were_values_are_identical(self): columns = pd.MultiIndex.from_product([[1, 200], [1, 2]], names=['depth', 'iter']) data = pd.DataFrame(data=[[3, 6, 9, 9]], columns=columns, index=['milo']) counts = pd.DataFrame(data=[[3, 3, 3, 3]], columns=columns, index=['milo']) obs = _compute_summary(data, 'pet', counts=counts) d = [ ['milo', 1, 3., 3.06, 3.27, 3.75, 4.5, 5.25, 5.73, 5.94, 6., 3], ['milo', 200, 9., 9., 9., 9., 9., 9., 9., 9., 9., 3], ] exp = pd.DataFrame(data=d, columns=['pet', 'depth', 'min', '2%', '9%', '25%', '50%', '75%', '91%', '98%', 'max', 'count']) pdt.assert_frame_equal(exp, obs)
def test_some_duplicates_in_category(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), (200, 2), ('pet', '')], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'], [9, 10, 11, 12, 'russ']], columns=columns, index=['S1', 'S2', 'S3']) obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1])
def test_all_identical(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), (200, 2), ('pet', '')], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'], [9, 10, 11, 12, 'russ']], columns=columns, index=['S1', 'S2', 'S3']) obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[3, 3, 3, 3]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1])
def write_data(self, result_dict): for key, result in six.iteritems(result_dict): is_null = False if isinstance(result, pd.DataFrame): if result.isnull().any().any(): is_null = True elif isinstance(result, pd.Series): if result.isnull().any(): is_null = True else: raise ValueError("PandasHDFDataHandler doesn't support type " "{} (in key {})".format(type(result), key)) if is_null: raise ValueError("data {} have nan".format(key)) with SimpleTimer("Writing generated data {} to hdf5 file" .format(key), end_in_new_line=False): if (isinstance(result, pd.DataFrame) and isinstance(result.index, pd.MultiIndex) and isinstance(result.columns, pd.MultiIndex)): self.hdf_store.put(key, result) else: self.hdf_store.put(key, result, format='table') self.hdf_store.flush(fsync=True)
def merge_interictal_preictal(interictal, preictal): """ Merges the *interictal* and *preictal* data frames to a single data frame. Also sorts the multilevel index. :param interictal: A data frame containing the interictal samples. :param preictal: A data frame containing the preictal samples. :return: A data frame containing both interictal and preictal data. The multilevel index of the data frame is sorted. """ logging.info("Merging interictal and preictal datasets") try: preictal.sortlevel('segment', inplace=True) if isinstance(preictal.columns, pd.MultiIndex): preictal.sortlevel(axis=1, inplace=True) interictal.sortlevel('segment', inplace=True) if isinstance(interictal.columns, pd.MultiIndex): interictal.sortlevel(axis=1, inplace=True) except TypeError: logging.warning("TypeError when trying to merge interictal and preictal sets.") dataset = pd.concat((interictal, preictal)) dataset.sortlevel('segment', inplace=True) return dataset
def test_k_fold_segment_split(): """ Test function for the k-fold segment split """ interictal_classes = np.zeros(120) preictal_classes = np.ones(120) classes = np.concatenate((interictal_classes, preictal_classes,)) segments = np.arange(12) i = np.arange(240) index = pd.MultiIndex.from_product([segments, np.arange(20)], names=('segment', 'start_sample')) dataframe = pd.DataFrame({'Preictal': classes, 'i': i}, index=index) # With a 6-fold cross validator, we expect each held-out fold to contain exactly 2 segments, one from each class cv1 = SegmentCrossValidator(dataframe, n_folds=6, shuffle=True, random_state=42) cv2 = SegmentCrossValidator(dataframe, n_folds=6, shuffle=True, random_state=42) for (training_fold1, test_fold1), (training_fold2, test_fold2) in zip(cv1, cv2): assert np.all(training_fold1 == training_fold1) and np.all(test_fold1 == test_fold2)
def load_preictal_dataframes(feature_folder, sliding_frames=False, **kwargs): """ Convenience function for loading preictal dataframes. Sets the 'Preictal' column to 1. :param feature_folder: The folder to load the feature data from. :param sliding_frames: If True, the data frame will be extended using sliding frames over the feature windows. :param kwargs: keyword arguments to use for loading the features. :return: A DataFrame of preictal data with a 'Preictal' column set to 1. """ preictal = load_feature_files(feature_folder, class_name="preictal", sliding_frames=sliding_frames, **kwargs) preictal['Preictal'] = 1 preictal.sortlevel('segment', inplace=True) if isinstance(preictal.columns, pd.MultiIndex): preictal.sortlevel(axis=1, inplace=True) return preictal
def load_interictal_dataframes(feature_folder, sliding_frames=False, **kwargs): """ Convenience function for loading interictal dataframes. Sets the 'Preictal' column to 0. :param feature_folder: The folder to load the feature data from. :param sliding_frames: If True, the data frame will be extended using sliding frames over the feature windows. :param kwargs: keyword arguments to use for loading the features. :return: A DataFrame of interictal data with a 'Preictal' column set to 0. """ interictal = load_feature_files(feature_folder, class_name="preictal", sliding_frames=sliding_frames, **kwargs) interictal['Preictal'] = 0 interictal.sortlevel('segment', inplace=True) if isinstance(interictal.columns, pd.MultiIndex): interictal.sortlevel(axis=1, inplace=True) return interictal
def create_sliding_frames(dataframe, frame_length=12): """ Wrapper for the extend_data_with_sliding_frames function which works with numpy arrays. This version does the data-frame conversion for us. :param dataframe: The dataframe to extend. :param frame_length: The frame length to use in the resulting extended data frame. :return: A new data frame where the original dataframe has been extended with sliding frames. """ extended_array = extend_data_with_sliding_frames(dataframe.values) # We should preserve the columns of the dataframe, otherwise # concatenating different dataframes along the row-axis will give # wrong results window_columns = dataframe.columns column_index = pd.MultiIndex.from_product([range(frame_length), window_columns], names=['window', 'feature']) return pd.DataFrame(data=extended_array, columns=column_index)
def testTwoDimensionalCumulativeDistribution(self): df = pd.DataFrame({"X": [1, 1, 1, 2, 2, 3, 4], "Y": [1, 2, 0, 1, 1, 1, 1], "Z": [1, 0, 0, 0, 0, 0, 0]}) weights = np.array([1, 1, 1, 1, 1, 1, 1]) metric = metrics.CumulativeDistribution("X", ["Y", "Z"]) output = metric(df, weights) correct = pd.DataFrame( np.array([1 / 14., 12 / 14., 13 / 14., 1.]), columns=[""], index=pd.MultiIndex(levels=[[0, 1, 2], [0, 1]], labels=[[0, 1, 1, 2], [0, 0, 1, 0]], names=["Y", "Z"])) self.assertTrue(all(output.index == correct.index) and all(output.columns == correct.columns) and all(abs(output.values - correct.values) < 1e-10))
def testShuffledTwoDimensionalCumulativeDistribution(self): df = pd.DataFrame({"X": [1, 1, 1, 2, 2, 3, 4], "Y": [1, 2, 0, 1, 1, 1, 1], "Z": [1, 0, 0, 0, 0, 0, 0]}) weights = np.array([1, 1, 1, 1, 1, 1, 1]) metric = metrics.CumulativeDistribution("X", ["Y", "Z"]) output = metric(df.iloc[np.random.permutation(7)], weights) correct = pd.DataFrame( np.array([1 / 14., 12 / 14., 13 / 14., 1.]), columns=[""], index=pd.MultiIndex(levels=[[0, 1, 2], [0, 1]], labels=[[0, 1, 1, 2], [0, 0, 1, 0]], names=["Y", "Z"])) self.assertTrue(all(output.index == correct.index) and all(output.columns == correct.columns) and all(abs(output.values - correct.values) < 1e-10))
def testRelativeToSplitJackknife(self): data = pd.DataFrame( {"X": [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8], "Y": [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3], "Z": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]}) metric = metrics.Sum("X") comparison = comparisons.AbsoluteDifference("Z", 0) se_method = standard_errors.Jackknife() output = core.Analyze(data).split_by("Y").relative_to( comparison).with_standard_errors(se_method).calculate(metric).run() rowindex = pd.MultiIndex( levels=[[1, 2, 3], [1]], labels=[[0, 1, 2], [0, 0, 0]], names=["Y", "Z"]) correct = pd.DataFrame( np.array([[-3.0, np.sqrt(5 * np.var([0, -1, -2, -3, -4, -5]))], [-3.0, np.sqrt(5 * np.var([3, 2, 1, -8, -7, -6]))], [-3.0, np.sqrt(5 * np.var([6, 5, 4, -11, -10, -9]))]]), columns=("sum(X) Absolute Difference", "sum(X) Absolute Difference Jackknife SE"), index=rowindex) self.assertTrue(output.equals(correct))
def testDataframeRelativeTo(self): df = pd.DataFrame({"X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8)))}) metric = metrics.Distribution("X", ["Z"]) output = core.Analyze(df).relative_to(comparisons.AbsoluteDifference( "Y", 0)).calculate(metric).run() correct = pd.DataFrame( np.array([-0.2, 0.2]), columns=["X Distribution Absolute Difference"], index=pd.MultiIndex(levels=[[1.], [0., 1.]], labels=[[0, 0], [0, 1]], names=["Y", "Z"])) self.assertTrue(all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))
def testSplitDataframe(self): df = pd.DataFrame({"X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8)))}) metric = metrics.Distribution("X", ["Z"]) output = core.Analyze(df).split_by(["Y"]).calculate(metric).run() correct = pd.DataFrame( np.array([0.2, 0.8, 0.0, 1.0]), columns=["X Distribution"], index=pd.MultiIndex(levels=[[0.0, 1.0], [0.0, 1.0]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=["Y", "Z"])) self.assertTrue(all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))
def _isnull_old(obj): """Detect missing values. Treat None, NaN, INF, -INF as null. Parameters ---------- arr: ndarray or object value Returns ------- boolean ndarray or boolean """ if lib.isscalar(obj): return lib.checknull_old(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, pd.MultiIndex): raise NotImplementedError("isnull is not defined for MultiIndex") elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)): return _isnull_ndarraylike_old(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isnull(func=_isnull_old)) elif isinstance(obj, list) or hasattr(obj, '__array__'): return _isnull_ndarraylike_old(np.asarray(obj)) else: return obj is None
def test_equals_op_multiindex(self): # GH9785 # test comparisons of multiindex from pandas.compat import StringIO df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) tm.assert_numpy_array_equal(df.index == df.index, np.array([True, True])) mi1 = MultiIndex.from_tuples([(1, 2), (4, 5)]) tm.assert_numpy_array_equal(df.index == mi1, np.array([True, True])) mi2 = MultiIndex.from_tuples([(1, 2), (4, 6)]) tm.assert_numpy_array_equal(df.index == mi2, np.array([True, False])) mi3 = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]) with tm.assertRaisesRegexp(ValueError, "Lengths must match"): df.index == mi3 index_a = Index(['foo', 'bar', 'baz']) with tm.assertRaisesRegexp(ValueError, "Lengths must match"): df.index == index_a tm.assert_numpy_array_equal(index_a == mi3, np.array([False, False, False]))
def test_stack_ints(self): df = DataFrame( np.random.randn(30, 27), columns=MultiIndex.from_tuples( list(itertools.product(range(3), repeat=3)) ) ) assert_frame_equal( df.stack(level=[1, 2]), df.stack(level=1).stack(level=1) ) assert_frame_equal( df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1) ) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) assert_frame_equal( df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) )
def test_unstack_level_binding(self): # GH9856 mi = pd.MultiIndex( levels=[[u('foo'), u('bar')], [u('one'), u('two')], [u('a'), u('b')]], labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], names=[u('first'), u('second'), u('third')]) s = pd.Series(0, index=mi) result = s.unstack([1, 2]).stack(0) expected_mi = pd.MultiIndex( levels=[['foo', 'bar'], ['one', 'two']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['first', 'second']) expected = pd.DataFrame(np.array([[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64), index=expected_mi, columns=pd.Index(['a', 'b'], name='third')) assert_frame_equal(result, expected)
def test_unstack_to_series(self): # check reversibility data = self.frame.unstack() self.assertTrue(isinstance(data, Series)) undo = data.unstack().T assert_frame_equal(undo, self.frame) # check NA handling data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) data.index = Index(['a', 'b', 'c']) result = data.unstack() midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) # check composability of unstack old_data = data.copy() for _ in range(4): data = data.unstack() assert_frame_equal(old_data, data)
def bar_fueltype_and_country_totals(dfs, keys, figsize=(12,8)): df = lookup(dfs, keys) countries = df.columns.levels[0] if isinstance(df.columns, pd.MultiIndex) else df.columns n = len(countries) subplots = gather_nrows_ncols(n) fig, ax = plt.subplots(*subplots, figsize=figsize) if sum(subplots)>2: ax_iter = ax.flat else: ax_iter = np.array(ax).flat for country in countries: ax = next(ax_iter) df[country].plot.bar(ax=ax, sharex=True, rot=55, legend=None) ax.ticklabel_format(axis='y', style='sci', scilimits=(-2,2)) ax.set_title(country) fig.tight_layout(pad=0.5) return fig, ax
def _get_header_iterable(self): """Reformats all but the last header rows.""" df_clean = self.df.loc[:, self.df.columns.get_level_values(0) != ORG_ROW_NAMES] if isinstance(df_clean.columns, pd.MultiIndex): transpose_tuples = zip(*df_clean.columns.tolist()) header_values = [] for i, t in enumerate(transpose_tuples): if i < len(transpose_tuples) - 1: # Not the last column, aggregate repeated items, e.g. [['aa', 'aa', 'aa'], ['bb', 'bb', 'bb']] header_values.append([list(g) for _, g in itertools.groupby(t)]) else: # For the last column keep all elements in single list, e.g. ['a', 'b', 'c', 'a', 'b', 'c'] header_values.append(list(t)) return header_values else: return [df_clean.columns.tolist()]
def _perform_operation(self, dataframe, key, schema, value_func, operation): # Check for references references = (dataframe.columns.get_level_values(0).tolist() if isinstance(dataframe.columns, pd.MultiIndex) else [None]) for reference in references: metric_df = value_func(dataframe, schema, reference=reference) operation_key = ('{}_{}'.format(metric_df.name, key) if reference is None else (reference, '{}_{}'.format(metric_df.name[1], key))) if isinstance(dataframe.index, pd.MultiIndex): unstack_levels = list(range(1, len(dataframe.index.levels))) dataframe[operation_key] = metric_df.groupby(level=unstack_levels).apply(operation) else: dataframe[operation_key] = operation(metric_df)
def _render_data(self, dataframe, display_schema): n = len(dataframe.index.levels) if isinstance(dataframe.index, pd.MultiIndex) else 1 dimensions = list(display_schema['dimensions'].items()) row_dimensions, column_dimensions = dimensions[:n], dimensions[n:] data = [] for idx, df_row in dataframe.iterrows(): row = {} if not isinstance(idx, tuple): idx = (idx,) for key, value in self._render_dimension_data(idx, row_dimensions): row[key] = value for key, value in self._render_metric_data(df_row, column_dimensions, display_schema['metrics'], display_schema.get('references')): row[key] = value data.append(row) return data
def create_multi_index(arr): '''From DataArray arr make a pandas.MultiIndex for the arr.coords Parameters ---------- arr: xarray.DataArray Returns ------- index: pandas.MultiIndex instance with index names taken from arr.dims and levels taken from arr.coords Examples -------- ''' np_arrs = tuple(getattr(arr, dim).values for dim in arr.dims) index = pd.MultiIndex.from_product(np_arrs, names=arr.dims) return index
def setUp(self): samples = [] t = 1.0 for i in range(20): sample = [] sample.append(t) t += 1.1 sample.append(t) t += 0.2 sample.append(t) t += 1.5 sample.append(t) t += 0.1 samples.append(sample) self.samples = pd.DataFrame( data=samples, columns=pd.MultiIndex( levels=[['fn1', 'fn2'], ['begin', 'end']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]] ) )
def read_seurat_hdf5(hdf5_file): import h5py with h5py.File(hdf5_file, 'r') as handle: cols = handle.get("seurat_matrix/columns").value rows = handle.get("seurat_matrix/rows").value df = handle.get("seurat_matrix/matrix").value seurat_matrix = pd.DataFrame(df, index=cols, columns=rows).T # add info as multiindex columns condition = map(lambda x: x[0], seurat_matrix.columns.str.split("|")) replicate = map(lambda x: x[1], seurat_matrix.columns.str.split("|")) cell = map(lambda x: x[2], seurat_matrix.columns.str.split("|")) grna = map(lambda x: x[3], seurat_matrix.columns.str.split("|")) gene = map(lambda x: x[1] if len(x) > 1 else x[0][:4], pd.Series(grna).str.split("_")) seurat_matrix.columns = pd.MultiIndex.from_arrays([condition, replicate, cell, grna, gene], names=['condition', 'replicate', 'cell', 'grna', 'gene']) return seurat_matrix
def test_observed_otus(self): t = biom.Table(np.array([[150, 100, 100], [50, 100, 100]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = _compute_rarefaction_data(feature_table=t, min_depth=1, max_depth=200, steps=2, iterations=1, phylogeny=None, metrics=['observed_otus']) exp_ind = pd.MultiIndex.from_product( [[1, 200], [1]], names=['depth', 'iter']) exp = pd.DataFrame(data=[[1, 2], [1, 2], [1, 2]], columns=exp_ind, index=['S1', 'S2', 'S3']) pdt.assert_frame_equal(obs['observed_otus'], exp)
def test_multiple_metrics(self): t = biom.Table(np.array([[150, 100, 100], [50, 100, 100]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) obs = _compute_rarefaction_data(feature_table=t, min_depth=1, max_depth=200, steps=2, iterations=1, phylogeny=None, metrics=['observed_otus', 'shannon']) exp_ind = pd.MultiIndex.from_product( [[1, 200], [1]], names=['depth', 'iter']) exp = pd.DataFrame(data=[[1, 2], [1, 2], [1, 2]], columns=exp_ind, index=['S1', 'S2', 'S3']) pdt.assert_frame_equal(obs['observed_otus'], exp) exp = pd.DataFrame(data=[[0., 0.811278124459], [0., 1.], [0., 1.]], columns=exp_ind, index=['S1', 'S2', 'S3']) pdt.assert_frame_equal(obs['shannon'], exp)
def test_one_iteration_no_metadata(self): columns = pd.MultiIndex.from_product([[1, 200], [1]], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2], [1, 2], [1, 2]], columns=columns, index=['S1', 'S2', 'S3']) # No counts provided because no metadata obs = _compute_summary(data, 'sample-id') d = [['S1', 1, 1, 1., 1., 1., 1., 1., 1., 1., 1., 1.], ['S1', 200, 1, 2., 2., 2., 2., 2., 2., 2., 2., 2.], ['S2', 1, 1, 1., 1., 1., 1., 1., 1., 1., 1., 1.], ['S2', 200, 1, 2., 2., 2., 2., 2., 2., 2., 2., 2.], ['S3', 1, 1, 1., 1., 1., 1., 1., 1., 1., 1., 1.], ['S3', 200, 1, 2., 2., 2., 2., 2., 2., 2., 2., 2.]] exp = pd.DataFrame(data=d, columns=['sample-id', 'depth', 'count', 'min', '2%', '9%', '25%', '50%', '75%', '91%', '98%', 'max']) pdt.assert_frame_equal(exp, obs)
def test_two_iterations_no_metadata(self): columns = pd.MultiIndex.from_product([[1, 200], [1, 2]], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], columns=columns, index=['S1', 'S2', 'S3']) # No counts provided because no metadata obs = _compute_summary(data, 'sample-id') d = [['S1', 1, 1, 1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2.], ['S1', 200, 1, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4.], ['S2', 1, 1, 1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2.], ['S2', 200, 1, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4.], ['S3', 1, 1, 1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2.], ['S3', 200, 1, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4.]] exp = pd.DataFrame(data=d, columns=['sample-id', 'depth', 'count', 'min', '2%', '9%', '25%', '50%', '75%', '91%', '98%', 'max']) pdt.assert_frame_equal(exp, obs)
def test_unique_metadata_groups(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), (200, 2), ('pet', '')], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'], [9, 10, 11, 12, 'peanut']], columns=columns, index=['S1', 'S2', 'S3']) obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'peanut', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1])
def test_multiple_categories(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), (200, 2), ('pet', ''), ('toy', '')], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ', 'stick'], [5, 6, 7, 8, 'milo', 'yeti'], [9, 10, 11, 12, 'peanut', 'stick']], columns=columns, index=['S1', 'S2', 'S3']) obs = _reindex_with_metadata('pet', ['pet', 'toy'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet', 'toy'], [1, 2, '']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'peanut', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1]) obs = _reindex_with_metadata('toy', ['pet', 'toy'], data) exp_ind = pd.Index(['stick', 'yeti'], name='toy') exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[2, 2, 2, 2], [1, 1, 1, 1]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1])
def normalize_segment_names(dataframe, inplace=False): """ Makes the segment index of the dataframe have names which correspond to the original .mat segment names. :param dataframe: The dataframe with segment names :param inplace: If True, the segment index will be changed in place in the given data frame. :return: A DataFrame where the segment name part of the index has been canonicalized. If inplace is True, the orignal dataframe is returned, otherwise a copy is returned. """ index_values = dataframe.index.get_values() fixed_values = [(fileutils.get_segment_name(filename), frame) for filename, frame in index_values] if not inplace: dataframe = dataframe.copy() dataframe.index = pd.MultiIndex.from_tuples(fixed_values, names=dataframe.index.names) return dataframe
def reshape_frames(dataframe, frame_length=12): """ Returns a new dataframe with the given frame length. :param dataframe: A pandas DataFrame with one window per row. :param frame_length: The desired number of windows for each feature frame. Must divide the number of windows in *dataframe* evenly. :return: A new pandas DataFrame with the desired window frame width. The columns of the new data-frame will be multi-index so that future concatenation of data frames align properly. """ # Assert that the length of the data frame is divisible by # frame_length n_windows, window_width = dataframe.shape if n_windows % frame_length != 0: raise ValueError("The dataframe has {} windows which" " is not divisible by the frame" " length {}".format(n_windows, frame_length)) values = dataframe.values n_frames = n_windows / frame_length frame_width = window_width * frame_length window_columns = dataframe.columns column_index = pd.MultiIndex.from_product([range(frame_length), window_columns], names=['window', 'feature']) reshaped_frame = pd.DataFrame(data=values.reshape(n_frames, frame_width), columns=column_index) reshaped_frame.sortlevel(axis=1) return reshaped_frame
def get_zeroth_quarter_idx(self, stacked_last_per_qtr): """ Filters for releases that are on or after each simulation date and determines the next quarter by picking out the upcoming release for each date in the index. Parameters ---------- stacked_last_per_qtr : pd.DataFrame A DataFrame with index of calendar dates, sid, and normalized quarters with each row being the latest estimate for the row's index values, sorted by event date. Returns ------- next_releases_per_date_index : pd.MultiIndex An index of calendar dates, sid, and normalized quarters, for only the rows that have a next event. """ next_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >= stacked_last_per_qtr.index.get_level_values(SIMULATION_DATES) ].groupby( level=[SIMULATION_DATES, SID_FIELD_NAME], as_index=False, # Here we take advantage of the fact that `stacked_last_per_qtr` is # sorted by event date. ).nth(0) return next_releases_per_date.index
def get_zeroth_quarter_idx(self, stacked_last_per_qtr): """ Filters for releases that are on or after each simulation date and determines the previous quarter by picking out the most recent release relative to each date in the index. Parameters ---------- stacked_last_per_qtr : pd.DataFrame A DataFrame with index of calendar dates, sid, and normalized quarters with each row being the latest estimate for the row's index values, sorted by event date. Returns ------- previous_releases_per_date_index : pd.MultiIndex An index of calendar dates, sid, and normalized quarters, for only the rows that have a previous event. """ previous_releases_per_date = stacked_last_per_qtr.loc[ stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <= stacked_last_per_qtr.index.get_level_values(SIMULATION_DATES) ].groupby( level=[SIMULATION_DATES, SID_FIELD_NAME], as_index=False, # Here we take advantage of the fact that `stacked_last_per_qtr` is # sorted by event date. ).nth(-1) return previous_releases_per_date.index
def validate(self, obj, value): value = super(PandasDataFrame, self).validate(obj, value) if self.get_metadata('lexsort'): if isinstance(value.columns, pd.MultiIndex): value = value.sortlevel(0, axis=1) return value
def testTwoDimensionalDistribution(self): df = pd.DataFrame({"X": [1, 1, 1, 2, 2, 3, 4], "Y": [1, 2, 0, 1, 1, 1, 1], "Z": [1, 0, 0, 0, 0, 0, 0]}) weights = np.array([1, 1, 1, 1, 1, 1, 1]) metric = metrics.Distribution("X", ["Y", "Z"]) output = metric(df, weights) correct = pd.DataFrame( np.array([1 / 14., 1 / 14., 1 / 14., 11 / 14.]), columns=[""], index=pd.MultiIndex(levels=[[0, 1, 2], [0, 1]], labels=[[1, 2, 0, 1], [1, 0, 0, 0]], names=["Y", "Z"])) self.assertTrue(output.equals(correct))
def testShuffledDataframeRelativeToJackknife(self): # Same as test above, but also testing that reordering the data doesn't # change results, up to order. df = pd.DataFrame({"X": range(11), "Y": np.concatenate((np.zeros(6), np.ones(5))), "Z": np.concatenate((np.zeros(3), np.ones(8)))}) metric = metrics.Distribution("X", ["Z"]) se_method = standard_errors.Jackknife() output = core.Analyze(df.iloc[np.random.permutation(11)]).relative_to( comparisons.AbsoluteDifference("Y", 0)).with_standard_errors( se_method).calculate(metric).run() output = (output. reset_index(). sort_values(by=["Y", "Z"]). set_index(["Y", "Z"])) correct = pd.DataFrame( np.array([[-0.2, 0.18100283490], [0.2, 0.18100283490]]), columns=["X Distribution Absolute Difference", "X Distribution Absolute Difference Jackknife SE"], index=pd.MultiIndex(levels=[[1.], [0., 1.]], labels=[[0, 0], [0, 1]], names=["Y", "Z"])) correct = (correct. reset_index(). sort_values(by=["Y", "Z"]). set_index(["Y", "Z"])) self.assertTrue(all(output.index == correct.index) and all(output.columns == correct.columns) and np.all(abs(output.values - correct.values) < 1e-10))
def groupby_deco(func): def func_wrapper(self, thing, *args, **kwargs): if isinstance(thing, pd.core.groupby.DataFrameGroupBy): agg = thing.apply(lambda x: func(self, x, *args, **kwargs)) is_series = isinstance(agg, pd.core.series.Series) has_multiindex = isinstance(agg.index, pd.MultiIndex) if is_series and has_multiindex: return agg.unstack() else: return agg return func(self, thing, *args, **kwargs) return func_wrapper
def _isnull_new(obj): if lib.isscalar(obj): return lib.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, pd.MultiIndex): raise NotImplementedError("isnull is not defined for MultiIndex") elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)): return _isnull_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isnull(func=isnull)) elif isinstance(obj, list) or hasattr(obj, '__array__'): return _isnull_ndarraylike(np.asarray(obj)) else: return obj is None
def test_get_level_values_box(self): from pandas import MultiIndex dates = date_range('1/1/2000', periods=4) levels = [dates, [0, 1]] labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] index = MultiIndex(levels=levels, labels=labels) self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp))
def setUp(self): self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100), strIndex=tm.makeStringIndex(100), dateIndex=tm.makeDateIndex(100), periodIndex=tm.makePeriodIndex(100), tdIndex=tm.makeTimedeltaIndex(100), intIndex=tm.makeIntIndex(100), rangeIndex=tm.makeIntIndex(100), floatIndex=tm.makeFloatIndex(100), boolIndex=Index([True, False]), catIndex=tm.makeCategoricalIndex(100), empty=Index([]), tuples=MultiIndex.from_tuples(lzip( ['foo', 'bar', 'baz'], [1, 2, 3]))) self.setup_indices()
def test_construction_list_mixed_tuples(self): # 10697 # if we are constructing from a mixed list of tuples, make sure that we # are independent of the sorting order idx1 = Index([('A', 1), 'B']) self.assertIsInstance(idx1, Index) and self.assertNotInstance( idx1, MultiIndex) idx2 = Index(['B', ('A', 1)]) self.assertIsInstance(idx2, Index) and self.assertNotInstance( idx2, MultiIndex)
def test_str_attribute(self): # GH9068 methods = ['strip', 'rstrip', 'lstrip'] idx = Index([' jack', 'jill ', ' jesse ', 'frank']) for method in methods: expected = Index([getattr(str, method)(x) for x in idx.values]) tm.assert_index_equal( getattr(Index.str, method)(idx.str), expected) # create a few instances that are not able to use .str accessor indices = [Index(range(5)), tm.makeDateIndex(10), MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]), PeriodIndex(start='2000', end='2010', freq='A')] for idx in indices: with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): idx.str.repeat(2) idx = Index(['a b c', 'd e', 'f']) expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']]) tm.assert_index_equal(idx.str.split(), expected) tm.assert_index_equal(idx.str.split(expand=False), expected) expected = MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan), ('f', np.nan, np.nan)]) tm.assert_index_equal(idx.str.split(expand=True), expected) # test boolean case, should return np.array instead of boolean Index idx = Index(['a1', 'a2', 'b1', 'b2']) expected = np.array([True, True, False, False]) tm.assert_numpy_array_equal(idx.str.startswith('a'), expected) self.assertIsInstance(idx.str.startswith('a'), np.ndarray) s = Series(range(4), index=idx) expected = Series(range(2), index=['a1', 'a2']) tm.assert_series_equal(s[s.index.str.startswith('a')], expected)
def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self): # GH7774 idx = pd.Index(list('abc')) def get_reindex_type(target): return idx.reindex(target)[0].dtype.type self.assertEqual(get_reindex_type(pd.Int64Index([])), np.int64) self.assertEqual(get_reindex_type(pd.Float64Index([])), np.float64) self.assertEqual(get_reindex_type(pd.DatetimeIndex([])), np.datetime64) reindexed = idx.reindex(pd.MultiIndex( [pd.Int64Index([]), pd.Float64Index([])], [[], []]))[0] self.assertEqual(reindexed.levels[0].dtype.type, np.int64) self.assertEqual(reindexed.levels[1].dtype.type, np.float64)
def test_pivot_index_none(self): # gh-3962 data = { 'index': ['A', 'B', 'C', 'C', 'B', 'A'], 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } frame = DataFrame(data).set_index('index') result = frame.pivot(columns='columns', values='values') expected = DataFrame({ 'One': {'A': 1., 'B': 2., 'C': 3.}, 'Two': {'A': 1., 'B': 2., 'C': 3.} }) expected.index.name, expected.columns.name = 'index', 'columns' assert_frame_equal(result, expected) # omit values result = frame.pivot(columns='columns') expected.columns = pd.MultiIndex.from_tuples([('values', 'One'), ('values', 'Two')], names=[None, 'columns']) expected.index.name = 'index' assert_frame_equal(result, expected, check_names=False) self.assertEqual(result.index.name, 'index',) self.assertEqual(result.columns.names, (None, 'columns')) expected.columns = expected.columns.droplevel(0) data = { 'index': range(7), 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], 'values': [1., 2., 3., 3., 2., 1.] } result = frame.pivot(columns='columns', values='values') expected.columns.name = 'columns' assert_frame_equal(result, expected)