我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.Categorical()。
def break_info(self, range=None): if range is None: range = self.dimension() # for discrete, limits != range limits = self.limits major = self.get_breaks(limits) minor = [] if major is None: major = labels = [] else: labels = self.get_labels(major) major = pd.Categorical(major.keys()) major = self.map(major) return {'range': range, 'labels': labels, 'major': major, 'minor': minor}
def map(self, data, layout): if not len(data): data['PANEL'] = pd.Categorical( [], categories=layout['PANEL'].cat.categories, ordered=True) return data facet_vals = eval_facet_vars(data, self.vars, self.plot.environment) data, facet_vals = add_missing_facets(data, layout, self.vars, facet_vals) # assign each point to a panel keys = join_keys(facet_vals, layout, self.vars) data['PANEL'] = match(keys['x'], keys['y'], start=1) data = data.sort_values('PANEL', kind='mergesort') # matching dtype data['PANEL'] = pd.Categorical( data['PANEL'], categories=layout['PANEL'].cat.categories, ordered=True) data.reset_index(drop=True, inplace=True) return data
def from_categorical(cls, categorical, missing_value=None): """ Create a LabelArray from a pandas categorical. Parameters ---------- categorical : pd.Categorical The categorical object to convert. missing_value : bytes, unicode, or None, optional The missing value to use for this LabelArray. Returns ------- la : LabelArray The LabelArray representation of this categorical. """ return LabelArray( categorical, missing_value, categorical.categories, )
def as_categorical(self, name=None): """ Coerce self into a pandas categorical. This is only defined on 1D arrays, since that's all pandas supports. """ if len(self.shape) > 1: raise ValueError("Can't convert a 2D array to a categorical.") with ignore_pandas_nan_categorical_warning(): return pd.Categorical.from_codes( self.as_int_array(), # We need to make a copy because pandas >= 0.17 fails if this # buffer isn't writeable. self.categories.copy(), ordered=False, name=name, )
def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: is_dask = isinstance(X, dd.DataFrame) if is_dask: X = X.categorize() X = X.copy() if hasattr(X, 'copy') else X categories = self.cat_cols_ for k in categories: cat = (categories.get(k, None) if hasattr(categories, 'get') else None) ordered = self.ordered.get(k, False) # can't use Categorical constructor since dask compat if not is_dask: X[k] = pd.Categorical(X[k]) if cat: X[k] = X[k].cat.set_categories(cat) if ordered: X[k] = X[k].cat.as_ordered() return X
def inverse_transform(self, X): non_cat = pd.DataFrame(X[:, :len(self.non_cat_columns_)], columns=self.non_cat_columns_) cats = [] for col in self.cat_columns_: slice_ = self.cat_blocks_[col] categories = self.categories_map_[col] ordered = self.ordered_map_[col] codes = X[:, slice_].argmax(1) series = pd.Series(pd.Categorical.from_codes( codes, categories, ordered=ordered ), name=col) cats.append(series) df = pd.concat([non_cat] + cats, axis=1)[self.columns_] return df
def _get_table(self, column, is_size=True): cols = list(range(5)) cols.append(self.header.index(column)) header = [self.header[c] for c in cols] rows = [ [row[c] for c in cols] for row in self.rows ] if is_size: for row in rows: row[5] = parse_size(row[5]) table = pd.DataFrame.from_records(rows, columns=header) table = table.rename(columns={ 'prog' : 'Program', 'prog2' : 'Program2', 'threads' : 'Threads', 'dataset' : 'Dataset', 'qcut' : 'Quality', }) table['Threads'] = pd.to_numeric(table['Threads']) table['Dataset'] = pd.Categorical(table['Dataset']) table['Program'] = pd.Categorical(table['Program']) table['Program2'] = pd.Categorical(table['Program2']) return table
def test_reindex_dtype(self): res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex(['a', 'c' ]) tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) res, indexer = CategoricalIndex(['a', 'b', 'c', 'a']).reindex( Categorical(['a', 'c'])) tm.assert_index_equal(res, CategoricalIndex( ['a', 'a', 'c'], categories=['a', 'c']), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) res, indexer = CategoricalIndex( ['a', 'b', 'c', 'a' ], categories=['a', 'b', 'c', 'd']).reindex(['a', 'c']) tm.assert_index_equal(res, Index( ['a', 'a', 'c'], dtype='object'), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2])) res, indexer = CategoricalIndex( ['a', 'b', 'c', 'a'], categories=['a', 'b', 'c', 'd']).reindex(Categorical(['a', 'c'])) tm.assert_index_equal(res, CategoricalIndex( ['a', 'a', 'c'], categories=['a', 'c']), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2]))
def test_categorical(self): # GH 8974 from pandas import Categorical, Series arr = Categorical(list('abc')) result = lib.infer_dtype(arr) self.assertEqual(result, 'categorical') result = lib.infer_dtype(Series(arr)) self.assertEqual(result, 'categorical') arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) result = lib.infer_dtype(arr) self.assertEqual(result, 'categorical') result = lib.infer_dtype(Series(arr)) self.assertEqual(result, 'categorical')
def setUp(self): np.random.seed(24) self.s = DataFrame({'A': np.random.permutation(range(6))}) self.df = DataFrame({'A': [0, 1], 'B': np.random.randn(2)}) self.f = lambda x: x self.g = lambda x: x def h(x, foo='bar'): return pd.Series(['color: %s' % foo], index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) self.attrs = pd.DataFrame({'A': ['color: red', 'color: blue']}) self.dataframes = [ self.df, pd.DataFrame({'f': [1., 2.], 'o': ['a', 'b'], 'c': pd.Categorical(['a', 'b'])}) ]
def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output as when one # would add a "normal" Series/DataFrame. s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) res = StringIO() s.to_csv(res) exp = StringIO() s2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue()) df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) res = StringIO() df.to_csv(res) exp = StringIO() df2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue())
def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], 'cat_y': [0., 1, 1]}) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y' ]] assert_frame_equal(result, expected) # GH12402 Add a new parameter `drop_first` to avoid collinearity
def test_unexpected_keyword(self): # GH8597 df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe']) ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) ts = df['joe'].copy() ts[2] = np.nan with assertRaisesRegexp(TypeError, 'unexpected keyword'): df.drop('joe', axis=1, in_place=True) with assertRaisesRegexp(TypeError, 'unexpected keyword'): df.reindex([1, 0], inplace=True) with assertRaisesRegexp(TypeError, 'unexpected keyword'): ca.fillna(0, inplace=True) with assertRaisesRegexp(TypeError, 'unexpected keyword'): ts.fillna(0, in_place=True) # See gh-12301
def test_describe_typefiltering_category_bool(self): df = DataFrame({'A_cat': pd.Categorical(['foo', 'foo', 'bar'] * 8), 'B_str': ['a', 'b', 'c', 'd'] * 6, 'C_bool': [True] * 12 + [False] * 12, 'D_num': np.arange(24.) + .5, 'E_ts': tm.makeTimeSeries()[:24].index}) desc = df.describe() expected_cols = ['D_num'] expected = DataFrame(dict((k, df[k].describe()) for k in expected_cols), columns=expected_cols) assert_frame_equal(desc, expected) desc = df.describe(include=["category"]) self.assertTrue(desc.columns.tolist() == ["A_cat"]) # 'all' includes numpy-dtypes + category desc1 = df.describe(include="all") desc2 = df.describe(include=[np.generic, "category"]) assert_frame_equal(desc1, desc2)
def test_setitem(self): # int/positional c = self.factor.copy() c[0] = 'b' self.assertEqual(c[0], 'b') c[-1] = 'a' self.assertEqual(c[-1], 'a') # boolean c = self.factor.copy() indexer = np.zeros(len(c), dtype='bool') indexer[0] = True indexer[-1] = True c[indexer] = 'c' expected = Categorical.from_array(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], ordered=True) self.assert_categorical_equal(c, expected)
def test_constructor_unsortable(self): # it works! arr = np.array([1, 2, 3, datetime.now()], dtype='O') factor = Categorical.from_array(arr, ordered=False) self.assertFalse(factor.ordered) if compat.PY3: self.assertRaises( TypeError, lambda: Categorical.from_array(arr, ordered=True)) else: # this however will raise as cannot be sorted (on PY3 or older # numpies) if LooseVersion(np.__version__) < "1.10": self.assertRaises( TypeError, lambda: Categorical.from_array(arr, ordered=True)) else: Categorical.from_array(arr, ordered=True)
def test_is_equal_dtype(self): # test dtype comparisons between cats c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False) c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False) c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True) self.assertTrue(c1.is_dtype_equal(c1)) self.assertTrue(c2.is_dtype_equal(c2)) self.assertTrue(c3.is_dtype_equal(c3)) self.assertFalse(c1.is_dtype_equal(c2)) self.assertFalse(c1.is_dtype_equal(c3)) self.assertFalse(c1.is_dtype_equal(Index(list('aabca')))) self.assertFalse(c1.is_dtype_equal(c1.astype(object))) self.assertTrue(c1.is_dtype_equal(CategoricalIndex(c1))) self.assertFalse(c1.is_dtype_equal( CategoricalIndex(c1, categories=list('cab')))) self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)))
def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull # returned a scalar for a generator xrange = range exp = Categorical([0, 1, 2]) cat = Categorical((x for x in [0, 1, 2])) self.assertTrue(cat.equals(exp)) cat = Categorical(xrange(3)) self.assertTrue(cat.equals(exp)) # This uses xrange internally from pandas.core.index import MultiIndex MultiIndex.from_product([range(5), ['a', 'b', 'c']]) # check that categories accept generators and sequences cat = pd.Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) self.assertTrue(cat.equals(exp)) cat = pd.Categorical([0, 1, 2], categories=xrange(3)) self.assertTrue(cat.equals(exp))
def test_empty_print(self): factor = Categorical([], ["a", "b", "c"]) expected = ("[], Categories (3, object): [a, b, c]") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) self.assertEqual(actual, expected) self.assertEqual(expected, actual) factor = Categorical([], ["a", "b", "c"], ordered=True) expected = ("[], Categories (3, object): [a < b < c]") actual = repr(factor) self.assertEqual(expected, actual) factor = Categorical([], []) expected = ("[], Categories (0, object): []") self.assertEqual(expected, repr(factor))
def test_categories_assigments(self): s = pd.Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1]) s.categories = [1, 2, 3] self.assert_numpy_array_equal(s.__array__(), exp) self.assert_numpy_array_equal(s.categories, np.array([1, 2, 3])) # lengthen def f(): s.categories = [1, 2, 3, 4] self.assertRaises(ValueError, f) # shorten def f(): s.categories = [1, 2] self.assertRaises(ValueError, f)
def test_ordered_api(self): # GH 9347 cat1 = pd.Categorical(["a", "c", "b"], ordered=False) self.assertTrue(cat1.categories.equals(Index(['a', 'b', 'c']))) self.assertFalse(cat1.ordered) cat2 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'], ordered=False) self.assertTrue(cat2.categories.equals(Index(['b', 'c', 'a']))) self.assertFalse(cat2.ordered) cat3 = pd.Categorical(["a", "c", "b"], ordered=True) self.assertTrue(cat3.categories.equals(Index(['a', 'b', 'c']))) self.assertTrue(cat3.ordered) cat4 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'], ordered=True) self.assertTrue(cat4.categories.equals(Index(['b', 'c', 'a']))) self.assertTrue(cat4.ordered)
def test_remove_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) # first inplace == False res = cat.remove_categories("c") self.assert_categorical_equal(cat, old) self.assert_categorical_equal(res, new) res = cat.remove_categories(["c"]) self.assert_categorical_equal(cat, old) self.assert_categorical_equal(res, new) # inplace == True res = cat.remove_categories("c", inplace=True) self.assert_categorical_equal(cat, new) self.assertIsNone(res) # removal is not in categories def f(): cat.remove_categories(["c"]) self.assertRaises(ValueError, f)
def test_shift(self): # GH 9416 cat = pd.Categorical(['a', 'b', 'c', 'd', 'a']) # shift forward sp1 = cat.shift(1) xp1 = pd.Categorical([np.nan, 'a', 'b', 'c', 'd']) self.assert_categorical_equal(sp1, xp1) self.assert_categorical_equal(cat[:-1], sp1[1:]) # shift back sn2 = cat.shift(-2) xp2 = pd.Categorical(['c', 'd', 'a', np.nan, np.nan], categories=['a', 'b', 'c', 'd']) self.assert_categorical_equal(sn2, xp2) self.assert_categorical_equal(cat[2:], sn2[:-2]) # shift by zero self.assert_categorical_equal(cat, cat.shift(0))
def test_dtypes(self): # GH8143 index = ['cat', 'obj', 'num'] cat = pd.Categorical(['a', 'b', 'c']) obj = pd.Series(['a', 'b', 'c']) num = pd.Series([1, 2, 3]) df = pd.concat([pd.Series(cat), obj, num], axis=1, keys=index) result = df.dtypes == 'object' expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) result = df.dtypes == 'int64' expected = Series([False, False, True], index=index) tm.assert_series_equal(result, expected) result = df.dtypes == 'category' expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected)
def test_codes_dtypes(self): # GH 8453 result = Categorical(['foo', 'bar', 'baz']) self.assertTrue(result.codes.dtype == 'int8') result = Categorical(['foo%05d' % i for i in range(400)]) self.assertTrue(result.codes.dtype == 'int16') result = Categorical(['foo%05d' % i for i in range(40000)]) self.assertTrue(result.codes.dtype == 'int32') # adding cats result = Categorical(['foo', 'bar', 'baz']) self.assertTrue(result.codes.dtype == 'int8') result = result.add_categories(['foo%05d' % i for i in range(400)]) self.assertTrue(result.codes.dtype == 'int16') # removing cats result = result.remove_categories(['foo%05d' % i for i in range(300)]) self.assertTrue(result.codes.dtype == 'int8')
def test_reshaping(self): p = tm.makePanel() p['str'] = 'foo' df = p.to_frame() df['category'] = df['str'].astype('category') result = df['category'].unstack() c = Categorical(['foo'] * len(p.major_axis)) expected = DataFrame({'A': c.copy(), 'B': c.copy(), 'C': c.copy(), 'D': c.copy()}, columns=Index(list('ABCD'), name='minor'), index=p.major_axis.set_names('major')) tm.assert_frame_equal(result, expected)
def test_reindex(self): index = pd.date_range('20000101', periods=3) # reindexing to an invalid Categorical s = Series(['a', 'b', 'c'], dtype='category') result = s.reindex(index) expected = Series(Categorical(values=[np.nan, np.nan, np.nan], categories=['a', 'b', 'c'])) expected.index = index tm.assert_series_equal(result, expected) # partial reindexing expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b', 'c'])) expected.index = [1, 2] result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) expected = Series(Categorical( values=['c', np.nan], categories=['a', 'b', 'c'])) expected.index = [2, 3] result = s.reindex([2, 3]) tm.assert_series_equal(result, expected)
def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a", "b", np.nan, "a"])) self.assert_numpy_array_equal(s.cat.categories, np.array(["a", "b"])) self.assert_numpy_array_equal(s.values.codes, np.array([0, 1, -1, 0])) # If categories have nan included, the label should point to that # instead with tm.assert_produces_warning(FutureWarning): s2 = Series(Categorical( ["a", "b", np.nan, "a"], categories=["a", "b", np.nan])) self.assert_numpy_array_equal(s2.cat.categories, np.array( ["a", "b", np.nan], dtype=np.object_)) self.assert_numpy_array_equal(s2.values.codes, np.array([0, 1, 2, 0])) # Changing categories should also make the replaced category np.nan s3 = Series(Categorical(["a", "b", "c", "a"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s3.cat.categories = ["a", "b", np.nan] self.assert_numpy_array_equal(s3.cat.categories, np.array( ["a", "b", np.nan], dtype=np.object_)) self.assert_numpy_array_equal(s3.values.codes, np.array([0, 1, 2, 0]))
def test_sequence_like(self): # GH 7839 # make sure can iterate df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing result = list(df.grade.values) expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result, expected) # iteration for t in df.itertuples(index=False): str(t) for row, s in df.iterrows(): str(s) for c, col in df.iteritems(): str(s)
def test_describe(self): # Categoricals should not show up together with numerical columns result = self.cat.describe() self.assertEqual(len(result.columns), 1) # In a frame, describe() for the cat should be the same as for string # arrays (count, unique, top, freq) cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'], ordered=True) s = Series(cat) result = s.describe() expected = Series([4, 2, "b", 3], index=['count', 'unique', 'top', 'freq']) tm.assert_series_equal(result, expected) cat = pd.Series(pd.Categorical(["a", "b", "c", "c"])) df3 = pd.DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) res = df3.describe() self.assert_numpy_array_equal(res["cat"].values, res["s"].values)
def test_repr(self): a = pd.Series(pd.Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") self.assertEqual(exp, a.__unicode__()) a = pd.Series(pd.Categorical(["a", "b"] * 25)) exp = u("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "dtype: category\nCategories (2, object): [a, b]") with option_context("display.max_rows", 5): self.assertEqual(exp, repr(a)) levs = list("abcdefghijklmnopqrstuvwxyz") a = pd.Series(pd.Categorical( ["a", "b"], categories=levs, ordered=True)) exp = u("0 a\n1 b\n" + "dtype: category\n" "Categories (26, object): [a < b < c < d ... w < x < y < z]") self.assertEqual(exp, a.__unicode__())
def test_categorical_series_repr(self): s = pd.Series(pd.Categorical([1, 2, 3])) exp = """0 1 1 2 2 3 dtype: category Categories (3, int64): [1, 2, 3]""" self.assertEqual(repr(s), exp) s = pd.Series(pd.Categorical(np.arange(10))) exp = """0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: category Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" self.assertEqual(repr(s), exp)
def test_categorical_series_repr_ordered(self): s = pd.Series(pd.Categorical([1, 2, 3], ordered=True)) exp = """0 1 1 2 2 3 dtype: category Categories (3, int64): [1 < 2 < 3]""" self.assertEqual(repr(s), exp) s = pd.Series(pd.Categorical(np.arange(10), ordered=True)) exp = """0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: category Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" self.assertEqual(repr(s), exp)
def test_categorical_series_repr_period_ordered(self): idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) s = pd.Series(pd.Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 2 2011-01-01 11:00 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" self.assertEqual(repr(s), exp) idx = pd.period_range('2011-01', freq='M', periods=5) s = pd.Series(pd.Categorical(idx, ordered=True)) exp = """0 2011-01 1 2011-02 2 2011-03 3 2011-04 4 2011-05 dtype: category Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(s), exp)
def test_mode(self): s = Series(Categorical([1, 1, 2, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True)) res = s.mode() exp = Series(Categorical([5], categories=[ 5, 4, 3, 2, 1], ordered=True)) tm.assert_series_equal(res, exp) s = Series(Categorical([1, 1, 1, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True)) res = s.mode() exp = Series(Categorical([5, 1], categories=[ 5, 4, 3, 2, 1], ordered=True)) tm.assert_series_equal(res, exp) s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1], ordered=True)) res = s.mode() exp = Series(Categorical([], categories=[5, 4, 3, 2, 1], ordered=True)) tm.assert_series_equal(res, exp)
def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) reversed = cat[::-1] exp = np.array([4, 3, 2, 1]) self.assert_numpy_array_equal(reversed.__array__(), exp) df = DataFrame({'value': (np.arange(100) + 1).astype('int64')}) df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) expected = Series([11, '(0, 25]'], index=['value', 'D'], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) expected = DataFrame({'value': np.arange(11, 21).astype('int64')}, index=np.arange(10, 20).astype('int64')) expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) result = df.iloc[10:20] tm.assert_frame_equal(result, expected) expected = Series([9, '(0, 25]'], index=['value', 'D'], name=8) result = df.loc[8] tm.assert_series_equal(result, expected)
def test_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] df = pd.DataFrame({"cats": cat, "vals": vals}) cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"]) vals2 = [1, 2, 1, 2] exp = pd.DataFrame({"cats": cat2, "vals": vals2}, index=pd.Index([0, 1, 0, 1])) res = df.append(df) tm.assert_frame_equal(exp, res) # Concat should raise if the two categoricals do not have the same # categories cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) def f(): df.append(df_wrong_categories) self.assertRaises(ValueError, f)
def test_pickle_v0_14_1(self): # we have the name warning # 10482 with tm.assert_produces_warning(UserWarning): cat = pd.Categorical(values=['a', 'b', 'c'], categories=['a', 'b', 'c', 'd'], name='foobar', ordered=False) pickle_path = os.path.join(tm.get_data_path(), 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], # name='foobar') # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) # self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
def test_concat_categorical(self): # See GH 10177 df1 = pd.DataFrame( np.arange(18, dtype='int64').reshape(6, 3), columns=["a", "b", "c"]) df2 = pd.DataFrame( np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"]) df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two", "two", "one"])) df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True) df_expected = pd.DataFrame( {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]}) df_expected['h'] = pd.Series(pd.Categorical( [None, None, None, None, None, None, "one", "one", "two", "one", "two", "two", "one"])) tm.assert_frame_equal(df_expected, df_concat)
def test_categorical(model_and_func): formula = 'y ~ 1 + d + x1' y = np.random.randn(1000) x1 = np.random.randn(1000) d = np.random.randint(0, 4, 1000) d = pd.Categorical(d) data = pd.DataFrame({'y': y, 'x1': x1, 'd': d}) data['Intercept'] = 1.0 model, func = model_and_func mod = model.from_formula(formula, data) res3 = mod.fit() res2 = func(formula, data).fit() res = model(data.y, data[['Intercept', 'x1', 'd']], None, None).fit() assert_allclose(res.rsquared, res2.rsquared) assert_allclose(res2.rsquared, res3.rsquared) assert mod.formula == formula
def test_mixed_input(data): y = PanelData(data.y) nt = y.values2d.shape[0] effects = np.random.randint(0, 5, size=nt) prim = ['a', 'b', 'c', 'd', 'e'] temp = {'effect.0': pd.Categorical(pd.Series(effects, index=y.index)), 'effect.1': pd.Series(np.random.choice(prim, size=nt), index=y.index)} effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) mod.fit() clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase)))) temp['var.cluster.0'] = pd.Series(np.random.choice(prim, size=nt), index=y.index) temp['var.cluster.1'] = pd.Series(clusters[:, 1], index=y.index) clusters = pd.DataFrame(temp, index=y.index) mod.fit(cov_type='clustered', clusters=clusters)
def test_general_demean_oneway(panel): y = PanelData(panel) dm1 = y.demean('entity') g = pd.DataFrame(y.entity_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean('time') g = pd.DataFrame(y.time_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g = pd.Categorical(g.iloc[:, 0]) d = pd.get_dummies(g) dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0] assert_allclose(dm1, dm2.values2d)
def test_general_demean_twoway(panel): y = PanelData(panel) dm1 = y.demean('both') g = pd.DataFrame(y.entity_ids, index=y.index) g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g1 = pd.Categorical(g.iloc[:, 0]) d1 = pd.get_dummies(g1) g2 = pd.Categorical(g.iloc[:, 1]) d2 = pd.get_dummies(g2, drop_first=True) d = np.c_[d1, d2] dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0] assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def test_general_weighted_demean_oneway(panel): y = PanelData(panel) weights = pd.DataFrame( np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean('entity', weights=w) g = PanelData(pd.DataFrame(y.entity_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean('time', weights=w) g = PanelData(pd.DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) g = PanelData(pd.DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, w) g = pd.Categorical(g.dataframe.iloc[:, 0]) d = pd.get_dummies(g) wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0] assert_allclose(dm1, dm2.values2d, atol=1e-14)
def test_general_unit_weighted_demean_twoway(panel): np.random.seed(12345) y = PanelData(panel) weights = pd.DataFrame( np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean('both', weights=w) g = pd.DataFrame(y.entity_ids, index=y.index) g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g, weights=w) assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g, weights=w) g1 = pd.Categorical(g.iloc[:, 0]) d1 = pd.get_dummies(g1) g2 = pd.Categorical(g.iloc[:, 1]) d2 = pd.get_dummies(g2, drop_first=True) d = np.c_[d1, d2] wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0] assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def filtered_table(table, v_gene_coverage, # at least j_gene_coverage, # at least v_gene_evalue, # at most ): """ Discard the following rows in the table: - no J assigned - stop codon found - V gene coverage less than v_gene_coverage - J gene coverage less than j_gene_coverage - V gene E-value greater than v_gene_evalue Return the filtered table. """ stats = FilteringStatistics() stats.n = len(table) # Both V and J must be assigned # (Note V_gene and J_gene columns use empty strings instead of NA) filtered = table[(table['V_gene'] != '') & (table['J_gene'] != '')][:] stats.vjassigned = len(filtered) filtered['V_gene'] = pd.Categorical(filtered['V_gene']) # Filter out sequences that have a stop codon filtered = filtered[filtered.stop == 'no'] stats.stop = len(filtered) # Filter out sequences with a too low V gene hit E-value filtered = filtered[filtered.V_evalue <= v_gene_evalue] stats.v_evalue = len(filtered) # Filter out sequences with too low V gene coverage filtered = filtered[filtered.V_covered >= v_gene_coverage] stats.v_coverage = len(filtered) # Filter out sequences with too low J gene coverage filtered = filtered[filtered.J_covered >= j_gene_coverage] stats.j_coverage = len(filtered) return filtered, stats
def generate_agents(df, country, population): """ Generate a dataframe of agents for a country where population is the number of agents to be created. """ def max_value(attribute): return df[attribute].max() # Turn this on for truly random output from each process. # pid = mp.current_process()._identity[0] rand = np.random.mtrand.RandomState(0) country_data = df[df.index == country].to_dict("records")[0] gdp = country_data["GDP"] income_array = gdp / 10 * rand.chisquare(10, population).astype('float32') unemployment_rate = float(country_data["Unemployment"] / 100.0) employment_array = rand.choice([True, False], population, p=[1 - unemployment_rate, unemployment_rate]) attachment_array = (country_data["Fertility"] * rand.triangular(0.0, 0.5, 1.0, population) / max_value("Fertility")).astype('float32') frame = pd.DataFrame({ "Country": pd.Categorical([country] * population, list(df.index)), "Income": income_array, "Employed": employment_array.astype('bool'), "Attachment": attachment_array, "Location": pd.Categorical([country] * population, list(df.index)), "Migration": 0, }, columns=world_columns) return frame
def y_transform(Y, data, flatten): df_y = data[Y] # if user input 'int' then function will be "greater than value" # if user input 'float' then function will be IQR range # below is for case where prediction is true or false # but the y-feature is in different format (e.g continuous) if flatten == 'mean': df_y = pd.DataFrame(df_y >= df_y.mean()) elif flatten == 'median': df_y = pd.DataFrame(df_y >= df_y.median()) elif flatten == 'mode': df_y = pd.DataFrame(df_y >= df_y.mode()[0]) elif type(flatten) == int: df_y = pd.DataFrame(df_y >= flatten) elif type(flatten) == float: df_y = pd.DataFrame(df_y >= df_y.quantile(flatten)) # below is for case where the y-feature is converted in # to a categorical, either if it's a number or string. elif flatten == 'cat_string': df_y = pd.Categorical(df_y) df_y = pd.DataFrame(pd.Series(df_y).cat.codes) elif flatten == 'cat_numeric': df_y = pd.qcut(df_y, 5, duplicates='drop') df_y = pd.DataFrame(pd.Series(df_y).cat.codes) # for cases when y-feature is already in the format # where the prediction output will be. elif flatten == 'none': df_y = pd.DataFrame(df_y) return df_y
def get_scale(self, gg): """ Create a scale """ # This method does some introspection to save users from # scale mismatch error. This could happen when the # aesthetic is mapped to a categorical but the limits # are not provided in categorical form. We only handle # the case where the mapping uses an expression to # conver to categorical e.g `aes(color='factor(cyl)')`. # However if `'cyl'` column is a categorical and the # mapping is `aes(color='cyl')`, that will result in # an error. If later case proves common enough then we # could inspect the data and be clever based on that too!! ae = self.aesthetic series = pd.Series(self.limits) ae_values = [] # Look through all the mappings for this aesthetic, # if we detect any factor stuff then we convert the # limits data to categorical so that the right scale # can be choosen. This should take care of the most # common use cases. for layer in gg.layers: with suppress(KeyError): value = layer.mapping[ae] if isinstance(value, six.string_types): ae_values.append(value) for value in ae_values: if ('factor(' in value or 'Categorical(' in value): series = pd.Categorical(series) break return make_scale(self.aesthetic, series, limits=self.limits, trans=self.trans)
def test_inverse_transform(self): de = dpp.DummyEncoder() df = dd.from_pandas(pd.DataFrame({"A": np.arange(10), "B": pd.Categorical(['a'] * 4 + ['b'] * 6)}), npartitions=2) de.fit(df) assert_eq_df(df, de.inverse_transform(de.transform(df))) assert_eq_df(df, de.inverse_transform(de.transform(df).values))