我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.Index()。
def test_filter_to_numeric(self): index = pd.Index(['a', 'b', 'c'], dtype=object) df = pd.DataFrame({'col1': ['2', '1', '3'], 'col2': ['two', 'one', 'three']}, index=index, dtype=object) metadata = qiime2.Metadata(df) obs_df = metadata.filter(column_type='numeric').to_dataframe() exp_df = pd.DataFrame({'col1': [2, 1, 3]}, dtype=np.int, index=index) pdt.assert_frame_equal(obs_df, exp_df) df = pd.DataFrame({'col1': ['2', '1', '3'], 'col2': ['2', '1', 'three'], 'col3': ['4.0', '5.2', '6.9']}, index=index, dtype=object) metadata = qiime2.Metadata(df) obs_df = metadata.filter(column_type='numeric').to_dataframe() exp_df = pd.DataFrame({'col1': [2, 1, 3], 'col3': [4.0, 5.2, 6.9]}, index=index) pdt.assert_frame_equal(obs_df, exp_df) self.assertEqual(dict(obs_df.dtypes), {'col1': np.int, 'col3': np.float})
def _add_field(self, field): """ Adds a new field to the container. """ # self.fields is already sorted, so we just need to insert the new # field in the correct index. ls = list(self.fields) insort_left(ls, field) self.fields = pd.Index(ls) # unset fillable fields cache self._ffillable_fields = None self._realign_fields() self.last_known_prior_values = self.last_known_prior_values.reindex( index=self.prior_values_index, ) return field
def test_some_duplicates_in_category(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), (200, 2), ('pet', '')], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'], [9, 10, 11, 12, 'russ']], columns=columns, index=['S1', 'S2', 'S3']) obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1])
def test_all_identical(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), (200, 2), ('pet', '')], names=['depth', 'iter']) data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'], [9, 10, 11, 12, 'russ']], columns=columns, index=['S1', 'S2', 'S3']) obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[0]) exp = pd.DataFrame(data=[[3, 3, 3, 3]], columns=exp_col, index=exp_ind) pdt.assert_frame_equal(exp, obs[1])
def cross_join(df1, df2): """ Return a dataframe that is a cross between dataframes df1 and df2 ref: https://github.com/pydata/pandas/issues/5401 """ if len(df1) == 0: return df2 if len(df2) == 0: return df1 # Add as lists so that the new index keeps the items in # the order that they are added together all_columns = pd.Index(list(df1.columns) + list(df2.columns)) df1['key'] = 1 df2['key'] = 1 return pd.merge(df1, df2, on='key').loc[:, all_columns]
def _split_sample( split: Callable[[object], bool], X: np.ndarray, y: np.ndarray ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]: """ Split X, y sample set in two with a split function :return: ((X_left, y_left), (X_right, y_right)) """ if split.type is 'numerical': left_indexes = X[:, split.attribute] < split.criteria right_indexes = ~left_indexes else: Z = ( pd.Index(pd.unique(split.criteria)) .get_indexer(X[:, split.attribute])) left_indexes = np.where(Z >= 0)[0] right_indexes = np.where(Z < 0)[0] left = X[left_indexes], y[left_indexes] right = X[right_indexes], y[right_indexes] return left, right
def get_dividend(self, order_book_id, adjusted=True): """ ????/?????? :param str order_book_id: ??? :param bool adjusted: ????????? :return: """ def fetchData(adjusted): if adjusted: mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0}) else: mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0}) return mongo_data result = pd.DataFrame({ 'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)), 'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)), 'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)), 'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)], 'round_lot': [d['round_lot'] for d in fetchData(adjusted)] }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted))) return result
def get_yield_curve(self, start_date, end_date, tenor): d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day s = self._dates.searchsorted(d1) e = self._dates.searchsorted(d2, side='right') if e == len(self._dates): e -= 1 if self._dates[e] == d2: # ?? end_date e += 1 if e < s: return None df = pd.DataFrame(self._table[s:e]) df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date']) del df['date'] df.rename(columns=lambda n: n[1:]+n[0], inplace=True) if tenor is not None: return df[tenor] return df
def fit_behavioral_data(): """Fit a model for all subjects. """ df = pd.read_pickle('data.pkl') subjects = df.index.get_level_values('subject').unique() data = np.empty((subjects.size, 10)) cues = (0, 1) for i, subject in enumerate(subjects): print('Fitting model for subject {}'.format(subject)) df_s = df.loc[subject] for cue in cues: ml = ML(df_s[df_s['cue']==cue]) r = ml.ml_estimation() data[i,2*cue:(2*cue+2)] = r.x data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense())) data[i,cue+8] = r.fun model = pd.DataFrame(data, pd.Index(subjects, name='subject'), ['alpha_0', 'beta_0', 'alpha_1', 'beta_1', 'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1', 'NLL_0', 'NLL_1']) return model
def update_table_models(self, visible=None, hidden=None): if visible is None and hidden is None: manager = self.Session.get_manager() for x in list(manager.hidden_columns): if x not in self.Session.output_object.columns: manager.hidden_columns.remove(x) hidden_cols = pd.Index(manager.hidden_columns) vis_cols = [x for x in self.Session.output_object.columns if not x in hidden_cols] to_show = self.Session.output_object[vis_cols] to_hide = self.Session.output_object[hidden_cols] else: to_show = visible to_hide = hidden self.table_model = classes.CoqTableModel( to_show, session=self.Session) self.hidden_model = classes.CoqHiddenTableModel( to_hide, session=self.Session) self.set_columns_widget() self.table_model.dataChanged.connect(self.change_userdata)
def json_conversion(obj): """Encode additional objects to JSON.""" try: # numpy isn't an explicit dependency of bowtie # so we can't assume it's available import numpy as np if isinstance(obj, (np.ndarray, np.generic)): return obj.tolist() except ImportError: pass try: # pandas isn't an explicit dependency of bowtie # so we can't assume it's available import pandas as pd if isinstance(obj, pd.Index): return obj.tolist() except ImportError: pass if isinstance(obj, (datetime, time, date)): return obj.isoformat() raise TypeError('Not sure how to serialize {} of type {}'.format(obj, type(obj)))
def encoders(obj): """Convert Python object to msgpack encodable ones.""" try: # numpy isn't an explicit dependency of bowtie # so we can't assume it's available import numpy as np if isinstance(obj, (np.ndarray, np.generic)): # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html return obj.tolist() except ImportError: pass try: # pandas isn't an explicit dependency of bowtie # so we can't assume it's available import pandas as pd if isinstance(obj, pd.Index): return obj.tolist() except ImportError: pass if isinstance(obj, (datetime, time, date)): return obj.isoformat() return obj
def batch_market_order(self, share_counts): """Place a batch market order for multiple assets. Parameters ---------- share_counts : pd.Series[Asset -> int] Map from asset to number of shares to order for that asset. Returns ------- order_ids : pd.Index[str] Index of ids for newly-created orders. """ style = MarketOrder() order_args = [ (asset, amount, style) for (asset, amount) in iteritems(share_counts) if amount ] return self.blotter.batch_order(order_args)
def test_filter_to_categorical(self): index = pd.Index(['a', 'b', 'c'], dtype=object) df = pd.DataFrame({'col1': ['2', '1', '3'], 'col2': ['a', 'b', 'c']}, index=index, dtype=object) metadata = qiime2.Metadata(df) obs_df = metadata.filter(column_type='categorical').to_dataframe() exp_df = pd.DataFrame({'col2': ['a', 'b', 'c']}, index=index) pdt.assert_frame_equal(obs_df, exp_df) df = pd.DataFrame({'col1': ['2', '1', '3'], 'col2': ['a', 'b', 'c'], 'col3': ['peanut', 'hotdog', 'gwar']}, index=index, dtype=object) metadata = qiime2.Metadata(df) obs_df = metadata.filter(column_type='categorical').to_dataframe() exp_df = pd.DataFrame({'col2': ['a', 'b', 'c'], 'col3': ['peanut', 'hotdog', 'gwar']}, index=index) pdt.assert_frame_equal(obs_df, exp_df)
def test_no_columns(self): fp = pkg_resources.resource_filename( 'qiime2.tests', 'data/metadata/no-columns.tsv') metadata = qiime2.Metadata.load(fp) obs_df = metadata.to_dataframe() exp_index = pd.Index(['a', 'b', 'id'], name='my-index', dtype=object) exp_df = pd.DataFrame({}, index=exp_index, dtype=object) self.assertFalse(obs_df.index.empty) self.assertTrue(obs_df.columns.empty) pdt.assert_frame_equal( obs_df, exp_df, check_dtype=True, check_index_type=True, check_column_type=True, check_frame_type=True, check_names=True, check_exact=True)
def test_index_and_column_names(self): md1 = qiime2.Metadata(pd.DataFrame( {'a': [1, 2]}, index=pd.Index(['id1', 'id2'], name='foo'), columns=pd.Index(['a'], name='abc'))) md2 = qiime2.Metadata(pd.DataFrame( {'b': [3, 4]}, index=pd.Index(['id1', 'id2'], name='bar'), columns=pd.Index(['b'], name='def'))) obs = md1.merge(md2) exp = qiime2.Metadata(pd.DataFrame( {'a': [1, 2], 'b': [3, 4]}, index=['id1', 'id2'])) self.assertEqual(obs, exp) self.assertIsNone(obs._dataframe.index.name) self.assertIsNone(obs._dataframe.columns.name)
def test_more_complex_expressions(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = qiime2.Metadata(df) where = "Subject='subject-1' OR Subject='subject-2'" actual = metadata.ids(where) expected = {'S1', 'S2', 'S3'} self.assertEqual(actual, expected) where = "Subject='subject-1' AND Subject='subject-2'" actual = metadata.ids(where) expected = set() self.assertEqual(actual, expected) where = "Subject='subject-1' AND SampleType='gut'" actual = metadata.ids(where) expected = {'S1'} self.assertEqual(actual, expected)
def testMultipleCalculationsRelativeTo(self): data = pd.DataFrame({"X": (1, 2, 3, 10, 20, 30, 100, 200, 300), "Y": (0, 1, 2, 3, 4, 5, 6, 7, 8), "Experiment": ("Control", "Control", "Control", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp2")}) comparison = comparisons.AbsoluteDifference("Experiment", "Control") output = core.Analyze(data).relative_to(comparison).calculate( (metrics.Sum("X"), metrics.Sum("Y"))).run() correct = pd.DataFrame( {"sum(X) Absolute Difference": (60 - 6, 600 - 6), "sum(Y) Absolute Difference": (12 - 3, 21 - 3)}, index=pd.Index( ("Exp1", "Exp2"), name="Experiment")) self.assertTrue(output.equals(correct))
def testRelativeToJackknife(self): data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9], "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]}) metric = metrics.Sum("X") comparison = comparisons.AbsoluteDifference("Y", 0) se_method = standard_errors.Jackknife() output = core.Analyze(data).relative_to(comparison).with_standard_errors( se_method).calculate(metric).run() rowindex = pd.Index([1, 2], name="Y") correct = pd.DataFrame( np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))], [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]), columns=("sum(X) Absolute Difference", "sum(X) Absolute Difference Jackknife SE"), index=rowindex) self.assertTrue(output.equals(correct))
def testRelativeToJackknifeIncludeBaseline(self): data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9], "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]}) metric = metrics.Sum("X") comparison = comparisons.AbsoluteDifference("Y", 0, include_base=True) se_method = standard_errors.Jackknife() output = core.Analyze(data).relative_to(comparison).with_standard_errors( se_method).calculate(metric).run() rowindex = pd.Index([0, 1, 2], name="Y") correct = pd.DataFrame( np.array([[0.0, 0.0], [9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))], [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]), columns=("sum(X) Absolute Difference", "sum(X) Absolute Difference Jackknife SE"), index=rowindex) self.assertTrue(output.equals(correct))
def testRelativeToJackknifeSingleComparisonBaselineFirst(self): data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]}) metric = metrics.Sum("X") comparison = comparisons.AbsoluteDifference("Y", 0) se_method = standard_errors.Jackknife() output = core.Analyze(data).relative_to(comparison).with_standard_errors( se_method).calculate(metric).run() rowindex = pd.Index([1], name="Y") correct = pd.DataFrame( np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]), columns=("sum(X) Absolute Difference", "sum(X) Absolute Difference Jackknife SE"), index=rowindex) self.assertTrue(output.equals(correct))
def testRelativeToJackknifeSingleComparisonBaselineSecond(self): data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]}) metric = metrics.Sum("X") comparison = comparisons.AbsoluteDifference("Y", 1) se_method = standard_errors.Jackknife() output = core.Analyze(data).relative_to(comparison).with_standard_errors( se_method).calculate(metric).run() rowindex = pd.Index([0], name="Y") correct = pd.DataFrame( np.array([[-9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]), columns=("sum(X) Absolute Difference", "sum(X) Absolute Difference Jackknife SE"), index=rowindex) self.assertTrue(output.equals(correct))
def testSplitJackknife(self): data = pd.DataFrame({"X": np.array([range(11) + [5] * 10]).flatten(), "Y": np.array([[0] * 11 + [1] * 10]).flatten()}) metric = metrics.Sum("X") se_method = standard_errors.Jackknife() output = core.Analyze(data).split_by("Y").with_standard_errors( se_method).calculate(metric).run() rowindex = pd.Index([0, 1], name="Y") correct = pd.DataFrame( np.array([[55.0, 10.0], [50.0, 0.0]]), columns=("sum(X)", "sum(X) Jackknife SE"), index=rowindex) self.assertTrue(output.equals(correct))
def test_storage_restore_schema_with_primary_key(): data = [ ('a',), ('b',), ] index = pd.Index([1, 2], name='key') df = pd.DataFrame(data, columns=('value',), index=index) storage = Storage(dataframes={'data': df}) assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] assert storage.describe('data') == { 'primaryKey': 'key', 'fields': [ {'name': 'key', 'type': 'integer', 'constraints': {'required': True}}, {'name': 'value', 'type': 'string'}, ] }
def test_dataframe_to_tsv_taxonomy_format(self): index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object) columns = ['Taxon', 'Foo', 'Bar'] df = pd.DataFrame([['taxon1', '42', 'foo'], ['taxon2', '43', 'bar']], index=index, columns=columns, dtype=object) exp = ( 'Feature ID\tTaxon\tFoo\tBar\n' 'seq1\ttaxon1\t42\tfoo\n' 'seq2\ttaxon2\t43\tbar\n' ) transformer = self.get_transformer(pd.DataFrame, TSVTaxonomyFormat) obs = transformer(df) with obs.open() as fh: self.assertEqual(fh.read(), exp)
def test_series_to_tsv_taxonomy_format(self): index = pd.Index(['emrakul', 'peanut'], name='Feature ID', dtype=object) series = pd.Series(['taxon1', 'taxon2'], index=index, name='Taxon', dtype=object) exp = ( 'Feature ID\tTaxon\n' 'emrakul\ttaxon1\n' 'peanut\ttaxon2\n' ) transformer = self.get_transformer(pd.Series, TSVTaxonomyFormat) obs = transformer(series) with obs.open() as fh: self.assertEqual(fh.read(), exp)
def test_tsv_taxonomy_format_to_metadata(self): _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata, os.path.join('taxonomy', '3-column.tsv')) index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object) exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0'], ['k__Foo; p__Baz', '-42.0']], index=index, columns=['Taxon', 'Confidence'], dtype=object) exp = qiime2.Metadata(exp_df) self.assertEqual(exp, obs) # In-depth testing of the `_taxonomy_formats_to_dataframe` helper function, # which does the heavy lifting for the transformers.
def test_3_columns(self): index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object) exp = pd.DataFrame([['k__Foo; p__Bar', '-1.0'], ['k__Foo; p__Baz', '-42.0']], index=index, columns=['Taxon', 'Confidence'], dtype=object) # has_header=None (default) obs = _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', '3-column.tsv'))) assert_frame_equal(obs, exp) # has_header=True obs = _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', '3-column.tsv')), has_header=True) assert_frame_equal(obs, exp)
def test_valid_but_messy_file(self): index = pd.Index( ['SEQUENCE1', 'seq2'], name='Feature ID', dtype=object) exp = pd.DataFrame([['k__Bar; p__Baz', 'foo'], ['some; taxonomy; for; ya', 'bar baz']], index=index, columns=['Taxon', 'Extra Column'], dtype=object) # has_header=None (default) obs = _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', 'valid-but-messy.tsv'))) assert_frame_equal(obs, exp) # has_header=True obs = _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', 'valid-but-messy.tsv')), has_header=True) assert_frame_equal(obs, exp)
def test_headerless(self): index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object) columns = ['Taxon', 'Unnamed Column 1', 'Unnamed Column 2'] exp = pd.DataFrame([['k__Foo; p__Bar', 'some', 'another'], ['k__Foo; p__Baz', 'column', 'column!']], index=index, columns=columns, dtype=object) # has_header=None (default) obs = _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', 'headerless.tsv'))) assert_frame_equal(obs, exp) # has_header=False obs = _taxonomy_formats_to_dataframe( self.get_data_path(os.path.join('taxonomy', 'headerless.tsv')), has_header=False) assert_frame_equal(obs, exp) # In-depth testing of the `_dataframe_to_tsv_taxonomy_format` helper function, # which does the heavy lifting for the transformers.
def find_missing_products(): train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv') train_ids = train['Producto_ID'].unique() test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv') test_ids = test['Producto_ID'].unique() missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids)) print "missing ID count ", len(missing_ids) missing_ids_df = pd.DataFrame(missing_ids, columns=["Producto_ID"]) missing_ids_df.to_csv('missing_ids.csv', index=False) entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID') print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0] print "full entries count", test.shape[0]
def at_time(self, time, asof=False): """ Select values at particular time of day (e.g. 9:30AM). Parameters ---------- time : datetime.time or string Returns ------- values_at_time : type of caller """ try: indexer = self.index.indexer_at_time(time, asof=asof) return self.take(indexer, convert=False) except AttributeError: raise TypeError('Index must be DatetimeIndex')
def between_time(self, start_time, end_time, include_start=True, include_end=True): """ Select values between particular times of the day (e.g., 9:00-9:30 AM). Parameters ---------- start_time : datetime.time or string end_time : datetime.time or string include_start : boolean, default True include_end : boolean, default True Returns ------- values_between_time : type of caller """ try: indexer = self.index.indexer_between_time( start_time, end_time, include_start=include_start, include_end=include_end) return self.take(indexer, convert=False) except AttributeError: raise TypeError('Index must be DatetimeIndex')
def _isnull_old(obj): """Detect missing values. Treat None, NaN, INF, -INF as null. Parameters ---------- arr: ndarray or object value Returns ------- boolean ndarray or boolean """ if lib.isscalar(obj): return lib.checknull_old(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, pd.MultiIndex): raise NotImplementedError("isnull is not defined for MultiIndex") elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)): return _isnull_ndarraylike_old(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isnull(func=_isnull_old)) elif isinstance(obj, list) or hasattr(obj, '__array__'): return _isnull_ndarraylike_old(np.asarray(obj)) else: return obj is None
def test_period_resample_with_local_timezone_pytz(self): # GH5430 tm._skip_if_no_pytz() import pytz local_timezone = pytz.timezone('America/Los_Angeles') start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) # 1 day later end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) index = pd.date_range(start, end, freq='H') series = pd.Series(1, index=index) series = series.tz_convert(local_timezone) result = series.resample('D', kind='period').mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific expected_index = (pd.period_range(start=start, end=end, freq='D') - 1) expected = pd.Series(1, index=expected_index) assert_series_equal(result, expected)
def test_period_resample_with_local_timezone_dateutil(self): # GH5430 tm._skip_if_no_dateutil() import dateutil local_timezone = 'dateutil/America/Los_Angeles' start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=dateutil.tz.tzutc()) # 1 day later end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc()) index = pd.date_range(start, end, freq='H') series = pd.Series(1, index=index) series = series.tz_convert(local_timezone) result = series.resample('D', kind='period').mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific expected_index = (pd.period_range(start=start, end=end, freq='D') - 1) expected = pd.Series(1, index=expected_index) assert_series_equal(result, expected)
def test_dayfirst(self): # GH 5917 arr = ['10/02/2014', '11/02/2014', '12/02/2014'] expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)]) idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True) idx4 = to_datetime(np.array(arr), dayfirst=True) idx5 = DatetimeIndex(Index(arr), dayfirst=True) idx6 = DatetimeIndex(Series(arr), dayfirst=True) self.assertTrue(expected.equals(idx1)) self.assertTrue(expected.equals(idx2)) self.assertTrue(expected.equals(idx3)) self.assertTrue(expected.equals(idx4)) self.assertTrue(expected.equals(idx5)) self.assertTrue(expected.equals(idx6))
def test_to_datetime_format(self): values = ['1/1/2000', '1/2/2000', '1/3/2000'] results1 = [Timestamp('20000101'), Timestamp('20000201'), Timestamp('20000301')] results2 = [Timestamp('20000101'), Timestamp('20000102'), Timestamp('20000103')] for vals, expecteds in [(values, (Index(results1), Index(results2))), (Series(values), (Series(results1), Series(results2))), (values[0], (results1[0], results2[0])), (values[1], (results1[1], results2[1])), (values[2], (results1[2], results2[2]))]: for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): result = to_datetime(vals, format=fmt) expected = expecteds[i] if isinstance(expected, Series): assert_series_equal(result, Series(expected)) elif isinstance(expected, Timestamp): self.assertEqual(result, expected) else: self.assertTrue(result.equals(expected))
def test_asobject_tolist(self): idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') expected_list = [Timedelta('1 days'), Timedelta('2 days'), Timedelta('3 days'), Timedelta('4 days')] expected = pd.Index(expected_list, dtype=object, name='idx') result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) self.assertTrue(result.equals(expected)) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT, timedelta(days=4)], name='idx') expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, Timedelta('4 days')] expected = pd.Index(expected_list, dtype=object, name='idx') result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) self.assertTrue(result.equals(expected)) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list)
def prior_values_index(self): index_values = list( product( (freq.freq_str for freq in self.unique_frequencies), # Only store prior values for forward-fillable fields. self.ffillable_fields, ) ) if index_values: return pd.MultiIndex.from_tuples(index_values) else: # MultiIndex doesn't gracefully support empty input, so we return # an empty regular Index if we have values. return pd.Index(index_values)
def add_sids(self, to_add): """ Add new sids to the container. """ self.sids = pd.Index( sorted(self.sids.union(_ensure_index(to_add))), ) self._realign_sids()
def drop_sids(self, to_drop): """ Remove sids from the container. """ self.sids = pd.Index( sorted(self.sids.difference(_ensure_index(to_drop))), ) self._realign_sids()
def _ensure_index(x): if not isinstance(x, pd.Index): x = pd.Index(sorted(x)) return x
def test_df_of_assets_as_input(self): algo = TestRegisterTransformAlgorithm( sim_params=self.sim_params, env=TradingEnvironment(), # new env without assets ) df = self.df.copy() df.columns = pd.Index(map(Equity, df.columns)) algo.run(df) assert isinstance(algo.sources[0], DataFrameSource)
def index(self): """Return dask Index instance""" name = self._name + '-index' dsk = {(name, i): (getattr, key, 'index') for i, key in enumerate(self._keys())} return Index(merge(dsk, self.dask), name, self._meta.index, self.divisions)
def _daskify(obj, npartitions=None, chunksize=None): """Convert input to a dask-gdf object. """ npartitions = npartitions or 1 if isinstance(obj, _Frame): return obj elif isinstance(obj, (pd.DataFrame, pd.Series, pd.Index)): return _daskify(dd.from_pandas(obj, npartitions=npartitions)) elif isinstance(obj, (gd.DataFrame, gd.Series, gd.index.Index)): return from_pygdf(obj, npartitions=npartitions) elif isinstance(obj, (dd.DataFrame, dd.Series, dd.Index)): return from_dask_dataframe(obj) else: raise TypeError("type {} is not supported".format(type(obj)))
def concat(objs): """Concantenate dask gdf objects Parameters ---------- objs : sequence of DataFrame, Series, Index A sequence of objects to be concatenated. """ objs = [_daskify(x) for x in objs] meta = gd.concat(_extract_meta(objs)) name = "concat-" + uuid4().hex dsk = {} divisions = [0] base = 0 lastdiv = 0 for obj in objs: for k, i in obj._keys(): dsk[name, base + i] = k, i base += obj.npartitions divisions.extend([d + lastdiv for d in obj.divisions[1:]]) lastdiv = obj.divisions[-1] dasks = [o.dask for o in objs] dsk = merge(dsk, *dasks) return new_dd_object(dsk, name, meta, divisions)
def _get_return_type(meta): if isinstance(meta, gd.Series): return Series elif isinstance(meta, gd.DataFrame): return DataFrame elif isinstance(meta, gd.index.Index): return Index return Scalar