Python pandas 模块,Index() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.Index()。
def test_filter_to_numeric(self):
index = pd.Index(['a', 'b', 'c'], dtype=object)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['two', 'one', 'three']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='numeric').to_dataframe()
exp_df = pd.DataFrame({'col1': [2, 1, 3]}, dtype=np.int, index=index)
pdt.assert_frame_equal(obs_df, exp_df)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['2', '1', 'three'],
'col3': ['4.0', '5.2', '6.9']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='numeric').to_dataframe()
exp_df = pd.DataFrame({'col1': [2, 1, 3],
'col3': [4.0, 5.2, 6.9]}, index=index)
pdt.assert_frame_equal(obs_df, exp_df)
self.assertEqual(dict(obs_df.dtypes),
{'col1': np.int, 'col3': np.float})
def _add_field(self, field):
"""
Adds a new field to the container.
"""
# self.fields is already sorted, so we just need to insert the new
# field in the correct index.
ls = list(self.fields)
insort_left(ls, field)
self.fields = pd.Index(ls)
# unset fillable fields cache
self._ffillable_fields = None
self._realign_fields()
self.last_known_prior_values = self.last_known_prior_values.reindex(
index=self.prior_values_index,
)
return field
def test_some_duplicates_in_category(self):
columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
(200, 2), ('pet', '')],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'],
[9, 10, 11, 12, 'russ']],
columns=columns, index=['S1', 'S2', 'S3'])
obs = _reindex_with_metadata('pet', ['pet'], data)
exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['depth', 'iter'])
exp_ind = pd.Index(['milo', 'russ'], name='pet')
exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
def test_all_identical(self):
columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
(200, 2), ('pet', '')],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'],
[9, 10, 11, 12, 'russ']],
columns=columns, index=['S1', 'S2', 'S3'])
obs = _reindex_with_metadata('pet', ['pet'], data)
exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['depth', 'iter'])
exp_ind = pd.Index(['russ'], name='pet')
exp = pd.DataFrame(data=[[5, 6, 7, 8]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[3, 3, 3, 3]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
def cross_join(df1, df2):
"""
Return a dataframe that is a cross between dataframes
df1 and df2
ref: https://github.com/pydata/pandas/issues/5401
"""
if len(df1) == 0:
return df2
if len(df2) == 0:
return df1
# Add as lists so that the new index keeps the items in
# the order that they are added together
all_columns = pd.Index(list(df1.columns) + list(df2.columns))
df1['key'] = 1
df2['key'] = 1
return pd.merge(df1, df2, on='key').loc[:, all_columns]
def _split_sample(
split: Callable[[object], bool], X: np.ndarray, y: np.ndarray
) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
"""
Split X, y sample set in two with a split function
:return: ((X_left, y_left), (X_right, y_right))
"""
if split.type is 'numerical':
left_indexes = X[:, split.attribute] < split.criteria
right_indexes = ~left_indexes
else:
Z = (
pd.Index(pd.unique(split.criteria))
.get_indexer(X[:, split.attribute]))
left_indexes = np.where(Z >= 0)[0]
right_indexes = np.where(Z < 0)[0]
left = X[left_indexes], y[left_indexes]
right = X[right_indexes], y[right_indexes]
return left, right
def get_dividend(self, order_book_id, adjusted=True):
"""
????/??????
:param str order_book_id: ???
:param bool adjusted: ?????????
:return:
"""
def fetchData(adjusted):
if adjusted:
mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
else:
mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
return mongo_data
result = pd.DataFrame({
'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
}, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))
return result
def get_yield_curve(self, start_date, end_date, tenor):
d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day
s = self._dates.searchsorted(d1)
e = self._dates.searchsorted(d2, side='right')
if e == len(self._dates):
e -= 1
if self._dates[e] == d2:
# ?? end_date
e += 1
if e < s:
return None
df = pd.DataFrame(self._table[s:e])
df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
del df['date']
df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
if tenor is not None:
return df[tenor]
return df
def get_dividend(self, order_book_id, adjusted=True):
"""
????/??????
:param str order_book_id: ???
:param bool adjusted: ?????????
:return:
"""
def fetchData(adjusted):
if adjusted:
mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
else:
mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
return mongo_data
result = pd.DataFrame({
'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
}, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))
return result
def get_yield_curve(self, start_date, end_date, tenor):
d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day
s = self._dates.searchsorted(d1)
e = self._dates.searchsorted(d2, side='right')
if e == len(self._dates):
e -= 1
if self._dates[e] == d2:
# ?? end_date
e += 1
if e < s:
return None
df = pd.DataFrame(self._table[s:e])
df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
del df['date']
df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
if tenor is not None:
return df[tenor]
return df
def fit_behavioral_data():
"""Fit a model for all subjects. """
df = pd.read_pickle('data.pkl')
subjects = df.index.get_level_values('subject').unique()
data = np.empty((subjects.size, 10))
cues = (0, 1)
for i, subject in enumerate(subjects):
print('Fitting model for subject {}'.format(subject))
df_s = df.loc[subject]
for cue in cues:
ml = ML(df_s[df_s['cue']==cue])
r = ml.ml_estimation()
data[i,2*cue:(2*cue+2)] = r.x
data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
data[i,cue+8] = r.fun
model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
'NLL_0', 'NLL_1'])
return model
def update_table_models(self, visible=None, hidden=None):
if visible is None and hidden is None:
manager = self.Session.get_manager()
for x in list(manager.hidden_columns):
if x not in self.Session.output_object.columns:
manager.hidden_columns.remove(x)
hidden_cols = pd.Index(manager.hidden_columns)
vis_cols = [x for x in self.Session.output_object.columns
if not x in hidden_cols]
to_show = self.Session.output_object[vis_cols]
to_hide = self.Session.output_object[hidden_cols]
else:
to_show = visible
to_hide = hidden
self.table_model = classes.CoqTableModel(
to_show, session=self.Session)
self.hidden_model = classes.CoqHiddenTableModel(
to_hide, session=self.Session)
self.set_columns_widget()
self.table_model.dataChanged.connect(self.change_userdata)
def json_conversion(obj):
"""Encode additional objects to JSON."""
try:
# numpy isn't an explicit dependency of bowtie
# so we can't assume it's available
import numpy as np
if isinstance(obj, (np.ndarray, np.generic)):
return obj.tolist()
except ImportError:
pass
try:
# pandas isn't an explicit dependency of bowtie
# so we can't assume it's available
import pandas as pd
if isinstance(obj, pd.Index):
return obj.tolist()
except ImportError:
pass
if isinstance(obj, (datetime, time, date)):
return obj.isoformat()
raise TypeError('Not sure how to serialize {} of type {}'.format(obj, type(obj)))
def encoders(obj):
"""Convert Python object to msgpack encodable ones."""
try:
# numpy isn't an explicit dependency of bowtie
# so we can't assume it's available
import numpy as np
if isinstance(obj, (np.ndarray, np.generic)):
# https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
return obj.tolist()
except ImportError:
pass
try:
# pandas isn't an explicit dependency of bowtie
# so we can't assume it's available
import pandas as pd
if isinstance(obj, pd.Index):
return obj.tolist()
except ImportError:
pass
if isinstance(obj, (datetime, time, date)):
return obj.isoformat()
return obj
def batch_market_order(self, share_counts):
"""Place a batch market order for multiple assets.
Parameters
----------
share_counts : pd.Series[Asset -> int]
Map from asset to number of shares to order for that asset.
Returns
-------
order_ids : pd.Index[str]
Index of ids for newly-created orders.
"""
style = MarketOrder()
order_args = [
(asset, amount, style)
for (asset, amount) in iteritems(share_counts)
if amount
]
return self.blotter.batch_order(order_args)
def test_filter_to_categorical(self):
index = pd.Index(['a', 'b', 'c'], dtype=object)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['a', 'b', 'c']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='categorical').to_dataframe()
exp_df = pd.DataFrame({'col2': ['a', 'b', 'c']}, index=index)
pdt.assert_frame_equal(obs_df, exp_df)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['a', 'b', 'c'],
'col3': ['peanut', 'hotdog', 'gwar']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='categorical').to_dataframe()
exp_df = pd.DataFrame({'col2': ['a', 'b', 'c'],
'col3': ['peanut', 'hotdog', 'gwar']},
index=index)
pdt.assert_frame_equal(obs_df, exp_df)
def test_no_columns(self):
fp = pkg_resources.resource_filename(
'qiime2.tests', 'data/metadata/no-columns.tsv')
metadata = qiime2.Metadata.load(fp)
obs_df = metadata.to_dataframe()
exp_index = pd.Index(['a', 'b', 'id'], name='my-index', dtype=object)
exp_df = pd.DataFrame({}, index=exp_index, dtype=object)
self.assertFalse(obs_df.index.empty)
self.assertTrue(obs_df.columns.empty)
pdt.assert_frame_equal(
obs_df, exp_df, check_dtype=True, check_index_type=True,
check_column_type=True, check_frame_type=True, check_names=True,
check_exact=True)
def test_index_and_column_names(self):
md1 = qiime2.Metadata(pd.DataFrame(
{'a': [1, 2]},
index=pd.Index(['id1', 'id2'], name='foo'),
columns=pd.Index(['a'], name='abc')))
md2 = qiime2.Metadata(pd.DataFrame(
{'b': [3, 4]},
index=pd.Index(['id1', 'id2'], name='bar'),
columns=pd.Index(['b'], name='def')))
obs = md1.merge(md2)
exp = qiime2.Metadata(pd.DataFrame(
{'a': [1, 2], 'b': [3, 4]}, index=['id1', 'id2']))
self.assertEqual(obs, exp)
self.assertIsNone(obs._dataframe.index.name)
self.assertIsNone(obs._dataframe.columns.name)
def test_more_complex_expressions(self):
df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
'SampleType': ['gut', 'tongue', 'gut']},
index=pd.Index(['S1', 'S2', 'S3'], name='id'))
metadata = qiime2.Metadata(df)
where = "Subject='subject-1' OR Subject='subject-2'"
actual = metadata.ids(where)
expected = {'S1', 'S2', 'S3'}
self.assertEqual(actual, expected)
where = "Subject='subject-1' AND Subject='subject-2'"
actual = metadata.ids(where)
expected = set()
self.assertEqual(actual, expected)
where = "Subject='subject-1' AND SampleType='gut'"
actual = metadata.ids(where)
expected = {'S1'}
self.assertEqual(actual, expected)
def testMultipleCalculationsRelativeTo(self):
data = pd.DataFrame({"X": (1, 2, 3, 10, 20, 30, 100, 200, 300),
"Y": (0, 1, 2, 3, 4, 5, 6, 7, 8),
"Experiment": ("Control", "Control", "Control", "Exp1",
"Exp1", "Exp1", "Exp2", "Exp2",
"Exp2")})
comparison = comparisons.AbsoluteDifference("Experiment", "Control")
output = core.Analyze(data).relative_to(comparison).calculate(
(metrics.Sum("X"), metrics.Sum("Y"))).run()
correct = pd.DataFrame(
{"sum(X) Absolute Difference": (60 - 6, 600 - 6),
"sum(Y) Absolute Difference": (12 - 3, 21 - 3)},
index=pd.Index(
("Exp1", "Exp2"), name="Experiment"))
self.assertTrue(output.equals(correct))
def testRelativeToJackknife(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 0)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([1, 2], name="Y")
correct = pd.DataFrame(
np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
[18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testRelativeToJackknifeIncludeBaseline(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 0, include_base=True)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([0, 1, 2], name="Y")
correct = pd.DataFrame(
np.array([[0.0, 0.0],
[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
[18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testRelativeToJackknifeSingleComparisonBaselineFirst(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 0)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([1], name="Y")
correct = pd.DataFrame(
np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testRelativeToJackknifeSingleComparisonBaselineSecond(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 1)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([0], name="Y")
correct = pd.DataFrame(
np.array([[-9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testSplitJackknife(self):
data = pd.DataFrame({"X": np.array([range(11) + [5] * 10]).flatten(),
"Y": np.array([[0] * 11 + [1] * 10]).flatten()})
metric = metrics.Sum("X")
se_method = standard_errors.Jackknife()
output = core.Analyze(data).split_by("Y").with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([0, 1], name="Y")
correct = pd.DataFrame(
np.array([[55.0, 10.0], [50.0, 0.0]]),
columns=("sum(X)", "sum(X) Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def test_storage_restore_schema_with_primary_key():
data = [
('a',),
('b',),
]
index = pd.Index([1, 2], name='key')
df = pd.DataFrame(data, columns=('value',), index=index)
storage = Storage(dataframes={'data': df})
assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
assert storage.describe('data') == {
'primaryKey': 'key',
'fields': [
{'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
{'name': 'value', 'type': 'string'},
]
}
def test_dataframe_to_tsv_taxonomy_format(self):
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
columns = ['Taxon', 'Foo', 'Bar']
df = pd.DataFrame([['taxon1', '42', 'foo'], ['taxon2', '43', 'bar']],
index=index, columns=columns, dtype=object)
exp = (
'Feature ID\tTaxon\tFoo\tBar\n'
'seq1\ttaxon1\t42\tfoo\n'
'seq2\ttaxon2\t43\tbar\n'
)
transformer = self.get_transformer(pd.DataFrame, TSVTaxonomyFormat)
obs = transformer(df)
with obs.open() as fh:
self.assertEqual(fh.read(), exp)
def test_series_to_tsv_taxonomy_format(self):
index = pd.Index(['emrakul', 'peanut'], name='Feature ID',
dtype=object)
series = pd.Series(['taxon1', 'taxon2'],
index=index, name='Taxon', dtype=object)
exp = (
'Feature ID\tTaxon\n'
'emrakul\ttaxon1\n'
'peanut\ttaxon2\n'
)
transformer = self.get_transformer(pd.Series, TSVTaxonomyFormat)
obs = transformer(series)
with obs.open() as fh:
self.assertEqual(fh.read(), exp)
def test_tsv_taxonomy_format_to_metadata(self):
_, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
os.path.join('taxonomy',
'3-column.tsv'))
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
['k__Foo; p__Baz', '-42.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
exp = qiime2.Metadata(exp_df)
self.assertEqual(exp, obs)
# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers.
def test_3_columns(self):
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
exp = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
['k__Foo; p__Baz', '-42.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
# has_header=None (default)
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy', '3-column.tsv')))
assert_frame_equal(obs, exp)
# has_header=True
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy', '3-column.tsv')),
has_header=True)
assert_frame_equal(obs, exp)
def test_valid_but_messy_file(self):
index = pd.Index(
['SEQUENCE1', 'seq2'], name='Feature ID', dtype=object)
exp = pd.DataFrame([['k__Bar; p__Baz', 'foo'],
['some; taxonomy; for; ya', 'bar baz']],
index=index, columns=['Taxon', 'Extra Column'],
dtype=object)
# has_header=None (default)
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'valid-but-messy.tsv')))
assert_frame_equal(obs, exp)
# has_header=True
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'valid-but-messy.tsv')),
has_header=True)
assert_frame_equal(obs, exp)
def test_headerless(self):
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
columns = ['Taxon', 'Unnamed Column 1', 'Unnamed Column 2']
exp = pd.DataFrame([['k__Foo; p__Bar', 'some', 'another'],
['k__Foo; p__Baz', 'column', 'column!']],
index=index, columns=columns, dtype=object)
# has_header=None (default)
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'headerless.tsv')))
assert_frame_equal(obs, exp)
# has_header=False
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy',
'headerless.tsv')),
has_header=False)
assert_frame_equal(obs, exp)
# In-depth testing of the `_dataframe_to_tsv_taxonomy_format` helper function,
# which does the heavy lifting for the transformers.
def find_missing_products():
train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv')
train_ids = train['Producto_ID'].unique()
test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv')
test_ids = test['Producto_ID'].unique()
missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids))
print "missing ID count ", len(missing_ids)
missing_ids_df = pd.DataFrame(missing_ids, columns=["Producto_ID"])
missing_ids_df.to_csv('missing_ids.csv', index=False)
entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID')
print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0]
print "full entries count", test.shape[0]
def at_time(self, time, asof=False):
"""
Select values at particular time of day (e.g. 9:30AM).
Parameters
----------
time : datetime.time or string
Returns
-------
values_at_time : type of caller
"""
try:
indexer = self.index.indexer_at_time(time, asof=asof)
return self.take(indexer, convert=False)
except AttributeError:
raise TypeError('Index must be DatetimeIndex')
def between_time(self, start_time, end_time, include_start=True,
include_end=True):
"""
Select values between particular times of the day (e.g., 9:00-9:30 AM).
Parameters
----------
start_time : datetime.time or string
end_time : datetime.time or string
include_start : boolean, default True
include_end : boolean, default True
Returns
-------
values_between_time : type of caller
"""
try:
indexer = self.index.indexer_between_time(
start_time, end_time, include_start=include_start,
include_end=include_end)
return self.take(indexer, convert=False)
except AttributeError:
raise TypeError('Index must be DatetimeIndex')
def _isnull_old(obj):
"""Detect missing values. Treat None, NaN, INF, -INF as null.
Parameters
----------
arr: ndarray or object value
Returns
-------
boolean ndarray or boolean
"""
if lib.isscalar(obj):
return lib.checknull_old(obj)
# hack (for now) because MI registers as ndarray
elif isinstance(obj, pd.MultiIndex):
raise NotImplementedError("isnull is not defined for MultiIndex")
elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)):
return _isnull_ndarraylike_old(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isnull(func=_isnull_old))
elif isinstance(obj, list) or hasattr(obj, '__array__'):
return _isnull_ndarraylike_old(np.asarray(obj))
else:
return obj is None
def test_period_resample_with_local_timezone_pytz(self):
# GH5430
tm._skip_if_no_pytz()
import pytz
local_timezone = pytz.timezone('America/Los_Angeles')
start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
tzinfo=pytz.utc)
# 1 day later
end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
tzinfo=pytz.utc)
index = pd.date_range(start, end, freq='H')
series = pd.Series(1, index=index)
series = series.tz_convert(local_timezone)
result = series.resample('D', kind='period').mean()
# Create the expected series
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
expected = pd.Series(1, index=expected_index)
assert_series_equal(result, expected)
def test_period_resample_with_local_timezone_dateutil(self):
# GH5430
tm._skip_if_no_dateutil()
import dateutil
local_timezone = 'dateutil/America/Los_Angeles'
start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
tzinfo=dateutil.tz.tzutc())
# 1 day later
end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
tzinfo=dateutil.tz.tzutc())
index = pd.date_range(start, end, freq='H')
series = pd.Series(1, index=index)
series = series.tz_convert(local_timezone)
result = series.resample('D', kind='period').mean()
# Create the expected series
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
expected = pd.Series(1, index=expected_index)
assert_series_equal(result, expected)
def test_dayfirst(self):
# GH 5917
arr = ['10/02/2014', '11/02/2014', '12/02/2014']
expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11),
datetime(2014, 2, 12)])
idx1 = DatetimeIndex(arr, dayfirst=True)
idx2 = DatetimeIndex(np.array(arr), dayfirst=True)
idx3 = to_datetime(arr, dayfirst=True)
idx4 = to_datetime(np.array(arr), dayfirst=True)
idx5 = DatetimeIndex(Index(arr), dayfirst=True)
idx6 = DatetimeIndex(Series(arr), dayfirst=True)
self.assertTrue(expected.equals(idx1))
self.assertTrue(expected.equals(idx2))
self.assertTrue(expected.equals(idx3))
self.assertTrue(expected.equals(idx4))
self.assertTrue(expected.equals(idx5))
self.assertTrue(expected.equals(idx6))
def test_to_datetime_format(self):
values = ['1/1/2000', '1/2/2000', '1/3/2000']
results1 = [Timestamp('20000101'), Timestamp('20000201'),
Timestamp('20000301')]
results2 = [Timestamp('20000101'), Timestamp('20000102'),
Timestamp('20000103')]
for vals, expecteds in [(values, (Index(results1), Index(results2))),
(Series(values),
(Series(results1), Series(results2))),
(values[0], (results1[0], results2[0])),
(values[1], (results1[1], results2[1])),
(values[2], (results1[2], results2[2]))]:
for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']):
result = to_datetime(vals, format=fmt)
expected = expecteds[i]
if isinstance(expected, Series):
assert_series_equal(result, Series(expected))
elif isinstance(expected, Timestamp):
self.assertEqual(result, expected)
else:
self.assertTrue(result.equals(expected))
def test_asobject_tolist(self):
idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx')
expected_list = [Timedelta('1 days'), Timedelta('2 days'),
Timedelta('3 days'), Timedelta('4 days')]
expected = pd.Index(expected_list, dtype=object, name='idx')
result = idx.asobject
self.assertTrue(isinstance(result, Index))
self.assertEqual(result.dtype, object)
self.assertTrue(result.equals(expected))
self.assertEqual(result.name, expected.name)
self.assertEqual(idx.tolist(), expected_list)
idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT,
timedelta(days=4)], name='idx')
expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT,
Timedelta('4 days')]
expected = pd.Index(expected_list, dtype=object, name='idx')
result = idx.asobject
self.assertTrue(isinstance(result, Index))
self.assertEqual(result.dtype, object)
self.assertTrue(result.equals(expected))
self.assertEqual(result.name, expected.name)
self.assertEqual(idx.tolist(), expected_list)
def prior_values_index(self):
index_values = list(
product(
(freq.freq_str for freq in self.unique_frequencies),
# Only store prior values for forward-fillable fields.
self.ffillable_fields,
)
)
if index_values:
return pd.MultiIndex.from_tuples(index_values)
else:
# MultiIndex doesn't gracefully support empty input, so we return
# an empty regular Index if we have values.
return pd.Index(index_values)
def add_sids(self, to_add):
"""
Add new sids to the container.
"""
self.sids = pd.Index(
sorted(self.sids.union(_ensure_index(to_add))),
)
self._realign_sids()
def drop_sids(self, to_drop):
"""
Remove sids from the container.
"""
self.sids = pd.Index(
sorted(self.sids.difference(_ensure_index(to_drop))),
)
self._realign_sids()
def _ensure_index(x):
if not isinstance(x, pd.Index):
x = pd.Index(sorted(x))
return x
def test_df_of_assets_as_input(self):
algo = TestRegisterTransformAlgorithm(
sim_params=self.sim_params,
env=TradingEnvironment(), # new env without assets
)
df = self.df.copy()
df.columns = pd.Index(map(Equity, df.columns))
algo.run(df)
assert isinstance(algo.sources[0], DataFrameSource)
def index(self):
"""Return dask Index instance"""
name = self._name + '-index'
dsk = {(name, i): (getattr, key, 'index')
for i, key in enumerate(self._keys())}
return Index(merge(dsk, self.dask), name,
self._meta.index, self.divisions)
def _daskify(obj, npartitions=None, chunksize=None):
"""Convert input to a dask-gdf object.
"""
npartitions = npartitions or 1
if isinstance(obj, _Frame):
return obj
elif isinstance(obj, (pd.DataFrame, pd.Series, pd.Index)):
return _daskify(dd.from_pandas(obj, npartitions=npartitions))
elif isinstance(obj, (gd.DataFrame, gd.Series, gd.index.Index)):
return from_pygdf(obj, npartitions=npartitions)
elif isinstance(obj, (dd.DataFrame, dd.Series, dd.Index)):
return from_dask_dataframe(obj)
else:
raise TypeError("type {} is not supported".format(type(obj)))
def concat(objs):
"""Concantenate dask gdf objects
Parameters
----------
objs : sequence of DataFrame, Series, Index
A sequence of objects to be concatenated.
"""
objs = [_daskify(x) for x in objs]
meta = gd.concat(_extract_meta(objs))
name = "concat-" + uuid4().hex
dsk = {}
divisions = [0]
base = 0
lastdiv = 0
for obj in objs:
for k, i in obj._keys():
dsk[name, base + i] = k, i
base += obj.npartitions
divisions.extend([d + lastdiv for d in obj.divisions[1:]])
lastdiv = obj.divisions[-1]
dasks = [o.dask for o in objs]
dsk = merge(dsk, *dasks)
return new_dd_object(dsk, name, meta, divisions)