Python pandas 模块,Index() 实例源码


项目:qiime2    作者:qiime2    | 项目源码 | 文件源码
def test_filter_to_numeric(self):
        index = pd.Index(['a', 'b', 'c'], dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['two', 'one', 'three']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='numeric').to_dataframe()
        exp_df = pd.DataFrame({'col1': [2, 1, 3]},, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['2', '1', 'three'],
                           'col3': ['4.0', '5.2', '6.9']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='numeric').to_dataframe()
        exp_df = pd.DataFrame({'col1': [2, 1, 3],
                               'col3': [4.0, 5.2, 6.9]}, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)
                         {'col1':, 'col3': np.float})
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def _add_field(self, field):
        Adds a new field to the container.
        # self.fields is already sorted, so we just need to insert the new
        # field in the correct index.
        ls = list(self.fields)
        insort_left(ls, field)
        self.fields = pd.Index(ls)
        # unset fillable fields cache
        self._ffillable_fields = None

        self.last_known_prior_values = self.last_known_prior_values.reindex(
        return field
项目:q2-diversity    作者:qiime2    | 项目源码 | 文件源码
def test_some_duplicates_in_category(self):
        columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
                                             (200, 2), ('pet', '')],
                                            names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'],
                                  [9, 10, 11, 12, 'russ']],
                            columns=columns, index=['S1', 'S2', 'S3'])

        obs = _reindex_with_metadata('pet', ['pet'], data)

        exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                                names=['depth', 'iter'])
        exp_ind = pd.Index(['milo', 'russ'], name='pet')
        exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[0])

        exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[1])
项目:q2-diversity    作者:qiime2    | 项目源码 | 文件源码
def test_all_identical(self):
        columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
                                             (200, 2), ('pet', '')],
                                            names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'],
                                  [9, 10, 11, 12, 'russ']],
                            columns=columns, index=['S1', 'S2', 'S3'])

        obs = _reindex_with_metadata('pet', ['pet'], data)

        exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                                names=['depth', 'iter'])
        exp_ind = pd.Index(['russ'], name='pet')
        exp = pd.DataFrame(data=[[5, 6, 7, 8]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[0])

        exp = pd.DataFrame(data=[[3, 3, 3, 3]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[1])
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def cross_join(df1, df2):
    Return a dataframe that is a cross between dataframes
    df1 and df2

    if len(df1) == 0:
        return df2

    if len(df2) == 0:
        return df1

    # Add as lists so that the new index keeps the items in
    # the order that they are added together
    all_columns = pd.Index(list(df1.columns) + list(df2.columns))
    df1['key'] = 1
    df2['key'] = 1
    return pd.merge(df1, df2, on='key').loc[:, all_columns]
项目:extra-trees    作者:allrod5    | 项目源码 | 文件源码
def _split_sample(
            split: Callable[[object], bool], X: np.ndarray, y: np.ndarray
    ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
        Split X, y sample set in two with a split function
        :return: ((X_left, y_left), (X_right, y_right))
        if split.type is 'numerical':
            left_indexes = X[:, split.attribute] < split.criteria
            right_indexes = ~left_indexes
            Z = (
                .get_indexer(X[:, split.attribute]))
            left_indexes = np.where(Z >= 0)[0]
            right_indexes = np.where(Z < 0)[0]

        left = X[left_indexes], y[left_indexes]
        right = X[right_indexes], y[right_indexes]

        return left, right
项目:InplusTrader_Linux    作者:zhengwsh    | 项目源码 | 文件源码
def get_dividend(self, order_book_id, adjusted=True):

        :param str order_book_id: ???
        :param bool adjusted: ?????????
        def fetchData(adjusted):
            if adjusted:
                mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
                mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
            return mongo_data

        result = pd.DataFrame({
            'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
            'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
            'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
            'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
            'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
        }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))

        return result
项目:InplusTrader_Linux    作者:zhengwsh    | 项目源码 | 文件源码
def get_yield_curve(self, start_date, end_date, tenor):
        d1 = start_date.year * 10000 + start_date.month * 100 +
        d2 = end_date.year * 10000 + end_date.month * 100 +

        s = self._dates.searchsorted(d1)
        e = self._dates.searchsorted(d2, side='right')

        if e == len(self._dates):
            e -= 1
        if self._dates[e] == d2:
            # ?? end_date
            e += 1

        if e < s:
            return None

        df = pd.DataFrame(self._table[s:e])
        df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
        del df['date']

        df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
        if tenor is not None:
            return df[tenor]
        return df
项目:InplusTrader_Linux    作者:zhengwsh    | 项目源码 | 文件源码
def get_dividend(self, order_book_id, adjusted=True):

        :param str order_book_id: ???
        :param bool adjusted: ?????????
        def fetchData(adjusted):
            if adjusted:
                mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
                mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
            return mongo_data

        result = pd.DataFrame({
            'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
            'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
            'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
            'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
            'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
        }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))

        return result
项目:InplusTrader_Linux    作者:zhengwsh    | 项目源码 | 文件源码
def get_yield_curve(self, start_date, end_date, tenor):
        d1 = start_date.year * 10000 + start_date.month * 100 +
        d2 = end_date.year * 10000 + end_date.month * 100 +

        s = self._dates.searchsorted(d1)
        e = self._dates.searchsorted(d2, side='right')

        if e == len(self._dates):
            e -= 1
        if self._dates[e] == d2:
            # ?? end_date
            e += 1

        if e < s:
            return None

        df = pd.DataFrame(self._table[s:e])
        df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
        del df['date']

        df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
        if tenor is not None:
            return df[tenor]
        return df
项目:FHDMM    作者:aweinstein    | 项目源码 | 文件源码
def fit_behavioral_data():
    """Fit a model for all subjects. """
    df = pd.read_pickle('data.pkl')
    subjects = df.index.get_level_values('subject').unique()
    data = np.empty((subjects.size, 10))
    cues = (0, 1)
    for i, subject in enumerate(subjects):
        print('Fitting model for subject {}'.format(subject))
        df_s = df.loc[subject]
        for cue in cues:
            ml = ML(df_s[df_s['cue']==cue])
            r = ml.ml_estimation()
            data[i,2*cue:(2*cue+2)] = r.x
            data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
            data[i,cue+8] =

    model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
                         ['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
                          'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
                          'NLL_0', 'NLL_1'])
    return model
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def update_table_models(self, visible=None, hidden=None):
        if visible is None and hidden is None:
            manager = self.Session.get_manager()
            for x in list(manager.hidden_columns):
                if x not in self.Session.output_object.columns:
            hidden_cols = pd.Index(manager.hidden_columns)

            vis_cols = [x for x in self.Session.output_object.columns
                        if not x in hidden_cols]

            to_show = self.Session.output_object[vis_cols]
            to_hide = self.Session.output_object[hidden_cols]
            to_show = visible
            to_hide = hidden

        self.table_model = classes.CoqTableModel(
            to_show, session=self.Session)
        self.hidden_model = classes.CoqHiddenTableModel(
            to_hide, session=self.Session)
项目:bowtie    作者:jwkvam    | 项目源码 | 文件源码
def json_conversion(obj):
    """Encode additional objects to JSON."""
        # numpy isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import numpy as np
        if isinstance(obj, (np.ndarray, np.generic)):
            return obj.tolist()
    except ImportError:

        # pandas isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import pandas as pd
        if isinstance(obj, pd.Index):
            return obj.tolist()
    except ImportError:

    if isinstance(obj, (datetime, time, date)):
        return obj.isoformat()
    raise TypeError('Not sure how to serialize {} of type {}'.format(obj, type(obj)))
项目:bowtie    作者:jwkvam    | 项目源码 | 文件源码
def encoders(obj):
    """Convert Python object to msgpack encodable ones."""
        # numpy isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import numpy as np
        if isinstance(obj, (np.ndarray, np.generic)):
            return obj.tolist()
    except ImportError:

        # pandas isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import pandas as pd
        if isinstance(obj, pd.Index):
            return obj.tolist()
    except ImportError:

    if isinstance(obj, (datetime, time, date)):
        return obj.isoformat()

    return obj
项目:catalyst    作者:enigmampc    | 项目源码 | 文件源码
def batch_market_order(self, share_counts):
        """Place a batch market order for multiple assets.

        share_counts : pd.Series[Asset -> int]
            Map from asset to number of shares to order for that asset.

        order_ids : pd.Index[str]
            Index of ids for newly-created orders.
        style = MarketOrder()
        order_args = [
            (asset, amount, style)
            for (asset, amount) in iteritems(share_counts)
            if amount
        return self.blotter.batch_order(order_args)
项目:qiime2    作者:qiime2    | 项目源码 | 文件源码
def test_filter_to_categorical(self):
        index = pd.Index(['a', 'b', 'c'], dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['a', 'b', 'c']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='categorical').to_dataframe()
        exp_df = pd.DataFrame({'col2': ['a', 'b', 'c']}, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['a', 'b', 'c'],
                           'col3': ['peanut', 'hotdog', 'gwar']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='categorical').to_dataframe()
        exp_df = pd.DataFrame({'col2': ['a', 'b', 'c'],
                               'col3': ['peanut', 'hotdog', 'gwar']},
        pdt.assert_frame_equal(obs_df, exp_df)
项目:qiime2    作者:qiime2    | 项目源码 | 文件源码
def test_no_columns(self):
        fp = pkg_resources.resource_filename(
            'qiime2.tests', 'data/metadata/no-columns.tsv')

        metadata = qiime2.Metadata.load(fp)
        obs_df = metadata.to_dataframe()

        exp_index = pd.Index(['a', 'b', 'id'], name='my-index', dtype=object)
        exp_df = pd.DataFrame({}, index=exp_index, dtype=object)

            obs_df, exp_df, check_dtype=True, check_index_type=True,
            check_column_type=True, check_frame_type=True, check_names=True,
项目:qiime2    作者:qiime2    | 项目源码 | 文件源码
def test_index_and_column_names(self):
        md1 = qiime2.Metadata(pd.DataFrame(
            {'a': [1, 2]},
            index=pd.Index(['id1', 'id2'], name='foo'),
            columns=pd.Index(['a'], name='abc')))
        md2 = qiime2.Metadata(pd.DataFrame(
            {'b': [3, 4]},
            index=pd.Index(['id1', 'id2'], name='bar'),
            columns=pd.Index(['b'], name='def')))

        obs = md1.merge(md2)

        exp = qiime2.Metadata(pd.DataFrame(
            {'a': [1, 2], 'b': [3, 4]}, index=['id1', 'id2']))
        self.assertEqual(obs, exp)
项目:qiime2    作者:qiime2    | 项目源码 | 文件源码
def test_more_complex_expressions(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)

        where = "Subject='subject-1' OR Subject='subject-2'"
        actual = metadata.ids(where)
        expected = {'S1', 'S2', 'S3'}
        self.assertEqual(actual, expected)

        where = "Subject='subject-1' AND Subject='subject-2'"
        actual = metadata.ids(where)
        expected = set()
        self.assertEqual(actual, expected)

        where = "Subject='subject-1' AND SampleType='gut'"
        actual = metadata.ids(where)
        expected = {'S1'}
        self.assertEqual(actual, expected)
项目:meterstick    作者:google    | 项目源码 | 文件源码
def testMultipleCalculationsRelativeTo(self):
    data = pd.DataFrame({"X": (1, 2, 3, 10, 20, 30, 100, 200, 300),
                         "Y": (0, 1, 2, 3, 4, 5, 6, 7, 8),
                         "Experiment": ("Control", "Control", "Control", "Exp1",
                                        "Exp1", "Exp1", "Exp2", "Exp2",

    comparison = comparisons.AbsoluteDifference("Experiment", "Control")
    output = core.Analyze(data).relative_to(comparison).calculate(
        (metrics.Sum("X"), metrics.Sum("Y"))).run()

    correct = pd.DataFrame(
        {"sum(X) Absolute Difference": (60 - 6, 600 - 6),
         "sum(Y) Absolute Difference": (12 - 3, 21 - 3)},
            ("Exp1", "Exp2"), name="Experiment"))

项目:meterstick    作者:google    | 项目源码 | 文件源码
def testRelativeToJackknife(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(

    rowindex = pd.Index([1, 2], name="Y")
    correct = pd.DataFrame(
        np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
                  [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),

项目:meterstick    作者:google    | 项目源码 | 文件源码
def testRelativeToJackknifeIncludeBaseline(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0, include_base=True)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(

    rowindex = pd.Index([0, 1, 2], name="Y")
    correct = pd.DataFrame(
        np.array([[0.0, 0.0],
                  [9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
                  [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),

项目:meterstick    作者:google    | 项目源码 | 文件源码
def testRelativeToJackknifeSingleComparisonBaselineFirst(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(

    rowindex = pd.Index([1], name="Y")
    correct = pd.DataFrame(
        np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),

项目:meterstick    作者:google    | 项目源码 | 文件源码
def testRelativeToJackknifeSingleComparisonBaselineSecond(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 1)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(

    rowindex = pd.Index([0], name="Y")
    correct = pd.DataFrame(
        np.array([[-9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),

项目:meterstick    作者:google    | 项目源码 | 文件源码
def testSplitJackknife(self):
    data = pd.DataFrame({"X": np.array([range(11) + [5] * 10]).flatten(),
                         "Y": np.array([[0] * 11 + [1] * 10]).flatten()})

    metric = metrics.Sum("X")
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).split_by("Y").with_standard_errors(

    rowindex = pd.Index([0, 1], name="Y")
    correct = pd.DataFrame(
        np.array([[55.0, 10.0], [50.0, 0.0]]),
        columns=("sum(X)", "sum(X) Jackknife SE"),

项目:tableschema-pandas-py    作者:frictionlessdata    | 项目源码 | 文件源码
def test_storage_restore_schema_with_primary_key():
    data = [
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame(data, columns=('value',), index=index)
    storage = Storage(dataframes={'data': df})
    assert list('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'primaryKey': 'key',
        'fields': [
            {'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'value', 'type': 'string'},
项目:q2-types    作者:qiime2    | 项目源码 | 文件源码
def test_dataframe_to_tsv_taxonomy_format(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        columns = ['Taxon', 'Foo', 'Bar']
        df = pd.DataFrame([['taxon1', '42', 'foo'], ['taxon2', '43', 'bar']],
                          index=index, columns=columns, dtype=object)
        exp = (
            'Feature ID\tTaxon\tFoo\tBar\n'

        transformer = self.get_transformer(pd.DataFrame, TSVTaxonomyFormat)
        obs = transformer(df)

        with as fh:
            self.assertEqual(, exp)
项目:q2-types    作者:qiime2    | 项目源码 | 文件源码
def test_series_to_tsv_taxonomy_format(self):
        index = pd.Index(['emrakul', 'peanut'], name='Feature ID',
        series = pd.Series(['taxon1', 'taxon2'],
                           index=index, name='Taxon', dtype=object)
        exp = (
            'Feature ID\tTaxon\n'

        transformer = self.get_transformer(pd.Series, TSVTaxonomyFormat)
        obs = transformer(series)

        with as fh:
            self.assertEqual(, exp)
项目:q2-types    作者:qiime2    | 项目源码 | 文件源码
def test_tsv_taxonomy_format_to_metadata(self):
        _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,

        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
                               ['k__Foo; p__Baz', '-42.0']], index=index,
                              columns=['Taxon', 'Confidence'], dtype=object)
        exp = qiime2.Metadata(exp_df)

        self.assertEqual(exp, obs)

# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers.
项目:q2-types    作者:qiime2    | 项目源码 | 文件源码
def test_3_columns(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        exp = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
                            ['k__Foo; p__Baz', '-42.0']], index=index,
                           columns=['Taxon', 'Confidence'], dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy', '3-column.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=True
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy', '3-column.tsv')),

        assert_frame_equal(obs, exp)
项目:q2-types    作者:qiime2    | 项目源码 | 文件源码
def test_valid_but_messy_file(self):
        index = pd.Index(
            ['SEQUENCE1', 'seq2'], name='Feature ID', dtype=object)
        exp = pd.DataFrame([['k__Bar; p__Baz', 'foo'],
                            ['some; taxonomy; for; ya', 'bar baz']],
                           index=index, columns=['Taxon', 'Extra Column'],

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(

        assert_frame_equal(obs, exp)

        # has_header=True
        obs = _taxonomy_formats_to_dataframe(

        assert_frame_equal(obs, exp)
项目:q2-types    作者:qiime2    | 项目源码 | 文件源码
def test_headerless(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        columns = ['Taxon', 'Unnamed Column 1', 'Unnamed Column 2']
        exp = pd.DataFrame([['k__Foo; p__Bar', 'some', 'another'],
                            ['k__Foo; p__Baz', 'column', 'column!']],
                           index=index, columns=columns, dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(

        assert_frame_equal(obs, exp)

        # has_header=False
        obs = _taxonomy_formats_to_dataframe(

        assert_frame_equal(obs, exp)

# In-depth testing of the `_dataframe_to_tsv_taxonomy_format` helper function,
# which does the heavy lifting for the transformers.
项目:mlprojects-py    作者:srinathperera    | 项目源码 | 文件源码
def find_missing_products():
    train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv')
    train_ids = train['Producto_ID'].unique()
    test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv')
    test_ids = test['Producto_ID'].unique()

    missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids))
    print "missing ID count ", len(missing_ids)

    missing_ids_df =  pd.DataFrame(missing_ids, columns=["Producto_ID"])
    missing_ids_df.to_csv('missing_ids.csv', index=False)

    entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID')

    print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0]

    print "full entries count", test.shape[0]
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def at_time(self, time, asof=False):
        Select values at particular time of day (e.g. 9:30AM).

        time : datetime.time or string

        values_at_time : type of caller
            indexer = self.index.indexer_at_time(time, asof=asof)
            return self.take(indexer, convert=False)
        except AttributeError:
            raise TypeError('Index must be DatetimeIndex')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def between_time(self, start_time, end_time, include_start=True,
        Select values between particular times of the day (e.g., 9:00-9:30 AM).

        start_time : datetime.time or string
        end_time : datetime.time or string
        include_start : boolean, default True
        include_end : boolean, default True

        values_between_time : type of caller
            indexer = self.index.indexer_between_time(
                start_time, end_time, include_start=include_start,
            return self.take(indexer, convert=False)
        except AttributeError:
            raise TypeError('Index must be DatetimeIndex')
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _isnull_old(obj):
    """Detect missing values. Treat None, NaN, INF, -INF as null.

    arr: ndarray or object value

    boolean ndarray or boolean
    if lib.isscalar(obj):
        return lib.checknull_old(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, pd.MultiIndex):
        raise NotImplementedError("isnull is not defined for MultiIndex")
    elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)):
        return _isnull_ndarraylike_old(obj)
    elif isinstance(obj, ABCGeneric):
        return obj._constructor(obj._data.isnull(func=_isnull_old))
    elif isinstance(obj, list) or hasattr(obj, '__array__'):
        return _isnull_ndarraylike_old(np.asarray(obj))
        return obj is None
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_period_resample_with_local_timezone_pytz(self):
        # GH5430
        import pytz

        local_timezone = pytz.timezone('America/Los_Angeles')

        start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
        # 1 day later
        end = datetime(year=2013, month=11, day=2, hour=0, minute=0,

        index = pd.date_range(start, end, freq='H')

        series = pd.Series(1, index=index)
        series = series.tz_convert(local_timezone)
        result = series.resample('D', kind='period').mean()

        # Create the expected series
        # Index is moved back a day with the timezone conversion from UTC to
        # Pacific
        expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
        expected = pd.Series(1, index=expected_index)
        assert_series_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_period_resample_with_local_timezone_dateutil(self):
        # GH5430
        import dateutil

        local_timezone = 'dateutil/America/Los_Angeles'

        start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
        # 1 day later
        end = datetime(year=2013, month=11, day=2, hour=0, minute=0,

        index = pd.date_range(start, end, freq='H')

        series = pd.Series(1, index=index)
        series = series.tz_convert(local_timezone)
        result = series.resample('D', kind='period').mean()

        # Create the expected series
        # Index is moved back a day with the timezone conversion from UTC to
        # Pacific
        expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
        expected = pd.Series(1, index=expected_index)
        assert_series_equal(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_dayfirst(self):
        # GH 5917
        arr = ['10/02/2014', '11/02/2014', '12/02/2014']
        expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11),
                                  datetime(2014, 2, 12)])
        idx1 = DatetimeIndex(arr, dayfirst=True)
        idx2 = DatetimeIndex(np.array(arr), dayfirst=True)
        idx3 = to_datetime(arr, dayfirst=True)
        idx4 = to_datetime(np.array(arr), dayfirst=True)
        idx5 = DatetimeIndex(Index(arr), dayfirst=True)
        idx6 = DatetimeIndex(Series(arr), dayfirst=True)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_to_datetime_format(self):
        values = ['1/1/2000', '1/2/2000', '1/3/2000']

        results1 = [Timestamp('20000101'), Timestamp('20000201'),
        results2 = [Timestamp('20000101'), Timestamp('20000102'),
        for vals, expecteds in [(values, (Index(results1), Index(results2))),
                                 (Series(results1), Series(results2))),
                                (values[0], (results1[0], results2[0])),
                                (values[1], (results1[1], results2[1])),
                                (values[2], (results1[2], results2[2]))]:

            for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']):
                result = to_datetime(vals, format=fmt)
                expected = expecteds[i]

                if isinstance(expected, Series):
                    assert_series_equal(result, Series(expected))
                elif isinstance(expected, Timestamp):
                    self.assertEqual(result, expected)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_asobject_tolist(self):
        idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx')
        expected_list = [Timedelta('1 days'), Timedelta('2 days'),
                         Timedelta('3 days'), Timedelta('4 days')]
        expected = pd.Index(expected_list, dtype=object, name='idx')
        result = idx.asobject
        self.assertTrue(isinstance(result, Index))

        self.assertEqual(result.dtype, object)
        self.assertEqual(idx.tolist(), expected_list)

        idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT,
                              timedelta(days=4)], name='idx')
        expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT,
                         Timedelta('4 days')]
        expected = pd.Index(expected_list, dtype=object, name='idx')
        result = idx.asobject
        self.assertTrue(isinstance(result, Index))
        self.assertEqual(result.dtype, object)
        self.assertEqual(idx.tolist(), expected_list)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def prior_values_index(self):
        index_values = list(
                (freq.freq_str for freq in self.unique_frequencies),
                # Only store prior values for forward-fillable fields.
        if index_values:
            return pd.MultiIndex.from_tuples(index_values)
            # MultiIndex doesn't gracefully support empty input, so we return
            # an empty regular Index if we have values.
            return pd.Index(index_values)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def add_sids(self, to_add):
        Add new sids to the container.
        self.sids = pd.Index(
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def drop_sids(self, to_drop):
        Remove sids from the container.
        self.sids = pd.Index(
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def _ensure_index(x):
    if not isinstance(x, pd.Index):
        x = pd.Index(sorted(x))

    return x
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def test_df_of_assets_as_input(self):
        algo = TestRegisterTransformAlgorithm(
            env=TradingEnvironment(),  # new env without assets
        df = self.df.copy()
        df.columns = pd.Index(map(Equity, df.columns))
        assert isinstance(algo.sources[0], DataFrameSource)
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def index(self):
        """Return dask Index instance"""
        name = self._name + '-index'
        dsk = {(name, i): (getattr, key, 'index')
               for i, key in enumerate(self._keys())}
        return Index(merge(dsk, self.dask), name,
                     self._meta.index, self.divisions)
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def _daskify(obj, npartitions=None, chunksize=None):
    """Convert input to a dask-gdf object.
    npartitions = npartitions or 1
    if isinstance(obj, _Frame):
        return obj
    elif isinstance(obj, (pd.DataFrame, pd.Series, pd.Index)):
        return _daskify(dd.from_pandas(obj, npartitions=npartitions))
    elif isinstance(obj, (gd.DataFrame, gd.Series, gd.index.Index)):
        return from_pygdf(obj, npartitions=npartitions)
    elif isinstance(obj, (dd.DataFrame, dd.Series, dd.Index)):
        return from_dask_dataframe(obj)
        raise TypeError("type {} is not supported".format(type(obj)))
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def concat(objs):
    """Concantenate dask gdf objects


    objs : sequence of DataFrame, Series, Index
        A sequence of objects to be concatenated.
    objs = [_daskify(x) for x in objs]
    meta = gd.concat(_extract_meta(objs))

    name = "concat-" + uuid4().hex
    dsk = {}
    divisions = [0]
    base = 0
    lastdiv = 0
    for obj in objs:
        for k, i in obj._keys():
            dsk[name, base + i] = k, i
        base += obj.npartitions
        divisions.extend([d + lastdiv for d in obj.divisions[1:]])
        lastdiv = obj.divisions[-1]

    dasks = [o.dask for o in objs]
    dsk = merge(dsk, *dasks)
    return new_dd_object(dsk, name, meta, divisions)
项目:dask_gdf    作者:gpuopenanalytics    | 项目源码 | 文件源码
def _get_return_type(meta):
    if isinstance(meta, gd.Series):
        return Series
    elif isinstance(meta, gd.DataFrame):
        return DataFrame
    elif isinstance(meta, gd.index.Index):
        return Index
    return Scalar