Python pandas 模块，Index() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用pandas.Index()。

项目：qiime2 作者：qiime2 | 项目源码 | 文件源码

def test_filter_to_numeric(self):
        index = pd.Index(['a', 'b', 'c'], dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['two', 'one', 'three']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='numeric').to_dataframe()
        exp_df = pd.DataFrame({'col1': [2, 1, 3]}, dtype=np.int, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['2', '1', 'three'],
                           'col3': ['4.0', '5.2', '6.9']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='numeric').to_dataframe()
        exp_df = pd.DataFrame({'col1': [2, 1, 3],
                               'col3': [4.0, 5.2, 6.9]}, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)
        self.assertEqual(dict(obs_df.dtypes),
                         {'col1': np.int, 'col3': np.float})

项目：zipline-chinese 作者：zhanghan1990 | 项目源码 | 文件源码

def _add_field(self, field):
        """
        Adds a new field to the container.
        """
        # self.fields is already sorted, so we just need to insert the new
        # field in the correct index.
        ls = list(self.fields)
        insort_left(ls, field)
        self.fields = pd.Index(ls)
        # unset fillable fields cache
        self._ffillable_fields = None

        self._realign_fields()
        self.last_known_prior_values = self.last_known_prior_values.reindex(
            index=self.prior_values_index,
        )
        return field

项目：q2-diversity 作者：qiime2 | 项目源码 | 文件源码

def test_some_duplicates_in_category(self):
        columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
                                             (200, 2), ('pet', '')],
                                            names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'],
                                  [9, 10, 11, 12, 'russ']],
                            columns=columns, index=['S1', 'S2', 'S3'])

        obs = _reindex_with_metadata('pet', ['pet'], data)

        exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                                names=['depth', 'iter'])
        exp_ind = pd.Index(['milo', 'russ'], name='pet')
        exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[0])

        exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[1])

项目：q2-diversity 作者：qiime2 | 项目源码 | 文件源码

def test_all_identical(self):
        columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
                                             (200, 2), ('pet', '')],
                                            names=['depth', 'iter'])
        data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'],
                                  [9, 10, 11, 12, 'russ']],
                            columns=columns, index=['S1', 'S2', 'S3'])

        obs = _reindex_with_metadata('pet', ['pet'], data)

        exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
                                names=['depth', 'iter'])
        exp_ind = pd.Index(['russ'], name='pet')
        exp = pd.DataFrame(data=[[5, 6, 7, 8]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[0])

        exp = pd.DataFrame(data=[[3, 3, 3, 3]],
                           columns=exp_col, index=exp_ind)

        pdt.assert_frame_equal(exp, obs[1])

项目：plotnine 作者：has2k1 | 项目源码 | 文件源码

def cross_join(df1, df2):
    """
    Return a dataframe that is a cross between dataframes
    df1 and df2

    ref: https://github.com/pydata/pandas/issues/5401
    """
    if len(df1) == 0:
        return df2

    if len(df2) == 0:
        return df1

    # Add as lists so that the new index keeps the items in
    # the order that they are added together
    all_columns = pd.Index(list(df1.columns) + list(df2.columns))
    df1['key'] = 1
    df2['key'] = 1
    return pd.merge(df1, df2, on='key').loc[:, all_columns]

项目：extra-trees 作者：allrod5 | 项目源码 | 文件源码

def _split_sample(
            split: Callable[[object], bool], X: np.ndarray, y: np.ndarray
    ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
        """
        Split X, y sample set in two with a split function
        :return: ((X_left, y_left), (X_right, y_right))
        """
        if split.type is 'numerical':
            left_indexes = X[:, split.attribute] < split.criteria
            right_indexes = ~left_indexes
        else:
            Z = (
                pd.Index(pd.unique(split.criteria))
                .get_indexer(X[:, split.attribute]))
            left_indexes = np.where(Z >= 0)[0]
            right_indexes = np.where(Z < 0)[0]

        left = X[left_indexes], y[left_indexes]
        right = X[right_indexes], y[right_indexes]

        return left, right

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def get_dividend(self, order_book_id, adjusted=True):
        """
        ????/??????

        :param str order_book_id: ???
        :param bool adjusted: ?????????
        :return:
        """
        def fetchData(adjusted):
            if adjusted:
                mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
            else:
                mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
            return mongo_data

        result = pd.DataFrame({
            'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
            'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
            'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
            'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
            'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
        }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))

        return result

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def get_yield_curve(self, start_date, end_date, tenor):
        d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
        d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day

        s = self._dates.searchsorted(d1)
        e = self._dates.searchsorted(d2, side='right')

        if e == len(self._dates):
            e -= 1
        if self._dates[e] == d2:
            # ?? end_date
            e += 1

        if e < s:
            return None

        df = pd.DataFrame(self._table[s:e])
        df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
        del df['date']

        df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
        if tenor is not None:
            return df[tenor]
        return df

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def get_dividend(self, order_book_id, adjusted=True):
        """
        ????/??????

        :param str order_book_id: ???
        :param bool adjusted: ?????????
        :return:
        """
        def fetchData(adjusted):
            if adjusted:
                mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
            else:
                mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
            return mongo_data

        result = pd.DataFrame({
            'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
            'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
            'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
            'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
            'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
        }, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))

        return result

项目：InplusTrader_Linux 作者：zhengwsh | 项目源码 | 文件源码

def get_yield_curve(self, start_date, end_date, tenor):
        d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
        d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day

        s = self._dates.searchsorted(d1)
        e = self._dates.searchsorted(d2, side='right')

        if e == len(self._dates):
            e -= 1
        if self._dates[e] == d2:
            # ?? end_date
            e += 1

        if e < s:
            return None

        df = pd.DataFrame(self._table[s:e])
        df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
        del df['date']

        df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
        if tenor is not None:
            return df[tenor]
        return df

项目：FHDMM 作者：aweinstein | 项目源码 | 文件源码

def fit_behavioral_data():
    """Fit a model for all subjects. """
    df = pd.read_pickle('data.pkl')
    subjects = df.index.get_level_values('subject').unique()
    data = np.empty((subjects.size, 10))
    cues = (0, 1)
    for i, subject in enumerate(subjects):
        print('Fitting model for subject {}'.format(subject))
        df_s = df.loc[subject]
        for cue in cues:
            ml = ML(df_s[df_s['cue']==cue])
            r = ml.ml_estimation()
            data[i,2*cue:(2*cue+2)] = r.x
            data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
            data[i,cue+8] = r.fun

    model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
                         ['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
                          'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
                          'NLL_0', 'NLL_1'])
    return model

项目：coquery 作者：gkunter | 项目源码 | 文件源码

def update_table_models(self, visible=None, hidden=None):
        if visible is None and hidden is None:
            manager = self.Session.get_manager()
            for x in list(manager.hidden_columns):
                if x not in self.Session.output_object.columns:
                    manager.hidden_columns.remove(x)
            hidden_cols = pd.Index(manager.hidden_columns)

            vis_cols = [x for x in self.Session.output_object.columns
                        if not x in hidden_cols]

            to_show = self.Session.output_object[vis_cols]
            to_hide = self.Session.output_object[hidden_cols]
        else:
            to_show = visible
            to_hide = hidden

        self.table_model = classes.CoqTableModel(
            to_show, session=self.Session)
        self.hidden_model = classes.CoqHiddenTableModel(
            to_hide, session=self.Session)
        self.set_columns_widget()
        self.table_model.dataChanged.connect(self.change_userdata)

项目：bowtie 作者：jwkvam | 项目源码 | 文件源码

def json_conversion(obj):
    """Encode additional objects to JSON."""
    try:
        # numpy isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import numpy as np
        if isinstance(obj, (np.ndarray, np.generic)):
            return obj.tolist()
    except ImportError:
        pass

    try:
        # pandas isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import pandas as pd
        if isinstance(obj, pd.Index):
            return obj.tolist()
    except ImportError:
        pass

    if isinstance(obj, (datetime, time, date)):
        return obj.isoformat()
    raise TypeError('Not sure how to serialize {} of type {}'.format(obj, type(obj)))

项目：bowtie 作者：jwkvam | 项目源码 | 文件源码

def encoders(obj):
    """Convert Python object to msgpack encodable ones."""
    try:
        # numpy isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import numpy as np
        if isinstance(obj, (np.ndarray, np.generic)):
            # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
            return obj.tolist()
    except ImportError:
        pass

    try:
        # pandas isn't an explicit dependency of bowtie
        # so we can't assume it's available
        import pandas as pd
        if isinstance(obj, pd.Index):
            return obj.tolist()
    except ImportError:
        pass

    if isinstance(obj, (datetime, time, date)):
        return obj.isoformat()

    return obj

项目：catalyst 作者：enigmampc | 项目源码 | 文件源码

def batch_market_order(self, share_counts):
        """Place a batch market order for multiple assets.

        Parameters
        ----------
        share_counts : pd.Series[Asset -> int]
            Map from asset to number of shares to order for that asset.

        Returns
        -------
        order_ids : pd.Index[str]
            Index of ids for newly-created orders.
        """
        style = MarketOrder()
        order_args = [
            (asset, amount, style)
            for (asset, amount) in iteritems(share_counts)
            if amount
        ]
        return self.blotter.batch_order(order_args)

项目：qiime2 作者：qiime2 | 项目源码 | 文件源码

def test_filter_to_categorical(self):
        index = pd.Index(['a', 'b', 'c'], dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['a', 'b', 'c']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='categorical').to_dataframe()
        exp_df = pd.DataFrame({'col2': ['a', 'b', 'c']}, index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

        df = pd.DataFrame({'col1': ['2', '1', '3'],
                           'col2': ['a', 'b', 'c'],
                           'col3': ['peanut', 'hotdog', 'gwar']},
                          index=index, dtype=object)
        metadata = qiime2.Metadata(df)

        obs_df = metadata.filter(column_type='categorical').to_dataframe()
        exp_df = pd.DataFrame({'col2': ['a', 'b', 'c'],
                               'col3': ['peanut', 'hotdog', 'gwar']},
                              index=index)
        pdt.assert_frame_equal(obs_df, exp_df)

项目：qiime2 作者：qiime2 | 项目源码 | 文件源码

def test_no_columns(self):
        fp = pkg_resources.resource_filename(
            'qiime2.tests', 'data/metadata/no-columns.tsv')

        metadata = qiime2.Metadata.load(fp)
        obs_df = metadata.to_dataframe()

        exp_index = pd.Index(['a', 'b', 'id'], name='my-index', dtype=object)
        exp_df = pd.DataFrame({}, index=exp_index, dtype=object)

        self.assertFalse(obs_df.index.empty)
        self.assertTrue(obs_df.columns.empty)
        pdt.assert_frame_equal(
            obs_df, exp_df, check_dtype=True, check_index_type=True,
            check_column_type=True, check_frame_type=True, check_names=True,
            check_exact=True)

项目：qiime2 作者：qiime2 | 项目源码 | 文件源码

def test_index_and_column_names(self):
        md1 = qiime2.Metadata(pd.DataFrame(
            {'a': [1, 2]},
            index=pd.Index(['id1', 'id2'], name='foo'),
            columns=pd.Index(['a'], name='abc')))
        md2 = qiime2.Metadata(pd.DataFrame(
            {'b': [3, 4]},
            index=pd.Index(['id1', 'id2'], name='bar'),
            columns=pd.Index(['b'], name='def')))

        obs = md1.merge(md2)

        exp = qiime2.Metadata(pd.DataFrame(
            {'a': [1, 2], 'b': [3, 4]}, index=['id1', 'id2']))
        self.assertEqual(obs, exp)
        self.assertIsNone(obs._dataframe.index.name)
        self.assertIsNone(obs._dataframe.columns.name)

项目：qiime2 作者：qiime2 | 项目源码 | 文件源码

def test_more_complex_expressions(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = qiime2.Metadata(df)

        where = "Subject='subject-1' OR Subject='subject-2'"
        actual = metadata.ids(where)
        expected = {'S1', 'S2', 'S3'}
        self.assertEqual(actual, expected)

        where = "Subject='subject-1' AND Subject='subject-2'"
        actual = metadata.ids(where)
        expected = set()
        self.assertEqual(actual, expected)

        where = "Subject='subject-1' AND SampleType='gut'"
        actual = metadata.ids(where)
        expected = {'S1'}
        self.assertEqual(actual, expected)

项目：meterstick 作者：google | 项目源码 | 文件源码

def testMultipleCalculationsRelativeTo(self):
    data = pd.DataFrame({"X": (1, 2, 3, 10, 20, 30, 100, 200, 300),
                         "Y": (0, 1, 2, 3, 4, 5, 6, 7, 8),
                         "Experiment": ("Control", "Control", "Control", "Exp1",
                                        "Exp1", "Exp1", "Exp2", "Exp2",
                                        "Exp2")})

    comparison = comparisons.AbsoluteDifference("Experiment", "Control")
    output = core.Analyze(data).relative_to(comparison).calculate(
        (metrics.Sum("X"), metrics.Sum("Y"))).run()

    correct = pd.DataFrame(
        {"sum(X) Absolute Difference": (60 - 6, 600 - 6),
         "sum(Y) Absolute Difference": (12 - 3, 21 - 3)},
        index=pd.Index(
            ("Exp1", "Exp2"), name="Experiment"))

    self.assertTrue(output.equals(correct))

项目：meterstick 作者：google | 项目源码 | 文件源码

def testRelativeToJackknife(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([1, 2], name="Y")
    correct = pd.DataFrame(
        np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
                  [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct))

项目：meterstick 作者：google | 项目源码 | 文件源码

def testRelativeToJackknifeIncludeBaseline(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         "Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0, include_base=True)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([0, 1, 2], name="Y")
    correct = pd.DataFrame(
        np.array([[0.0, 0.0],
                  [9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
                  [18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct))

项目：meterstick 作者：google | 项目源码 | 文件源码

def testRelativeToJackknifeSingleComparisonBaselineFirst(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 0)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([1], name="Y")
    correct = pd.DataFrame(
        np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct))

项目：meterstick 作者：google | 项目源码 | 文件源码

def testRelativeToJackknifeSingleComparisonBaselineSecond(self):
    data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})

    metric = metrics.Sum("X")
    comparison = comparisons.AbsoluteDifference("Y", 1)
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).relative_to(comparison).with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([0], name="Y")
    correct = pd.DataFrame(
        np.array([[-9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
        columns=("sum(X) Absolute Difference",
                 "sum(X) Absolute Difference Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct))

项目：meterstick 作者：google | 项目源码 | 文件源码

def testSplitJackknife(self):
    data = pd.DataFrame({"X": np.array([range(11) + [5] * 10]).flatten(),
                         "Y": np.array([[0] * 11 + [1] * 10]).flatten()})

    metric = metrics.Sum("X")
    se_method = standard_errors.Jackknife()
    output = core.Analyze(data).split_by("Y").with_standard_errors(
        se_method).calculate(metric).run()

    rowindex = pd.Index([0, 1], name="Y")
    correct = pd.DataFrame(
        np.array([[55.0, 10.0], [50.0, 0.0]]),
        columns=("sum(X)", "sum(X) Jackknife SE"),
        index=rowindex)

    self.assertTrue(output.equals(correct))

项目：tableschema-pandas-py 作者：frictionlessdata | 项目源码 | 文件源码

def test_storage_restore_schema_with_primary_key():
    data = [
        ('a',),
        ('b',),
    ]
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame(data, columns=('value',), index=index)
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'primaryKey': 'key',
        'fields': [
            {'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'value', 'type': 'string'},
        ]
    }

项目：q2-types 作者：qiime2 | 项目源码 | 文件源码

def test_dataframe_to_tsv_taxonomy_format(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        columns = ['Taxon', 'Foo', 'Bar']
        df = pd.DataFrame([['taxon1', '42', 'foo'], ['taxon2', '43', 'bar']],
                          index=index, columns=columns, dtype=object)
        exp = (
            'Feature ID\tTaxon\tFoo\tBar\n'
            'seq1\ttaxon1\t42\tfoo\n'
            'seq2\ttaxon2\t43\tbar\n'
        )

        transformer = self.get_transformer(pd.DataFrame, TSVTaxonomyFormat)
        obs = transformer(df)

        with obs.open() as fh:
            self.assertEqual(fh.read(), exp)

项目：q2-types 作者：qiime2 | 项目源码 | 文件源码

def test_series_to_tsv_taxonomy_format(self):
        index = pd.Index(['emrakul', 'peanut'], name='Feature ID',
                         dtype=object)
        series = pd.Series(['taxon1', 'taxon2'],
                           index=index, name='Taxon', dtype=object)
        exp = (
            'Feature ID\tTaxon\n'
            'emrakul\ttaxon1\n'
            'peanut\ttaxon2\n'
        )

        transformer = self.get_transformer(pd.Series, TSVTaxonomyFormat)
        obs = transformer(series)

        with obs.open() as fh:
            self.assertEqual(fh.read(), exp)

项目：q2-types 作者：qiime2 | 项目源码 | 文件源码

def test_tsv_taxonomy_format_to_metadata(self):
        _, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
                                       os.path.join('taxonomy',
                                                    '3-column.tsv'))

        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
                               ['k__Foo; p__Baz', '-42.0']], index=index,
                              columns=['Taxon', 'Confidence'], dtype=object)
        exp = qiime2.Metadata(exp_df)

        self.assertEqual(exp, obs)


# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers.

项目：q2-types 作者：qiime2 | 项目源码 | 文件源码

def test_3_columns(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        exp = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
                            ['k__Foo; p__Baz', '-42.0']], index=index,
                           columns=['Taxon', 'Confidence'], dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy', '3-column.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=True
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy', '3-column.tsv')),
            has_header=True)

        assert_frame_equal(obs, exp)

项目：q2-types 作者：qiime2 | 项目源码 | 文件源码

def test_valid_but_messy_file(self):
        index = pd.Index(
            ['SEQUENCE1', 'seq2'], name='Feature ID', dtype=object)
        exp = pd.DataFrame([['k__Bar; p__Baz', 'foo'],
                            ['some; taxonomy; for; ya', 'bar baz']],
                           index=index, columns=['Taxon', 'Extra Column'],
                           dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'valid-but-messy.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=True
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'valid-but-messy.tsv')),
            has_header=True)

        assert_frame_equal(obs, exp)

项目：q2-types 作者：qiime2 | 项目源码 | 文件源码

def test_headerless(self):
        index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
        columns = ['Taxon', 'Unnamed Column 1', 'Unnamed Column 2']
        exp = pd.DataFrame([['k__Foo; p__Bar', 'some', 'another'],
                            ['k__Foo; p__Baz', 'column', 'column!']],
                           index=index, columns=columns, dtype=object)

        # has_header=None (default)
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'headerless.tsv')))

        assert_frame_equal(obs, exp)

        # has_header=False
        obs = _taxonomy_formats_to_dataframe(
            self.get_data_path(os.path.join('taxonomy',
                                            'headerless.tsv')),
            has_header=False)

        assert_frame_equal(obs, exp)


# In-depth testing of the `_dataframe_to_tsv_taxonomy_format` helper function,
# which does the heavy lifting for the transformers.

项目：mlprojects-py 作者：srinathperera | 项目源码 | 文件源码

def find_missing_products():
    train = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/train.csv')
    train_ids = train['Producto_ID'].unique()
    test = pd.read_csv('/Users/srinath/playground/data-science/BimboInventoryDemand/test.csv')
    test_ids = test['Producto_ID'].unique()

    missing_ids = pd.Index(test_ids).difference(pd.Index(train_ids))
    print "missing ID count ", len(missing_ids)

    missing_ids_df =  pd.DataFrame(missing_ids, columns=["Producto_ID"])
    missing_ids_df.to_csv('missing_ids.csv', index=False)

    entries_with_missing = pd.merge(test, missing_ids_df, on='Producto_ID')

    print "Mising entries=", entries_with_missing.shape[0], "percentage=", entries_with_missing.shape[0]*100/test.shape[0]

    print "full entries count", test.shape[0]