我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.DataFrame()。
def aggregate_ohlcv_panel(self, fields, ohlcv_panel, items=None, minor_axis=None): """ Convert an OHLCV Panel into a DataFrame by aggregating each field's frame into a Series. """ vals = ohlcv_panel if isinstance(ohlcv_panel, pd.Panel): vals = ohlcv_panel.values items = ohlcv_panel.items minor_axis = ohlcv_panel.minor_axis data = [ self.frame_to_series( field, vals[items.get_loc(field)], minor_axis ) for field in fields ] return np.array(data)
def y_sum_by_time(x_arr, y_arr, top=None): df = pd.DataFrame({'Timestamp': pd.to_datetime(x_arr, unit='s'), 'Status': y_arr}) df['Date'] = df['Timestamp'].apply(lambda x: "%d/%d/%d" % (x.day, x.month, x.year)) df['Hour'] = df['Timestamp'].apply(lambda x: "%d" % (x.hour)) df['Weekday'] = df['Timestamp'].apply(lambda x: "%s" % (x.weekday_name)) times = ['Hour', 'Weekday', 'Date'] result = {} for groupby in times: df_group = df.groupby(groupby, as_index=False).agg({'Status': np.sum}) if top != None and top > 0: #df_group = df_group.nlargest(top, 'Status').sort(['Status', 'Hour'],ascending=False) idx = df_group.nlargest(top, 'Status') > 0 else: idx = df_group['Status'].max() == df_group['Status'] result[groupby] = {k: g['Status'].replace(np.nan, 'None').tolist() for k,g in df_group[idx].groupby(groupby)} return result
def get_citation_df(args, text): """ Generate citation_df and save it to 'citations.tsv'. """ citation_df = pandas.DataFrame( {'string': get_citation_strings(text)} ) if args.citation_tags_path.is_file(): tag_df = pandas.read_table(args.citation_tags_path) tag_df['string'] = '@tag:' + tag_df.tag for citation in tag_df.citation: is_valid_citation_string('@' + citation) citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left') else: citation_df['citation'] = None logging.info(f'missing {args.citation_tags_path} file: no citation tags set') citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True) citation_df['standard_citation'] = citation_df.citation.map(standardize_citation) citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id) citation_df = citation_df.sort_values(['standard_citation', 'citation']) citation_df.to_csv(args.citations_path, sep='\t', index=False) check_collisions(citation_df) check_multiple_citation_strings(citation_df) return citation_df
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population): output = pd.DataFrame(population[item].position) output.columns = ['Split'] dataSplit = pd.concat([data, output], axis=1) f1 = [] results = [] for i in range(nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme, reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) # print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population): output = pd.DataFrame(population[item].genes) output.columns = ['Split'] dataSplit = pd.concat([data, output], axis=1) f1 = [] results = [] for i in range(nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme, reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) return (1 / np.sum(f1)) # Main
def rhoA(self): # rhoA rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent) for i in range(self.lenlatent): weights = pd.DataFrame(self.outer_weights[self.latent[i]]) weights = weights[(weights.T != 0).any()] result = pd.DataFrame.dot(weights.T, weights) result_ = pd.DataFrame.dot(weights, weights.T) S = self.data_[self.Variables['measurement'][ self.Variables['latent'] == self.latent[i]]] S = pd.DataFrame.dot(S.T, S) / S.shape[0] numerador = ( np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights)) denominador = ( (np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights))) rhoA_ = ((result)**2) * (numerador / denominador) if(np.isnan(rhoA_.values)): rhoA[self.latent[i]] = 1 else: rhoA[self.latent[i]] = rhoA_.values return rhoA.T
def xloads(self): # Xloadings A = self.data_.transpose().values B = self.fscores.transpose().values A_mA = A - A.mean(1)[:, None] B_mB = B - B.mean(1)[:, None] ssA = (A_mA**2).sum(1) ssB = (B_mB**2).sum(1) xloads_ = (np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None], ssB[None]))) xloads = pd.DataFrame( xloads_, index=self.manifests, columns=self.latent) return xloads
def alpha(self): # Cronbach Alpha alpha = pd.DataFrame(0, index=np.arange(1), columns=self.latent) for i in range(self.lenlatent): block = self.data_[self.Variables['measurement'] [self.Variables['latent'] == self.latent[i]]] p = len(block.columns) if(p != 1): p_ = len(block) correction = np.sqrt((p_ - 1) / p_) soma = np.var(np.sum(block, axis=1)) cor_ = pd.DataFrame.corr(block) denominador = soma * correction**2 numerador = 2 * np.sum(np.tril(cor_) - np.diag(np.diag(cor_))) alpha_ = (numerador / denominador) * (p / (p - 1)) alpha[self.latent[i]] = alpha_ else: alpha[self.latent[i]] = 1 return alpha.T
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo): output = pd.DataFrame(population[item].position) output.columns = ['Split'] dataSplit = pd.concat([data, output], axis=1) f1 = [] results = [] for i in range(nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme, reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_ga(self, item): output = pd.DataFrame(self.population[item].genes) output.columns = ['Split'] dataSplit = pd.concat([self.data, output], axis=1) f1 = [] results = [] for i in range(self.nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme, self.reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_pso(self, item): output = pd.DataFrame(self.population[item].position) output.columns = ['Split'] dataSplit = pd.concat([self.data, output], axis=1) f1 = [] results = [] for i in range(self.nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme, self.reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) print((1 / np.sum(f1))) return (1 / np.sum(f1))
def do_work_tabu(self, item): output = pd.DataFrame(self.population[item]) output.columns = ['Split'] dataSplit = pd.concat([self.data, output], axis=1) f1 = [] results = [] for i in range(self.nclusters): dataSplited = (dataSplit.loc[dataSplit['Split'] == i]).drop('Split', axis=1) dataSplited.index = range(len(dataSplited)) try: results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme, self.reg, 0, 50, HOC='true')) resid = results[i].residuals()[3] f1.append(resid) except: f1.append(10000) cost = (np.sum(f1)) print(1 / cost) return [self.population[item], cost]
def apply(path): data = metadata.load(path) for service in data["services"]: filename = os.path.join(path, service["filename"]) df = load_timeseries(filename, service) print(service) df2 = interpolate_missing(df[service["fields"]]) classes = classify_series(df2) preprocessed_series = {} for k in classes["other_fields"]: # short by one value, because we have to short the other one! preprocessed_series[k] = df2[k][1:] for k in classes["monotonic_fields"]: preprocessed_series[k + "-diff"] = df2[k].diff()[1:] newname = service["name"] + "-preprocessed.tsv.gz" df3 = pd.DataFrame(preprocessed_series) df3.to_csv(os.path.join(path, newname), sep="\t", compression='gzip') service["preprocessed_filename"] = newname service["preprocessed_fields"] = list(df3.columns) service.update(classes) metadata.save(path, data)
def centroids(path): metadata = load_metadata(path) d = {} for srv in metadata["services"]: name = "%s/%s-cluster-1_1.tsv" % (path, srv["name"]) df = pd.read_csv(name, sep="\t", index_col='time', parse_dates=True) d[srv["name"]] = df.centroid df2 = pd.DataFrame(d) df2 = df2.fillna(method="bfill", limit=1e9) df2 = df2.fillna(method="ffill", limit=1e9) fig = df2.plot() handles, labels = fig.get_legend_handles_labels() fig.grid('on') lgd = fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.1)) plt.savefig("graph.png", bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close("all")
def test_pd_outer_join(): dfs = [ pd.DataFrame({ 'id': [0, 1, 2, 3], 'a': ['foo', 'bar', 'baz', np.nan], 'b': ['panda', 'zebra', np.nan, np.nan], }), pd.DataFrame({ 'id': [1, 2, 3, 4], 'b': ['mouse', np.nan, 'tiger', 'egret'], 'c': ['toe', 'finger', 'nose', np.nan], }), ] expected = pd.DataFrame({ 'id': [0, 1, 2, 3, 4], 'a': ['foo', 'bar', 'baz', np.nan, np.nan], 'b': ['panda', 'zebra', np.nan, 'tiger', 'egret'], 'c': [np.nan, 'toe', 'finger', 'nose', np.nan], }).set_index('id') actual = pd_outer_join(dfs, on='id') print(expected) print(actual) assert expected.equals(actual)
def read_image(imagery_path): # Read image dataset = gdal.Open(imagery_path) dsmatrix = dataset.ReadAsArray(xoff=0, yoff=0, xsize=dataset.RasterXSize, ysize=dataset.RasterYSize) # Get Geographic meta data geo_trans_list = dataset.GetGeoTransform() proj_str = dataset.GetProjection() num_bands = dataset.RasterCount # Adapt to one bands or multi-bands if num_bands > 1: # Unfold array into pandas DataFrame rows = dsmatrix.shape[1] cols = dsmatrix.shape[2] data_array = dsmatrix[:,0,:] for irow in range(1,rows): tempmatirx = dsmatrix[:,irow,:] data_array = np.hstack((data_array,tempmatirx)) else: # Unfold array into pandas DataFrame rows = dsmatrix.shape[0] cols = dsmatrix.shape[1] data_array = dsmatrix[0,:] for irow in range(1,rows): tempmatirx = dsmatrix[irow,:] data_array = np.hstack((data_array,tempmatirx)) data_frame = pd.DataFrame(data_array.T) return data_frame, rows, cols, geo_trans_list, proj_str, num_bands
def __init__(self, *args, **kwargs): ''' The same arguments as for pandas.DataFrame https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html data argument should be a list of XSeries objects or dict of XSeries objects. In dict is passed, key must be a string and it's indicate appropriate column name. For example, to create XDataFrame data should looks like data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries ''' data = kwargs.get('data') if data is None: data = args[0] data_to_check = [] if isinstance(data, list): data_to_check = data elif isinstance(data, dict): data_to_check = data.values() for d in data_to_check: if not isinstance(d, XSeries): raise ValueError('All data must be XSeries instances') super(XDataFrame, self).__init__(*args, **kwargs)
def to_pandas_dataframe(self): ''' Convert self to pandas.DataFrame if all columns are primitive types. See more at XSeries.to_pandas_series :return: ''' data_types = self.get_data_types() is_all_columns_are_primitive = all( _is_class_a_primitive(dt) for dt in data_types ) if is_all_columns_are_primitive: self.__class__ = pd.DataFrame else: raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types)) return self
def _create_daily_stats(self, perfs): # create daily and cumulative stats dataframe daily_perfs = [] # TODO: the loop here could overwrite expected properties # of daily_perf. Could potentially raise or log a # warning. for perf in perfs: if 'daily_perf' in perf: perf['daily_perf'].update( perf['daily_perf'].pop('recorded_vars') ) perf['daily_perf'].update(perf['cumulative_risk_metrics']) daily_perfs.append(perf['daily_perf']) else: self.risk_report = perf daily_dts = [np.datetime64(perf['period_close'], utc=True) for perf in daily_perfs] daily_stats = pd.DataFrame(daily_perfs, index=daily_dts) return daily_stats
def _pipeline_output(self, pipeline, chunks): """ Internal implementation of `pipeline_output`. """ today = normalize_date(self.get_datetime()) try: data = self._pipeline_cache.unwrap(today) except Expired: data, valid_until = self._run_pipeline( pipeline, today, next(chunks), ) self._pipeline_cache = CachedObject(data, valid_until) # Now that we have a cached result, try to return the data for today. try: return data.loc[today] except KeyError: # This happens if no assets passed the pipeline screen on a given # day. return pd.DataFrame(index=[], columns=data.columns)
def frame_from_bardata(self, data, algo_dt): """ Create a DataFrame from the given BarData and algo dt. """ data = data._data frame_data = np.empty((len(self.fields), len(self.sids))) * np.nan for j, sid in enumerate(self.sids): sid_data = data.get(sid) if not sid_data: continue if algo_dt != sid_data['dt']: continue for i, field in enumerate(self.fields): frame_data[i, j] = sid_data.get(field, np.nan) return pd.DataFrame( frame_data, index=self.fields.copy(), columns=self.sids.copy(), )
def update_dividends(self, new_dividends): """ Update our dividend frame with new dividends. @new_dividends should be a DataFrame with columns containing at least the entries in zipline.protocol.DIVIDEND_FIELDS. """ # Mark each new dividend with a unique integer id. This ensures that # we can differentiate dividends whose date/sid fields are otherwise # identical. new_dividends['id'] = np.arange( self._dividend_count, self._dividend_count + len(new_dividends), ) self._dividend_count += len(new_dividends) self.dividend_frame = sort_values(pd.concat( [self.dividend_frame, new_dividends] ), ['pay_date', 'ex_date']).set_index('id', drop=False)
def create_test_panel_ohlc_source(sim_params, env): start = sim_params.first_open \ if sim_params else pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc) end = sim_params.last_close \ if sim_params else pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc) index = env.days_in_range(start, end) price = np.arange(0, len(index)) + 100 high = price * 1.05 low = price * 0.95 open_ = price + .1 * (price % 2 - .5) volume = np.ones(len(index)) * 1000 arbitrary = np.ones(len(index)) df = pd.DataFrame({'price': price, 'high': high, 'low': low, 'open': open_, 'volume': volume, 'arbitrary': arbitrary}, index=index) panel = pd.Panel.from_dict({0: df}) return DataPanelSource(panel), panel
def add_frame(self, tick, frame, minor_axis=None, items=None): """ """ if self._pos == self.cap: self._roll_data() if isinstance(frame, pd.DataFrame): minor_axis = frame.columns items = frame.index if set(minor_axis).difference(set(self.minor_axis)) or \ set(items).difference(set(self.items)): self._update_buffer(frame) vals = frame.T.astype(self.dtype) self.buffer.loc[:, self._pos, :] = vals self.date_buf[self._pos] = tick self._pos += 1
def getindexdaily(self,code,start,end): total=[] startdate = datetime.datetime.strptime(start, "%Y-%m-%d") enddate=datetime.datetime.strptime(end, "%Y-%m-%d") series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]} for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"): series["date"].append(stockdaily["date"]) series["open"].append(stockdaily["open"]) series["close"].append(stockdaily["close"]) series["high"].append(stockdaily["high"]) series["low"].append(stockdaily["low"]) series["volume"].append(stockdaily["volume"]) totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume']) df = pd.DataFrame(list(totaldata)) df.index=df.date return df
def read_treasure_from_mongodb(self,start,end): startdate=start enddate=end series={"Time Period":[],"1month":[],"3month":[],"6month":[],"1year":[],"2year":[],"3year":[],"5year":[],"7year":[],"10year":[],"20year":[],"30year":[]} if type(start) is types.StringType: startdate = datetime.datetime.strptime(start, "%Y-%m-%d") if type(end) is types.StringType: enddate=datetime.datetime.strptime(end, "%Y-%m-%d") for treasuredaily in self.treasure['treasure'].find({"Time Period": {"$gte": startdate,"$lt":enddate}}).sort("date"): series["Time Period"].append(treasuredaily["Time Period"]) series["1month"].append(treasuredaily["1month"]) series["3month"].append(treasuredaily["3month"]) series["6month"].append(treasuredaily["6month"]) series["1year"].append(treasuredaily["1year"]) series["2year"].append(treasuredaily["2year"]) series["3year"].append(treasuredaily["3year"]) series["5year"].append(treasuredaily["5year"]) series["7year"].append(treasuredaily["7year"]) series["10year"].append(treasuredaily["10year"]) series["20year"].append(treasuredaily["20year"]) series["30year"].append(treasuredaily["30year"]) totaldata=zip(series["1month"],series["3month"],series["6month"],series["1year"],series["2year"],series["3year"],series["5year"],series["7year"],series["10year"],series["20year"],series["30year"]) df = pd.DataFrame(data=list(totaldata),index=series["Time Period"],columns = ['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year']) return df.sort_index().tz_localize('UTC')
def __init__(self, constants, dates, sids): loaders = {} for column, const in iteritems(constants): frame = DataFrame( const, index=dates, columns=sids, dtype=column.dtype, ) loaders[column] = DataFrameLoader( column=column, baseline=frame, adjustments=None, ) self._loaders = loaders
def test_consume_metadata(self): # Test dict consumption dict_to_consume = {0: {'symbol': 'PLAY'}, 1: {'symbol': 'MSFT'}} self.env.write_data(equities_data=dict_to_consume) finder = self.asset_finder_type(self.env.engine) equity = finder.retrieve_asset(0) self.assertIsInstance(equity, Equity) self.assertEqual('PLAY', equity.symbol) # Test dataframe consumption df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1]) df['asset_name'][0] = "Dave'N'Busters" df['exchange'][0] = "NASDAQ" df['asset_name'][1] = "Microsoft" df['exchange'][1] = "NYSE" self.env = TradingEnvironment(load=noop_load) self.env.write_data(equities_df=df) finder = self.asset_finder_type(self.env.engine) self.assertEqual('NASDAQ', finder.retrieve_asset(0).exchange) self.assertEqual('Microsoft', finder.retrieve_asset(1).asset_name)
def setUp(self): self.env = TradingEnvironment() self.days = self.env.trading_days[:5] self.panel = pd.Panel({1: pd.DataFrame({ 'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 0], 'type': [DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.CLOSE_POSITION]}, index=self.days) }) self.no_close_panel = pd.Panel({1: pd.DataFrame({ 'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 1e9], 'type': [DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE, DATASOURCE_TYPE.TRADE]}, index=self.days) })
def test_bfill(self): # test ndim=1 N = 100 s = pd.Series(np.random.randn(N)) mask = random.sample(range(N), 10) s.iloc[mask] = np.nan correct = s.bfill().values test = bfill(s.values) assert_almost_equal(correct, test) # test ndim=2 df = pd.DataFrame(np.random.randn(N, N)) df.iloc[mask] = np.nan correct = df.bfill().values test = bfill(df.values) assert_almost_equal(correct, test)
def test_ffill(self): # test ndim=1 N = 100 s = pd.Series(np.random.randn(N)) mask = random.sample(range(N), 10) s.iloc[mask] = np.nan correct = s.ffill().values test = ffill(s.values) assert_almost_equal(correct, test) # test ndim=2 df = pd.DataFrame(np.random.randn(N, N)) df.iloc[mask] = np.nan correct = df.ffill().values test = ffill(df.values) assert_almost_equal(correct, test)
def get_expected_next_event_dates(self, dates): return pd.DataFrame({ 0: get_values_for_date_ranges(zip_with_dates, next_dates[0], next_date_intervals[0], dates), 1: get_values_for_date_ranges(zip_with_dates, next_dates[1], next_date_intervals[1], dates), 2: get_values_for_date_ranges(zip_with_dates, next_dates[2], next_date_intervals[2], dates), 3: get_values_for_date_ranges(zip_with_dates, next_dates[3], next_date_intervals[3], dates), 4: zip_with_dates(dates, ['NaT'] * len(dates)), }, index=dates)
def get_expected_previous_event_dates(self, dates): return pd.DataFrame({ 0: get_values_for_date_ranges(zip_with_dates, prev_dates[0], prev_date_intervals[0], dates), 1: get_values_for_date_ranges(zip_with_dates, prev_dates[1], prev_date_intervals[1], dates), 2: get_values_for_date_ranges(zip_with_dates, prev_dates[2], prev_date_intervals[2], dates), 3: get_values_for_date_ranges(zip_with_dates, prev_dates[3], prev_date_intervals[3], dates), 4: zip_with_dates(dates, ['NaT'] * len(dates)), }, index=dates)
def get_vals_for_dates(zip_date_index_with_vals, vals, date_invervals, dates): return pd.DataFrame({ 0: get_values_for_date_ranges(zip_date_index_with_vals, vals[0], date_invervals[0], dates), 1: get_values_for_date_ranges(zip_date_index_with_vals, vals[1], date_invervals[1], dates), 2: get_values_for_date_ranges(zip_date_index_with_vals, vals[2], date_invervals[2], dates), # Assume the latest of 2 cash values is used if we find out about 2 # announcements that happened on the same day for the same sid. 3: get_values_for_date_ranges(zip_date_index_with_vals, vals[3], date_invervals[3], dates), 4: zip_date_index_with_vals(dates, ['NaN'] * len(dates)), }, index=dates)
def test_auto_deltas(self): expr = bz.data( {'ds': self.df, 'ds_deltas': pd.DataFrame(columns=self.df.columns)}, dshape=var * Record(( ('ds', self.dshape.measure), ('ds_deltas', self.dshape.measure), )), ) loader = BlazeLoader() ds = from_blaze( expr.ds, loader=loader, missing_values=self.missing_values, ) self.assertEqual(len(loader), 1) exprdata = loader[ds] self.assertTrue(exprdata.expr.isidentical(expr.ds)) self.assertTrue(exprdata.deltas.isidentical(expr.ds_deltas))
def pipeline_event_loader_args(self, dates): _, mapping = super( BlazeCashBuybackAuthLoaderTestCase, self, ).pipeline_event_loader_args(dates) return (bz.data(pd.concat( pd.DataFrame({ BUYBACK_ANNOUNCEMENT_FIELD_NAME: frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME], CASH_FIELD_NAME: frame[CASH_FIELD_NAME], TS_FIELD_NAME: frame[TS_FIELD_NAME], SID_FIELD_NAME: sid, }) for sid, frame in iteritems(mapping) ).reset_index(drop=True)),)
def file_get_iem_data_frame(path): """ Return the IEM samplesheet data as a Pandas DataFrame, to perform better slicing operations. """ rows = read_csv_rows(path) if not rows_are_iem_samplesheet(rows): raise ValueError("Invalid IEM samplesheet format: %s" % path) section_gen = rows_iem_section_generator(rows) for section in section_gen: if section_is_valid_data(section): # TODO this appears to be a problem if you have data columns # with trailing all-blank entries (see CSI-215 fix) df = pd.DataFrame(data=section.rows[1:], columns=section.rows[0]) # skip tailing rows return df[df['Sample_ID'].notnull()] raise ValueError("Invalid IEM samplesheet format, no data found: %s" % path)
def gen_csv_paths(data_dir, pref): """ Generate CSV file from image, contour, and segment file paths. Args: data_dir: BBBC006 data directory path. pref: Prefix (either 'train' or 'test') """ filenames = get_png_files(os.path.join(data_dir, 'BBBC006_v1_' + pref)) contours = get_png_files(os.path.join(data_dir, 'BBBC006_v1_contours_' + pref)) segments = get_png_files(os.path.join(data_dir, 'BBBC006_v1_segments_' + pref)) all_files = [filenames, contours, segments] pd_arr = pd.DataFrame(all_files).transpose() pd_arr.to_csv(pref + '.csv', index=False, header=False)
def kraken_order_book(book_type: str, currency_code: str = 'EUR', coin_code: str = 'XBT'): """Kraken specific orderbook retrieval """ import krakenex kraken_api = krakenex.API(key=KRAKEN_API_KEY, secret=KRAKEN_PRIVATE_KEY, conn=krakenex.Connection()) pair = f'X{coin_code}Z{currency_code}' orders = kraken_api.query_public('Depth', {'pair': pair}) df = pd.DataFrame( orders['result'][pair][book_type], columns=['price', 'volume', 'timestamp']) return df
def get_irrelevant_cited_papers(bad_papers, db_cursor, papers_table='papers'): """Retrieves the papers cited by the irrelevant papers given in input, from a SQL database. Args: bad_papers (list of dicts): the list of irrelevant papers, formatted as the output of :func:`data_retrieval.list2paper` db_cursor (:class:`MySQLdb.cursors.Cursor`): cursor of a SQL database in which there is a papers table papers_table (string): name of the papers table in the SQL database Returns: tuple of tuples: the results of the SQL query """ citations = [] for p in bad_papers: for c in p['citations']: citations.append([p['index'], c]) citations_df = pd.DataFrame(citations, columns=['citing', 'cited']) cited = citations_df['cited'].unique() db_cursor.execute("SELECT id, title, abstract FROM papers p WHERE p.abstract != '' AND p.id IN (" + ','.join(["%s"] * len(cited)) + ")", tuple(cited)) return db_cursor.fetchall()
def parse_fasta(self): self.ref_id=dict() self.ref_inf=dict() i=1 N = 0 ref_inf=np.empty(shape=[0,3]) for seqs in SeqIO.parse(self.ref,'fasta'): seq_id = seqs.id self.ref_id[i] = seq_id seq = str(seqs.seq.upper()) seq_len = len(seq) self.ref_inf[seq_id]=seq_len N+=seq.count('N') ref_inf = np.append(ref_inf,[[i,seq_id,seq_len]],axis=0) i+=1 self.ref_detail = pd.DataFrame(ref_inf,columns=['Index','Contig','Length(bp)']) self.N = N
def qualification_filter(self): """ Providing information of those unqualified and qualified contigs from the orginal fasta file with the criterion: >20Kb & >=5 restriction sites inside. """ unqualified = np.empty(shape=[0,3]) qualified = np.empty(shape=[0,4]) rm_dup = self.RcmapTable[['CMapId','ContigLength','NumSites']].drop_duplicates() for i in self.ref_id.keys(): index = i name = self.ref_id[i] length = self.ref_inf[name] if i not in self.RcmapTable['CMapId'].unique(): unqualified = np.append(unqualified,[[index,name, length]],axis=0) else: Id = rm_dup[rm_dup['CMapId']==i].index[0] sites = rm_dup['NumSites'][Id] qualified = np.append(qualified,[[index,name,length,sites]],axis=0) self.unqualified = pd.DataFrame(unqualified, columns=['index','contig','length(bp)']) self.qualified = pd.DataFrame(qualified, columns=['index','contig','length(bp)','numSites'])
def test_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({'x': np.random.randint(0, 5, size=n), 'y': np.random.normal(size=n)}) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) # Combine with .append head = frags[0] tail = frags[1:] appended = dgd.from_pygdf(head, npartitions=1) for each in tail: appended = appended.append(each) assert_frame_equal(df, appended.compute().to_pandas())
def test_series_append(): np.random.seed(0) n = 1000 df = pd.DataFrame({'x': np.random.randint(0, 5, size=n), 'y': np.random.normal(size=n)}) gdf = gd.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) frags = [df.x for df in frags] appending = dgd.from_pygdf(frags[0], npartitions=1) for frag in frags[1:]: appending = appending.append(frag) appended = appending.compute().to_pandas() assert isinstance(appended, pd.Series) np.testing.assert_array_equal(appended, df.x)
def test_set_index(nelem): np.random.seed(0) # Use unique index range as the sort may not be stable-ordering x = np.arange(nelem) np.random.shuffle(x) df = pd.DataFrame({'x': x, 'y': np.random.randint(0, nelem, size=nelem)}) ddf = dd.from_pandas(df, npartitions=2) dgdf = dgd.from_dask_dataframe(ddf) expect = ddf.set_index('x').compute() got = dgdf.set_index('x').compute().to_pandas() np.testing.assert_array_equal(got.index.values, expect.index.values) np.testing.assert_array_equal(got.y.values, expect.y.values) assert got.columns == expect.columns
def test_groupby_single_key(keygen): np.random.seed(0) nelem = 500 npartitions = 10 # Generate the keys xs = keygen(nelem) assert xs.size == nelem df = pd.DataFrame({'x': xs, 'z': np.random.normal(size=nelem) + 1}) gdf = gd.DataFrame.from_pandas(df) dgf = dgd.from_pygdf(gdf, npartitions=npartitions) groups = dgf.groupby(by=['x']).count() got = groups.compute().to_pandas() # Check against expectation expect = df.groupby(by=['x'], as_index=False).count() # Check keys np.testing.assert_array_equal(got.x, expect.x) # Check values np.testing.assert_array_equal(got.z, expect.z)
def store_test_predictions(self, prediction_id='_final'): """ Stores the test predictions in a CSV file :param prediction_id: A simple id appended to the name of the summary for uniqueness :return: None """ # prediction id is usually the step count print 'Storing predictions on Test Data...' review = [] true_summary = [] generated_summary = [] for i in range(self.test_size): if not self.checkpointer.is_output_file_present(): review.append(self._index2sentence(self.test_review[i])) true_summary.append(self._index2sentence(self.true_summary[i])) if i < (self.test_batch_size * (self.test_size // self.test_batch_size)): generated_summary.append(self._index2sentence(self.predicted_test_summary[i])) else: generated_summary.append('') prediction_nm = 'generated_summary' + prediction_id if self.checkpointer.is_output_file_present(): df = pd.read_csv(self.checkpointer.get_result_location(), header=0) df[prediction_nm] = np.array(generated_summary) else: df = pd.DataFrame() df['review'] = np.array(review) df['true_summary'] = np.array(true_summary) df[prediction_nm] = np.array(generated_summary) df.to_csv(self.checkpointer.get_result_location(), index=False) print 'Stored the predictions. Moving Forward' if prediction_id == '_final': print 'All done. Exiting..' print 'Exited'
def crawl_for_reviews_and_summary(self, input_file): """ Crawl the input dataset :param input_file: The location of the file containing the txt file dataset :return: None """ self.raw_data_file = input_file self.df = pd.DataFrame() self.df['Review'] = self.__crawl_review() self.df['Summary'] = self.__crawl_summary()
def pearson(X, y): r = [] p = [] for c in X.columns: r_, p_ = pearsonr(X[c], y) r.append(r_) p.append(p_) dfr = pd.DataFrame(index=range(1, 1+len(X.columns))) dfr['pearson'] = r dfr['pearson_p'] = p return dfr
def kolmogorov_smirnov(x_train, x_test): r = [] p = [] for c in x_train.columns: r_, p_ = ks_2samp(x_train[c], x_test[c]) r.append(r_) p.append(p_) dfks = pd.DataFrame(index=range(1, 1 + len(x_train.columns))) dfks['KS'] = r dfks['KS_p'] = p return dfks