我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.notnull()。
def parse_psqs(psqs_results_file): """Parse a PSQS result file and returns a Pandas DataFrame of the results Args: psqs_results_file: Path to psqs results file Returns: Pandas DataFrame: Summary of PSQS results """ # TODO: generalize column names for all results, save as dict instead psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None) psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb')) psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1) psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan) psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan) psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)] return psqs_results
def trace_serializer(trace): data = OrderedDict([ ("type", "ARBITRARY_START"), ("interpretation", trace.interpretation), ("unit", trace.unit), ("trace_id", trace.trace_id), ("interval", trace.interval), ("records", [ OrderedDict([ ("start", start.isoformat()), ("value", record.value if pd.notnull(record.value) else None), ("estimated", bool(record.estimated)), ]) for start, record in trace.data.iterrows() ]), ]) return data
def clean_and_write_dataframe_to_csv(data, filename): """ Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv :param data: data to write to CSV :type data: :class:`pandas.DataFrame` :param filename: Path to file to write CSV to. if None, string of data will be returned :type filename: str | None :return: If the filename is None, returns the string of data. Otherwise returns None. :rtype: str | None """ # cleans np.NaN values data = data.where((pd.notnull(data)), None) # If filename=None, to_csv will return a string result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None, skipinitialspace=True, quoting=csv.QUOTE_ALL) logging.info("Dataframe of shape %s has been stored." % str(data.shape)) return result
def element_to_bdsim(e): """Convert a pandas.Series representation onto a BDSim sequence element.""" bdsim = "" if e.KEYWORD in ['MARKER', 'INSTRUMENT']: bdsim = "{}: {};".format(e.name.replace('$', ''), "marker") if e.KEYWORD in ['DRIFT', 'QUADRUPOLE', 'RBEND', 'SBEND']: bdsim = "{}: {}, l={}*m".format(e.name.replace('$', ''), e.KEYWORD.lower(), e.L) if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']): bdsim += f",angle=-{e['BENDING_ANGLE']}" elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']): bdsim += f",angle=-{e.get('ANGLE', 0)}" else: # Angle property not supported by the element or absent bdsim += "" #if pd.notnull(e['APERTYPE']): # bdsim += ", aperture={}*m".format(str(e['APERTURE']).strip('[]')) if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')): bdsim += ", {}={{{{ {} or '0.0' }}}}".format(e['PLUG'].lower(), e['CIRCUIT']) bdsim += ';' return bdsim
def element_to_mad(e): """Convert a pandas.Series representation onto a MAD-X sequence element.""" if e.CLASS not in SUPPORTED_CLASSES: return "" mad = "{}: {}, ".format(e.name, e.CLASS) if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']): mad += f"ANGLE={e['BENDING_ANGLE']}," elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']): mad += f"ANGLE={e.get('ANGLE', 0)}," else: # Angle property not supported by the element or absent mad += "" mad += ', '.join(["{}={}".format(p, e[p]) for p in SUPPORTED_PROPERTIES if pd.notnull(e.get(p, None))]) if pd.notnull(e['LENGTH']) and e['LENGTH'] != 0.0: mad += ", L={}".format(e['LENGTH']) if pd.notnull(e.get('APERTYPE', None)): mad += ", APERTURE={}".format(str(e['APERTURE']).strip('[]')) if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')) and pd.isnull(e.get('VALUE')): mad += ", {}:={}".format(e['PLUG'], e['CIRCUIT']) if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('VALUE')): mad += ", {}={}".format(e['PLUG'], e['VALUE']) mad += ", AT={}".format(e['AT_CENTER']) mad += ";" return mad
def get_sec_spt(row): """ Get the secondary spectral type from the information we have. Meant to be called as the `apply` method of a pandas DataFrame. """ if pd.notnull(row['Sp2']): return row['Sp2'] elif pd.notnull(row['Sp1']) and pd.notnull(row['mag1']) and pd.notnull(row['mag2']): # TODO: Do better than assuming V band! band = 'V' absmag_prim = MS.GetAbsoluteMagnitude(row['Sp1'], color=band) dm = float(row['mag1']) - absmag_prim absmag_sec = float(row['mag2']) - dm return MS.GetSpectralType_FromAbsMag(absmag_sec, color=band)[0] elif pd.notnull(row['Sp1']) and pd.notnull(row['K1']) and pd.notnull(row['K2']): mass = MS.Interpolate('mass', row['Sp1']) q = float(row['K1']) / float(row['K2']) sec_mass = q * mass return MS.GetSpectralType('mass', sec_mass)[0] else: print(row) raise ValueError('Must give enough information to figure out the spectral type!')
def series_is_datetime(series: pd.Series, check_num: int=5, dropna: bool=True): """ Checks random rows in a Series comparing rows that coerce to datetime. :param series: :param check_num: :param dropna: :return: """ if dropna: series = series.dropna(axis=0) got, lost = 0, 0 size = (check_num if series.index.size > check_num else series.index.size) if size > 0: checks = np.random.randint(0, high=series.index.size, size=size) for x in series[checks].tolist(): try: x = pd.Timestamp(x) if pd.notnull(x): got += 1 except (ValueError, OverflowError): lost += 1 return got > lost
def to_csv(self, filepath='hypothesis/SGD_hypothesis_header.csv'): df = pd.DataFrame() df = pd.concat([df, pd.DataFrame([['depth', self.depth]])], ignore_index=True) df = pd.concat([df, pd.DataFrame([['sizes'] + [self.input_size+1] \ + [hidden_size+1 for hidden_size in self.hidden_sizes] \ + [self.output_size]])], ignore_index=True) for i, weight in enumerate(self.best_weights): df = pd.concat([df, pd.DataFrame([['W_{}'.format(i)] + weight.T.flatten().tolist()])], ignore_index=True) # Fill nan with None[] df = df.where((pd.notnull(df)), None) # Since pd.to_csv converts int to float if there's `None` in the same row, # we need to handle this. with open(filepath, 'w') as f: for row in range(df.shape[0]): for col in range(df.shape[1]): if (row == 0 and col != 0) or (row == 1 and col != 0): val = int(df[col][row]) if df[col][row] is not None else '' else: val = df[col][row] if df[col][row] is not None else '' f.writelines('{},'.format(val)) if row != df.shape[0]-1: f.writelines('\n')
def execute_internal(self, context, **kwargs): """ the internal execution process to be implemented :param context: :param kwargs: :return: """ df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv') # Process projection on the dataset to get our interested attributes df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']] # Filter out records with *NAN* title_year and budget df = df[pd.notnull(df['title_year'])] df = df[df['budget'] > 0] # Extract the genres ROOT df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0]) return df
def Join_Inputs(df, df_betas, df_ff_params, df_liq_prox): # add beta values & set index to datetime from df_diff df = pd.merge(df, df_betas, left_on='cusip_id', right_on='cusip_id', left_index=True) df['trd_exctn_dt_idx'] = pd.to_datetime(df['trd_exctn_dt'],\ format='%Y%m%d') df.set_index('trd_exctn_dt_idx', inplace=True) # Join with fama-french factors on date index df_join_ff = df.join(df_ff_params, lsuffix="_m", rsuffix='_b') # Drop any rows where dates in df_diff do not appear in fama-french df_join_ff = df_join_ff[pd.notnull(df_join_ff['MKT_b'])] # Combine liquidity factor L_t df_liq_prox_values = df_liq_prox['residual_term'] df_join_liq = df_join_ff.join(df_liq_prox_values) df_join_liq = df_join_liq[pd.notnull(df_join_liq['residual_term'])] return df_join_liq
def test_decide_from_contexts_df_null_decisions(): tree = CLIENT.get_decision_tree(AGENT_ID, COMPLEX_AGENT_DATA.last_valid_index().value // 10 ** 9) test_df = pd.DataFrame( [ ["Jean-Pierre", "+02:00"], ["Paul"] ], columns=["b", "tz"], index=pd.date_range("20130201", periods=2, freq="D")) df = CLIENT.decide_from_contexts_df(tree, test_df) assert_equal(len(df), 2) assert pd.isnull(df["a_predicted_value"][0]) assert pd.notnull(df["error"][0]) assert pd.notnull(df["a_predicted_value"][1]) assert pd.isnull(df["error"][1])
def add_operations(self, agent_id, operations): if isinstance(operations, pd.DataFrame): if not isinstance(operations.index, pd.DatetimeIndex): raise CraftAiBadRequestError("Invalid dataframe given, it is not time indexed") chunk_size = self.config["operationsChunksSize"] for chunk in chunker(operations, chunk_size): chunk_operations = [ { "timestamp": row.name.value // 10 ** 9, # Timestamp.value returns nanoseconds "context": { col: row[col] for col in operations.columns if pd.notnull(row[col]) } } for _, row in chunk.iterrows() ] super(Client, self).add_operations(agent_id, chunk_operations) return { "message": "Successfully added %i operation(s) to the agent \"%s/%s/%s\" context." % (len(operations), self.config["owner"], self.config["project"], agent_id) } else: return super(Client, self).add_operations(agent_id, operations)
def decide_from_row(tree, columns, row): time = Time( t=row.name.value // 10 ** 9, # Timestamp.value returns nanoseconds timezone=row.name.tz ) context = { col: row[col] for col in columns if pd.notnull(row[col]) } try: decision = VanillaInterpreter.decide(tree, [context, time]) keys, values = zip(*[ (output + "_" + key, value) for output, output_decision in decision["output"].items() for key, value in output_decision.items() ]) return pd.Series(data=values, index=keys) except CraftAiNullDecisionError as e: return pd.Series(data=[e.message], index=["error"])
def _calculate_geographic_nullity(geo_group, x_col, y_col): """ Helper method which calculates the nullity of a DataFrame. Factored out of and used within `geoplot`. """ # Aggregate by point and fetch a list of non-null coordinate pairs, which is returned. point_groups = geo_group.groupby([x_col, y_col]) points = [point for point in point_groups.groups.keys() if pd.notnull(point[0]) and pd.notnull(point[1])] # Calculate nullities by location, then take their average within the overall feature. counts = np.sum(point_groups.count().values, axis=1) entries = point_groups.size() width = len(geo_group.columns) # Remove empty (NaN, NaN) points. if len(entries) > 0: # explicit check to avoid a Runtime Warning geographic_nullity = np.average(1 - counts / width / entries) return points, geographic_nullity else: return points, np.nan
def _get_hd_args(path, high_dim_node, annotation): """ Create dict with meta tags that belong to a certain high dimensional node. """ map_file = high_dim_node.sample_mapping s = map_file.slice_path(path).iloc[:, 5].unique() t = map_file.slice_path(path).iloc[:, 6].unique() hd_args = {'hd_sample': ', '.join(s.astype(str)) if pd.notnull(s[0]) else '', 'hd_tissue': ', '.join(t.astype(str)) if pd.notnull(t[0]) else '', 'hd_type': Mappings.annotation_data_types.get(high_dim_node.params.datatype), } if annotation: hd_args.update({'pl_marker_type': annotation.marker_type, 'pl_genome_build': annotation.params.get('GENOME_RELEASE', ''), 'pl_title': annotation.params.get('TITLE', ''), 'pl_id': annotation.platform}) return hd_args
def extract_days(input_delta): """ Helper function to extract the number of days from a time delta. Returns: - Number of days, if valid time delta - np.NaN if time delta is null or invalid :param input_delta: :return: number of days in time delta :rtype: float """ # Attempt to coerce into Pandas time delta delta = pd.Timedelta(input_delta) # Attempt to extract number of days days = np.NaN if pd.notnull(delta): days = delta.days # Return result return days
def kama(self, efficiency_ratio_periods=10, ema_fast=2, ema_slow=30, period=20, column='adj_close'): er = self._efficiency_ratio_computation( period=efficiency_ratio_periods, column=column) fast_alpha = 2 / (ema_fast + 1) slow_alpha = 2 / (ema_slow + 1) smoothing_constant = pd.Series( (er * (fast_alpha - slow_alpha) + slow_alpha) ** 2, name='smoothing_constant') sma = pd.Series(self.ohlcv[column].rolling(period).mean(), name='SMA') kama = [] for smooth, ma, price in zip(iter(smoothing_constant.items()), iter(sma.shift(-1).items()), iter(self.ohlcv[column].items())): try: kama.append(kama[-1] + smooth[1] * (price[1] - kama[-1])) except: if pd.notnull(ma[1]): kama.append(ma[1] + smooth[1] * (price[1] - ma[1])) else: kama.append(None) sma['KAMA'] = pd.Series(kama, index=sma.index, name='{} days KAMA Ticker {}'.format(period, self.ticker)) yield sma['KAMA']
def KAMA(cls, ohlc, er=10, ema_fast=2, ema_slow=30, period=20): """Developed by Perry Kaufman, Kaufman's Adaptive Moving Average (KAMA) is a moving average designed to account for market noise or volatility. Its main advantage is that it takes into consideration not just the direction, but the market volatility as well.""" er = cls.ER(ohlc, er) fast_alpha = 2 / (ema_fast + 1) slow_alpha = 2 / (ema_slow + 1) sc = pd.Series((er * (fast_alpha - slow_alpha) + slow_alpha)**2, name="smoothing_constant") ## smoothing constant sma = pd.Series(ohlc["close"].rolling(period).mean(), name="SMA") ## first KAMA is SMA kama = [] # Current KAMA = Prior KAMA + smoothing_constant * (Price - Prior KAMA) for s, ma, price in zip(sc.iteritems(), sma.shift().iteritems(), ohlc["close"].iteritems()): try: kama.append(kama[-1] + s[1] * (price[1] - kama[-1])) except: if pd.notnull(ma[1]): kama.append(ma[1] + s[1] * (price[1] - ma[1])) else: kama.append(None) sma["KAMA"] = pd.Series(kama, index=sma.index, name="{0} period KAMA.".format(period)) ## apply the kama list to existing index return sma["KAMA"]
def markGaps(self): """Produces dictionary of list of gaps in time series data based on the presence of nan values; used for gantt plotting :returns: dateranges; a dictionary with station names as keys and lists of begin and end dates as values """ df = self.data stations = self.stations dateranges = {} for station in stations: dateranges[station] = [] first = df.ix[:, station].first_valid_index() last = df.ix[:, station].last_valid_index() records = df.ix[first:last, station] #dateranges[station].append(pd.to_datetime(first)) for i in range(len(records) - 1): if pd.isnull(records[i + 1]) and pd.notnull(records[i]): dateranges[station].append(pd.to_datetime(records.index)[i]) elif pd.isnull(records[i]) and pd.notnull(records[i + 1]): dateranges[station].append(pd.to_datetime(records.index)[i]) dateranges[station].append(pd.to_datetime(last)) return dateranges
def update_last_known_values(self): """ Store the non-NaN values from our oldest frame in each frequency. """ ffillable = self.ffillable_fields if not len(ffillable): return for frequency in self.unique_frequencies: digest_panel = self.digest_panels.get(frequency, None) if digest_panel: oldest_known_values = digest_panel.oldest_frame(raw=True) else: oldest_known_values = self.buffer_panel.oldest_frame(raw=True) oldest_vals = oldest_known_values oldest_columns = self.fields for field in ffillable: f_idx = oldest_columns.get_loc(field) field_vals = oldest_vals[f_idx] # isnan would be fast, possible to use? non_nan_sids = np.where(pd.notnull(field_vals)) key = (frequency.freq_str, field) key_loc = self.last_known_prior_values.index.get_loc(key) self.last_known_prior_values.values[ key_loc, non_nan_sids ] = field_vals[non_nan_sids]
def uniprot_reviewed_checker(uniprot_id): """Check if a single UniProt ID is reviewed or not. Args: uniprot_id: Returns: bool: If the entry is reviewed """ query_string = 'id:' + uniprot_id uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab')) uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0) uni_rev_df = uni_rev_df.fillna(False) uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)] uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True) uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False) uni_rev_dict_adder = uni_rev_df.to_dict()['Status'] return uni_rev_dict_adder[uniprot_id]
def processData(data): df = pd.DataFrame.transpose(pd.read_json(json.dumps(data))) df = df.dropna(subset = [key for key in df.keys() if "x_" in key]) df = df[pd.notnull(df['y_observed'])] X = df[[key for key in df.keys() if "x_" in key]].values y = df["y_observed"].values return X, y # 5th: initial model
def pre_processData(train_data,file_path): train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age) # ??????????? train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no' '''0/1????''' dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin') # get_dummies?????0/1??????????????prefix???Cabin dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked') dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass') train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1) # ??dataframe,axis=1?? train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True) # ???????????? header_string = ','.join(train_data.columns.tolist()) # ?????string??????? np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string) # ????????????? '''???????(Age?Fare)''' scaler = StandardScaler() age_scaler = scaler.fit(train_data['Age']) train_data['Age'] = age_scaler.fit_transform(train_data['Age']) if np.sum(train_data.Fare.isnull()): # ??Fare??????????? train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare) fare_scaler = scaler.fit(train_data['Fare']) train_data['Fare'] = fare_scaler.transform(train_data['Fare']) header_string = ','.join(train_data.columns.tolist()) # ?????string??????? np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string) # ????????????? return train_data ## feature engineering?????-?????
def generate_tokens(table, key_attr, join_attr, tokenizer): table_nonnull = table[pd.notnull(table[join_attr])] return dict(zip(table_nonnull[key_attr], table_nonnull[join_attr].apply(tokenizer.tokenize)))
def preprocess_data(path, is_test=False): data = pd.read_csv(path, index_col='PassengerId') data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True) if is_test: data = data.replace([None], [0]) else: data = data[pd.notnull(data['Age'])] data = data[pd.notnull(data['Embarked'])] data.replace(["female", "male"], [0, 1], inplace=True) data.replace(["Q", "C", "S"], [0, 1, 2], inplace=True) if "Survived" in data: data = data[pd.notnull(data['Survived'])] data_norm = (data - data.mean()) / (data.max() - data.min()) return data_norm
def plot_facet(self, data, color, **kwargs): x = kwargs.get("x") y = kwargs.get("y") levels_x = kwargs.get("levels_x") levels_y = kwargs.get("levels_y") #num = [] #date = [] #time = data[self._time_column] #num = data[self._time_column].apply(self.convert_to_datetime) #date = data[self._time_column].apply(self.convert_to_timeseries) #if pd.isnull(num).sum() <= pd.isnull(date).sum(): #data[self._time_column] = num #else: #data[self._time_column] = date #data.dropna(inplace=True) #if len(self._groupby) == 2: #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) #ct = ct[pd.notnull(ct.index)] #else: #ct = pd.crosstab( #data[self._time_column], #pd.Series([""] * len(self._table[self._time_column]), name="")) ## Line plot: #self.vmax = max(self.vmax, ct.values.max()) #ct.plot(ax=plt.gca(), color=self.get_palette())
def plot_facet(self, data, color, **kwargs): x = kwargs.get("x") y = kwargs.get("y") levels_x = kwargs.get("levels_x") levels_y = kwargs.get("levels_y") #num = [] #date = [] #time = data[self._time_column] #num = data[self._time_column].apply(self.convert_to_datetime) #date = data[self._time_column].apply(self.convert_to_timeseries) #if pd.isnull(num).sum() <= pd.isnull(date).sum(): #data[self._time_column] = num #else: #data[self._time_column] = date #data.dropna(inplace=True) #if len(self._groupby) == 2: #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) #ct = ct[pd.notnull(ct.index)] #else: #ct = pd.crosstab( #data[self._time_column], #pd.Series([""] * len(self._table[self._time_column]), name="")) ## Stacked area plot: #if len(self._groupby) == 2: #self.vmax = max(self.vmax, ct.apply(sum, axis=1).max()) #ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
def plot_facet(self, data, color, **kwargs): x = kwargs.get("x") y = kwargs.get("y") levels_x = kwargs.get("levels_x") levels_y = kwargs.get("levels_y") #num = [] #date = [] #time = data[self._time_column] #num = data[self._time_column].apply(self.convert_to_datetime) #date = data[self._time_column].apply(self.convert_to_timeseries) #if pd.isnull(num).sum() <= pd.isnull(date).sum(): #data[self._time_column] = num #else: #data[self._time_column] = date #data.dropna(inplace=True) #if len(self._groupby) == 2: #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]]) #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0) #ct = ct[pd.notnull(ct.index)] #else: #ct = pd.crosstab( #data[self._time_column], #pd.Series([""] * len(self._table[self._time_column]), name="")) ## percentage area plot: ## if there is only one grouping variable (the time column), ## the cross table produces a Series, not a data frame. It ## isn't really very informative to plot it, but we provide ## for this special case anyway_ #if type(ct) == pd.Series: #ct = ct.apply(lambda x: 100) #else: #ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1) #ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
def _save_series(self, series): data = [ [ d.strftime(self.cache_date_format), t if pd.notnull(t) else None ] for d, t in series.iteritems() ] self.json_store.save_json(self._get_cache_key(), data)
def save_series(self, year, series): key = self._get_cache_key(year) data = [ [ d.strftime(self.cache_date_format), t if pd.notnull(t) else None ] for d, t in series.iteritems() ] self.json_store.save_json(key, data)
def yield_records(self, sorted_records): n = len(sorted_records) for i, record in enumerate(sorted_records): self.validate_record(record) start = record["start"] value = record["value"] estimated = record.get("estimated", False) if i < n - 1: # all except last record yield (start, value, estimated) else: # last record end = record.get("end", None) if end is None: # can't use the value of this record, no end date yield (start, np.nan, False) else: self._validate_record_start_end(record, start, end) # provide an end date cap if pd.notnull(value): yield (start, value, estimated) yield (end, np.nan, False) else: yield (start, np.nan, False)
def serialize_input(self, input_data): ''' Serialize input data ''' return OrderedDict([ (start.isoformat(), OrderedDict([ ("energy", row.energy if pd.notnull(row.energy) else None), ("tempF", row.tempF if pd.notnull(row.tempF) else None), ])) for start, row in input_data.iterrows() ])
def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'): """ Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None :param str filename_or_stream: path to CSV :return: """ # pulls data in as utf8, all as strings, and without pre whitespace padding try: data = pd.read_csv( filepath_or_buffer=filename_or_stream, encoding=encoding, dtype=str, skipinitialspace=True ) except AttributeError: # this is an empty dataframe and pandas crashed because it can't coerce the columns to strings # issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048 # slated for 1.8 release # so for now just try loading the dataframe without specifying dtype data = pd.read_csv( filepath_or_buffer=filename_or_stream, encoding=encoding, skipinitialspace=True ) logging.info('File read via the pandas read_csv methodology.') # coerces pandas nulls (of np.NaN type) into python None data = data.where((pd.notnull(data)), None) # coerces string representations of Python None to a real Python None data[data == 'None'] = None data[data == ''] = None logging.info("Dataframe of shape %s has been retrieved." % str(data.shape)) return data
def __init__(self, estimates, name_map): validate_column_specs( estimates, name_map ) self.estimates = estimates[ estimates[EVENT_DATE_FIELD_NAME].notnull() & estimates[FISCAL_QUARTER_FIELD_NAME].notnull() & estimates[FISCAL_YEAR_FIELD_NAME].notnull() ] self.estimates[NORMALIZED_QUARTERS] = normalize_quarters( self.estimates[FISCAL_YEAR_FIELD_NAME], self.estimates[FISCAL_QUARTER_FIELD_NAME], ) self.array_overwrites_dict = { datetime64ns_dtype: Datetime641DArrayOverwrite, float64_dtype: Float641DArrayOverwrite, } self.scalar_overwrites_dict = { datetime64ns_dtype: Datetime64Overwrite, float64_dtype: Float64Overwrite, } self.name_map = name_map self._columns = set(name_map.keys())
def update_dataframe_to_be_none_instead_of_nan_for_api_responses(df): df = df.where((pd.notnull(df)), None) return df
def get_sorted_response(series): if series.dropna().empty: return NO_DATA_RESPONSE # Do a odd sorted tuple response because Javascript sorting is an oddly difficult problem # sorted_response = [item for item in series.iteritems()] sorted_response = [] for index, value in series.iteritems(): if not pd.notnull(value): value = None data_point = (index, value) sorted_response.append(data_point) return Response(sorted_response)
def test_api_categorization_sort(app, sort_by): n_categories = 2 dsid, lsi_id, _, ds_input = get_features_lsi_cached(app, n_categories=n_categories) method = V01 + "/feature-extraction/{}".format(dsid) data = app.get_check(method) training_set = ds_input['training_set'] pars = { 'parent_id': lsi_id, 'data': training_set, 'method': 'NearestNeighbor'} method = V01 + "/categorization/" data = app.post_check(method, json=pars) mid = data['id'] method = V01 + "/categorization/{}/predict".format(mid) data = app.get_check(method, json={'batch_id': -1, "sort_by": sort_by}) res = [] for row in data['data']: res_el = {'document_id': row['document_id']} for scores in row['scores']: res_el[scores['category']] = scores['score'] res.append(res_el) df = pd.DataFrame(res) df = df.set_index('document_id') if sort_by in df.columns: mask = pd.notnull(df[sort_by]) assert_array_equal(df[mask].index.values, df[mask].sort_values(sort_by, ascending=False).index.values)
def _prepare_data(self): """ Subset the dataframe to the columns needed for estimation purposes, and add a constant. :return: pd.DataFrame """ # Subset the data to the columns used in the model data = self.data[self.varlist].copy() data = data[pd.notnull(data)].reset_index(drop=True) # Mapping each variable name to a unique variable code, and renaming the columns in the data.) data.rename(columns=self._var_to_symb, inplace=True) # Adding a constant to the data. data["Cons"] = 1 if self.options["logit"]: endog = data["y"] uniques = np.unique(endog) if len(uniques) != 2: raise ValueError( "The dependent variable does not have exactly two distinct outcomes." "Please provide another dataset or change the 'logit' option to 0") else: endog_logit = [0 if i == uniques[0] else 1 for i in endog] data["y"] = endog_logit return data
def remove_random_nan(pd_obj): return pd_obj.where((pd.notnull(pd_obj)), None)
def split_by_component(df): df['prim_comp'] = df.Comp.map(lambda s: s[0]) df['sec_comp'] = df.Comp.map(lambda s: s[-1]) comps = pd.concat((df[['prim_comp', 'Sp1']], df[['sec_comp', 'Sp2']])) prim = comps.loc[comps.prim_comp.notnull()].rename(columns={'Sp1': 'SpT', 'prim_comp': 'comp'}) sec = comps.loc[comps.sec_comp.notnull()].rename(columns={'Sp2': 'SpT', 'sec_comp': 'comp'}) return pd.concat((prim, sec))[['comp', 'SpT']].drop_duplicates(subset='comp')
def add_committee(): df = pandas.DataFrame.from_csv('data/mp-en.csv', header=0, index_col=False) df = df.where((pandas.notnull(df)), None) MPs = df.to_dict(orient='records') for mp in MPs: if mp['committee_memberships']: committees = [committee.strip() for committee in mp['committee_memberships'].split(',')] person_id = utils.hluttaw_to_popitid(mp['identifier__hluttaw'], base_url) on_behalf_of_id = utils.org_name_to_popitid(mp['group'],base_url) for org in committees: payload = {} payload['person_id'] = person_id payload['organization_id'] = utils.org_name_to_popitid(org,base_url) payload['on_behalf_of_id'] = on_behalf_of_id payload['role'] = 'Committee Member' payload['start_date'] = mp['start_date'] url = base_url + '/en/memberships' r = requests.post(url,headers=headers,json=payload) print r.content
def update_my(): lang = 'my' df = pandas.DataFrame.from_csv('data/mp-my.csv', header=1, index_col=False) df = df.where((pandas.notnull(df)), None) MPs = df.to_dict(orient='records') for mp in MPs: hluttaw_id = mp['identifier__hluttaw'] popit_id = utils.hluttaw_to_popitid(hluttaw_id, base_url) print hluttaw_id print popit_id if popit_id: url = base_url + "/" + lang + "/persons/" + popit_id honorific_prefix = mp['honorific_prefix'] name = mp['name'] gender = mp['gender'] national_identity = mp['national_identity'] payload = { 'honorific_prefix': honorific_prefix, 'name': name, 'gender': gender, 'national_identity': national_identity, } r = requests.put(url, headers=headers, json=payload) print r.content
def not_null(x): return notnull(x) and str(x).lower() not in NULL_VALUES
def nan_coerce(x): v = str(x) if pd.notnull(v) is False or v in NAN_LIST: return np.nan return x
def remove_line_breaks(x): x = (str(x) if pd.notnull(x) else '') for b in LINE_BREAKS_LIST_RX: x = b.sub(" ", x) return string_blank_na(x.lstrip().rstrip())
def test_longpanel_series_combo(self): wp = tm.makePanel() lp = wp.to_frame() y = lp.pop('ItemA') with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): model = ols(y=y, x=lp, entity_effects=True, window=20) self.assertTrue(notnull(model.beta.values).all()) tm.assertIsInstance(model, PanelOLS) model.summary
def test_count(self): f = lambda s: notnull(s).sum() self._check_stat_op('count', f, obj=self.panel, has_skipna=False)
def test_transpose_copy(self): panel = self.panel.copy() result = panel.transpose(2, 0, 1, copy=True) expected = panel.swapaxes('items', 'minor') expected = expected.swapaxes('major', 'minor') assert_panel_equal(result, expected) panel.values[0, 1, 1] = np.nan self.assertTrue(notnull(result.values[1, 0, 1]))
def test_count(self): f = lambda s: notnull(s).sum() self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False)
def test_setitem_always_copy(self): s = self.frame['A'].copy() self.frame['E'] = s self.frame['E'][5:10] = nan self.assertTrue(notnull(s[5:10]).all())