Python pandas 模块,notnull() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.notnull()

项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results
项目:eemeter    作者:openeemeter    | 项目源码 | 文件源码
def trace_serializer(trace):
    data = OrderedDict([
        ("type", "ARBITRARY_START"),
        ("interpretation", trace.interpretation),
        ("unit", trace.unit),
        ("trace_id", trace.trace_id),
        ("interval", trace.interval),
        ("records", [
            OrderedDict([
                ("start", start.isoformat()),
                ("value", record.value if pd.notnull(record.value) else None),
                ("estimated", bool(record.estimated)),
            ])
            for start, record in trace.data.iterrows()
        ]),
    ])
    return data
项目:fileflow    作者:industrydive    | 项目源码 | 文件源码
def clean_and_write_dataframe_to_csv(data, filename):
    """
    Cleans a dataframe of np.NaNs and saves to file via pandas.to_csv

    :param data: data to write to CSV
    :type data: :class:`pandas.DataFrame`
    :param filename: Path to file to write CSV to. if None, string of data
        will be returned
    :type filename: str | None
    :return: If the filename is None, returns the string of data. Otherwise
        returns None.
    :rtype: str | None
    """
    # cleans np.NaN values
    data = data.where((pd.notnull(data)), None)
    # If filename=None, to_csv will return a string
    result = data.to_csv(path_or_buf=filename, encoding='utf-8', dtype=str, index=False, na_rep=None,
                         skipinitialspace=True, quoting=csv.QUOTE_ALL)
    logging.info("Dataframe of shape %s has been stored." % str(data.shape))

    return result
项目:georges    作者:chernals    | 项目源码 | 文件源码
def element_to_bdsim(e):
    """Convert a pandas.Series representation onto a BDSim sequence element."""
    bdsim = ""
    if e.KEYWORD in ['MARKER', 'INSTRUMENT']:
        bdsim = "{}: {};".format(e.name.replace('$', ''), "marker")
    if e.KEYWORD in ['DRIFT', 'QUADRUPOLE', 'RBEND', 'SBEND']:
        bdsim = "{}: {}, l={}*m".format(e.name.replace('$', ''), e.KEYWORD.lower(), e.L)
        if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']):
            bdsim += f",angle=-{e['BENDING_ANGLE']}"
        elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']):
            bdsim += f",angle=-{e.get('ANGLE', 0)}"
        else:
            # Angle property not supported by the element or absent
            bdsim += ""
        #if pd.notnull(e['APERTYPE']):
        #    bdsim += ", aperture={}*m".format(str(e['APERTURE']).strip('[]'))
        if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')):
            bdsim += ", {}={{{{ {} or '0.0' }}}}".format(e['PLUG'].lower(), e['CIRCUIT'])
        bdsim += ';'
    return bdsim
项目:georges    作者:chernals    | 项目源码 | 文件源码
def element_to_mad(e):
    """Convert a pandas.Series representation onto a MAD-X sequence element."""
    if e.CLASS not in SUPPORTED_CLASSES:
        return ""
    mad = "{}: {}, ".format(e.name, e.CLASS)
    if e.get('BENDING_ANGLE') is not None and not np.isnan(e['BENDING_ANGLE']):
        mad += f"ANGLE={e['BENDING_ANGLE']},"
    elif e.get('ANGLE') is not None and not np.isnan(e['ANGLE']):
        mad += f"ANGLE={e.get('ANGLE', 0)},"
    else:
        # Angle property not supported by the element or absent
        mad += ""
    mad += ', '.join(["{}={}".format(p, e[p]) for p in SUPPORTED_PROPERTIES if pd.notnull(e.get(p, None))])
    if pd.notnull(e['LENGTH']) and e['LENGTH'] != 0.0:
        mad += ", L={}".format(e['LENGTH'])
    if pd.notnull(e.get('APERTYPE', None)):
        mad += ", APERTURE={}".format(str(e['APERTURE']).strip('[]'))
    if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('CIRCUIT')) and pd.isnull(e.get('VALUE')):
        mad += ", {}:={}".format(e['PLUG'], e['CIRCUIT'])
    if pd.notnull(e.get('PLUG')) and pd.notnull(e.get('VALUE')):
        mad += ", {}={}".format(e['PLUG'], e['VALUE'])
    mad += ", AT={}".format(e['AT_CENTER'])
    mad += ";"
    return mad
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def get_sec_spt(row):
    """
    Get the secondary spectral type from the information we have. Meant to be
    called as the `apply` method of a pandas DataFrame.
    """
    if pd.notnull(row['Sp2']):
        return row['Sp2']
    elif pd.notnull(row['Sp1']) and pd.notnull(row['mag1']) and pd.notnull(row['mag2']):
        # TODO: Do better than assuming V band!
        band = 'V'
        absmag_prim = MS.GetAbsoluteMagnitude(row['Sp1'], color=band)
        dm = float(row['mag1']) - absmag_prim
        absmag_sec = float(row['mag2']) - dm
        return MS.GetSpectralType_FromAbsMag(absmag_sec, color=band)[0]
    elif pd.notnull(row['Sp1']) and pd.notnull(row['K1']) and pd.notnull(row['K2']):
        mass = MS.Interpolate('mass', row['Sp1'])
        q = float(row['K1']) / float(row['K2'])
        sec_mass = q * mass
        return MS.GetSpectralType('mass', sec_mass)[0]
    else:
        print(row)
        raise ValueError('Must give enough information to figure out the spectral type!')
项目:zeex    作者:zbarge    | 项目源码 | 文件源码
def series_is_datetime(series: pd.Series, check_num: int=5, dropna: bool=True):
    """
    Checks random rows in a Series comparing rows that coerce to datetime.
    :param series:
    :param check_num:
    :param dropna:
    :return:
    """
    if dropna:
        series = series.dropna(axis=0)
    got, lost = 0, 0
    size = (check_num if series.index.size > check_num else series.index.size)

    if size > 0:
        checks = np.random.randint(0, high=series.index.size, size=size)
        for x in series[checks].tolist():
            try:
                x = pd.Timestamp(x)
                if pd.notnull(x):
                    got += 1
            except (ValueError, OverflowError):
                lost += 1

    return got > lost
项目:NTHU-Machine-Learning    作者:YuChunLOL    | 项目源码 | 文件源码
def to_csv(self, filepath='hypothesis/SGD_hypothesis_header.csv'):
        df = pd.DataFrame()
        df = pd.concat([df, pd.DataFrame([['depth', self.depth]])], ignore_index=True)
        df = pd.concat([df, pd.DataFrame([['sizes'] + [self.input_size+1] \
                                                    + [hidden_size+1 for hidden_size in self.hidden_sizes] \
                                                    + [self.output_size]])], ignore_index=True)
        for i, weight in enumerate(self.best_weights):
            df = pd.concat([df, pd.DataFrame([['W_{}'.format(i)] + weight.T.flatten().tolist()])], ignore_index=True)

        # Fill nan with None[]
        df = df.where((pd.notnull(df)), None)

        # Since pd.to_csv converts int to float if there's `None` in the same row,
        # we need to handle this.
        with open(filepath, 'w') as f:
            for row in range(df.shape[0]):
                for col in range(df.shape[1]):
                    if (row == 0 and col != 0) or (row == 1 and col != 0):
                        val = int(df[col][row]) if df[col][row] is not None else ''
                    else:
                        val = df[col][row] if df[col][row] is not None else ''
                    f.writelines('{},'.format(val))
                if row != df.shape[0]-1: f.writelines('\n')
项目:parade    作者:bailaohe    | 项目源码 | 文件源码
def execute_internal(self, context, **kwargs):
        """
        the internal execution process to be implemented
        :param context:
        :param kwargs:
        :return:
        """
        df = pd.read_csv('https://raw.githubusercontent.com/bailaohe/parade/master/assets/movie_metadata.csv')

        # Process projection on the dataset to get our interested attributes
        df = df[['movie_title', 'genres', 'title_year', 'content_rating', 'budget', 'num_voted_users', 'imdb_score']]

        # Filter out records with *NAN* title_year and budget
        df = df[pd.notnull(df['title_year'])]
        df = df[df['budget'] > 0]

        # Extract the genres ROOT
        df['genres_root'] = df['genres'].apply(lambda g: g.split('|')[0])

        return df
项目:MarketMakingProfitability    作者:MiesJansen    | 项目源码 | 文件源码
def Join_Inputs(df, df_betas, df_ff_params, df_liq_prox):
    # add beta values & set index to datetime from df_diff
    df = pd.merge(df, df_betas, left_on='cusip_id', 
                  right_on='cusip_id', left_index=True)

    df['trd_exctn_dt_idx'] = pd.to_datetime(df['trd_exctn_dt'],\
                                        format='%Y%m%d')
    df.set_index('trd_exctn_dt_idx', inplace=True)

    # Join with fama-french factors on date index
    df_join_ff = df.join(df_ff_params, lsuffix="_m", rsuffix='_b')
    # Drop any rows where dates in df_diff do not appear in fama-french
    df_join_ff = df_join_ff[pd.notnull(df_join_ff['MKT_b'])]

    # Combine liquidity factor L_t
    df_liq_prox_values = df_liq_prox['residual_term']
    df_join_liq = df_join_ff.join(df_liq_prox_values)
    df_join_liq = df_join_liq[pd.notnull(df_join_liq['residual_term'])]

    return df_join_liq
项目:craft-ai-client-python    作者:craft-ai    | 项目源码 | 文件源码
def test_decide_from_contexts_df_null_decisions():
  tree = CLIENT.get_decision_tree(AGENT_ID,
                                  COMPLEX_AGENT_DATA.last_valid_index().value // 10 ** 9)

  test_df = pd.DataFrame(
    [
      ["Jean-Pierre", "+02:00"],
      ["Paul"]
    ],
    columns=["b", "tz"],
    index=pd.date_range("20130201", periods=2, freq="D"))

  df = CLIENT.decide_from_contexts_df(tree, test_df)
  assert_equal(len(df), 2)
  assert pd.isnull(df["a_predicted_value"][0])
  assert pd.notnull(df["error"][0])

  assert pd.notnull(df["a_predicted_value"][1])
  assert pd.isnull(df["error"][1])
项目:craft-ai-client-python    作者:craft-ai    | 项目源码 | 文件源码
def add_operations(self, agent_id, operations):
    if isinstance(operations, pd.DataFrame):
      if not isinstance(operations.index, pd.DatetimeIndex):
        raise CraftAiBadRequestError("Invalid dataframe given, it is not time indexed")

      chunk_size = self.config["operationsChunksSize"]

      for chunk in chunker(operations, chunk_size):
        chunk_operations = [
          {
            "timestamp": row.name.value // 10 ** 9, # Timestamp.value returns nanoseconds
            "context": {
              col: row[col] for col in operations.columns if pd.notnull(row[col])
            }
          } for _, row in chunk.iterrows()
        ]
        super(Client, self).add_operations(agent_id, chunk_operations)

      return {
        "message": "Successfully added %i operation(s) to the agent \"%s/%s/%s\" context."
                   % (len(operations), self.config["owner"], self.config["project"], agent_id)
      }
    else:
      return super(Client, self).add_operations(agent_id, operations)
项目:craft-ai-client-python    作者:craft-ai    | 项目源码 | 文件源码
def decide_from_row(tree, columns, row):
  time = Time(
    t=row.name.value // 10 ** 9, # Timestamp.value returns nanoseconds
    timezone=row.name.tz
  )
  context = {
    col: row[col] for col in columns if pd.notnull(row[col])
  }
  try:
    decision = VanillaInterpreter.decide(tree, [context, time])

    keys, values = zip(*[
      (output + "_" + key, value)
      for output, output_decision in decision["output"].items()
      for key, value in output_decision.items()
    ])

    return pd.Series(data=values, index=keys)
  except CraftAiNullDecisionError as e:
    return pd.Series(data=[e.message], index=["error"])
项目:missingno    作者:ResidentMario    | 项目源码 | 文件源码
def _calculate_geographic_nullity(geo_group, x_col, y_col):
    """
    Helper method which calculates the nullity of a DataFrame. Factored out of and used within `geoplot`.
    """
    # Aggregate by point and fetch a list of non-null coordinate pairs, which is returned.
    point_groups = geo_group.groupby([x_col, y_col])
    points = [point for point in point_groups.groups.keys() if pd.notnull(point[0]) and pd.notnull(point[1])]
    # Calculate nullities by location, then take their average within the overall feature.
    counts = np.sum(point_groups.count().values, axis=1)
    entries = point_groups.size()
    width = len(geo_group.columns)
    # Remove empty (NaN, NaN) points.
    if len(entries) > 0:  # explicit check to avoid a Runtime Warning
        geographic_nullity = np.average(1 - counts / width / entries)
        return points, geographic_nullity
    else:
        return points, np.nan
项目:tmtk    作者:thehyve    | 项目源码 | 文件源码
def _get_hd_args(path, high_dim_node, annotation):
    """
    Create dict with meta tags that belong to a certain high dimensional node.
    """
    map_file = high_dim_node.sample_mapping

    s = map_file.slice_path(path).iloc[:, 5].unique()
    t = map_file.slice_path(path).iloc[:, 6].unique()

    hd_args = {'hd_sample': ', '.join(s.astype(str)) if pd.notnull(s[0]) else '',
               'hd_tissue': ', '.join(t.astype(str)) if pd.notnull(t[0]) else '',
               'hd_type': Mappings.annotation_data_types.get(high_dim_node.params.datatype),
               }

    if annotation:
        hd_args.update({'pl_marker_type': annotation.marker_type,
                        'pl_genome_build': annotation.params.get('GENOME_RELEASE', ''),
                        'pl_title': annotation.params.get('TITLE', ''),
                        'pl_id': annotation.platform})
    return hd_args
项目:Uber-DS-Challenge    作者:bjherger    | 项目源码 | 文件源码
def extract_days(input_delta):
    """
    Helper function to extract the number of days from a time delta. Returns:
     - Number of days, if valid time delta
     - np.NaN if time delta is null or invalid
    :param input_delta:
    :return: number of days in time delta
    :rtype: float
    """

    # Attempt to coerce into Pandas time delta
    delta = pd.Timedelta(input_delta)

    # Attempt to extract number of days
    days = np.NaN
    if pd.notnull(delta):
        days = delta.days

    # Return result
    return days
项目:py-investment    作者:kprestel    | 项目源码 | 文件源码
def kama(self, efficiency_ratio_periods=10, ema_fast=2, ema_slow=30,
             period=20, column='adj_close'):
        er = self._efficiency_ratio_computation(
                period=efficiency_ratio_periods, column=column)
        fast_alpha = 2 / (ema_fast + 1)
        slow_alpha = 2 / (ema_slow + 1)
        smoothing_constant = pd.Series(
                (er * (fast_alpha - slow_alpha) + slow_alpha) ** 2,
                name='smoothing_constant')
        sma = pd.Series(self.ohlcv[column].rolling(period).mean(), name='SMA')
        kama = []
        for smooth, ma, price in zip(iter(smoothing_constant.items()),
                                     iter(sma.shift(-1).items()),
                                     iter(self.ohlcv[column].items())):
            try:
                kama.append(kama[-1] + smooth[1] * (price[1] - kama[-1]))
            except:
                if pd.notnull(ma[1]):
                    kama.append(ma[1] + smooth[1] * (price[1] - ma[1]))
                else:
                    kama.append(None)
        sma['KAMA'] = pd.Series(kama, index=sma.index,
                                name='{} days KAMA Ticker {}'.format(period,
                                                                     self.ticker))
        yield sma['KAMA']
项目:finta    作者:peerchemist    | 项目源码 | 文件源码
def KAMA(cls, ohlc, er=10, ema_fast=2, ema_slow=30, period=20):
        """Developed by Perry Kaufman, Kaufman's Adaptive Moving Average (KAMA) is a moving average designed to account for market noise or volatility.
        Its main advantage is that it takes into consideration not just the direction, but the market volatility as well."""

        er = cls.ER(ohlc, er)
        fast_alpha = 2 / (ema_fast + 1)
        slow_alpha = 2 / (ema_slow + 1)
        sc = pd.Series((er * (fast_alpha - slow_alpha) + slow_alpha)**2, name="smoothing_constant") ## smoothing constant

        sma = pd.Series(ohlc["close"].rolling(period).mean(), name="SMA") ## first KAMA is SMA
        kama = []
        # Current KAMA = Prior KAMA + smoothing_constant * (Price - Prior KAMA)
        for s, ma, price in zip(sc.iteritems(), sma.shift().iteritems(), ohlc["close"].iteritems()):
            try:
                kama.append(kama[-1] + s[1] * (price[1] - kama[-1]))
            except:
                if pd.notnull(ma[1]):
                    kama.append(ma[1] + s[1] * (price[1] - ma[1]))
                else:
                    kama.append(None)

        sma["KAMA"] = pd.Series(kama, index=sma.index, name="{0} period KAMA.".format(period)) ## apply the kama list to existing index
        return sma["KAMA"]
项目:WellApplication    作者:inkenbrandt    | 项目源码 | 文件源码
def markGaps(self):
        """Produces dictionary of list of gaps in time series data based on the presence of nan values;
        used for gantt plotting

        :returns: dateranges; a dictionary with station names as keys and lists of begin and end dates as values
        """
        df = self.data
        stations = self.stations

        dateranges = {}
        for station in stations:
            dateranges[station] = []
            first = df.ix[:, station].first_valid_index()
            last = df.ix[:, station].last_valid_index()
            records = df.ix[first:last, station]
            #dateranges[station].append(pd.to_datetime(first))
            for i in range(len(records) - 1):
                if pd.isnull(records[i + 1]) and pd.notnull(records[i]):
                    dateranges[station].append(pd.to_datetime(records.index)[i])
                elif pd.isnull(records[i]) and pd.notnull(records[i + 1]):
                    dateranges[station].append(pd.to_datetime(records.index)[i])
            dateranges[station].append(pd.to_datetime(last))
        return dateranges
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def update_last_known_values(self):
        """
        Store the non-NaN values from our oldest frame in each frequency.
        """
        ffillable = self.ffillable_fields
        if not len(ffillable):
            return

        for frequency in self.unique_frequencies:
            digest_panel = self.digest_panels.get(frequency, None)
            if digest_panel:
                oldest_known_values = digest_panel.oldest_frame(raw=True)
            else:
                oldest_known_values = self.buffer_panel.oldest_frame(raw=True)

            oldest_vals = oldest_known_values
            oldest_columns = self.fields
            for field in ffillable:
                f_idx = oldest_columns.get_loc(field)
                field_vals = oldest_vals[f_idx]
                # isnan would be fast, possible to use?
                non_nan_sids = np.where(pd.notnull(field_vals))
                key = (frequency.freq_str, field)
                key_loc = self.last_known_prior_values.index.get_loc(key)
                self.last_known_prior_values.values[
                    key_loc, non_nan_sids
                ] = field_vals[non_nan_sids]
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def uniprot_reviewed_checker(uniprot_id):
    """Check if a single UniProt ID is reviewed or not.

    Args:
        uniprot_id:

    Returns:
        bool: If the entry is reviewed

    """

    query_string = 'id:' + uniprot_id

    uni_rev_raw = StringIO(bsup.search(query_string, columns='id,reviewed', frmt='tab'))
    uni_rev_df = pd.read_table(uni_rev_raw, sep='\t', index_col=0)
    uni_rev_df = uni_rev_df.fillna(False)
    uni_rev_df = uni_rev_df[pd.notnull(uni_rev_df.Status)]

    uni_rev_df = uni_rev_df.replace(to_replace="reviewed", value=True)
    uni_rev_df = uni_rev_df.replace(to_replace="unreviewed", value=False)
    uni_rev_dict_adder = uni_rev_df.to_dict()['Status']

    return uni_rev_dict_adder[uniprot_id]
项目:berlin-devfest-2016-backend    作者:giansegato    | 项目源码 | 文件源码
def processData(data):
    df = pd.DataFrame.transpose(pd.read_json(json.dumps(data)))
    df = df.dropna(subset = [key for key in df.keys() if "x_" in key])
    df = df[pd.notnull(df['y_observed'])]

    X = df[[key for key in df.keys() if "x_" in key]].values
    y = df["y_observed"].values

    return X, y

# 5th: initial model
项目:Kaggle    作者:lawlite19    | 项目源码 | 文件源码
def pre_processData(train_data,file_path):
    train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age)  # ???????????
    train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes
    train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no'    
    '''0/1????'''
    dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin')  # get_dummies?????0/1??????????????prefix???Cabin
    dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
    dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
    dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass')
    train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1)  # ??dataframe,axis=1??
    train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True)   # ????????????            
    header_string = ','.join(train_data.columns.tolist())  # ?????string???????
    np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string)  # ?????????????    
    '''???????(Age?Fare)'''
    scaler = StandardScaler()
    age_scaler = scaler.fit(train_data['Age'])
    train_data['Age'] = age_scaler.fit_transform(train_data['Age'])
    if np.sum(train_data.Fare.isnull()):  # ??Fare???????????
        train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare)
    fare_scaler = scaler.fit(train_data['Fare'])
    train_data['Fare'] = fare_scaler.transform(train_data['Fare'])
    header_string = ','.join(train_data.columns.tolist())  # ?????string???????
    np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string)  # ?????????????    
    return train_data






## feature engineering?????-?????
项目:py_stringsimjoin    作者:anhaidgroup    | 项目源码 | 文件源码
def generate_tokens(table, key_attr, join_attr, tokenizer):
    table_nonnull = table[pd.notnull(table[join_attr])]
    return dict(zip(table_nonnull[key_attr],
                    table_nonnull[join_attr].apply(tokenizer.tokenize)))
项目:tensorflow    作者:KirovVerst    | 项目源码 | 文件源码
def preprocess_data(path, is_test=False):
    data = pd.read_csv(path, index_col='PassengerId')
    data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    if is_test:
        data = data.replace([None], [0])
    else:
        data = data[pd.notnull(data['Age'])]
        data = data[pd.notnull(data['Embarked'])]
    data.replace(["female", "male"], [0, 1], inplace=True)
    data.replace(["Q", "C", "S"], [0, 1, 2], inplace=True)
    if "Survived" in data:
        data = data[pd.notnull(data['Survived'])]
    data_norm = (data - data.mean()) / (data.max() - data.min())
    return data_norm
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Line plot:
        #self.vmax = max(self.vmax, ct.values.max())
        #ct.plot(ax=plt.gca(), color=self.get_palette())
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## Stacked area plot:
        #if len(self._groupby) == 2:
            #self.vmax = max(self.vmax, ct.apply(sum, axis=1).max())
        #ct.plot(ax=plt.gca(), kind="area", stacked=True, color=self.get_palette(), **kwargs)
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def plot_facet(self, data, color, **kwargs):
        x = kwargs.get("x")
        y = kwargs.get("y")
        levels_x = kwargs.get("levels_x")
        levels_y = kwargs.get("levels_y")

        #num = []
        #date = []
        #time = data[self._time_column]
        #num = data[self._time_column].apply(self.convert_to_datetime)
        #date = data[self._time_column].apply(self.convert_to_timeseries)
        #if pd.isnull(num).sum() <= pd.isnull(date).sum():
            #data[self._time_column] = num
        #else:
            #data[self._time_column] = date

        #data.dropna(inplace=True)
        #if len(self._groupby) == 2:
            #ct = pd.crosstab(data[self._time_column], data[self._groupby[0]])
            #ct = ct.reindex_axis(self._levels[0], axis=1).fillna(0)
            #ct = ct[pd.notnull(ct.index)]
        #else:
            #ct = pd.crosstab(
                #data[self._time_column],
                #pd.Series([""] * len(self._table[self._time_column]), name=""))

        ## percentage area plot:
        ## if there is only one grouping variable (the time column),
        ## the cross table produces a Series, not a data frame. It
        ## isn't really very informative to plot it, but we provide
        ## for this special case anyway_
        #if type(ct) == pd.Series:
            #ct = ct.apply(lambda x: 100)
        #else:
            #ct = ct.apply(lambda x: (100 * x) / sum(x), axis=1)
        #ct.plot(kind="area", ax=plt.gca(), stacked=True, color=self.get_palette(), **kwargs)
项目:eemeter    作者:openeemeter    | 项目源码 | 文件源码
def _save_series(self, series):
        data = [
            [
                d.strftime(self.cache_date_format), t
                if pd.notnull(t) else None
            ]
            for d, t in series.iteritems()
        ]
        self.json_store.save_json(self._get_cache_key(), data)
项目:eemeter    作者:openeemeter    | 项目源码 | 文件源码
def save_series(self, year, series):
        key = self._get_cache_key(year)
        data = [
            [
                d.strftime(self.cache_date_format), t
                if pd.notnull(t) else None
            ]
            for d, t in series.iteritems()
        ]
        self.json_store.save_json(key, data)
项目:eemeter    作者:openeemeter    | 项目源码 | 文件源码
def yield_records(self, sorted_records):

        n = len(sorted_records)
        for i, record in enumerate(sorted_records):

            self.validate_record(record)

            start = record["start"]
            value = record["value"]
            estimated = record.get("estimated", False)

            if i < n - 1:  # all except last record
                yield (start, value, estimated)
            else:  # last record
                end = record.get("end", None)
                if end is None:
                    # can't use the value of this record, no end date
                    yield (start, np.nan, False)
                else:

                    self._validate_record_start_end(record, start, end)

                    # provide an end date cap
                    if pd.notnull(value):
                        yield (start, value, estimated)
                        yield (end, np.nan, False)
                    else:
                        yield (start, np.nan, False)
项目:eemeter    作者:openeemeter    | 项目源码 | 文件源码
def serialize_input(self, input_data):
        ''' Serialize input data
        '''
        return OrderedDict([
            (start.isoformat(), OrderedDict([
                ("energy", row.energy if pd.notnull(row.energy) else None),
                ("tempF", row.tempF if pd.notnull(row.tempF) else None),
            ]))
            for start, row in input_data.iterrows()
        ])
项目:fileflow    作者:industrydive    | 项目源码 | 文件源码
def read_and_clean_csv_to_dataframe(filename_or_stream, encoding='utf-8'):
    """
    Reads a utf-8 encoded CSV directly into a pandas dataframe as string values and scrubs np.NaN values to Python None

    :param str filename_or_stream: path to CSV
    :return:
    """
    # pulls data in as utf8, all as strings, and without pre whitespace padding
    try:
        data = pd.read_csv(
            filepath_or_buffer=filename_or_stream,
            encoding=encoding,
            dtype=str,
            skipinitialspace=True
        )
    except AttributeError:
        # this is an empty dataframe and pandas crashed because it can't coerce the columns to strings
        # issue and PR to fix is open on pandas core at https://github.com/pydata/pandas/issues/12048
        # slated for 1.8 release
        # so for now just try loading the dataframe without specifying dtype
        data = pd.read_csv(
            filepath_or_buffer=filename_or_stream,
            encoding=encoding,
            skipinitialspace=True
        )
    logging.info('File read via the pandas read_csv methodology.')

    # coerces pandas nulls (of np.NaN type) into python None
    data = data.where((pd.notnull(data)), None)

    # coerces string representations of Python None to a real Python None
    data[data == 'None'] = None
    data[data == ''] = None
    logging.info("Dataframe of shape %s has been retrieved." % str(data.shape))

    return data
项目:catalyst    作者:enigmampc    | 项目源码 | 文件源码
def __init__(self,
                 estimates,
                 name_map):
        validate_column_specs(
            estimates,
            name_map
        )

        self.estimates = estimates[
            estimates[EVENT_DATE_FIELD_NAME].notnull() &
            estimates[FISCAL_QUARTER_FIELD_NAME].notnull() &
            estimates[FISCAL_YEAR_FIELD_NAME].notnull()
        ]
        self.estimates[NORMALIZED_QUARTERS] = normalize_quarters(
            self.estimates[FISCAL_YEAR_FIELD_NAME],
            self.estimates[FISCAL_QUARTER_FIELD_NAME],
        )

        self.array_overwrites_dict = {
            datetime64ns_dtype: Datetime641DArrayOverwrite,
            float64_dtype: Float641DArrayOverwrite,
        }
        self.scalar_overwrites_dict = {
            datetime64ns_dtype: Datetime64Overwrite,
            float64_dtype: Float64Overwrite,
        }

        self.name_map = name_map
        self._columns = set(name_map.keys())
项目:betterself    作者:jeffshek    | 项目源码 | 文件源码
def update_dataframe_to_be_none_instead_of_nan_for_api_responses(df):
    df = df.where((pd.notnull(df)), None)
    return df
项目:betterself    作者:jeffshek    | 项目源码 | 文件源码
def get_sorted_response(series):
    if series.dropna().empty:
        return NO_DATA_RESPONSE

    # Do a odd sorted tuple response because Javascript sorting is an oddly difficult problem
    # sorted_response = [item for item in series.iteritems()]
    sorted_response = []
    for index, value in series.iteritems():
        if not pd.notnull(value):
            value = None

        data_point = (index, value)
        sorted_response.append(data_point)

    return Response(sorted_response)
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def test_api_categorization_sort(app, sort_by):
    n_categories = 2
    dsid, lsi_id, _, ds_input = get_features_lsi_cached(app, n_categories=n_categories)
    method = V01 + "/feature-extraction/{}".format(dsid)
    data = app.get_check(method)

    training_set = ds_input['training_set']

    pars = {
          'parent_id': lsi_id,
          'data': training_set,
          'method': 'NearestNeighbor'}

    method = V01 + "/categorization/"
    data = app.post_check(method, json=pars)
    mid = data['id']

    method = V01 + "/categorization/{}/predict".format(mid)

    data = app.get_check(method, json={'batch_id': -1, "sort_by": sort_by})

    res = []
    for row in data['data']:
        res_el = {'document_id': row['document_id']}
        for scores in row['scores']:
            res_el[scores['category']] = scores['score']
        res.append(res_el)

    df = pd.DataFrame(res)
    df = df.set_index('document_id')

    if sort_by in df.columns:
        mask = pd.notnull(df[sort_by])
        assert_array_equal(df[mask].index.values,
                           df[mask].sort_values(sort_by, ascending=False).index.values)
项目:pyprocessmacro    作者:QuentinAndre    | 项目源码 | 文件源码
def _prepare_data(self):
        """
        Subset the dataframe to the columns needed for estimation purposes, and add a constant.
        :return: pd.DataFrame
        """
        # Subset the data to the columns used in the model
        data = self.data[self.varlist].copy()
        data = data[pd.notnull(data)].reset_index(drop=True)

        # Mapping each variable name to a unique variable code, and renaming the columns in the data.)
        data.rename(columns=self._var_to_symb, inplace=True)

        # Adding a constant to the data.
        data["Cons"] = 1

        if self.options["logit"]:
            endog = data["y"]
            uniques = np.unique(endog)
            if len(uniques) != 2:
                raise ValueError(
                    "The dependent variable does not have exactly two distinct outcomes."
                    "Please provide another dataset or change the 'logit' option to 0")
            else:
                endog_logit = [0 if i == uniques[0] else 1 for i in endog]
            data["y"] = endog_logit
        return data
项目:stockstats    作者:jealous    | 项目源码 | 文件源码
def remove_random_nan(pd_obj):
        return pd_obj.where((pd.notnull(pd_obj)), None)
项目:gullikson-scripts    作者:kgullikson88    | 项目源码 | 文件源码
def split_by_component(df):
    df['prim_comp'] = df.Comp.map(lambda s: s[0])
    df['sec_comp'] = df.Comp.map(lambda s: s[-1])
    comps = pd.concat((df[['prim_comp', 'Sp1']], df[['sec_comp', 'Sp2']]))
    prim = comps.loc[comps.prim_comp.notnull()].rename(columns={'Sp1': 'SpT', 'prim_comp': 'comp'})
    sec = comps.loc[comps.sec_comp.notnull()].rename(columns={'Sp2': 'SpT', 'sec_comp': 'comp'})
    return pd.concat((prim, sec))[['comp', 'SpT']].drop_duplicates(subset='comp')
项目:popit-scripts    作者:open-hluttaw    | 项目源码 | 文件源码
def add_committee():

    df = pandas.DataFrame.from_csv('data/mp-en.csv', header=0, index_col=False)
    df = df.where((pandas.notnull(df)), None)
    MPs = df.to_dict(orient='records')

    for mp in MPs:
        if mp['committee_memberships']:
            committees = [committee.strip() for committee in mp['committee_memberships'].split(',')]

            person_id = utils.hluttaw_to_popitid(mp['identifier__hluttaw'],
                                            base_url) 
            on_behalf_of_id = utils.org_name_to_popitid(mp['group'],base_url)


            for org in committees:
                payload = {}

                payload['person_id'] = person_id
                payload['organization_id'] = utils.org_name_to_popitid(org,base_url)
                payload['on_behalf_of_id'] = on_behalf_of_id
                payload['role'] = 'Committee Member'
                payload['start_date'] = mp['start_date']

                url = base_url + '/en/memberships'
                r = requests.post(url,headers=headers,json=payload)
                print r.content
项目:popit-scripts    作者:open-hluttaw    | 项目源码 | 文件源码
def update_my():

    lang = 'my'

    df = pandas.DataFrame.from_csv('data/mp-my.csv', header=1, index_col=False)
    df = df.where((pandas.notnull(df)), None)

    MPs = df.to_dict(orient='records')

    for mp in MPs:
        hluttaw_id = mp['identifier__hluttaw']

        popit_id = utils.hluttaw_to_popitid(hluttaw_id, base_url)

        print hluttaw_id
        print popit_id

        if popit_id:
            url = base_url + "/" + lang + "/persons/" + popit_id

            honorific_prefix = mp['honorific_prefix']
            name = mp['name']
            gender = mp['gender']
            national_identity = mp['national_identity']

            payload = { 
                        'honorific_prefix': honorific_prefix,
                        'name': name,
                        'gender': gender,
                        'national_identity': national_identity,
                        }

            r = requests.put(url, headers=headers, json=payload)
            print r.content
项目:zeex    作者:zbarge    | 项目源码 | 文件源码
def not_null(x):
    return notnull(x) and str(x).lower() not in NULL_VALUES
项目:zeex    作者:zbarge    | 项目源码 | 文件源码
def nan_coerce(x):
    v = str(x)
    if pd.notnull(v) is False or v in NAN_LIST:
        return np.nan
    return x
项目:zeex    作者:zbarge    | 项目源码 | 文件源码
def remove_line_breaks(x):
    x = (str(x) if pd.notnull(x) else '')
    for b in LINE_BREAKS_LIST_RX:
        x = b.sub(" ", x)
    return string_blank_na(x.lstrip().rstrip())
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_longpanel_series_combo(self):
        wp = tm.makePanel()
        lp = wp.to_frame()

        y = lp.pop('ItemA')
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            model = ols(y=y, x=lp, entity_effects=True, window=20)
        self.assertTrue(notnull(model.beta.values).all())
        tm.assertIsInstance(model, PanelOLS)
        model.summary
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_count(self):
        f = lambda s: notnull(s).sum()
        self._check_stat_op('count', f, obj=self.panel, has_skipna=False)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_transpose_copy(self):
        panel = self.panel.copy()
        result = panel.transpose(2, 0, 1, copy=True)
        expected = panel.swapaxes('items', 'minor')
        expected = expected.swapaxes('major', 'minor')
        assert_panel_equal(result, expected)

        panel.values[0, 1, 1] = np.nan
        self.assertTrue(notnull(result.values[1, 0, 1]))
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_count(self):
        f = lambda s: notnull(s).sum()
        self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def test_setitem_always_copy(self):
        s = self.frame['A'].copy()
        self.frame['E'] = s

        self.frame['E'][5:10] = nan
        self.assertTrue(notnull(s[5:10]).all())