def calc_tvd(label_dict,attr): ''' attr should be a 0,1 pandas dataframe with columns corresponding to label names for example: names=zip(*self.graph)[0] calc_tvd(label_dict,attr[names]) label_dict should be a dictionary key:1d-array of samples ''' ####Calculate Total Variation#### if np.min(attr.values)<0: raise ValueError('calc_tvd received \ attr that may not have been in {0,1}') label_names=label_dict.keys() attr=attr[label_names] df2=attr.drop_duplicates() df2 = df2.reset_index(drop = True).reset_index() df2=df2.rename(columns = {'index':'ID'}) real_data_id=pd.merge(attr,df2) real_counts = pd.value_counts(real_data_id['ID']) real_pdf=real_counts/len(attr) label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()} df_dat=pd.DataFrame.from_dict(label_list_dict) dat_id=pd.merge(df_dat,df2,on=label_names,how='left') dat_counts=pd.value_counts(dat_id['ID']) dat_pdf = dat_counts / dat_counts.sum() diff=real_pdf.subtract(dat_pdf, fill_value=0) tvd=0.5*diff.abs().sum() return tvd
def global_stats(articles: pd.DataFrame): """Calculate global stats on article db.""" print(f'Number of articles: {len(articles):,}') num_sources = len(pd.value_counts(articles['base_url'], sort=False)) print(f'Number of news sources: {num_sources}') mean_wc = articles['word_count'].mean() print(f'Global mean word count: {mean_wc:.1f}') missing_authors = (articles['authors'] == '').sum() print(f'Missing authors: {missing_authors:,}') missing_titles = (articles['title'] == '').sum() print(f'Missing titles: {missing_titles}') missing_texts = (articles['text'] == '').sum() print(f'Missing texts: {missing_texts:,}')
def return_mean(datafile, mapping, flag_columns=None): mapped_regions = pd.DataFrame(datafile[datafile.iloc[:, 0].isin(mapping)]) mean_values = mapped_regions.iloc[:, 1:].applymap(float).mean() if flag_columns.any() and (len(mapping) > 1): mean_values[flag_columns] = (datafile[datafile.iloc[:, 0].isin(mapping)][flag_columns] ).apply(lambda x: pd.value_counts(x).index[0]) return mean_values
def normalizedIntradayCountStats(intradayStats, limitCount=5): # For each minute, number of days for which we have a valid measure (record) notNullCount = intradayStats.count() # Ignore minutes where we have low level of records notNullCount[notNullCount < limitCount] = None # Count how many times each value appears for each minute valueCount = intradayStats.apply(pd.value_counts) # Normalize each minute by records count res = valueCount.div(notNullCount, axis=1) return res
def classify_user(): new_df_log_scaled = get_scaled_user() c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T) pd.value_counts(c.labels_) d = c.labels_ types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0] types[types == -1] = 2 return types
def word_count(string): return pd.value_counts( string.split() ).to_dict()
def get_entity_features(self): # First we will calculate the rates, so let's drop all the NaN rate_df = self.df.dropna(subset=['rate']) # Calculate the rates by hour and delete the old rate column. rate_df = rate_df.\ merge(mean_hourly_rate_df(rate_df), left_on=['_id'], right_on=['_id']).\ drop('rate', axis=1).\ drop_duplicates() # Now get the stats we want for rate rate_df = self.calculate_entity_rate_features(rate_df) # Get a count of the entities df = pd.value_counts(self.df[self.entity]).\ reset_index().\ rename(columns={'index': self.entity, self.entity: self.entity+'_count'}) # Get counts of unique locations for loc_col, unique_loc_col in [('city_wikidata_id', 'unique_cities'), ('state_wikidata_id', 'unique_states')]: unique_loc_df = self.df.loc[:, [self.entity, loc_col]].\ dropna().\ drop_duplicates().\ groupby(self.entity).\ count().\ reset_index().\ rename(columns={loc_col: unique_loc_col}) df = df.merge(unique_loc_df, how='left', left_on=self.entity, right_on=self.entity) df.loc[:, unique_loc_col] = \ df.loc[:, unique_loc_col].fillna(0).astype(int) del unique_loc_df # Reset the index on our rate dataframe and rename the columns rate_df.reset_index(level=0, inplace=True) rate_df.columns = [self.entity, 'rate_count', 'rate_mean', 'rate_std', 'rate_median'] # Lastly merge the two dataframes return df.merge(rate_df, how='outer') # Save this code as we may use it later """df['incall_count'] = df['index'].apply(lambda x: self.get_incall_count(x)) df['outcall_count'] = df['index'].apply(lambda x: self.get_outcall_count(x))"""
def test_plottingOnIntradayStats(self): filepath = RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv" data1 = utils.loadIntradayData(filepath) filepath = RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv" data2 = utils.loadIntradayData(filepath) stats = sleepStats.generateStatsFrom([data1, data2], sleepStats.STATS_NAME_INTRADAY) data = stats.apply(pd.value_counts) mplot.plotSleepValueHeatmap(data, sleepValue=1)
def describe_data(data, info=False, describe=False, value_counts=None, unique=None, univariate_feature_selection=None, description=None): # Data diagnostics if description is not None: print("\n" + description) # Info if info: print("\nInfo:") print(data.info()) # Description if describe: print("\nDescribe:") print(data.describe()) # Value counts if value_counts is not None: for feature in value_counts: print("\nValue Counts [" + feature + "]") print(pd.value_counts(data[feature])) # Unique values if unique is not None: for feature in unique: print("\nUnique [" + feature + "]") print(data[feature].unique()) # Univariate feature selection if univariate_feature_selection is not None: # Extract predictors and target predictors = univariate_feature_selection[0] target = univariate_feature_selection[1] # Perform feature selection selector = SelectKBest(f_classif, k="all") selector.fit(data[predictors], data[target]) # Get the raw p-values for each feature, and transform from p-values into scores scores = -np.log10(selector.pvalues_) print("\nUnivariate Feature Selection:") for feature, imp in sorted(zip(predictors, scores), key=lambda x: x[1] if pd.notnull(x[1]) else 0): print(feature, imp)
def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True): def get_title(name): title_search = re.search(' ([A-Za-z]+)\.', name) if title_search: return title_search.group(1) return "" def normalize_fare(data): new_data = None for embarked in (0, 1, 2): temp = data[data.Embarked == embarked] temp['Fare'] /= temp['Fare'].values.mean() if new_data is None: new_data = temp else: new_data = pd.concat([new_data, temp]) new_data = new_data.sort('PassengerId') return new_data data = pd.read_csv(self.file_name).replace('male',0).replace('female',1) data['Age'].fillna(data.Age.median(), inplace=True) data['Fare'].fillna(data.Fare.median(), inplace=True) data['FamilySize'] = data['SibSp'] + data['Parch'] + 1 data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2) data['Embarked'].fillna(0, inplace=True) if norm_fare: data = normalize_fare(data) # Get all the titles and print how often each one occurs. titles = data["Name"].apply(get_title) print(pd.value_counts(titles)) # Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles. title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2} for k,v in title_mapping.items(): titles[titles == k] = v # Add in the title column. data['Title'] = titles data['Title'].fillna(1, inplace=True) #data['Pos'] = data["Title"] + data['Pclass'] if drop: #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1) data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1) #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1) print(data.keys()) if title_to_onehot: self.encode(data, 'Title', [i for i in range(1, 11)]) data = data.drop(['Title'], axis=1) return data