我们从Python开源项目中,提取了以下18个代码示例,用于说明如何使用pandas.dataframe()。
def _df_initial_fixer(df, word, sample=60000): ''' function: - ramdomly select rows (image) "sample" times from the df dataframe and delete features that are not used in ensemble method modeling input: df = dataframe. output of 1_feature_engineering_func. [pd.dataframe] word = name of topic ig "cat" [str] sample = number of sample you want to extract from df [int] output: new data frame! ''' print "total number of images for df_{}: {}".format(word, len(df)) random_index = np.random.choice(list(df.index), sample, replace=False) df = df.loc[list(random_index)] df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\ 'X_per_stroke','Y_per_stroke','time_per_stroke',\ 'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\ 'direction'], axis=1) return df_test
def _df_initial_fixer_cc(df, word): ''' prepares training and test X and Y for xgboost test for countrycode classifier function: - delete features that are not used in ensemble method modeling input: df = dataframe. output of 1_feature_engineering_func. [pd.dataframe] word = name of topic ig "cat" [str] output: new data frame! ''' df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\ 'X_per_stroke','Y_per_stroke','time_per_stroke',\ 'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\ 'direction'], axis=1) return df_test
def _country_initial_fixer(df,country,limit): ''' Function: extracts data by country and ramdomly select "limit" amount of data from that dataset Input: df = dataframe (should contain 'countrycode' features) [dataframe] country = should be 2 capital letter country code[string] limit = max number of rows (data) you want to take into the new data frame Output: dataframe contains data from selected country (# of data <= limit) note: uses random.seed(32113) ''' if df[df['countrycode']==country].count()[0] > limit: df_c = df[df['countrycode']==country] random_c = np.random.choice(list(df_c.index), limit, replace=False) df_c = df_c.loc[list(random_c)] else: df_c = df[df['countrycode']==country] return df_c
def postProcessingDatFile(fname, objName=None, root='./'): if objName!=None: dataFolder = postProcessingFolder(objName, root=root) timeNames = timeFolder(root=dataFolder) else: dataFolder = addslash(root) timeNames = [] if len(timeNames)==0: timeNames=[''] # at least check the current folder keyName = os.path.basename(rmslash(fname)) keyName = os.path.splitext(keyName)[0] datFiles = [] for subdir in timeNames: found = filesOnly(sorted(glob.glob(dataFolder + subdir + "/" + keyName + "*.dat"))) for f in found: datFiles.append(f) return datFiles # concat dataframe and optionally merge xAxis # When overlap, either keep 'last', 'first', or 'False' # list_of_data must be of type pandas.dataframe
def feature_eng_pt3(df_cf): ''' function: - feature engineering pt3 need to run this after feature_eng_pt2 since pt4 and pt5 uses features created in this function. - Create following features: direction = direction of stroke (from first XY points to last XY points) in radian (0 to 6.28...) [float] input: df_cf = output dataframe from feature_eng_pt2 output: dataframe with above features and filter the way I approached this is by finding the first and last x,y locations for each stroke and I then calculated delta x (dx) and delta y (dy). from there, I just calculated the direction of the stroke in radian using my user defined function "_radian_direction" ''' direction = {} for index in df_cf.index: dx = [float(df_cf.drawing[index][stroke][1][-1] - df_cf.drawing[index][stroke][1][0]) \ for stroke in xrange(df_cf.stroke_number[index])] dy = [float(df_cf.drawing[index][stroke][0][-1] - df_cf.drawing[index][stroke][0][0]) \ for stroke in xrange(df_cf.stroke_number[index])] dx = np.array(dx) dy = np.array(dy) dx[dx==0] = 0.000001 vecrad_direction = np.vectorize(_radian_direction) direction[index] = vecrad_direction(dy,dx) df_cf['direction'] = pd.Series(direction) return df_cf
def load_json(filename): ''' Function: - opens json file and store information in a pandas dataframe - also prints out aggregated df with counts of picture by countrycode Input: 1. filename/path ex: ./data/filename.json Output: 1. new dataframe containing json info ''' df = pd.read_json(filename, lines=True) test = df.groupby(df['countrycode']).count() print test.sort(columns='drawing',ascending=False).head(15) return df
def pic_viewer(df_cf, _id): ''' Function: - If X and Y columns exist in your dataframe, you can use this function to view drawing with specific id. - run this after running CNN_feat_eng_pt1 or feature_eng_pt2 Input: 1. dataframe df_cf 2. object id _id Output: 1. scatter plot of x and y ''' plt.scatter(df_cf.X[_id],df_cf.Y[_id]) plt.gca().invert_yaxis()
def random_sort(df, prng=None): """Randomly shuffle a DataFrame. NOTE: if the training data is not randomly shuffled, then supervised learning may find artifacts related to the order of the data. Parameters ---------- df : pd.DataFrame dataframe with feature information Returns ------- df : pd.DataFrame Randomly shuffled data frame """ # get new random state if not specified if prng is None: prng = np.random.RandomState() # get random order random_indices = prng.choice(df.index.values, # sample from 'genes' len(df), # number of samples replace=False) # sample without replacement # change order of df random_df = df.ix[random_indices].copy() return random_df
def process_mutational_features(mydf): """Performs feature processing pipeline. Parameters ---------- mydf : pd.DataFrame data frame containing the desired raw data for computation of features for classifier Returns ------- proc_feat_df: pd.DataFrame dataframe consisting of features for classification """ # rename process of columns to ensure compatability with previously # written code mydf = mydf.rename(columns={'Protein_Change': 'AminoAcid', 'DNA_Change': 'Nucleotide'}) # process features feat_list = fmat.generate_feature_matrix(mydf, 2) headers = feat_list.pop(0) # remove header row feat_df = pd.DataFrame(feat_list, columns=headers) # convert to data frame proc_feat_df = normalize_mutational_features(feat_df, 0) miss_ent_df = pentropy.missense_position_entropy(mydf[['Gene', 'AminoAcid']]) # mut_ent_df = pentropy.mutation_position_entropy(mydf[['Gene', 'AminoAcid']]) # encorporate entropy features #proc_feat_df['mutation position entropy'] = mut_ent_df['mutation position entropy'] #proc_feat_df['pct of uniform mutation entropy'] = mut_ent_df['pct of uniform mutation entropy'] proc_feat_df['missense position entropy'] = miss_ent_df['missense position entropy'] proc_feat_df['pct of uniform missense entropy'] = miss_ent_df['pct of uniform missense entropy'] return proc_feat_df
def pandadf2adeldict(df): ''' convertit un dataframe panda en dictionaire de vecteur numpy ''' d = df.to_dict() return dict((k,np.array([v for v in dv.itervalues()])) for k, dv in d.iteritems())
def compute_transcription_factor_activity(self, allow_self_interactions_for_duplicate_prior_columns = True): # Find TFs that have non-zero columns in the priors matrix non_zero_tfs = self.prior.columns[(self.prior != 0).any(axis=0)].tolist() # Delete tfs that have neither prior information nor expression delete_tfs = set(self.prior.columns).difference(self.prior.index).difference(non_zero_tfs) # Raise warnings if len(delete_tfs) > 0: message = " ".join([str(len(delete_tfs)).capitalize(), "transcription factors are removed because no expression or prior information exists."]) warnings.warn(message) self.prior = self.prior.drop(delete_tfs, axis = 1) # Create activity dataframe with values set by default to the transcription factor's expression activity = pd.DataFrame(self.expression_matrix.loc[self.prior.columns,:].values, index = self.prior.columns, columns = self.expression_matrix.columns) # Find all non-zero TFs that are duplicates of any other non-zero tfs is_duplicated = self.prior[non_zero_tfs].transpose().duplicated(keep=False) duplicates = is_duplicated[is_duplicated].index.tolist() # Find non-zero TFs that are also present in target gene list self_interacting_tfs = set(non_zero_tfs).intersection(self.prior.index) # If this flag is set to true, don't count duplicates as self-interacting when setting the diag to zero if allow_self_interactions_for_duplicate_prior_columns: self_interacting_tfs = self_interacting_tfs.difference(duplicates) # Set the diagonal of the matrix subset of self-interacting tfs to zero subset = self.prior.loc[self_interacting_tfs, self_interacting_tfs].values np.fill_diagonal(subset, 0) self.prior.at[self_interacting_tfs, self_interacting_tfs] = subset # Set the activity of non-zero tfs to the pseudoinverse of the prior matrix times the expression if non_zero_tfs: activity.loc[non_zero_tfs,:] = np.matrix(linalg.pinv2(self.prior[non_zero_tfs])) * np.matrix(self.expression_matrix_halftau) return activity
def usage(): print ''' # template for loading openfoam data into pandas.dataframe import sys sys.path.append("/home/soseng/OpenFOAM/bv/foamBazar/pythonScripts/") import fsData as fs from matplotlib import pyplot as plt if __name__ == "__main__": log = fs.loadLogData("-p res -w init,Ux,Uy,Uz", logfiles=['log.run','fsLog']) mot = fs.loadMotionInfo("motionInfo", root='./') vbm = fs.loadInternalLoads("vbm", root='./', fnames=['my','fz','acc']) ''' pass
def setmetadata(data, label=None, info=None, module=None, args=None): data.fsData = deepcopy(FSDATA) data.fsData['label'] = label data.fsData['info'] = info if info!=None else 'last update: ' + datetime.date.today().strftime("%I:%M%p %B %d, %Y") data.fsData['module'] = module data.fsData['args'] = args if args!=None else { 'lastUpdate':datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') } # load log data given a list of logfiles/folder # data will be merged and return as pandas.dataframe # cmd: is fsPlot.py command line arguments # e.g.: loadLogData('-p res', logfiles=['log.run0','log.run1','']) # When overlap, either keep 'last', 'first', or 'False'
def feature_engineering_ensemble(df,category,sample=60000,purpose='word',\ countries = ['US','BR','RU','KR']): ''' function: - aggregates multiple user defined functions to create dataframe for ensemble method modeling. - it also prints out how long it takes to run - processes google quickdraw raw data dataframe - after this processing, dataframe contains 404 features - the output of this function will be used for ensemble method modeling. input: - df = dataframe that was converted from raw_data json file - category = used to name output pickle file - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word') - purpose = 'word' or 'country'. prepares data for different purposes. 'word' for image recognition, 'country' for country prediction - countries = list of country code used in country prediction output: - pickled dataframe that will be used for ensemble method (404 features) filename: "./data/MY_feature_{}.pkl".format(category) ''' start_time = time.time() #runs feature_eng_pt1 through pt5. df_test1 = feature_eng_pt1(df) df_test2 = feature_eng_pt2(df_test1) df_test3 = feature_eng_pt3(df_test2) df_subset = feature_eng_pt4(df_test3) df_subset2 = feature_eng_pt5(df_test3) df_final = pd.concat([df_test3,df_subset,df_subset2], axis=1) # prepares final dataframe #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final if purpose == 'word': df_final.index = xrange(len(df_final)) random_ind = np.random.choice(list(df_final.index), sample, replace=False) df_final = df_final.loc[list(random_ind)] #if purpose = 'country', it will correct all datapoints from the selected countries. elif purpose == 'country': df_final = df_final[(df_final['countrycode']==countries[0])|\ (df_final['countrycode']==countries[1])|\ (df_final['countrycode']==countries[2])|(df_final['countrycode']==countries[3])] df_final.index = df_final['key_id'] df_final.to_pickle("./data/MY_feature_{}.pkl".format(category)) print("--- %s seconds ---" % (time.time() - start_time))
def feature_engineering_CNN(df,category,sample=60000,purpose='word',countries = ['US','BR','RU','KR']): ''' function: - aggregates 2 user defined functions that prepares dataframe for CNN modeling. - it also prints out how long it takes to run. input: - df = dataframe that was converted from raw_data json file - category = used to name output pickle file - sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word') - purpose = 'word' or 'country'. prepares data for different purposes. 'word' for image recognition, 'country' for country prediction - countries = list of country codes used in country prediction output: - pickled dataframe that will be used for CNN modeling (1176 features) - each row represents 42 by 28 pixel image file name: "./data/{}.pkl".format(category) ''' start_time = time.time() #runs CNN feature engineering functions df_1 = CNN_feat_eng_pt1(df) df_2 = CNN_feat_eng_pt2(df_1) #If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final if purpose == 'word': df_2.index = xrange(len(df_2)) random_ind = np.random.choice(list(df_2.index), sample, replace=False) df_2 = df_2.loc[list(random_ind)] #If purpose = 'country', it will correct all datapoints from the selected countries. elif purpose == 'country': df_2 = df_2[(df_2['countrycode']==countries[0])|(df_2['countrycode']==countries[1])|\ (df_2['countrycode']==countries[2])|(df_2['countrycode']==countries[3])] df_2.index = df_2['key_id'] df_2.to_pickle("./data/{}.pkl".format(category)) print("--- %s seconds ---" % (time.time() - start_time)) return df_2 ############################################################################## # functions for feature engineeering for ensemble methods # ##############################################################################
def feature_eng_pt1(df_cf): ''' function: - feature engineering pt1 need to run this first since pt2 to pt5 uses features created in this function. - create following features: stroke_number = total stroke number of an image [int] final time = time of the last datapoints for an image (how long it took user to draw) [int] recognized = changed True/False response to boolean (1 is true, 0 is false)[int] - Filtering applied: 1: filtered out data where recognize == 0. Having unrecognized images in the dataset may reduce prediction accuracy 2: filtered out data where stroke_number is greater than 15 After analysis, most pics were drawn under 15 strokes. I'm suspecting that if stroke numbers are above 20 or 30, users might be using a graphic tablet. In this project, I tried to exclude those images above 15 strokes. So that I keep all images that are drawn in the similar environment. 3: filtered out data where final time is greater than 20000 I do not know how this happens but some images have time values that are more than 20000. The quickdraw ask users to draw in 20sec so I am a bit puzzled how these users draw for more than 20000ms. input: df = dataframe created from Google quickdraw raw data json file output: dataframe with additional features mentioned above ''' # create feature "stroke_number" df_cf['stroke_number']=df_cf['drawing'].str.len() #create feature "final_time" df_cf['final_time'] = [df_cf.loc[index,'drawing']\ [df_cf.stroke_number[index]-1][2][-1] for index in df_cf.index] #setting boolean and changing recognized features to 1 and 0. b_loon = {True: 1, False:0} df_cf['recognized'] = df_cf['recognized'].map(b_loon) #filtered data by stroke number, recognized and final time features df_cf = df_cf[(df_cf['recognized']==1) & (df_cf['stroke_number'] <= 15)] df_cf = df_cf[(df_cf['final_time']<=20000)] return df_cf
def feature_eng_pt4(df_cf): ''' function: - feature engineering pt4 create new dataframe that need to be combined with output dataframe of feature_eng_pt3 - it creates 5 features per 1 stroke. - this function will creates these 5 features for first 15 strokes of an image - Create following features: datapoint_percentage_stroke'i' = # of data points in stroke i divide by total number of data points of an image. [float] * do not confuse with dp_percent_per_stroke column I previously made. dp_percent_per_stroke is a list. datapoint_percentage_stroke'i' is a float! direction_stroke'i' = direction of stroke 'i' [float] time_stroke'i' = total time spent on stroke'i' [int] datapoints_stroke'i' = number of data points in stroke i [int] switch_stroke'i' = boolean indicates whether stroke'i' exist in an image 0: stroke exist 1: stroke does not exist [int] input: df_cf = output dataframe from feature_eng_pt3 output: new dataframe with 75 features (5 * 15 features) ''' ar = np.zeros((len(df_cf),75)) c = 0 for index_ in df_cf.index: stroke = (df_cf.stroke_number[index_]) ar[c][:stroke] = np.array(df_cf['dp_percent_per_stroke'][index_]) ar[c][15:15+stroke] = np.array(df_cf['direction'][index_]) ar[c][30:30+stroke] = np.array(df_cf['total_time_of_stroke'][index_]) ar[c][45:45+stroke] = np.array(df_cf['dp_per_stroke'][index_]) ar[c][60:75] = np.array([0]*stroke+[1]*(15-stroke)) c += 1 subset = pd.DataFrame(ar) subset.index = df_cf.index for num in xrange(15): subset = subset.rename(columns={num:"datapoint_percentage_stroke{}".format(num)}) for num in xrange(15,30): subset = subset.rename(columns={num:"direction_stroke{}".format(num-15)}) for num in xrange(30,45): subset = subset.rename(columns={num:"time_stroke{}".format(num-30)}) for num in xrange(45,60): subset = subset.rename(columns={num:"datapoint_stroke{}".format(num-45)}) for num in xrange(60,75): subset = subset.rename(columns={num:"switch_stroke{}".format(num-60)}) return subset
def extract_data_from_file( filename, filepath, good_line_pattern, good_cols=None, labels=None,): '''YG Develop Octo 17, 2018 Extract data from a file Input: filename: str, filename of the data filepath: str, path of the data good_line_pattern: str, data will be extract below this good_line_pattern good_cols: list of integer, good index of cols lables: the label of the good_cols #save: False, if True will save the data into a csv file with filename appending csv ?? Return: a pds.dataframe Example: filepath = '/XF11ID/analysis/2017_3/lwiegart/Link_files/Exports/' filename = 'ANPES2 15-10-17 16-31-11-84Exported.txt' good_cols = [ 1,2,4,6,8,10 ] labels = [ 'time', 'temperature', 'force', 'distance', 'stress', 'strain' ] good_line_pattern = "Index\tX\tY\tX\tY\tX\tY" df = extract_data_from_file( filename, filepath, good_line_pattern, good_cols, labels) ''' import pandas as pds with open( filepath + filename, 'r' ) as fin: p=fin.readlines() di = 1e20 for i, line in enumerate(p): if good_line_pattern in line: di = i if i == di+1: els = line.split() if good_cols is None: data = np.array( els, dtype=float ) else: data = np.array( [els[j] for j in good_cols], dtype=float ) elif i > di: try: els = line.split() if good_cols is None: temp = np.array( els, dtype=float ) else: temp= np.array( [els[j] for j in good_cols], dtype=float ) data=np.vstack( (data,temp)) except: pass if labels is None: labels = np.arange(data.shape[1]) df = pds.DataFrame( data, index= np.arange(data.shape[0]), columns= labels ) return df