Python pandas 模块,dataframe() 实例源码
我们从Python开源项目中,提取了以下18个代码示例,用于说明如何使用pandas.dataframe()。
def _df_initial_fixer(df, word, sample=60000):
'''
function:
- ramdomly select rows (image) "sample" times from the df dataframe
and delete features that are not used in ensemble method modeling
input:
df = dataframe. output of 1_feature_engineering_func. [pd.dataframe]
word = name of topic ig "cat" [str]
sample = number of sample you want to extract from df [int]
output:
new data frame!
'''
print "total number of images for df_{}: {}".format(word, len(df))
random_index = np.random.choice(list(df.index), sample, replace=False)
df = df.loc[list(random_index)]
df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\
'X_per_stroke','Y_per_stroke','time_per_stroke',\
'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\
'direction'], axis=1)
return df_test
def _df_initial_fixer_cc(df, word):
'''
prepares training and test X and Y for xgboost test for countrycode classifier
function:
- delete features that are not used in ensemble method modeling
input:
df = dataframe. output of 1_feature_engineering_func. [pd.dataframe]
word = name of topic ig "cat" [str]
output:
new data frame!
'''
df_test = df.drop(['drawing','key_id','timestamp','recognized','X','Y','time',\
'X_per_stroke','Y_per_stroke','time_per_stroke',\
'total_time_of_stroke','dp_per_stroke','dp_percent_per_stroke',\
'direction'], axis=1)
return df_test
def _country_initial_fixer(df,country,limit):
'''
Function:
extracts data by country and ramdomly select "limit" amount of data from that dataset
Input:
df = dataframe (should contain 'countrycode' features) [dataframe]
country = should be 2 capital letter country code[string]
limit = max number of rows (data) you want to take into the new data frame
Output:
dataframe contains data from selected country (# of data <= limit)
note: uses random.seed(32113)
'''
if df[df['countrycode']==country].count()[0] > limit:
df_c = df[df['countrycode']==country]
random_c = np.random.choice(list(df_c.index), limit, replace=False)
df_c = df_c.loc[list(random_c)]
else:
df_c = df[df['countrycode']==country]
return df_c
def postProcessingDatFile(fname, objName=None, root='./'):
if objName!=None:
dataFolder = postProcessingFolder(objName, root=root)
timeNames = timeFolder(root=dataFolder)
else:
dataFolder = addslash(root)
timeNames = []
if len(timeNames)==0: timeNames=[''] # at least check the current folder
keyName = os.path.basename(rmslash(fname))
keyName = os.path.splitext(keyName)[0]
datFiles = []
for subdir in timeNames:
found = filesOnly(sorted(glob.glob(dataFolder + subdir + "/" + keyName + "*.dat")))
for f in found: datFiles.append(f)
return datFiles
# concat dataframe and optionally merge xAxis
# When overlap, either keep 'last', 'first', or 'False'
# list_of_data must be of type pandas.dataframe
def feature_eng_pt3(df_cf):
'''
function:
- feature engineering pt3
need to run this after feature_eng_pt2 since pt4 and pt5
uses features created in this function.
- Create following features:
direction = direction of stroke (from first XY points to last XY points)
in radian (0 to 6.28...) [float]
input:
df_cf = output dataframe from feature_eng_pt2
output:
dataframe with above features and filter
the way I approached this is by finding the first and last x,y locations for each stroke and
I then calculated delta x (dx) and delta y (dy).
from there, I just calculated the direction of the stroke in radian using my user defined function "_radian_direction"
'''
direction = {}
for index in df_cf.index:
dx = [float(df_cf.drawing[index][stroke][1][-1] - df_cf.drawing[index][stroke][1][0]) \
for stroke in xrange(df_cf.stroke_number[index])]
dy = [float(df_cf.drawing[index][stroke][0][-1] - df_cf.drawing[index][stroke][0][0]) \
for stroke in xrange(df_cf.stroke_number[index])]
dx = np.array(dx)
dy = np.array(dy)
dx[dx==0] = 0.000001
vecrad_direction = np.vectorize(_radian_direction)
direction[index] = vecrad_direction(dy,dx)
df_cf['direction'] = pd.Series(direction)
return df_cf
def load_json(filename):
'''
Function:
- opens json file and store information in a pandas dataframe
- also prints out aggregated df with counts of picture by countrycode
Input:
1. filename/path ex: ./data/filename.json
Output:
1. new dataframe containing json info
'''
df = pd.read_json(filename, lines=True)
test = df.groupby(df['countrycode']).count()
print test.sort(columns='drawing',ascending=False).head(15)
return df
def pic_viewer(df_cf, _id):
'''
Function:
- If X and Y columns exist in your dataframe, you can use this function
to view drawing with specific id.
- run this after running CNN_feat_eng_pt1 or feature_eng_pt2
Input:
1. dataframe df_cf
2. object id _id
Output:
1. scatter plot of x and y
'''
plt.scatter(df_cf.X[_id],df_cf.Y[_id])
plt.gca().invert_yaxis()
def random_sort(df, prng=None):
"""Randomly shuffle a DataFrame.
NOTE: if the training data is not randomly shuffled, then
supervised learning may find artifacts related to the order
of the data.
Parameters
----------
df : pd.DataFrame
dataframe with feature information
Returns
-------
df : pd.DataFrame
Randomly shuffled data frame
"""
# get new random state if not specified
if prng is None:
prng = np.random.RandomState()
# get random order
random_indices = prng.choice(df.index.values, # sample from 'genes'
len(df), # number of samples
replace=False) # sample without replacement
# change order of df
random_df = df.ix[random_indices].copy()
return random_df
def process_mutational_features(mydf):
"""Performs feature processing pipeline.
Parameters
----------
mydf : pd.DataFrame
data frame containing the desired raw data for computation of
features for classifier
Returns
-------
proc_feat_df: pd.DataFrame
dataframe consisting of features for classification
"""
# rename process of columns to ensure compatability with previously
# written code
mydf = mydf.rename(columns={'Protein_Change': 'AminoAcid',
'DNA_Change': 'Nucleotide'})
# process features
feat_list = fmat.generate_feature_matrix(mydf, 2)
headers = feat_list.pop(0) # remove header row
feat_df = pd.DataFrame(feat_list, columns=headers) # convert to data frame
proc_feat_df = normalize_mutational_features(feat_df, 0)
miss_ent_df = pentropy.missense_position_entropy(mydf[['Gene', 'AminoAcid']])
# mut_ent_df = pentropy.mutation_position_entropy(mydf[['Gene', 'AminoAcid']])
# encorporate entropy features
#proc_feat_df['mutation position entropy'] = mut_ent_df['mutation position entropy']
#proc_feat_df['pct of uniform mutation entropy'] = mut_ent_df['pct of uniform mutation entropy']
proc_feat_df['missense position entropy'] = miss_ent_df['missense position entropy']
proc_feat_df['pct of uniform missense entropy'] = miss_ent_df['pct of uniform missense entropy']
return proc_feat_df
def pandadf2adeldict(df):
''' convertit un dataframe panda en dictionaire de vecteur numpy '''
d = df.to_dict()
return dict((k,np.array([v for v in dv.itervalues()])) for k, dv in d.iteritems())
def compute_transcription_factor_activity(self, allow_self_interactions_for_duplicate_prior_columns = True):
# Find TFs that have non-zero columns in the priors matrix
non_zero_tfs = self.prior.columns[(self.prior != 0).any(axis=0)].tolist()
# Delete tfs that have neither prior information nor expression
delete_tfs = set(self.prior.columns).difference(self.prior.index).difference(non_zero_tfs)
# Raise warnings
if len(delete_tfs) > 0:
message = " ".join([str(len(delete_tfs)).capitalize(),
"transcription factors are removed because no expression or prior information exists."])
warnings.warn(message)
self.prior = self.prior.drop(delete_tfs, axis = 1)
# Create activity dataframe with values set by default to the transcription factor's expression
activity = pd.DataFrame(self.expression_matrix.loc[self.prior.columns,:].values,
index = self.prior.columns,
columns = self.expression_matrix.columns)
# Find all non-zero TFs that are duplicates of any other non-zero tfs
is_duplicated = self.prior[non_zero_tfs].transpose().duplicated(keep=False)
duplicates = is_duplicated[is_duplicated].index.tolist()
# Find non-zero TFs that are also present in target gene list
self_interacting_tfs = set(non_zero_tfs).intersection(self.prior.index)
# If this flag is set to true, don't count duplicates as self-interacting when setting the diag to zero
if allow_self_interactions_for_duplicate_prior_columns:
self_interacting_tfs = self_interacting_tfs.difference(duplicates)
# Set the diagonal of the matrix subset of self-interacting tfs to zero
subset = self.prior.loc[self_interacting_tfs, self_interacting_tfs].values
np.fill_diagonal(subset, 0)
self.prior.at[self_interacting_tfs, self_interacting_tfs] = subset
# Set the activity of non-zero tfs to the pseudoinverse of the prior matrix times the expression
if non_zero_tfs:
activity.loc[non_zero_tfs,:] = np.matrix(linalg.pinv2(self.prior[non_zero_tfs])) * np.matrix(self.expression_matrix_halftau)
return activity
def usage():
print '''
# template for loading openfoam data into pandas.dataframe
import sys
sys.path.append("/home/soseng/OpenFOAM/bv/foamBazar/pythonScripts/")
import fsData as fs
from matplotlib import pyplot as plt
if __name__ == "__main__":
log = fs.loadLogData("-p res -w init,Ux,Uy,Uz", logfiles=['log.run','fsLog'])
mot = fs.loadMotionInfo("motionInfo", root='./')
vbm = fs.loadInternalLoads("vbm", root='./', fnames=['my','fz','acc'])
'''
pass
def setmetadata(data, label=None, info=None, module=None, args=None):
data.fsData = deepcopy(FSDATA)
data.fsData['label'] = label
data.fsData['info'] = info if info!=None else 'last update: ' + datetime.date.today().strftime("%I:%M%p %B %d, %Y")
data.fsData['module'] = module
data.fsData['args'] = args if args!=None else {
'lastUpdate':datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
}
# load log data given a list of logfiles/folder
# data will be merged and return as pandas.dataframe
# cmd: is fsPlot.py command line arguments
# e.g.: loadLogData('-p res', logfiles=['log.run0','log.run1',''])
# When overlap, either keep 'last', 'first', or 'False'
def feature_engineering_ensemble(df,category,sample=60000,purpose='word',\
countries = ['US','BR','RU','KR']):
'''
function:
- aggregates multiple user defined functions to create dataframe for ensemble method modeling.
- it also prints out how long it takes to run
- processes google quickdraw raw data dataframe
- after this processing, dataframe contains 404 features
- the output of this function will be used for ensemble method modeling.
input:
- df = dataframe that was converted from raw_data json file
- category = used to name output pickle file
- sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')
- purpose = 'word' or 'country'. prepares data for different purposes.
'word' for image recognition, 'country' for country prediction
- countries = list of country code used in country prediction
output:
- pickled dataframe that will be used for ensemble method (404 features)
filename: "./data/MY_feature_{}.pkl".format(category)
'''
start_time = time.time()
#runs feature_eng_pt1 through pt5.
df_test1 = feature_eng_pt1(df)
df_test2 = feature_eng_pt2(df_test1)
df_test3 = feature_eng_pt3(df_test2)
df_subset = feature_eng_pt4(df_test3)
df_subset2 = feature_eng_pt5(df_test3)
df_final = pd.concat([df_test3,df_subset,df_subset2], axis=1)
# prepares final dataframe
#If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
if purpose == 'word':
df_final.index = xrange(len(df_final))
random_ind = np.random.choice(list(df_final.index), sample, replace=False)
df_final = df_final.loc[list(random_ind)]
#if purpose = 'country', it will correct all datapoints from the selected countries.
elif purpose == 'country':
df_final = df_final[(df_final['countrycode']==countries[0])|\
(df_final['countrycode']==countries[1])|\
(df_final['countrycode']==countries[2])|(df_final['countrycode']==countries[3])]
df_final.index = df_final['key_id']
df_final.to_pickle("./data/MY_feature_{}.pkl".format(category))
print("--- %s seconds ---" % (time.time() - start_time))
def feature_engineering_CNN(df,category,sample=60000,purpose='word',countries = ['US','BR','RU','KR']):
'''
function:
- aggregates 2 user defined functions that prepares dataframe for CNN modeling.
- it also prints out how long it takes to run.
input:
- df = dataframe that was converted from raw_data json file
- category = used to name output pickle file
- sample = number of datapoints included in the final dataframe. (Used only when purpose = 'word')
- purpose = 'word' or 'country'. prepares data for different purposes.
'word' for image recognition, 'country' for country prediction
- countries = list of country codes used in country prediction
output:
- pickled dataframe that will be used for CNN modeling (1176 features)
- each row represents 42 by 28 pixel image
file name: "./data/{}.pkl".format(category)
'''
start_time = time.time()
#runs CNN feature engineering functions
df_1 = CNN_feat_eng_pt1(df)
df_2 = CNN_feat_eng_pt2(df_1)
#If purpose = 'word' it will randomly select 'sample' number of datapoints from df_final
if purpose == 'word':
df_2.index = xrange(len(df_2))
random_ind = np.random.choice(list(df_2.index), sample, replace=False)
df_2 = df_2.loc[list(random_ind)]
#If purpose = 'country', it will correct all datapoints from the selected countries.
elif purpose == 'country':
df_2 = df_2[(df_2['countrycode']==countries[0])|(df_2['countrycode']==countries[1])|\
(df_2['countrycode']==countries[2])|(df_2['countrycode']==countries[3])]
df_2.index = df_2['key_id']
df_2.to_pickle("./data/{}.pkl".format(category))
print("--- %s seconds ---" % (time.time() - start_time))
return df_2
##############################################################################
# functions for feature engineeering for ensemble methods #
##############################################################################
def feature_eng_pt1(df_cf):
'''
function:
- feature engineering pt1
need to run this first since pt2 to pt5 uses features created
in this function.
- create following features:
stroke_number = total stroke number of an image [int]
final time = time of the last datapoints for an image (how long it took user to draw) [int]
recognized = changed True/False response to boolean
(1 is true, 0 is false)[int]
- Filtering applied:
1: filtered out data where recognize == 0.
Having unrecognized images in the dataset may reduce prediction accuracy
2: filtered out data where stroke_number is greater than 15
After analysis, most pics were drawn under 15 strokes.
I'm suspecting that if stroke numbers are above 20 or 30, users might be using a graphic tablet.
In this project, I tried to exclude those images above 15 strokes.
So that I keep all images that are drawn in the similar environment.
3: filtered out data where final time is greater than 20000
I do not know how this happens but some images have time values that are more than 20000.
The quickdraw ask users to draw in 20sec so I am a bit puzzled how these users draw for more than 20000ms.
input:
df = dataframe created from Google quickdraw raw data json file
output:
dataframe with additional features mentioned above
'''
# create feature "stroke_number"
df_cf['stroke_number']=df_cf['drawing'].str.len()
#create feature "final_time"
df_cf['final_time'] = [df_cf.loc[index,'drawing']\
[df_cf.stroke_number[index]-1][2][-1] for index in df_cf.index]
#setting boolean and changing recognized features to 1 and 0.
b_loon = {True: 1, False:0}
df_cf['recognized'] = df_cf['recognized'].map(b_loon)
#filtered data by stroke number, recognized and final time features
df_cf = df_cf[(df_cf['recognized']==1) & (df_cf['stroke_number'] <= 15)]
df_cf = df_cf[(df_cf['final_time']<=20000)]
return df_cf
def feature_eng_pt4(df_cf):
'''
function:
- feature engineering pt4
create new dataframe that need to be combined with output dataframe
of feature_eng_pt3
- it creates 5 features per 1 stroke.
- this function will creates these 5 features for first 15 strokes of an image
- Create following features:
datapoint_percentage_stroke'i' = # of data points in stroke i divide by
total number of data points of an image. [float]
* do not confuse with dp_percent_per_stroke column I previously made.
dp_percent_per_stroke is a list. datapoint_percentage_stroke'i' is a float!
direction_stroke'i' = direction of stroke 'i' [float]
time_stroke'i' = total time spent on stroke'i' [int]
datapoints_stroke'i' = number of data points in stroke i [int]
switch_stroke'i' = boolean indicates whether stroke'i' exist in an image
0: stroke exist 1: stroke does not exist [int]
input:
df_cf = output dataframe from feature_eng_pt3
output:
new dataframe with 75 features (5 * 15 features)
'''
ar = np.zeros((len(df_cf),75))
c = 0
for index_ in df_cf.index:
stroke = (df_cf.stroke_number[index_])
ar[c][:stroke] = np.array(df_cf['dp_percent_per_stroke'][index_])
ar[c][15:15+stroke] = np.array(df_cf['direction'][index_])
ar[c][30:30+stroke] = np.array(df_cf['total_time_of_stroke'][index_])
ar[c][45:45+stroke] = np.array(df_cf['dp_per_stroke'][index_])
ar[c][60:75] = np.array([0]*stroke+[1]*(15-stroke))
c += 1
subset = pd.DataFrame(ar)
subset.index = df_cf.index
for num in xrange(15):
subset = subset.rename(columns={num:"datapoint_percentage_stroke{}".format(num)})
for num in xrange(15,30):
subset = subset.rename(columns={num:"direction_stroke{}".format(num-15)})
for num in xrange(30,45):
subset = subset.rename(columns={num:"time_stroke{}".format(num-30)})
for num in xrange(45,60):
subset = subset.rename(columns={num:"datapoint_stroke{}".format(num-45)})
for num in xrange(60,75):
subset = subset.rename(columns={num:"switch_stroke{}".format(num-60)})
return subset
def extract_data_from_file( filename, filepath, good_line_pattern, good_cols=None, labels=None,):
'''YG Develop Octo 17, 2018
Extract data from a file
Input:
filename: str, filename of the data
filepath: str, path of the data
good_line_pattern: str, data will be extract below this good_line_pattern
good_cols: list of integer, good index of cols
lables: the label of the good_cols
#save: False, if True will save the data into a csv file with filename appending csv ??
Return:
a pds.dataframe
Example:
filepath = '/XF11ID/analysis/2017_3/lwiegart/Link_files/Exports/'
filename = 'ANPES2 15-10-17 16-31-11-84Exported.txt'
good_cols = [ 1,2,4,6,8,10 ]
labels = [ 'time', 'temperature', 'force', 'distance', 'stress', 'strain' ]
good_line_pattern = "Index\tX\tY\tX\tY\tX\tY"
df = extract_data_from_file( filename, filepath, good_line_pattern, good_cols, labels)
'''
import pandas as pds
with open( filepath + filename, 'r' ) as fin:
p=fin.readlines()
di = 1e20
for i, line in enumerate(p):
if good_line_pattern in line:
di = i
if i == di+1:
els = line.split()
if good_cols is None:
data = np.array( els, dtype=float )
else:
data = np.array( [els[j] for j in good_cols], dtype=float )
elif i > di:
try:
els = line.split()
if good_cols is None:
temp = np.array( els, dtype=float )
else:
temp= np.array( [els[j] for j in good_cols], dtype=float )
data=np.vstack( (data,temp))
except:
pass
if labels is None:
labels = np.arange(data.shape[1])
df = pds.DataFrame( data, index= np.arange(data.shape[0]), columns= labels )
return df