我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_excel()。
def getExcelData(self): """ get data from 'hsi_futures.xlsx' Date | Open | High | Low | Close | SMAVG5 | SMAVG10 | SMAVG15 | Volume | VolumeSMAVG5 :return: data table """ df = pd.DataFrame() xl = pd.ExcelFile("../dataManager/hsi_futures.xlsx") # print xl.sheet_names sheets = xl.sheet_names for sheet in sheets: df = df.append(pd.read_excel("../dataManager/hsi_futures.xlsx", sheet)) df['Date'] = pd.to_datetime(df['Date']) df.sort_values("Date", ascending=True, inplace=True) data = df.set_index([range(df.shape[0])]) return data
def get_hs300s(): """ ????300?????????? Return -------- DataFrame code :???? name :???? date :?? weight:?? """ from tushare.stock.fundamental import get_stock_basics try: wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], ct.PAGES['hs300w']), parse_cols=[0, 3, 6]) wt.columns = ct.FOR_CLASSIFY_W_COLS wt['code'] = wt['code'].map(lambda x :str(x).zfill(6)) df = get_stock_basics()[['name']] df = df.reset_index() return pd.merge(df,wt) except Exception as er: print(str(er))
def get_sz50s(): """ ????50??? Return -------- DataFrame code :???? name :???? """ try: df = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], ct.PAGES['sz50b']), parse_cols=[0,1]) df.columns = ct.FOR_CLASSIFY_B_COLS df['code'] = df['code'].map(lambda x :str(x).zfill(6)) return df except Exception as er: print(str(er))
def get_zz500s(): """ ????500??? Return -------- DataFrame code :???? name :???? """ from tushare.stock.fundamental import get_stock_basics try: # df = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], # ct.PAGES['zz500b']), parse_cols=[0,1]) # df.columns = ct.FOR_CLASSIFY_B_COLS # df['code'] = df['code'].map(lambda x :str(x).zfill(6)) wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], ct.PAGES['zz500wt']), parse_cols=[0, 3, 6]) wt.columns = ct.FOR_CLASSIFY_W_COLS wt['code'] = wt['code'].map(lambda x :str(x).zfill(6)) df = get_stock_basics()[['name']] df = df.reset_index() return pd.merge(df,wt) except Exception as er: print(str(er))
def search(): #?? df = pd.read_excel("huatai2.xls") input_m = 0.0 output_m = 0.0 for index, row in df.iterrows(): if row[u'??'] == u'??': each_input = row[u'?????'] print u"??", print each_input input_m = input_m + each_input #print type(money) if row[u'??'] == u'??': each_output = row[u'?????'] print u"??", print each_output #print type(money) output_m = output_m + each_output print "Sumary is %f" % (input_m - output_m)
def replace_test(): #??? df = pd.read_excel("huatai2.xls") s1 = pd.Series(['a', 'b', 'c', 'd', 'e']) #print s1 s2 = pd.Series(['1', '2', '3', '4', '5']) #print s2 s3 = s1.replace(1, 'k') #print s1 #print s3 print df df.replace(['20160722', u'????', 2431.0, u'????', 13.00, 300.0, 3891.10, 3905.71, u'??'], ['20160722', '0', '0', '0', 0, 0, 0, 0, '0'], inplace=True) #df.replace(['20160722'],['20160725','0','0','0',0,0,0,0,'0'],inplace=True) print df
def load_articles(self): """ Loads the DataFrame with all the articles. Return: DataFrame with the title, content, tags and author of all articles """ #parser = SafeConfigParser() #parser.read('Config.ini') #file_path = settings['IP_FILE_PATH'] #file_name = settings['IP_FILE_NAME'] #logging.debug("Directory Name : {0} and File name is {1} \n".format(file_path,file_name)) #logging.debug("Directory Name : {0} and File name is {1} \n".format(parser.get('Article_input_dir', 'ip_file_path'),parser.get('Article_input_file', 'ip_file_name')) #file_path = '/Users/shwetanknagar/Downloads/Personal/Project Eventstreet/Boconni Project' #file_name = os.path.basename("TestSet300_User_Ratings.xlsx") path = os.path.join(self.ip_file_path, self.ip_file_name) #commented by shwenag #self.df = pd.read_csv('TrainSet700_User_Ratings.xlsx', encoding='utf-8') # Load articles in a DataFrame self.df = pd.read_excel(path, na_values=['NA'], parse_cols = "A,B,C") #self.df = self.df[['Sno', 'title', 'content_text']] # Slice to remove redundant columns #commenting the below by shwenag print(self.df) logging.debug("Number of articles: {0} and no of columns are {1} \n".format(len(self.df),self.df.shape))
def load_articles(self): """ Loads the DataFrame with all the articles. Return: DataFrame with the title, content, tags and author of all articles """ #parser = SafeConfigParser() #parser.read('Config.ini') #file_path = settings['IP_FILE_PATH'] #file_name = settings['IP_FILE_NAME'] #logging.debug("Directory Name : {0} and File name is {1} \n".format(file_path,file_name)) #logging.debug("Directory Name : {0} and File name is {1} \n".format(parser.get('Article_input_dir', 'ip_file_path'),parser.get('Article_input_file', 'ip_file_name')) file_path = '/Users/shwetanknagar/Downloads/Personal/Project Eventstreet/Boconni Project' file_name = os.path.basename("TestSet300_User_Ratings.xlsx") path = os.path.join(file_path, file_name) #commented by shwenag #self.df = pd.read_csv('TrainSet700_User_Ratings.xlsx', encoding='utf-8') # Load articles in a DataFrame self.df = pd.read_excel(path, na_values=['NA'], parse_cols = "A,B,C") #self.df = self.df[['Sno', 'title', 'content_text']] # Slice to remove redundant columns #commenting the below by shwenag print(self.df) logging.debug("Number of articles: {0} and no of columns are {1} \n".format(len(self.df),self.df.shape))
def test_load_xlsx(self): url = 'http://test.com/the.xlsx' self.url_pval.set_value(url) self.url_pval.save() xlsx_bytes = open(mock_xslx_path, "rb").read() xlsx_table = pd.read_excel(mock_xslx_path) # success case with requests_mock.Mocker() as m: m.get(url, content=xlsx_bytes, headers={'content-type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'}) self.press_fetch_button() response = self.get_render() self.assertEqual(response.content, make_render_json(xlsx_table)) # malformed file should put module in error state with requests_mock.Mocker() as m: m.get(url, content=b"there's just no way this is xlsx", headers={'content-type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'}) self.press_fetch_button() self.wfmodule.refresh_from_db() self.assertEqual(self.wfmodule.status, WfModule.ERROR)
def main(): df = pd.read_excel('../data/Tariffs.xlsx') df.loc[df['Tariff'] == 'Low', 'Tariff'] = 0.0399 df.loc[df['Tariff'] == 'Normal', 'Tariff'] = 0.1176 df.loc[df['Tariff'] == 'High', 'Tariff'] = 0.6720 # ets = ExtractTimeSeries(datetime_col='TariffDateTime', yt_col='Tariff') df = ets.transform(df) # day = pd.to_datetime('2013-12-27').date() next_day = day + timedelta(days=1) df_out = df.query('index >= @day and index < @next_day') df_out.columns=['Tariff (UK Pounds)'] # print_process('Saving Post-Processed Data') path_to_price = '../clean_data/price_data_London.csv' df_out.to_csv(path_to_price) print 'Tariff data saved into: {}'.format(path_to_price) print
def _load_table(self, filepath): """ Load table from file system. :param str filepath: Path to table in CSV, TSV, XLSX or Pandas pickle format. :return: Pandas table :rtype: pandas.core.frame.DataFrame """ _, ext = os.path.splitext(filepath.lower()) if ext == '.tsv': return pd.read_table(filepath, **self.kwargs) if ext == '.csv': return pd.read_csv(filepath, **self.kwargs) if ext == '.xlsx': return pd.read_excel(filepath, **self.kwargs) return pd.read_pickle(filepath, **self.kwargs)
def read_excel(self, file): # TODO: add iterator and return columns excel_tab = pd.read_excel(file, dtype=str) columns = excel_tab.columns def make_gen(excel_tab, chunksize): cursor = 0 chunk = excel_tab.iloc[:chunksize] while chunk.shape[0]: yield chunk cursor += chunksize chunk = excel_tab.iloc[cursor:cursor+chunksize] tab = make_gen(excel_tab, self.CHUNKSIZE) tab = (self._clean_header(tab_part) for tab_part in tab) return tab, None, None, self._clean_column_names(columns)
def GetAllTodayData(self): #???? ???? ?,?????????? filename=self.today+'_all_.xls' #??data???? filename=os.path.join(self.path,filename) if not os.path.exists(filename): self.df_today_all=ts.get_today_all() #????? self.df_today_all.drop(self.df_today_all[self.df_today_all['turnoverratio']==0].index,inplace=True) #?????????? #n1=self.df_today_all[self.df_today_all['turnoverratio']==0] #n2=self.df_today_all.drop(n1.index) #print n2 print self.df_today_all self.df_today_all.to_excel(filename,sheet_name='All') else: self.df_today_all=pd.read_excel(filename,sheet_name='All') print "File existed"
def count_up_down(filename): total=[] df=pd.read_excel(filename) count= len(df[(df['changepercent']>=-10.2) & (df['changepercent']<-9)]) total.append(count) for i in range(-9,9,1): count= len(df[(df['changepercent']>=i*1.00) & (df['changepercent']<((i+1))*1.00)]) total.append(count) count= len(df[(df['changepercent']>=9)]) total.append(count) print total df_figure=pd.Series(total,index=[range(-10,10)]) print df_figure fg=df_figure.plot(kind='bar',table=True) plt.show(fg)
def read_res(file): #read data from xls files Size, R, xr,xl,xc,yu,yd,yc = [] ,[], [],[] ,[], [],[],[] #lists for areas, Fij's, deps cordinates out = pd.read_excel(file, sheetname = "Out") #read model results Rout = pd.read_excel(file, sheetname = "R") #read Fij's Sizeout = pd.read_excel(file, sheetname = "Size") #read deps wanted sizes Wout = pd.read_excel(file, sheetname = "W") # w1 and w2 w1 = float(Wout['w1'][0]) w2 = 1.0-w1 totx = float(out['totx'][0]) #total length in x axis toty = float(out['toty'][0]) #total length in y axis for d in range(len(Sizeout)): #insert data results into python lists R.append([]) Size.append(float(Sizeout['Area'][d])) xr.append(float(out['Xr'][d])) xl.append(float(out['Xl'][d])) xc.append((float(out['Xl'][d])+float(out['Xr'][d]))/2) yu.append(float(out['Yu'][d])) yd.append(float(out['Yd'][d])) yc.append((float(out['Yu'][d])+float(out['Yd'][d]))/2) for i in range(len(Rout)): R[d].append(float(Rout.iloc[d,i])) return Size, R, totx, toty, xr,xl,xc,yu,yd,yc, w1, w2
def groups(ofname): df = pandas.read_excel('GC-VTPR.xlsx', sheetname='Groups') entries = [] for i,row in df.iterrows(): entry = { "Q_k": row['Qk'], "R_k": row['Rk'], "maingroup_name": row["main group name"], "mgi": row['main group index'], "sgi": row['sub group index'], "subgroup_name": row["sub group name"] } entries.append(entry) with open(ofname, 'w') as fp: json.dump(entries, fp, indent = 2, sort_keys = True)
def interaction_parameters(ofname): df = pandas.read_excel('GC-VTPR.xlsx', sheetname='InteractionParameters') df = df.fillna(0.0) entries = [] for i,row in df.iterrows(): entry = { "a_ij": row['aij / K'], "a_ji": row['aji / K'], "b_ij": row['bij'], "b_ji": row['bji'], "c_ij": row['cij / K-1'], "c_ji": row['cji / K-1'], "mgi1": row['i'], "mgi2": row['j'] } entries.append(entry) with open(ofname, 'w') as fp: json.dump(entries, fp, indent = 2, sort_keys = True)
def _load_powerplant(): """ attribute information: features consist of hourly average ambient variables - temperature (t) in the range 1.81 c and 37.11 c, - ambient pressure (ap) in the range 992.89-1033.30 millibar, - relative humidity (rh) in the range 25.56% to 100.16% - exhaust vacuum (v) in teh range 25.36-81.56 cm hg - net hourly electrical energy output (ep) 420.26-495.76 mw the averages are taken from various sensors located around the plant that record the ambient variables every second. the variables are given without normalization. """ data_file = os.path.join(data_dir, 'power-plant/Folds5x2_pp.xlsx') data = pd.read_excel(data_file) x = data.values[:, :-1] y = data.values[:, -1] return x, y
def _fetch_data(self, dataset, query=None): files = [(y, m) for y in query['years'] for m in query['months']] frames = [] # Download and clean every monthly Excel file for file in files: year, month = file url = self.BASE_URL.format(year=year, month=MONTHS[month]) frame = self._clean_data(pd.read_excel(url), year, month) frames.append(frame) # Yield individual rows of type Result from the dataframe raw_data = pd.concat(frames) for i, row in raw_data.iterrows(): val = row.pop('value') yield Result(val, json.loads(row.to_json()))
def get_exceldf(self, basename, *args, **kwds): """ Return test data DataFrame. Test data path is defined by pandas.util.testing.get_data_path() Parameters ---------- basename : str File base name, excluding file extension. Returns ------- df : DataFrame """ pth = os.path.join(self.dirpath, basename + self.ext) return read_excel(pth, *args, **kwds)
def test_read_one_empty_col_no_header(self): df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) with ensure_clean(self.ext) as path: df.to_excel(path, 'no_header', index=False, header=False) actual_header_none = read_excel( path, 'no_header', parse_cols=[0], header=None ) actual_header_zero = read_excel( path, 'no_header', parse_cols=[0], header=0 ) expected = DataFrame() tm.assert_frame_equal(actual_header_none, expected) tm.assert_frame_equal(actual_header_zero, expected)
def test_read_from_file_url(self): # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") localtable = os.path.join(self.dirpath, 'test1' + self.ext) local_table = read_excel(localtable) try: url_table = read_excel('file://localhost/' + localtable) except URLError: # fails on some systems import platform raise nose.SkipTest("failing on %s" % ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table)
def test_read_excel_skiprows_list(self): # GH 4903 actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + self.ext), 'skiprows_list', skiprows=[0, 2]) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], [3, 4.5, pd.Timestamp('2015-01-03'), False], [4, 5.5, pd.Timestamp('2015-01-04'), True]], columns=['a', 'b', 'c', 'd']) tm.assert_frame_equal(actual, expected) actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + self.ext), 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected)
def test_read_excel_squeeze(self): # GH 12157 f = os.path.join(self.dirpath, 'test_squeeze' + self.ext) actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') expected.index.name = 'a' tm.assert_series_equal(actual, expected) actual = pd.read_excel(f, 'two_columns', squeeze=True) expected = pd.DataFrame({'a': [4, 5, 6], 'b': [2, 3, 4]}) tm.assert_frame_equal(actual, expected) actual = pd.read_excel(f, 'one_column', squeeze=True) expected = pd.Series([1, 2, 3], name='a') tm.assert_series_equal(actual, expected)
def test_int_types(self): _skip_if_no_xlrd() for np_type in (np.int8, np.int16, np.int32, np.int64): with ensure_clean(self.ext) as path: # Test np.int values read come back as int (rather than float # which is Excel's format). frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = read_excel(reader, 'test1') int_frame = frame.astype(np.int64) tm.assert_frame_equal(int_frame, recons) recons2 = read_excel(path, 'test1') tm.assert_frame_equal(int_frame, recons2) # test with convert_float=False comes back as float float_frame = frame.astype(float) recons = read_excel(path, 'test1', convert_float=False) tm.assert_frame_equal(recons, float_frame, check_index_type=False, check_column_type=False)
def test_sheets(self): _skip_if_no_xlrd() with ensure_clean(self.ext) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', columns=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(path) self.frame.to_excel(writer, 'test1') self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(path) recons = read_excel(reader, 'test1', index_col=0) tm.assert_frame_equal(self.frame, recons) recons = read_excel(reader, 'test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) np.testing.assert_equal(2, len(reader.sheet_names)) np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1])
def test_colaliases(self): _skip_if_no_xlrd() with ensure_clean(self.ext) as path: self.frame['A'][:5] = nan self.frame.to_excel(path, 'test1') self.frame.to_excel(path, 'test1', columns=['A', 'B']) self.frame.to_excel(path, 'test1', header=False) self.frame.to_excel(path, 'test1', index=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_excel(path, 'test1', header=col_aliases) reader = ExcelFile(path) rs = read_excel(reader, 'test1', index_col=0) xp = self.frame2.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs)
def test_to_excel_multiindex(self): _skip_if_no_xlrd() frame = self.frame arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean(self.ext) as path: frame.to_excel(path, 'test1', header=False) frame.to_excel(path, 'test1', columns=['A', 'B']) # round trip frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = read_excel(reader, 'test1', index_col=[0, 1], parse_dates=False) tm.assert_frame_equal(frame, df) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells
def test_to_excel_multiindex_dates(self): _skip_if_no_xlrd() # try multiindex with dates tsframe = self.tsframe.copy() new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) with ensure_clean(self.ext) as path: tsframe.index.names = ['time', 'foo'] tsframe.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) recons = read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) self.assertEqual(recons.index.names, ('time', 'foo'))
def test_to_excel_multiindex_no_write_index(self): _skip_if_no_xlrd() # Test writing and re-reading a MI witout the index. GH 5616. # Initial non-MI frame. frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) # Add a MI. frame2 = frame1.copy() multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) frame2.index = multi_index with ensure_clean(self.ext) as path: # Write out to Excel without the index. frame2.to_excel(path, 'test1', index=False) # Read it back in. reader = ExcelFile(path) frame3 = read_excel(reader, 'test1') # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3)
def test_datetimes(self): # Test writing and reading datetimes. For issue #9139. (xref #9185) _skip_if_no_xlrd() datetimes = [datetime(2013, 1, 13, 1, 2, 3), datetime(2013, 1, 13, 2, 45, 56), datetime(2013, 1, 13, 4, 29, 49), datetime(2013, 1, 13, 6, 13, 42), datetime(2013, 1, 13, 7, 57, 35), datetime(2013, 1, 13, 9, 41, 28), datetime(2013, 1, 13, 11, 25, 21), datetime(2013, 1, 13, 13, 9, 14), datetime(2013, 1, 13, 14, 53, 7), datetime(2013, 1, 13, 16, 37, 0), datetime(2013, 1, 13, 18, 20, 52)] with ensure_clean(self.ext) as path: write_frame = DataFrame.from_items([('A', datetimes)]) write_frame.to_excel(path, 'Sheet1') read_frame = read_excel(path, 'Sheet1', header=0) tm.assert_series_equal(write_frame['A'], read_frame['A']) # GH7074
def pingan_trust(): result = get_total(start_date=START_DATE,end_date='2016-05-10') # issuers = pd.read_excel('????-SW??-????.xlsx',sheetname=[0], header = 0)[0] # issuers = my_db.getCompanyList() issuers = pd.read_excel('../peace/??????.xlsx',sheetname=[0], header = 0)[0] issuers.columns= ['name'] focus = issuers.merge(result, on='name', how='left') focus = focus.sort_values('??',axis=0,ascending=False) import time time_str = time.strftime('%Y%m%d',time.localtime(time.time())) focus['rptDate']=time_str insert_into_db(focus) report = focus.dropna(axis=0, how='any',thresh=3) report.to_excel("shixin_zhixing_bank.xlsx")
def getModle(): fixedEvaluation = pd.read_excel(MODLE_FILE_NAME,sheetname=[0], header = 0, skiprows = [0]) industryTbl = pd.read_excel(MODLE_FILE_NAME,sheetname=[1], index_col = 2,parse_cols="B:L",header = 3) trendTbl = pd.read_excel(MODLE_FILE_NAME,sheetname=[2], header = 2,skiprows=[0]) fluctuationTbl = pd.read_excel(MODLE_FILE_NAME,sheetname=[3], header = 2,skiprows=[0]) fixedScoreTble = pd.read_excel(MODLE_FILE_NAME,sheetname=[4], header = 0,skiprows=[0]) df=pd.read_excel(DATA_FILE,sheetname=[1], header = 0,index_col=0,verbose=True) df[1].head().index df[1].head().columns df[1].head().describe() df[1].head().loc[:,['????','?????']] for i in range(df[1].head().iloc[1].count()): print(df[1].head().iloc[1][i]) head = df[1].head() head.values[0][1:40].reshape(13,3)
def get_history_bar(field_names, start_date, end_date, **kwargs): field_info = pd.read_excel(argInfoWB,sheetname='????',engine='xlrd') if not isinstance(field_names,list): field_names = [field_names] # ????????? _l = [] w.start() for fieldName in field_names: field_name = field_info[field_info['FactorName']==field_name]['FieldName'].iat[0] args = field_info[field_info['FactorName']==field_name]['Args'].iat[0] params = _parse_args(args,**kwargs) all_days = data_api.tc.get_trade_days(start_date, end_date) all_ids = data_api.get_history_ashare(all_days).index.levels[1].unique() data = w.wsd( list(map(tradecode_to_windcode, all_ids)), field_name, start_date, end_date, params) _l.append(_bar_to_dataframe(data)) data = pd.concat(_l,axis=1) w.close() return data
def read_annotated_files(dirname): messages = [] labels = np.zeros(0) filenames = glob.glob(os.path.join(dirname, '*.xls*')) for filename in filenames: print('Reading %s' % filename, end='. ', flush=True) df = pd.read_excel(filename) print("Found %d new samples" % df[df.LABEL.notnull()].shape[0]) labels = np.hstack((labels, np.array(df[df.LABEL.notnull()].LABEL.tolist(), dtype=int))) messages += df[df.LABEL.notnull()].text.tolist() return messages, labels
def setUp(self): trans = { 'Linear': ['A'], 'Random': ['B'], 'Wave': ['C','D']} system_name = 'Simple' file_name = join(simpleexampledir,'simple.xlsx') df = pd.read_excel(file_name) self.pm = pecos.monitoring.PerformanceMonitoring() self.pm.add_dataframe(df, system_name) self.pm.add_translation_dictionary(trans, system_name) self.pm.check_timestamp(900) clock_time = self.pm.get_clock_time() time_filter = (clock_time > 3*3600) & (clock_time < 21*3600) self.pm.add_time_filter(time_filter)
def unet_cross_val(data_dir, out_dir, mapping, splits, unet_conf): # Load spreadsheet with pd.ExcelFile(mapping) as xls: df = pd.read_excel(xls, 'Sheet1').set_index('index') df['class'] = df['class'].map({'preplus': 'pre-plus', 'normal': 'normal', 'plus': 'plus'}) img_dir = join(data_dir, 'images') seg_dir = join(data_dir, 'manual_segmentations') mask_dir = join(data_dir, 'masks') # Check whether all images exist check_images_exist(df, img_dir, seg_dir, mask_dir) # Now split into training and testing CVFile = sio.loadmat(splits) # # Combining Pre-Plus and Plus # trainPlusIndex = CVFile['trainPlusIndex'][0] # testPlusIndex = CVFile['testPlusIndex'][0] # # plus_dir = make_sub_dir(out_dir, 'trainTestPlus') # print "Generating splits for combined No and Pre-Plus" # generate_splits(trainPlusIndex, testPlusIndex, df, img_dir, mask_dir, seg_dir, plus_dir) # Combining No and Pre-Plus trainPrePIndex = CVFile['trainPrePIndex'][0] testPrePIndex = CVFile['testPrePIndex'][0] prep_dir = make_sub_dir(out_dir, 'trainTestPreP') print "Generating splits for combined Pre-Plus and Plus" generate_splits(trainPrePIndex, testPrePIndex, df, img_dir, mask_dir, seg_dir, prep_dir) # Train models train_and_test(prep_dir, unet_conf, processes=1) # train_and_test(plus_dir, unet_conf, processes=2)
def pca_augmentation(data_h5, excel_path): f = h5py.File(data_h5, 'r') df1 = pd.read_excel(excel_path, sheetname=0, header=1) df1 = df1.rename(columns=lambda x: x.strip()).set_index('Image') # strip whitespace df2 = pd.read_excel(excel_path, sheetname=1, header=1) df2 = df2.rename(columns=lambda x: x.strip()).set_index('Image') # strip whitespace df = pd.concat([df1, df2]) X = preprocess_data(f) X_mean = np.mean(X, axis=0) X = X - X_mean # PCA pca = PCA().fit(X)
def setDataFrameFromFile(self, filepath, **kwargs): """ Sets the model's dataFrame by reading a file. Accepted file formats: - .xlsx (sheet1 is read unless specified in kwargs) - .csv (comma separated unless specified in kwargs) - .txt (any separator) :param filepath: (str) The path to the file to be read. :param kwargs: pandas.read_csv(**kwargs) or pandas.read_excel(**kwargs) :return: None """ df = superReadFile(filepath, **kwargs) self.setDataFrame(df, filePath=filepath)
def get_dataset_rows(meta): """ if filename is a xlsx file, returns a list of list of dicts else: returns a list of dicts relies on get_dataset_local_filename() and meta['slug'] right now wishing I had made this all OOP... """ srcpath = meta['local_filepath'] if meta['filetype'] == 'workbook': import pandas as pd sheetindices = list(range(len(meta['gsheet']['sheets']))) dfs = pd.read_excel(str(srcpath), sheetname=sheetindices) return [x.to_dict('records') for x in dfs.values()] else: # assume CSV return list(DictReader(srcpath.open('r')))
def initGraph(self, name): #file reading using pandas df = pd.read_excel(name, sheetname='Sheet1') dim = df['Dimension'] x = df['X'] y = df['Y'] #print x,y self.numCities = len(dim) #set and fill the adjMatrix self.adjMatrix = [[1.0 for i in range(self.numCities)] for j in range(self.numCities) ] for i in range(self.numCities): for j in range(self.numCities): #fill the adjmatrix with city coordinates and calculate euclidean distances self.adjMatrix[i][j] = self.calEdge(x[i], x[j], y[i], y[j]) #calculating edge weights using euclidean distances
def extract_lstm_test(dictionary, file_name, tag_num=CLASS_NUM, col_tag=0, col_content=1, length=MAX_LENGTH): contents = pd.read_excel(file_name, header=None) cw = lambda x: [word.encode('utf-8') for word in jieba.cut(x) if word not in stopwords and word.strip() != '' and word.encode('utf-8') in dictionary.index] contents['words'] = contents[col_content].apply(cw) get_sent = lambda x: list(dictionary['id'][x]) contents['sent'] = contents['words'].apply(get_sent) # ????,?????????? print("Pad sequences (samples x time)") contents['sent'] = list(sequence.pad_sequences(contents['sent'], maxlen=length)) x = np.array(list(contents['sent'])) # ??? y = np.zeros((len(list(contents[col_tag])), tag_num)) for i in range(len(list(contents[col_tag]))): for j in range(tag_num): if contents[col_tag][i] == j: y[i][j] = 1 return x, y # dictionary model ????
def extract_dictionary_feature(file_name, col_tag=0, col_content=1): # ???? adv = codecs.open('./data/vocabulary/adv.txt', 'rb', encoding='utf-8').read().split('\n') inverse = codecs.open('./data/vocabulary/inverse.txt', 'rb', encoding='utf-8').read().split('\n') negdict = codecs.open('./data/vocabulary/negdict.txt', 'rb', encoding='utf-8').read().split('\n') posdict = codecs.open('./data/vocabulary/posdict.txt', 'rb', encoding='utf-8').read().split('\n') contents = pd.read_excel(file_name, header=None) print 'cut words...' cw = lambda x: [pair for pair in psg.lcut(x) if pair.word not in stopwords] contents['pairs'] = contents[col_content].apply(cw) matrix = reviews2matrix(list(contents['pairs']), posdict, negdict, inverse, adv) x = matrix2vec(matrix) y = list(contents[col_tag]) return x, y
def read(): df = pd.read_excel("jjs1.xlsx") data = list(df.ix[:, 7]) prices=[] for item in data: if not str(item)=="nan": prices.append(int(item)) print(prices[0:100]) matplotlib.style.use('ggplot')#??ggplot?? ts = pd.Series(prices[0:100], index=pd.date_range('1/1/2000', periods=100)) plt.figure() df.plot.hist(alpha=0.5) plt.legend() plt.show() pass
def loadfile(): neg=pd.read_excel(datadir + '/neg.xls',header=None,index=None) pos=pd.read_excel(datadir + '/pos.xls',header=None,index=None) cw = lambda x: list(jieba.cut(x)) pos['words'] = pos[0].apply(cw) neg['words'] = neg[0].apply(cw) #print pos['words'] #use 1 for positive sentiment, 0 for negative y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg)))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2) np.save(modeldir + '/y_train.npy',y_train) np.save(modeldir + '/y_test.npy',y_test) return x_train,x_test #??????????????
def load(self, pth): with open(pth, 'r') as fh: data = read_excel(fh, sheetname='data') print(data.as_matrix(columns=data.columns[1:])) return data.as_matrix() #todo: pandas formats - http://pandas.pydata.org/pandas-docs/stable/io.html # hdf5 # sql #todo: hdf5 - http://stackoverflow.com/a/9619713/723090 #todo: bloscpack http://stackoverflow.com/a/22225337/723090 #todo: pytables