我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用pandas.ExcelFile()。
def load_file(file_name, sheet_name_list): ''' load xlsx file into a dictionary indexed by sheet names :param string file_name:name of file :param [string] sheet_name_list: name of selected sheets in the xlsx file :return: {string:DataFrame} raw_data: {name of sheet:pure data retrieved from xlsx with column and index 0,1,2,...} ''' print 'loading file...' cut_head = 2 file = pd.ExcelFile(file_name) raw_data = {} # iterate over every sheet and retrieve useful data into raw_data for i in range(len(sheet_name_list)): print 'parsing sheet', sheet_name_list[i] # parse a sheet from the whole file into a DataFrame with headers cut off temp = file.parse(sheet_name_list[i]).iloc[cut_head:, :] # now temp.dtype = object,because the data read in contains string.Here convert it to float temp = temp.astype(np.float) # reset index and column with 0,1,2,..., temp.columns = range(temp.shape[1]) temp.index = range(temp.shape[0]) temp.fillna(0, inplace=True) raw_data[sheet_name_list[i]] = temp return raw_data
def read_data(file): """Read data about the system from the excel file and assign it to different parameters Args: file: excel file, which stores all data about considered system Returns: cst: constants, which describe the system srs: parameters, which are variable with time U: list of possible decisions states: values, which set number of states and characterize all possible ones """ xls = pd.ExcelFile(file) states = xls.parse('DP-States',index_col=[0]) cst = xls.parse('Constants',index_col=[0])['Value'] srs = xls.parse('Time-Series',index_col=[0]) U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values return cst,srs,U,states ################################### #for 2 states - temperature and heat-storage ###################################
def getExcelData(self): """ get data from 'hsi_futures.xlsx' Date | Open | High | Low | Close | SMAVG5 | SMAVG10 | SMAVG15 | Volume | VolumeSMAVG5 :return: data table """ df = pd.DataFrame() xl = pd.ExcelFile("../dataManager/hsi_futures.xlsx") # print xl.sheet_names sheets = xl.sheet_names for sheet in sheets: df = df.append(pd.read_excel("../dataManager/hsi_futures.xlsx", sheet)) df['Date'] = pd.to_datetime(df['Date']) df.sort_values("Date", ascending=True, inplace=True) data = df.set_index([range(df.shape[0])]) return data
def get_clinical_template(study): """Try to detect the clinical template file in the source dir and open it with pandas.""" clinical_templates = [template for template in study.excel_files if "clin" in template.lower() and "~$" not in template] clinical_template = None if len(clinical_templates) == 1: clinical_template_name = clinical_templates[0] clinical_template = pd.ExcelFile(clinical_template_name, comment="#") print("[INFO] Clinical data template detected: " + clinical_template_name) elif len(clinical_templates) == 0: study.clinical_template_present = False print("[WARNING] No clinical data template could be detected. Assuming only high-dimensional templates " + "are available.") else: print("[ERROR] Too many clinical data templates were found. " + "Make sure only one file has 'clinical' in its name.") Validity.list_length(clinical_templates, expected=1) return clinical_template
def add_general_study_metadata(study, study_metadata_template_path): """Read the data from general study level metadata template and write to tags file.""" metadata = pd.ExcelFile(study_metadata_template_path, comment="#") if len(metadata.sheet_names) > 1: print("[WARNING] Multiple sheets detected in general study metadata template. Assuming first sheet.") df = metadata.parse(0, header=None) tag_index = 10 for __, row in df.iterrows(): data = row[row.first_valid_index():].dropna().tolist() if len(data) == 2: tag = data[0] value = data[1] study.all_metadata.add(("\\", tag, value, tag_index)) tag_index += 1 study.write_metadata()
def read_excel(filename): """Read input Excel file and return dict of DataFrames for each sheet. Read an Excel spreadsheet with geographic input data. Args: filename: filename to an Excel spreadsheet with 'Vertex' and 'Edge' Returns: dict of 2 pandas DataFrames """ with pd.ExcelFile(filename) as xls: vertex = xls.parse('Vertex').set_index('Vertex') edge = xls.parse('Edge').set_index(['Edge', 'Vertex1', 'Vertex2']) data = { 'Vertex': vertex, 'Edge': edge} return data
def _compare_xlsx(self, file1, file2, rtol=1e-02, atol=1e-03): # print("requested compare: {} and {}".format(file1, file2)) xl1 = pd.ExcelFile(file1) xl2 = pd.ExcelFile(file2) self.assertEqual(xl1.sheet_names, xl2.sheet_names) for sheet in xl1.sheet_names: # print("Prrocessing sheet {}".format(sheet)) df1 = xl1.parse(sheet) df2 = xl2.parse(sheet) columns1 = list(df1) columns2 = list(df2) self.assertEqual(len(columns1), len(columns2)) arr1 = df1.values arr2 = df2.values self.assertEqual(arr1.shape, arr2.shape) for x, y in np.ndindex(arr1.shape): v1 = arr1[x, y] v2 = arr2[x, y] # print("{}: ({}, {}): {} vs {}".format(sheet, x, y, v1, v2)) if isinstance(v1, six.string_types) or isinstance(v2, six.string_types): self.assertEqual(v1, v2) else: npt.assert_allclose(v1, v2, rtol=rtol, atol=atol)
def parse_xls(fpath): ''' load the xls data. export: a data frame (in pandas) ''' book = xlrd.open_workbook(fpath) sheet_names = book.sheet_names() print("sheet_names:", sheet_names) xl_sheet = book.sheet_by_name(sheet_names[0]) title_row = xl_sheet.row_values(0) # title row print(title_row) xls_file = pd.ExcelFile(fpath) sht1 = xls_file.sheet_names[0] df = xls_file.parse(sht1) print(df.ix[0]) print(df.ix[1]) print(df[3:])
def load_supervision_authorities(self, *args, **options): excel_file = pd.ExcelFile(options['filename']) state_names = excel_file.sheet_names for state_name in state_names: state = State.objects.get(name=state_name) df = excel_file.parse(state_name) for _, row in df.iterrows(): try: email = stringify(row['email']) if email: email = email.splitlines()[0] email = EMAIL_RE.search(email) email = email.group(0).strip() if email is not None else '' email = email.lower() authority, created = SupervisionAuthority.objects.update_or_create( state=state, name=stringify(row['name']), defaults=dict( address=stringify(row['address']), contact=stringify(row['contact']), email=email, url=stringify(row['url']), report_url=stringify(row.get(u'Verfügbare Berichte', '')) ) ) if created: print(authority) except Exception: print(row['name']) raise
def assign_brandenburg(self, *args, **options): brandenburg_state = State.objects.get(name='Brandenburg') excel_file = pd.ExcelFile(options['filename']) df = excel_file.parse('Brandenburg') assigned_auths = defaultdict(list) locations = {} for _, row in df.iterrows(): auth = SupervisionAuthority.objects.get(state=brandenburg_state, name=row['name']) locations[auth] = GEOSGeometry('POINT(%f %f)' % (row['lng'], row['lat']), srid=4326) assigned_districts = row[u'Landkreis-Zuständigkeit'].splitlines() for district_name in assigned_districts: districts = District.objects.filter(part_of=brandenburg_state, name=district_name) if len(districts) != 1: print(district_name) print(districts) else: assigned_auths[districts[0]].append(auth) for nursinghome in NursingHome.objects.filter(supervision_authority__isnull=True, state=brandenburg_state): district = District.objects.get(geom__covers=nursinghome.geo) auths = assigned_auths[district] if len(auths) == 1: nursinghome.supervision_authority = auths[0] nursinghome.save() else: min_distance = None best_auth = None for auth, point in locations.items(): if auth not in auths: continue dist = NursingHome.objects.filter(pk=nursinghome.pk ).annotate(distance=Distance('geo', point)) dist = dist[0].distance.m if min_distance is None or dist < min_distance: min_distance = dist best_auth = auth nursinghome.supervision_authority = best_auth nursinghome.save()
def assign_rheinlandpfalz(self, *args, **options): rp_state = State.objects.get(name='Rheinland-Pfalz') excel_file = pd.ExcelFile(options['filename']) df = excel_file.parse('Rheinland-Pfalz') assigned = defaultdict(list) for _, row in df.iterrows(): auth = SupervisionAuthority.objects.get(state=rp_state, name=row['name']) district_names = row[u'Landkreis-Zuständigkeit'].splitlines() for district_name in district_names: only = None if '|' in district_name: district_name, only = district_name.split('|') only = only.split(',') districts = District.objects.filter(part_of=rp_state, name=district_name) if len(districts) == 0: districts = District.objects.filter(part_of=rp_state, name__contains=district_name) if len(districts) == 0: districts = District.objects.filter(part_of=rp_state, name__contains=district_name.split()[0]) if len(districts) == 0: districts = District.objects.filter(part_of=rp_state, name__istartswith=re.sub('\W', '', district_name)) if len(districts) > 1: if 'Kreis' in district_name: districts = districts.filter(kind_detail__contains='Landkreis') if 'Stadt' in district_name: districts = districts.filter(kind_detail__contains='Stadt') if len(districts) != 1: print(districts) print(u'District not one: %s' % district_name) continue assigned[auth].append((districts[0], only)) for auth, district_list in assigned.items(): for district, only in district_list: if only is None: NursingHome.objects.filter(state=rp_state, district=district, supervision_authority__isnull=True).update(supervision_authority=auth) continue for muni_name in only: muni_name = muni_name.strip() munis = Municipality.objects.filter(part_of=district, name__contains=muni_name) if len(munis) > 1: munis = Municipality.objects.filter(part_of=district, name=muni_name) if len(munis) != 1: print('Did not find %s' % muni_name) continue muni = munis[0] NursingHome.objects.filter(state=rp_state, district=district, supervision_authority__isnull=True, geo__coveredby=muni.geom).update(supervision_authority=auth)
def read_data(file): """Read data about the system from the excel file and assign it to different parameters Args: file: excel file, which stores all data about considered system Returns: cst: constants, which describe the system srs: parameters, which are variable with time U: list of possible decisions states: values, which set number of states and characterize all possible ones """ xls = pd.ExcelFile(file) states = xls.parse('DP-States',index_col=[0]) cst = xls.parse('Constants',index_col=[0])['Value'] srs = xls.parse('Time-Series',index_col=[0]) U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values return cst,srs,U,states ################################### #for 2 states ###################################
def read_data(file): xls = pd.ExcelFile(file) cst = xls.parse('Constants',index_col=[0])['Value'] srs = xls.parse('Time-Series',index_col=[0]) U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values states = xls.parse('DP-States',index_col=[0]) return cst,srs,U,states
def read_data(file): """Read data about the system from the excel file and assign it to different parameters Args: file: excel file, which stores all data about considered system Returns: cst: constants, which describe the system srs: parameters, which are variable with time U: list of possible decisions states: values, which set number of states and characterize all possible ones """ xls = pd.ExcelFile(file) states = xls.parse('DP-States',index_col=[0]) cst = xls.parse('Constants',index_col=[0])['Value'] srs = xls.parse('Time-Series',index_col=[0]) U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values return cst,srs,U,states ################################### #for 1 state - temperature ###################################
def get_sanitized_dataframe(self, date_column='Date'): # ExcelFile does not handle file_paths very well, use native Python open opened_file = open(self.file_path, 'rb') excel_file = pd.ExcelFile(opened_file) dataframe = excel_file.parse(self.sheet) # Sanitize so the inputs are correct and remove fluke days dataframe = self._sanitize_sheet(dataframe) dataframe = self._set_dataframe_index(dataframe, date_column) return dataframe
def loadFile(self, filename): if filename.endswith('.xls') or filename.endswith('.xlsx'): df=pd.ExcelFile(filename) sheetnames=df.sheet_names dialog=ExcelFileDialog(filename,sheetnames,self) dialog.accepted.connect(self.loadExcel) dialog.show() else: table = QTableView() table.setAlternatingRowColors(True) model=DataFrameTableModel(filename=filename) model.trackDataChange.connect(self.trackChanges) table.setModel(model) ### Set some variables ### table.headers = table.horizontalHeader() table.vHeaders=table.verticalHeader() #### Set context menu for table headers #### table.headers.setContextMenuPolicy(Qt.CustomContextMenu) table.headers.customContextMenuRequested.connect(self.headerMenu) table.vHeaders.setContextMenuPolicy(Qt.CustomContextMenu) table.vHeaders.customContextMenuRequested.connect(self.vHeaderMenu) if model._filename.endswith('.csv'): df=pd.read_csv(model._filename) model.setDataFrame(df) self.tableTabWidget.addTab(table, QFileInfo(model._filename).fileName()) self.tableTabWidget.setCurrentWidget(table) if model._filename.endswith('.txt'): delim = str(self.parseDelimiter(model._filename)) if delim == ' ': df=pd.read_csv(model._filename,delim_whitespace = True) else: df=pd.read_csv(model._filename,sep=delim) model.setDataFrame(df) self.tableTabWidget.addTab(table, QFileInfo(model._filename).fileName()) self.tableTabWidget.setCurrentWidget(table)
def read_excel(fname, header=None): """Read excel into dict. Args: fname: name of excel file header: The finland files does not have a header Output: dictionary containing the data """ xls = ExcelFile(fname) if header: parse_cols = [1] else: parse_cols = None df = xls.parse(xls.sheet_names[0], skiprows=1, parse_cols=parse_cols) # Fix keys temp = df.to_dict() for key in temp: new_key = key.replace(" - ", "_") temp[new_key] = temp.pop(key) # Stupid hack for Finland if header: temp[header] = temp.pop(temp.keys()[0]) return temp
def importData(file): try: fileData = pd.ExcelFile(file) except: try: fileData = pd.read_csv(file) except: print("Please provide an excel or csv file") return fileData #Load for not writing all the time
def setUp(self): self.data = { "DC_PEC": "import pandas as pd; from urllib.request import urlretrieve; urlretrieve('https://s3.amazonaws.com/assets.datacamp.com/production/course_998/datasets/battledeath.xlsx', 'battledeath.xlsx')", "DC_SOLUTION": "xl = pd.ExcelFile('battledeath.xlsx')", "DC_SCT": "test_object('xl')" }
def test_step_1(self): self.data["DC_CODE"] = "xl = pd.ExcelFile('battledeath.xlsx')" sct_payload = helper.run(self.data) self.assertTrue(sct_payload['correct'])
def test_pass_1(self): self.data = { "DC_PEC": "import pandas as pd; from urllib.request import urlretrieve; urlretrieve('https://s3.amazonaws.com/assets.datacamp.com/production/course_998/datasets/battledeath.xlsx', 'battledeath.xlsx'); from urllib.request import urlretrieve; urlretrieve('https://s3.amazonaws.com/assets.datacamp.com/production/course_998/datasets/battledeath.xlsx', 'battledeath2.xlsx')", "DC_SOLUTION": "xl = pd.ExcelFile('battledeath.xlsx')", "DC_CODE": "xl = pd.ExcelFile('battledeath2.xlsx')", "DC_SCT": ''' def my_converter(x): return(x.sheet_names) set_converter(key = "pandas.io.excel.ExcelFile", fundef = my_converter) test_object('xl') ''' } sct_payload = helper.run(self.data) self.assertTrue(sct_payload['correct'])
def unet_cross_val(data_dir, out_dir, mapping, splits, unet_conf): # Load spreadsheet with pd.ExcelFile(mapping) as xls: df = pd.read_excel(xls, 'Sheet1').set_index('index') df['class'] = df['class'].map({'preplus': 'pre-plus', 'normal': 'normal', 'plus': 'plus'}) img_dir = join(data_dir, 'images') seg_dir = join(data_dir, 'manual_segmentations') mask_dir = join(data_dir, 'masks') # Check whether all images exist check_images_exist(df, img_dir, seg_dir, mask_dir) # Now split into training and testing CVFile = sio.loadmat(splits) # # Combining Pre-Plus and Plus # trainPlusIndex = CVFile['trainPlusIndex'][0] # testPlusIndex = CVFile['testPlusIndex'][0] # # plus_dir = make_sub_dir(out_dir, 'trainTestPlus') # print "Generating splits for combined No and Pre-Plus" # generate_splits(trainPlusIndex, testPlusIndex, df, img_dir, mask_dir, seg_dir, plus_dir) # Combining No and Pre-Plus trainPrePIndex = CVFile['trainPrePIndex'][0] testPrePIndex = CVFile['testPrePIndex'][0] prep_dir = make_sub_dir(out_dir, 'trainTestPreP') print "Generating splits for combined Pre-Plus and Plus" generate_splits(trainPrePIndex, testPrePIndex, df, img_dir, mask_dir, seg_dir, prep_dir) # Train models train_and_test(prep_dir, unet_conf, processes=1) # train_and_test(plus_dir, unet_conf, processes=2)
def read_hd_file_template(self, source_dir, hd_template): """Try to read the specified template file and send to sheet loading method.""" template_path = os.path.join(source_dir, hd_template) try: hd_template_workbook = pd.ExcelFile(template_path, comment="#", dtype=object) except FileNotFoundError: raise Validity.TemplateException("Could not find high-dim template file at: {0}".format(template_path)) # except XLRDError: # raise Validity.TemplateException( # "High-dim template file at: {0} is not a valid xlsx file.".format(template_path)) self._load_sheets(hd_template_workbook)
def load_data(fname='SeatTest_New.xlsx'): """ Load the xlsx using pandas. :param fname: string location of the file to load :return: pandas object """ return pd.ExcelFile(fname)
def parse_statistics(logfile): xl = pd.ExcelFile(logfile) df = xl.parse("Sheet") df = df.sort_values(by='Line Numbers') writer = pd.ExcelWriter(logfile) df.to_excel(writer, sheet_name='Sheet', index=False) writer.save() wb = openpyxl.load_workbook(logfile) ws = wb.active row_count = ws.max_row column_count = ws.max_column chart = ScatterChart() chart.title = "Time upload domain names" chart.style = 13 chart.x_axis.title = "Line numbers" chart.y_axis.title = "Time, sec" xvalues = Reference(ws, min_col=1, min_row=2, max_row=row_count) color_choice = ['3F888F', 'D24D57'] for i in range(2, column_count + 1): values = Reference(ws, min_col=i, min_row=1, max_row=row_count) series = Series(values, xvalues, title_from_data=True) series.marker.symbol = "diamond" series.graphicalProperties.line.solidFill = color_choice[i-2] series.marker.graphicalProperties.line.solidFill = color_choice[i-2] series.marker.graphicalProperties.solidFill = color_choice[i-2] series.graphicalProperties.line.width = 20000 chart.series.append(series) chart.legend.legendPos = 'b' ws.add_chart(chart) wb.save(logfile)
def loadExcel(self,options): names = options['sheets'] filename = options['file'] openEach = options['openEach'] df=pd.ExcelFile(filename) if not openEach: newTab = QTabWidget() newTab.setTabsClosable(True) newTab.currentChanged.connect(self.tabChanged) self.connect(newTab,SIGNAL("tabCloseRequested(int)"), self.fileCloseInternalTab) for i in range(len(names)): table = QTableView() table.setAlternatingRowColors(True) model=DataFrameTableModel(filename=filename) model.trackDataChange.connect(self.trackChanges) table.setModel(model) ### Set some variables ### table.headers = table.horizontalHeader() table.vHeaders=table.verticalHeader() #### Set context menu for table headers #### table.headers.setContextMenuPolicy(Qt.CustomContextMenu) table.headers.customContextMenuRequested.connect(self.headerMenu) table.vHeaders.setContextMenuPolicy(Qt.CustomContextMenu) table.vHeaders.customContextMenuRequested.connect(self.vHeaderMenu) df2=df.parse(sheetname=names[i]) model.setDataFrame(df2) newTab.addTab(table,names[i]) newTab.setCurrentIndex(0) self.tableTabWidget.addTab(newTab,QFileInfo(filename).fileName()) self.tableTabWidget.setCurrentWidget(newTab) else: for i in range(len(names)): table = QTableView() table.setAlternatingRowColors(True) model=DataFrameTableModel(filename=names[i]) model.trackDataChange.connect(self.trackChanges) table.setModel(model) ### Set some variables ### table.headers = table.horizontalHeader() table.vHeaders=table.verticalHeader() #### Set context menu for table headers #### table.headers.setContextMenuPolicy(Qt.CustomContextMenu) table.headers.customContextMenuRequested.connect(self.headerMenu) table.vHeaders.setContextMenuPolicy(Qt.CustomContextMenu) table.vHeaders.customContextMenuRequested.connect(self.vHeaderMenu) df2=df.parse(sheetname=names[i]) model.setDataFrame(df2) self.tableTabWidget.addTab(table,names[i]) self.tableTabWidget.setCurrentWidget(table)
def metadata_file(path, root, tables=None): ''' Returns the metadata for a file. There are 3 types of file formats: 1. Archives (7z, zip, rar, tar) / compressed (xz, bzip2, gzip). Decompress and process 2. Database (sqlite3, hdf5, xls, xlsx). Process each table/sheet as a sub-dataset 3. Data (csv, json). Process directly ''' tree = Meta() format = guess_format(path) if format is not None: tree.format = format if format == 'dir': tree.datasets = Datasets() for base, dirs, files in os.walk(path): for filename in files: source = os.path.join(base, filename) name = os.path.relpath(source, path) tree.datasets[name] = submeta = Meta(name=name, source=source) try: submeta.update(metadata_file(source, root, tables)) except Exception as e: submeta['error'] = str(e) logging.exception('Unable to get metadata for %s', source) elif format in {'7z', 'zip', 'rar', 'tar', 'xz', 'gz', 'bz2'}: tree.datasets = Datasets() for name, source in unzip_files(path, root, format): tree.datasets[name] = submeta = Meta(name=name) try: submeta.update(metadata_file(source, root, tables)) except Exception as e: submeta['error'] = str(e) logging.exception('Unable to get metadata for %s', source) elif format == 'sqlite3': tree.update(metadata_sql('sqlite:///' + path, tables)) elif format in {'hdf5', 'xls', 'xlsx'}: if format == 'hdf5': store = pd.HDFStore(path) table_list = store.keys() store.close() else: xls = pd.ExcelFile(path) table_list = xls.sheet_names format = 'xlsx' tree.datasets = Datasets() for table in table_list: if tables is None or table in tables: tree.datasets[table] = Meta([ ('name', table), ('format', 'table'), ('command', [format, path, table]) ]) elif format == 'csv': tree.command = ['csv', path] elif format == 'json': tree.command = ['json', path] return tree