Python pandas 模块,ExcelFile() 实例源码
我们从Python开源项目中,提取了以下28个代码示例,用于说明如何使用pandas.ExcelFile()。
def load_file(file_name, sheet_name_list):
'''
load xlsx file into a dictionary indexed by sheet names
:param string file_name:name of file
:param [string] sheet_name_list: name of selected sheets in the xlsx file
:return: {string:DataFrame} raw_data: {name of sheet:pure data retrieved from xlsx
with column and index 0,1,2,...}
'''
print 'loading file...'
cut_head = 2
file = pd.ExcelFile(file_name)
raw_data = {}
# iterate over every sheet and retrieve useful data into raw_data
for i in range(len(sheet_name_list)):
print 'parsing sheet', sheet_name_list[i]
# parse a sheet from the whole file into a DataFrame with headers cut off
temp = file.parse(sheet_name_list[i]).iloc[cut_head:, :]
# now temp.dtype = object,because the data read in contains string.Here convert it to float
temp = temp.astype(np.float)
# reset index and column with 0,1,2,...,
temp.columns = range(temp.shape[1])
temp.index = range(temp.shape[0])
temp.fillna(0, inplace=True)
raw_data[sheet_name_list[i]] = temp
return raw_data
def read_data(file):
"""Read data about the system from the excel file and assign
it to different parameters
Args:
file: excel file, which stores all data about considered system
Returns:
cst: constants, which describe the system
srs: parameters, which are variable with time
U: list of possible decisions
states: values, which set number of states and characterize
all possible ones
"""
xls = pd.ExcelFile(file)
states = xls.parse('DP-States',index_col=[0])
cst = xls.parse('Constants',index_col=[0])['Value']
srs = xls.parse('Time-Series',index_col=[0])
U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values
return cst,srs,U,states
###################################
#for 2 states - temperature and heat-storage
###################################
def getExcelData(self):
"""
get data from 'hsi_futures.xlsx'
Date | Open | High | Low | Close | SMAVG5 | SMAVG10 | SMAVG15 | Volume | VolumeSMAVG5
:return: data table
"""
df = pd.DataFrame()
xl = pd.ExcelFile("../dataManager/hsi_futures.xlsx")
# print xl.sheet_names
sheets = xl.sheet_names
for sheet in sheets:
df = df.append(pd.read_excel("../dataManager/hsi_futures.xlsx", sheet))
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values("Date", ascending=True, inplace=True)
data = df.set_index([range(df.shape[0])])
return data
def get_clinical_template(study):
"""Try to detect the clinical template file in the source dir and open it with pandas."""
clinical_templates = [template for template in study.excel_files if "clin" in template.lower() and
"~$" not in template]
clinical_template = None
if len(clinical_templates) == 1:
clinical_template_name = clinical_templates[0]
clinical_template = pd.ExcelFile(clinical_template_name, comment="#")
print("[INFO] Clinical data template detected: " + clinical_template_name)
elif len(clinical_templates) == 0:
study.clinical_template_present = False
print("[WARNING] No clinical data template could be detected. Assuming only high-dimensional templates " +
"are available.")
else:
print("[ERROR] Too many clinical data templates were found. " +
"Make sure only one file has 'clinical' in its name.")
Validity.list_length(clinical_templates, expected=1)
return clinical_template
def add_general_study_metadata(study, study_metadata_template_path):
"""Read the data from general study level metadata template and write to tags file."""
metadata = pd.ExcelFile(study_metadata_template_path, comment="#")
if len(metadata.sheet_names) > 1:
print("[WARNING] Multiple sheets detected in general study metadata template. Assuming first sheet.")
df = metadata.parse(0, header=None)
tag_index = 10
for __, row in df.iterrows():
data = row[row.first_valid_index():].dropna().tolist()
if len(data) == 2:
tag = data[0]
value = data[1]
study.all_metadata.add(("\\", tag, value, tag_index))
tag_index += 1
study.write_metadata()
def read_excel(filename):
"""Read input Excel file and return dict of DataFrames for each sheet.
Read an Excel spreadsheet with geographic input data.
Args:
filename: filename to an Excel spreadsheet with 'Vertex' and 'Edge'
Returns:
dict of 2 pandas DataFrames
"""
with pd.ExcelFile(filename) as xls:
vertex = xls.parse('Vertex').set_index('Vertex')
edge = xls.parse('Edge').set_index(['Edge', 'Vertex1', 'Vertex2'])
data = {
'Vertex': vertex,
'Edge': edge}
return data
def _compare_xlsx(self, file1, file2, rtol=1e-02, atol=1e-03):
# print("requested compare: {} and {}".format(file1, file2))
xl1 = pd.ExcelFile(file1)
xl2 = pd.ExcelFile(file2)
self.assertEqual(xl1.sheet_names, xl2.sheet_names)
for sheet in xl1.sheet_names:
# print("Prrocessing sheet {}".format(sheet))
df1 = xl1.parse(sheet)
df2 = xl2.parse(sheet)
columns1 = list(df1)
columns2 = list(df2)
self.assertEqual(len(columns1), len(columns2))
arr1 = df1.values
arr2 = df2.values
self.assertEqual(arr1.shape, arr2.shape)
for x, y in np.ndindex(arr1.shape):
v1 = arr1[x, y]
v2 = arr2[x, y]
# print("{}: ({}, {}): {} vs {}".format(sheet, x, y, v1, v2))
if isinstance(v1, six.string_types) or isinstance(v2, six.string_types):
self.assertEqual(v1, v2)
else:
npt.assert_allclose(v1, v2, rtol=rtol, atol=atol)
def parse_xls(fpath):
'''
load the xls data.
export: a data frame (in pandas)
'''
book = xlrd.open_workbook(fpath)
sheet_names = book.sheet_names()
print("sheet_names:", sheet_names)
xl_sheet = book.sheet_by_name(sheet_names[0])
title_row = xl_sheet.row_values(0) # title row
print(title_row)
xls_file = pd.ExcelFile(fpath)
sht1 = xls_file.sheet_names[0]
df = xls_file.parse(sht1)
print(df.ix[0])
print(df.ix[1])
print(df[3:])
def load_supervision_authorities(self, *args, **options):
excel_file = pd.ExcelFile(options['filename'])
state_names = excel_file.sheet_names
for state_name in state_names:
state = State.objects.get(name=state_name)
df = excel_file.parse(state_name)
for _, row in df.iterrows():
try:
email = stringify(row['email'])
if email:
email = email.splitlines()[0]
email = EMAIL_RE.search(email)
email = email.group(0).strip() if email is not None else ''
email = email.lower()
authority, created = SupervisionAuthority.objects.update_or_create(
state=state, name=stringify(row['name']), defaults=dict(
address=stringify(row['address']),
contact=stringify(row['contact']),
email=email,
url=stringify(row['url']),
report_url=stringify(row.get(u'Verfügbare Berichte', ''))
)
)
if created:
print(authority)
except Exception:
print(row['name'])
raise
def assign_brandenburg(self, *args, **options):
brandenburg_state = State.objects.get(name='Brandenburg')
excel_file = pd.ExcelFile(options['filename'])
df = excel_file.parse('Brandenburg')
assigned_auths = defaultdict(list)
locations = {}
for _, row in df.iterrows():
auth = SupervisionAuthority.objects.get(state=brandenburg_state, name=row['name'])
locations[auth] = GEOSGeometry('POINT(%f %f)' % (row['lng'], row['lat']), srid=4326)
assigned_districts = row[u'Landkreis-Zuständigkeit'].splitlines()
for district_name in assigned_districts:
districts = District.objects.filter(part_of=brandenburg_state, name=district_name)
if len(districts) != 1:
print(district_name)
print(districts)
else:
assigned_auths[districts[0]].append(auth)
for nursinghome in NursingHome.objects.filter(supervision_authority__isnull=True,
state=brandenburg_state):
district = District.objects.get(geom__covers=nursinghome.geo)
auths = assigned_auths[district]
if len(auths) == 1:
nursinghome.supervision_authority = auths[0]
nursinghome.save()
else:
min_distance = None
best_auth = None
for auth, point in locations.items():
if auth not in auths:
continue
dist = NursingHome.objects.filter(pk=nursinghome.pk
).annotate(distance=Distance('geo', point))
dist = dist[0].distance.m
if min_distance is None or dist < min_distance:
min_distance = dist
best_auth = auth
nursinghome.supervision_authority = best_auth
nursinghome.save()
def assign_rheinlandpfalz(self, *args, **options):
rp_state = State.objects.get(name='Rheinland-Pfalz')
excel_file = pd.ExcelFile(options['filename'])
df = excel_file.parse('Rheinland-Pfalz')
assigned = defaultdict(list)
for _, row in df.iterrows():
auth = SupervisionAuthority.objects.get(state=rp_state, name=row['name'])
district_names = row[u'Landkreis-Zuständigkeit'].splitlines()
for district_name in district_names:
only = None
if '|' in district_name:
district_name, only = district_name.split('|')
only = only.split(',')
districts = District.objects.filter(part_of=rp_state, name=district_name)
if len(districts) == 0:
districts = District.objects.filter(part_of=rp_state, name__contains=district_name)
if len(districts) == 0:
districts = District.objects.filter(part_of=rp_state, name__contains=district_name.split()[0])
if len(districts) == 0:
districts = District.objects.filter(part_of=rp_state, name__istartswith=re.sub('\W', '', district_name))
if len(districts) > 1:
if 'Kreis' in district_name:
districts = districts.filter(kind_detail__contains='Landkreis')
if 'Stadt' in district_name:
districts = districts.filter(kind_detail__contains='Stadt')
if len(districts) != 1:
print(districts)
print(u'District not one: %s' % district_name)
continue
assigned[auth].append((districts[0], only))
for auth, district_list in assigned.items():
for district, only in district_list:
if only is None:
NursingHome.objects.filter(state=rp_state, district=district, supervision_authority__isnull=True).update(supervision_authority=auth)
continue
for muni_name in only:
muni_name = muni_name.strip()
munis = Municipality.objects.filter(part_of=district, name__contains=muni_name)
if len(munis) > 1:
munis = Municipality.objects.filter(part_of=district, name=muni_name)
if len(munis) != 1:
print('Did not find %s' % muni_name)
continue
muni = munis[0]
NursingHome.objects.filter(state=rp_state, district=district, supervision_authority__isnull=True, geo__coveredby=muni.geom).update(supervision_authority=auth)
def read_data(file):
"""Read data about the system from the excel file and assign
it to different parameters
Args:
file: excel file, which stores all data about considered system
Returns:
cst: constants, which describe the system
srs: parameters, which are variable with time
U: list of possible decisions
states: values, which set number of states and characterize
all possible ones
"""
xls = pd.ExcelFile(file)
states = xls.parse('DP-States',index_col=[0])
cst = xls.parse('Constants',index_col=[0])['Value']
srs = xls.parse('Time-Series',index_col=[0])
U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values
return cst,srs,U,states
###################################
#for 2 states
###################################
def read_data(file):
xls = pd.ExcelFile(file)
cst = xls.parse('Constants',index_col=[0])['Value']
srs = xls.parse('Time-Series',index_col=[0])
U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values
states = xls.parse('DP-States',index_col=[0])
return cst,srs,U,states
def read_data(file):
xls = pd.ExcelFile(file)
cst = xls.parse('Constants',index_col=[0])['Value']
srs = xls.parse('Time-Series',index_col=[0])
U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values
states = xls.parse('DP-States',index_col=[0])
return cst,srs,U,states
def read_data(file):
"""Read data about the system from the excel file and assign
it to different parameters
Args:
file: excel file, which stores all data about considered system
Returns:
cst: constants, which describe the system
srs: parameters, which are variable with time
U: list of possible decisions
states: values, which set number of states and characterize
all possible ones
"""
xls = pd.ExcelFile(file)
states = xls.parse('DP-States',index_col=[0])
cst = xls.parse('Constants',index_col=[0])['Value']
srs = xls.parse('Time-Series',index_col=[0])
U = xls.parse('DP-Decisions',index_col=[0])['Decisions'].values
return cst,srs,U,states
###################################
#for 1 state - temperature
###################################
def get_sanitized_dataframe(self, date_column='Date'):
# ExcelFile does not handle file_paths very well, use native Python open
opened_file = open(self.file_path, 'rb')
excel_file = pd.ExcelFile(opened_file)
dataframe = excel_file.parse(self.sheet)
# Sanitize so the inputs are correct and remove fluke days
dataframe = self._sanitize_sheet(dataframe)
dataframe = self._set_dataframe_index(dataframe, date_column)
return dataframe
def loadFile(self, filename):
if filename.endswith('.xls') or filename.endswith('.xlsx'):
df=pd.ExcelFile(filename)
sheetnames=df.sheet_names
dialog=ExcelFileDialog(filename,sheetnames,self)
dialog.accepted.connect(self.loadExcel)
dialog.show()
else:
table = QTableView()
table.setAlternatingRowColors(True)
model=DataFrameTableModel(filename=filename)
model.trackDataChange.connect(self.trackChanges)
table.setModel(model)
### Set some variables ###
table.headers = table.horizontalHeader()
table.vHeaders=table.verticalHeader()
#### Set context menu for table headers ####
table.headers.setContextMenuPolicy(Qt.CustomContextMenu)
table.headers.customContextMenuRequested.connect(self.headerMenu)
table.vHeaders.setContextMenuPolicy(Qt.CustomContextMenu)
table.vHeaders.customContextMenuRequested.connect(self.vHeaderMenu)
if model._filename.endswith('.csv'):
df=pd.read_csv(model._filename)
model.setDataFrame(df)
self.tableTabWidget.addTab(table, QFileInfo(model._filename).fileName())
self.tableTabWidget.setCurrentWidget(table)
if model._filename.endswith('.txt'):
delim = str(self.parseDelimiter(model._filename))
if delim == ' ':
df=pd.read_csv(model._filename,delim_whitespace = True)
else:
df=pd.read_csv(model._filename,sep=delim)
model.setDataFrame(df)
self.tableTabWidget.addTab(table, QFileInfo(model._filename).fileName())
self.tableTabWidget.setCurrentWidget(table)
def read_excel(fname, header=None):
"""Read excel into dict.
Args:
fname: name of excel file
header: The finland files does not have a header
Output:
dictionary containing the data
"""
xls = ExcelFile(fname)
if header:
parse_cols = [1]
else:
parse_cols = None
df = xls.parse(xls.sheet_names[0], skiprows=1,
parse_cols=parse_cols)
# Fix keys
temp = df.to_dict()
for key in temp:
new_key = key.replace(" - ", "_")
temp[new_key] = temp.pop(key)
# Stupid hack for Finland
if header:
temp[header] = temp.pop(temp.keys()[0])
return temp
def importData(file):
try:
fileData = pd.ExcelFile(file)
except:
try:
fileData = pd.read_csv(file)
except:
print("Please provide an excel or csv file")
return fileData
#Load for not writing all the time
def setUp(self):
self.data = {
"DC_PEC": "import pandas as pd; from urllib.request import urlretrieve; urlretrieve('https://s3.amazonaws.com/assets.datacamp.com/production/course_998/datasets/battledeath.xlsx', 'battledeath.xlsx')",
"DC_SOLUTION": "xl = pd.ExcelFile('battledeath.xlsx')",
"DC_SCT": "test_object('xl')"
}
def test_step_1(self):
self.data["DC_CODE"] = "xl = pd.ExcelFile('battledeath.xlsx')"
sct_payload = helper.run(self.data)
self.assertTrue(sct_payload['correct'])
def test_pass_1(self):
self.data = {
"DC_PEC": "import pandas as pd; from urllib.request import urlretrieve; urlretrieve('https://s3.amazonaws.com/assets.datacamp.com/production/course_998/datasets/battledeath.xlsx', 'battledeath.xlsx'); from urllib.request import urlretrieve; urlretrieve('https://s3.amazonaws.com/assets.datacamp.com/production/course_998/datasets/battledeath.xlsx', 'battledeath2.xlsx')",
"DC_SOLUTION": "xl = pd.ExcelFile('battledeath.xlsx')",
"DC_CODE": "xl = pd.ExcelFile('battledeath2.xlsx')",
"DC_SCT": '''
def my_converter(x):
return(x.sheet_names)
set_converter(key = "pandas.io.excel.ExcelFile", fundef = my_converter)
test_object('xl')
'''
}
sct_payload = helper.run(self.data)
self.assertTrue(sct_payload['correct'])
def unet_cross_val(data_dir, out_dir, mapping, splits, unet_conf):
# Load spreadsheet
with pd.ExcelFile(mapping) as xls:
df = pd.read_excel(xls, 'Sheet1').set_index('index')
df['class'] = df['class'].map({'preplus': 'pre-plus', 'normal': 'normal', 'plus': 'plus'})
img_dir = join(data_dir, 'images')
seg_dir = join(data_dir, 'manual_segmentations')
mask_dir = join(data_dir, 'masks')
# Check whether all images exist
check_images_exist(df, img_dir, seg_dir, mask_dir)
# Now split into training and testing
CVFile = sio.loadmat(splits)
# # Combining Pre-Plus and Plus
# trainPlusIndex = CVFile['trainPlusIndex'][0]
# testPlusIndex = CVFile['testPlusIndex'][0]
#
# plus_dir = make_sub_dir(out_dir, 'trainTestPlus')
# print "Generating splits for combined No and Pre-Plus"
# generate_splits(trainPlusIndex, testPlusIndex, df, img_dir, mask_dir, seg_dir, plus_dir)
# Combining No and Pre-Plus
trainPrePIndex = CVFile['trainPrePIndex'][0]
testPrePIndex = CVFile['testPrePIndex'][0]
prep_dir = make_sub_dir(out_dir, 'trainTestPreP')
print "Generating splits for combined Pre-Plus and Plus"
generate_splits(trainPrePIndex, testPrePIndex, df, img_dir, mask_dir, seg_dir, prep_dir)
# Train models
train_and_test(prep_dir, unet_conf, processes=1)
# train_and_test(plus_dir, unet_conf, processes=2)
def read_hd_file_template(self, source_dir, hd_template):
"""Try to read the specified template file and send to sheet loading method."""
template_path = os.path.join(source_dir, hd_template)
try:
hd_template_workbook = pd.ExcelFile(template_path, comment="#", dtype=object)
except FileNotFoundError:
raise Validity.TemplateException("Could not find high-dim template file at: {0}".format(template_path))
# except XLRDError:
# raise Validity.TemplateException(
# "High-dim template file at: {0} is not a valid xlsx file.".format(template_path))
self._load_sheets(hd_template_workbook)
def load_data(fname='SeatTest_New.xlsx'):
"""
Load the xlsx using pandas.
:param fname: string location of the file to load
:return: pandas object
"""
return pd.ExcelFile(fname)
def parse_statistics(logfile):
xl = pd.ExcelFile(logfile)
df = xl.parse("Sheet")
df = df.sort_values(by='Line Numbers')
writer = pd.ExcelWriter(logfile)
df.to_excel(writer, sheet_name='Sheet', index=False)
writer.save()
wb = openpyxl.load_workbook(logfile)
ws = wb.active
row_count = ws.max_row
column_count = ws.max_column
chart = ScatterChart()
chart.title = "Time upload domain names"
chart.style = 13
chart.x_axis.title = "Line numbers"
chart.y_axis.title = "Time, sec"
xvalues = Reference(ws, min_col=1, min_row=2, max_row=row_count)
color_choice = ['3F888F', 'D24D57']
for i in range(2, column_count + 1):
values = Reference(ws, min_col=i, min_row=1, max_row=row_count)
series = Series(values, xvalues, title_from_data=True)
series.marker.symbol = "diamond"
series.graphicalProperties.line.solidFill = color_choice[i-2]
series.marker.graphicalProperties.line.solidFill = color_choice[i-2]
series.marker.graphicalProperties.solidFill = color_choice[i-2]
series.graphicalProperties.line.width = 20000
chart.series.append(series)
chart.legend.legendPos = 'b'
ws.add_chart(chart)
wb.save(logfile)
def loadExcel(self,options):
names = options['sheets']
filename = options['file']
openEach = options['openEach']
df=pd.ExcelFile(filename)
if not openEach:
newTab = QTabWidget()
newTab.setTabsClosable(True)
newTab.currentChanged.connect(self.tabChanged)
self.connect(newTab,SIGNAL("tabCloseRequested(int)"),
self.fileCloseInternalTab)
for i in range(len(names)):
table = QTableView()
table.setAlternatingRowColors(True)
model=DataFrameTableModel(filename=filename)
model.trackDataChange.connect(self.trackChanges)
table.setModel(model)
### Set some variables ###
table.headers = table.horizontalHeader()
table.vHeaders=table.verticalHeader()
#### Set context menu for table headers ####
table.headers.setContextMenuPolicy(Qt.CustomContextMenu)
table.headers.customContextMenuRequested.connect(self.headerMenu)
table.vHeaders.setContextMenuPolicy(Qt.CustomContextMenu)
table.vHeaders.customContextMenuRequested.connect(self.vHeaderMenu)
df2=df.parse(sheetname=names[i])
model.setDataFrame(df2)
newTab.addTab(table,names[i])
newTab.setCurrentIndex(0)
self.tableTabWidget.addTab(newTab,QFileInfo(filename).fileName())
self.tableTabWidget.setCurrentWidget(newTab)
else:
for i in range(len(names)):
table = QTableView()
table.setAlternatingRowColors(True)
model=DataFrameTableModel(filename=names[i])
model.trackDataChange.connect(self.trackChanges)
table.setModel(model)
### Set some variables ###
table.headers = table.horizontalHeader()
table.vHeaders=table.verticalHeader()
#### Set context menu for table headers ####
table.headers.setContextMenuPolicy(Qt.CustomContextMenu)
table.headers.customContextMenuRequested.connect(self.headerMenu)
table.vHeaders.setContextMenuPolicy(Qt.CustomContextMenu)
table.vHeaders.customContextMenuRequested.connect(self.vHeaderMenu)
df2=df.parse(sheetname=names[i])
model.setDataFrame(df2)
self.tableTabWidget.addTab(table,names[i])
self.tableTabWidget.setCurrentWidget(table)
def metadata_file(path, root, tables=None):
'''
Returns the metadata for a file. There are 3 types of file formats:
1. Archives (7z, zip, rar, tar) / compressed (xz, bzip2, gzip). Decompress and process
2. Database (sqlite3, hdf5, xls, xlsx). Process each table/sheet as a sub-dataset
3. Data (csv, json). Process directly
'''
tree = Meta()
format = guess_format(path)
if format is not None:
tree.format = format
if format == 'dir':
tree.datasets = Datasets()
for base, dirs, files in os.walk(path):
for filename in files:
source = os.path.join(base, filename)
name = os.path.relpath(source, path)
tree.datasets[name] = submeta = Meta(name=name, source=source)
try:
submeta.update(metadata_file(source, root, tables))
except Exception as e:
submeta['error'] = str(e)
logging.exception('Unable to get metadata for %s', source)
elif format in {'7z', 'zip', 'rar', 'tar', 'xz', 'gz', 'bz2'}:
tree.datasets = Datasets()
for name, source in unzip_files(path, root, format):
tree.datasets[name] = submeta = Meta(name=name)
try:
submeta.update(metadata_file(source, root, tables))
except Exception as e:
submeta['error'] = str(e)
logging.exception('Unable to get metadata for %s', source)
elif format == 'sqlite3':
tree.update(metadata_sql('sqlite:///' + path, tables))
elif format in {'hdf5', 'xls', 'xlsx'}:
if format == 'hdf5':
store = pd.HDFStore(path)
table_list = store.keys()
store.close()
else:
xls = pd.ExcelFile(path)
table_list = xls.sheet_names
format = 'xlsx'
tree.datasets = Datasets()
for table in table_list:
if tables is None or table in tables:
tree.datasets[table] = Meta([
('name', table),
('format', 'table'),
('command', [format, path, table])
])
elif format == 'csv':
tree.command = ['csv', path]
elif format == 'json':
tree.command = ['json', path]
return tree