我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_html()。
def get_forex_buy_quote(currency_code: str = 'EUR', source: str = 'FNB', order_type: str = 'buy'): """Get latest forex from FNB website """ if source == 'FNB': tables = pd.read_html( 'https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList', index_col=1, header=0, match=currency_code) df = tables[0] types = { 'buy': 'Bank Selling Rate', 'sell': 'Bank Buying Rate', } exhange_rate = df.loc[currency_code, types[order_type]] return Decimal("%.4f" % float(exhange_rate))
def _sz_hz(date='', retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'], ct.PAGES['szsefc'], date)) lines = urlopen(request, timeout = 10).read() if len(lines) <= 200: return pd.DataFrame() df = pd.read_html(lines, skiprows=[0])[0] df.columns = rv.MAR_SZ_HZ_COLS df['opDate'] = date except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"): doc_urls = [ "{base_url}/{version}/configuration.html", "{base_url}/{version}/sql-programming-guide.html", "{base_url}/{version}/monitoring.html", "{base_url}/{version}/spark-standalone.html", "{base_url}/{version}/running-on-mesos.html", "{base_url}/{version}/running-on-yarn.html", ] for url in doc_urls: doc_url = url.format(version=version, base_url=base_url) # print(url) print("Loading spark properties from %s", doc_url) dfs = pd.read_html(doc_url, header=0) desired_cols = ["Property Name", "Default", "Meaning"] for df in dfs: if ("Property Name" in df) and ('Default' in df): for pn, default, desc in df[desired_cols].itertuples(index=False): if type(default) == numpy.bool_: default = bool(default) yield pn, default, desc
def get_financial_statements(code): url = "http://companyinfo.stock.naver.com/v1/company/ajax/cF1001.aspx?cmp_cd=%s&fin_typ=0&freq_typ=Y" % (code) html = requests.get(url).text html = html.replace('<th class="bg r01c02 endLine line-bottom"colspan="8">??</th>', "") html = html.replace("<span class='span-sub'>(IFRS??)</span>", "") html = html.replace("<span class='span-sub'>(IFRS??)</span>", "") html = html.replace('\t', '') html = html.replace('\n', '') html = html.replace('\r', '') html = html.replace('2011/12', '2011') html = html.replace('2012/03', '2011') html = html.replace('2012/12', '2012') html = html.replace('2013/03', '2012') html = html.replace('2013/12', '2013') html = html.replace('2014/03', '2013') html = html.replace('2014/12', '2014') html = html.replace('2015/03', '2014') html = html.replace('2015/12', '2015') df_list = pd.read_html(html, index_col='??????') df = df_list[0] return df
def get_stats(self, url): """ Extracts statistics from URL Args: url (str): basketball-reference.com box score Returns: stats (pd.DataFrame): DataFrame of statistics from game """ response = urllib2.urlopen(url) html = response.read() stat_html = html.replace('<!--', "") stat_html = stat_html.replace('-->', "") stats = pd.read_html(stat_html) return stats[-5]
def price_history_frame(mkt_id, year, month): """Returns price history as a DataFrame""" url = _build_url('pricehistory/PriceHistory_GetData.cfm') data = dict(Market_ID=mkt_id, Month='{:02d}'.format(month), Year=year) response = requests.post(url=url, data=data) index_cols = [iem.DATE, iem.CONTRACT] kwargs = dict(header=0, parse_dates=[iem.DATE], index_col=index_cols) try: dfs = pd.read_html(response.text, **kwargs) except ValueError: dfs = [pd.DataFrame()] # Expect a singleton list assert len(dfs) == 1 # Remove duplicates, if any df = dfs[0] if len(df.index.unique()) != len(df.index): df = df.groupby(level=df.index.names).first() return df
def history_dates(mkt_id): url = _build_url('pricehistory/pricehistory_selectcontract.cfm') response = requests.get(url=url, params={'Market_ID': mkt_id}) dfs = pd.read_html(response.text, index_col=0) # Expect a singleton list assert len(dfs) == 1 df = dfs[0] mon_str = df.ix['Month:'][1] months = [dt.datetime.strptime(s[:3], '%b').month for s in mon_str.split()] year_str = df.ix['Year'][1] years = [int(s) for s in year_str.split()] return itertools.product(years, months)
def run(query): r = requests.get('https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population') soup = BeautifulSoup(r.content) # tbl = soup.find_all('table')[3] df = pd.read_html(str(tbl))[0] # df.columns = df.iloc[0] # cities = df['City'].tolist() # for city in cities: i = city.find('[') if i != -1: city = city[0:i] city = city + ' ' + query print(city) populate.query_and_post(city) time.sleep(1)
def __query_new_stocks(self): DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc' html = lxml.html.parse(DATA_URL) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if six.PY2: sarr = [etree.tostring(node) for node in res] else: sarr = [etree.tostring(node).decode('utf-8') for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>' % sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1) df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price'] df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) return df
def generate_sched_table(team, year, max_rows=20): df = pd.DataFrame(teams) filter_team = df.loc[df["TeamAlt"] == team] filter_team['ESPNID'] = "http://www.espn.com/college-football/team/fpi/_/id/" \ + filter_team.ESPNID.map(str) + "/year/" + str(year) link = filter_team.tail(1)['ESPNID'].values[0] sched_dataframe = pd.read_html(link, header=1)[4] sched_dataframe.columns = ['Date', 'Opponent', 'Result/Proj', 'Opp FPI', 'Game Rating'] return html.Table( # Header1 [html.Tr([ html.Th(html.H6([team + ' ' + str(year) + ' ' + 'Schedule']), colSpan=5, style=dict(textAlign="center")), ])] + # Header2 [html.Tr([html.Td(col) for col in sched_dataframe.columns], style=dict(fontWeight="bold"))] + # Body [html.Tr([ html.Td(sched_dataframe.iloc[i][col]) for col in sched_dataframe.columns ]) for i in range(min(len(sched_dataframe), max_rows))] )
def _dist_cotent(year, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: if pageNo > 0: ct._write_console() html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'], ct.PAGES['163dp'], year, pageNo)) res = html.xpath('//div[@class=\"fn_rp_list\"]/table') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows=[0])[0] df = df.drop(df.columns[0], axis=1) df.columns = rv.DP_163_COLS df['divi'] = df['plan'].map(_fun_divi) df['shares'] = df['plan'].map(_fun_into) df = df.drop('plan', axis=1) df['code'] = df['code'].astype(object) df['code'] = df['code'].map(lambda x : str(x).zfill(6)) pages = [] if pageNo == 0: page = html.xpath('//div[@class=\"mod_pages\"]/a') if len(page)>1: asr = page[len(page)-2] pages = asr.xpath('text()') except Exception as e: print(e) else: if pageNo == 0: return df, pages[0] if len(pages)>0 else 0 else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _newstocks(data, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], ct.PAGES['newstock'], pageNo)) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1) df.columns = rv.NEW_STOCKS_COLS df['code'] = df['code'].map(lambda x : str(x).zfill(6)) res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()') tag = '???' if ct.PY3 else unicode('???', 'utf-8') hasNext = True if tag in res else False data = data.append(df, ignore_index=True) pageNo += 1 if hasNext: data = _newstocks(data, pageNo, retry_count, pause) except Exception as ex: print(ex) else: return data
def _parse_fq_data(url, index, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: request = Request(url) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = df['date'].astype(np.datetime64) df = df.drop_duplicates('date') except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_GGTJ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _cap_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_YYTJ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _broker_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([2,3], axis=1) df.columns = rv.LHB_JGZZ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3], ct.PAGES['fd'], '', pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_JGMX_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_detail(pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
def _get_profit_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.PROFIT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_profit_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
def _get_operation_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.OPERATION_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_operation_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _get_growth_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.GROWTH_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _get_debtpaying_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.DEBTPAYING_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _get_cashflow_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _parse_fq_data(url, index, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: request = Request(url) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': return None df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = df['date'].astype(np.datetime64) df = df.drop_duplicates('date') except ValueError as e: # ???????????? return None except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def checkStockNameChanges(): """ Handles the renaming of stocks in the DB that have been renamed. It will switch both the name and the symbol for every table in the database. Uses PANDAS to capture html tables from the NASDAQ listings containing symbol changes. Returns the changes in a PANDAS frame, if any. If problem scraping NASDAQ, returns error msg. If code error, returns a tuple of False, error. """ try: path = 'http://www.nasdaq.com/markets/stocks/symbol-change-history.aspx?page=' ticker_changes = pd.DataFrame() for i in np.arange(100): # set the number high enough to catch all pages page = str(i+1) full_path = ''.join([path, page]) symbol_changes = pd.read_html(full_path, header=0)[3] # note: index could change in future if html is restructured # concat all of the changes together if 'No records found.' not in symbol_changes.iloc[0][0]: ticker_changes = pd.concat([ticker_changes, symbol_changes], ignore_index=True) else: break # drop out of loop if there's nothing left to capture ticker_changes.rename(columns={'Old Symbol': 'Old', 'New Symbol': 'New', 'Effective Date': 'Date'}, inplace=True) # check returned value assert isinstance(ticker_changes, pd.DataFrame), "Expected stock name changes to return pandas DataFrame. Got %r instead" % type(tickers) return ticker_changes except Exception as e: return False, e # renameStocks() # ************** #
def get_coins(): coins_db = OrderedDict() print(crayons.yellow('Scraping CoinMaketCap...')) r = session.get(url) html = pq(pq(r.content)('table')[0]).html() df = pandas.read_html("<table>{}</table>".format(html)) df = pandas.concat(df) btc_value = float(df.to_dict()['Price'][0][1:].replace(',', '')) for row in df.itertuples(): rank = int(row[1]) name = ' '.join(row[2].split()[1:]) ticker = row[3].lower() try: usd = float(row[5][1:].replace(',', '')) except ValueError: usd = 0 finally: pass btc = convert_to_decimal(usd / btc_value) coins_db.update({ticker: {'rank': rank, 'name': name, 'ticker': ticker, 'usd': usd, 'btc': btc}}) return coins_db
def read_html_table(self, url, **kwargs): try: self.get_status(url) except: return None # set some default values (some are already default values for read_table) kwargs.update({'encoding': kwargs.get('encoding') or None}) # run pandas... df = pd.read_html(url, **kwargs) return df #/************************************************************************/
def parse_rep_vote_history(self, response): # Some reps did not vote during a session. Test for the "Vote data is unavailable. # We capture the base information about the rep for later matching if "Vote data is unavailable" in response.css("#mainBody::text").extract()[3]: cur_url = response.url session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url) url = cur_url self.rep_info.append([rep_id, session_id, chamber]) #Otherwise, we process the body of text. else: title = response.xpath("""//*[@id="title"]/text()""").extract_first() rep_title, rep_short_name, rep_district = self.get_name_district(title) #Fetch the main table - they use nested tables, so have to use a direct reference. table_rows = response.css('#mainBody > table').extract()[0] #Parse the html table and select relevant info for the vote. pd_table = pd.read_html(table_rows, header=0, match="Doc.", attrs={'cellspacing':0})[0][['RCS\xa0#', 'Doc.','Vote','Result']] #Get session and chamber id from URL and assign to each row cur_url = response.url session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url) pd_table['session_id'] = session_id pd_table['chamber'] = chamber pd_table['rep_id'] = rep_id pd_table['rep_title'] = rep_title pd_table['rep_short_name'] = rep_short_name pd_table['district'] = rep_district #Reorder columns pd_table = pd_table.reindex_axis(['session_id', 'chamber', 'rep_id', 'rep_short_name', 'rep_title', 'district', 'RCS\xa0#', 'Doc.', 'Vote', 'Result'], axis=1) return pd_table.to_dict(orient='records')
def fetch_price(country_code='CA-AB', session=None): """Requests the last known power price of a given country Arguments: country_code (optional) -- used in case a parser is able to fetch multiple countries session (optional) -- request session passed in order to re-use an existing session Return: A dictionary in the form: { 'countryCode': 'FR', 'currency': EUR, 'datetime': '2017-01-01T00:00:00Z', 'price': 0.0, 'source': 'mysource.com' } """ r = session or requests.session() url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/SMPriceReportServlet?contentType=html/' response = r.get(url) df_prices = pd.read_html(response.text, match='Price', index_col=0, header=0) prices = df_prices[1] data = {} for rowIndex, row in prices.iterrows(): price = row['Price ($)'] if (isfloat(price)): hours = int(rowIndex.split(' ')[1]) - 1 data[rowIndex] = { 'datetime': arrow.get(rowIndex, 'MM/DD/YYYY').replace(hours=hours, tzinfo=ab_timezone).datetime, 'countryCode': country_code, 'currency': 'CAD', 'source': 'ets.aeso.ca', 'price': float(price), } return [data[k] for k in sorted(data.keys())]
def fetch_exchange(country_code1='CA-AB', country_code2='CA-BC', session=None): """Requests the last known power exchange (in MW) between two countries Arguments: country_code (optional) -- used in case a parser is able to fetch multiple countries session (optional) -- request session passed in order to re-use an existing session Return: A dictionary in the form: { 'sortedCountryCodes': 'DK->NO', 'datetime': '2017-01-01T00:00:00Z', 'netFlow': 0.0, 'source': 'mysource.com' } """ r = session or requests.session() url = 'http://ets.aeso.ca/ets_web/ip/Market/Reports/CSDReportServlet' response = r.get(url) df_exchanges = pd.read_html(response.text, match='INTERCHANGE', skiprows=0, index_col=0) flows = { 'CA-AB->CA-BC': df_exchanges[1][1]['British Columbia'], 'CA-AB->CA-SK': df_exchanges[1][1]['Saskatchewan'], 'CA-AB->US': df_exchanges[1][1]['Montana'] } sortedCountryCodes = '->'.join(sorted([country_code1, country_code2])) if sortedCountryCodes not in flows: raise NotImplementedError('This exchange pair is not implemented') return { 'datetime': arrow.now(tz=ab_timezone).datetime, 'sortedCountryCodes': sortedCountryCodes, 'netFlow': float(flows[sortedCountryCodes]), 'source': 'ets.aeso.ca' }
def fetch_production(country_code='CR', session=None): # Do not use existing session as some amount of cache is taking place r = requests.session() url = 'https://appcenter.grupoice.com/CenceWeb/CencePosdespachoNacional.jsf' response = r.get(url) df_yesterday = pd.read_html(response.text, skiprows=1, index_col=0, header=0)[0] soup = BeautifulSoup(response.text, 'html.parser') yesterday_date = soup.select('#formPosdespacho:pickFechaInputDate')[0]['value'] jsf_view_state = soup.select('#javax.faces.ViewState')[0]['value'] yesterday = arrow.get(yesterday_date, 'DD/MM/YYYY', tzinfo=TIMEZONE) today = yesterday.shift(days=+1) data = [ ('formPosdespacho', 'formPosdespacho'), ('formPosdespacho:pickFechaInputDate', today.format(DATE_FORMAT)), ('formPosdespacho:pickFechaInputCurrentDate', today.format(MONTH_FORMAT)), ('formPosdespacho:j_id35.x', ''), ('formPosdespacho:j_id35.y', ''), ('javax.faces.ViewState', jsf_view_state), ] response = r.post(url, cookies={}, data=data) df_today = pd.read_html(response.text, skiprows=1, index_col=0)[0] ydata = df_to_data(country_code, yesterday, df_yesterday) tdata = df_to_data(country_code, today, df_today) production = ydata + tdata unknown_plants() return production
def convert_wiki_to_table(wiki_text, n_table=0): html_text = pypandoc.convert(wiki_text, 'html', 'mediawiki') tables = pandas.read_html(html_text) return tables[n_table]
def getSignleStockShortInfo(stock): df = pd.DataFrame() url = "http://shortsqueeze.com/?symbol=" + stock + "&submit=Short+Quote%E2%84%A2" repeat_times = 3 downloadFailed = True for _ in range(repeat_times): try: response = requests.get(url, timeout=15) downloadFailed = False break except Exception as e: print ("exception in get stock:" + stock, str(e)) continue if downloadFailed: return "", df try: tables = pd.read_html(response.text, attrs={'cellpadding': '3', 'width': '100%'}) except Exception as e: print ("exception in parse stock:" + stock, str(e)) return "", df for table in tables: if df.empty: df = table else: df = pd.concat([df, table]) df = df.reset_index(drop=True, inplace=True) #print(df) soup = BeautifulSoup(response.text, 'lxml') dateString = soup.find('span', {"style" : "color:#999999;font-family: verdana, arial, helvetica;font-size:10px"}).get_text() date = datetime.datetime.strptime(dateString, '%A %B %d, %Y') return date, df.T
def __init__(self, code, year): self._geo = get_geo(code, year) self.url = url_resolver(code, year, self._geo['region_code'], self._geo['department_code']) tables = pd.read_html(self.url, header=0, encoding='utf8', decimal=',', thousands=' ') self._parse(tables)
def read_quote_frames(mkt_conf): url = _market_quote_url(mkt_conf) response = requests.get(url=url) dfs = pd.read_html(response.text, index_col=0, header=0, na_values=['---']) # Data outside of the HTML tables table_headers = _table_headers(response.text) market_names = [_market_name(s) for s in table_headers] timestamps = [_timestamp(s) for s in table_headers] # Modify data frames mod_dfs = [_modify_frame(df, ts) for df, ts in zip(dfs, timestamps)] return OrderedDict((nm, df) for nm, df in zip(market_names, mod_dfs))
def main(): date_cols = [iem.ORDER_DATE, iem.EXPIRATION] kwargs = dict(index_col=iem.ORDER_DATE, parse_dates=date_cols) dfs = pd.read_html(table_text, **kwargs) df = dfs[0] oid_df = pd.DataFrame() cxl_o = iem.CANCEL_ORDER df[cxl_o] = df[cxl_o].combine_first(oid_df[cxl_o])
def _frame(response, **kwargs): print(response.text) dfs = pd.read_html(response.text, **kwargs) # Expect a singleton list assert len(dfs) == 1 return dfs[0]
def parse_detail_page(b): prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []} other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water'] # TODO: use the extended fields, add them to the list of properties tables = b.findAll('table', {'class': 'cell'}) if len(tables) > 0: prop['listing_timestamp'] = datetime.datetime.now() addr_rows = b.findAll('td', {'class': 'addr'}) addr = ' '.join(map(lambda x: x.getText(), addr_rows)) t = tables[0] df = pd.read_html(str(t))[0] data = dict(zip(df[0], df[1])) prop['raw_address'] = addr prop['bedrooms'] = int(data['Bedrooms']) prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths']) if data.has_key('Interior Sq Ft'): prop['building_size'] = int(data['Interior Sq Ft']) prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', '')) if data.has_key('Parking'): try: prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', '')) except ValueError: prop['car_spaces'] = -1 return [prop] else: return None # Takes a string of the raw version of the page and extracts any links we might want to crawl
def parse_detail_page(content): prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []} other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water'] # TODO: use the extended fields b = soup.BeautifulSoup(content) tables = b.findAll('table', {'class': 'cell'}) if len(tables) > 0: prop['listing_timestamp'] = datetime.datetime.now() addr_rows = b.findAll('td', {'class': 'addr'}) addr = ' '.join(map(lambda x: x.getText(), addr_rows)) t = tables[0] df = pd.read_html(str(t))[0] data = dict(zip(df[0], df[1])) prop['raw_address'] = addr prop['bedrooms'] = int(data['Bedrooms']) prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths']) if data.has_key('Interior Sq Ft'): prop['building_size'] = int(data['Interior Sq Ft']) prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', '')) if data.has_key('Parking'): try: prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', '')) except ValueError: prop['car_spaces'] = -1 #for of in other_fields: # if data.has_key(of): # prop['features'].append({of: data[of]}) return [prop] else: return None # Takes a string of the raw version of the page and extracts any links we might want to crawl
def new_stocks(): url = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc' request = requests.get(url) doc = lxml.html.soupparser.fromstring(request.content, features='html.parser') table = doc.cssselect('table#NewStockTable')[0] table.remove(table.cssselect('thead')[0]) table_html = lxml.html.etree.tostring(table).decode('utf-8') df = pd.read_html(table_html, skiprows=[0, 1])[0] df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1) df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price'] df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) return df