我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_sql()。
def __init__(self,code,sql=False): if int(code[0]) is 0 or 3: name = "sz" + code if int(code[0]) is 6: name = "sh" + code if sql: self.sharedf=ShareClass().GetDayData(code) else: conn = create_engine( 'mysql://' + config.user + ':' + config.password + '@' + config.ip + '/daydata?charset=utf8') x = 'select * from ' + name + '_tencent;' # sql??? self.sharedf = pandas.read_sql(x, con=conn) self.name=name
def loadData(self): # Load the toilet collection data to pandas collects = pd.read_sql('SELECT * FROM premodeling.toiletcollection', self.conn, coerce_float=True, params=None) pprint.pprint(collects.keys()) collects = collects[['ToiletID','ToiletExID','Collection_Date','Area','Feces_kg_day','year','month']] pprint.pprint(collects.keys()) # Load the density data to pandas density = pd.read_sql('SELECT * FROM premodeling.toiletdensity', self.conn, coerce_float=True, params=None) pprint.pprint(density.keys()) # Return the data self.collects = collects self.density = density return(collects, density)
def as_dataframe(self, model_group_ids): """Return model-group-id subset of table as dataframe Args: model_group_ids (list) the desired model group ids Returns: (pandas.DataFrame) The data from the table corresponding to those model group ids """ return pd.read_sql( 'select * from {} where model_group_id in ({})'.format( self.distance_table, str_in_sql(model_group_ids) ), self.db_engine )
def simple_moving_average(self, period=50, column='adj_close'): table_name = 'sma_test' # stmt = text('SELECT * FROM sma_test WHERE asset_id = :asset_id') # stmt.bindparams(asset_id=self.id) # try: # TODO: parse dates # df = pd.read_sql(sql, con=conn, params={ # 'asset_id': self.id # }) # except OperationalError: # logger.exception('error in query') sma_ts = pd.Series( self.ohlcv[column].rolling(center=False, window=period, min_periods=period - 1).mean()).dropna() print('creating') print(sma_ts) return sma_ts # return sma_ts # else: # print('found') # print(df) # return df
def standardize_variable_names(table, RULES): """ Script to standardize the variable names in the tables PARAM DataFrame table: A table returned from pd.read_sql PARAM list[tuples]: A list of tuples with string replacements, i.e., (string, replacement) RET table """ variableNames = list(table.columns.values) standardizedNames = {} # Pandas renames columns with a dictionary object for v in variableNames: f = v for r in RULES: f = re.sub(r[0],r[1],f) print '%s to %s' %(v,f) standardizedNames[v] = f table = table.rename(columns=standardizedNames) return table
def export_data_unresolved(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) db_work_view = db.get_work_view() connection = db_work_view._db_connection df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False) df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ') df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ') df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ') df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection) sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved']) sample_unresoleved['n'].fillna(0, inplace=True) sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
def pickle_correlations_zeros_january(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id from link_features', conn) print 'loaded links' df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn) print 'loaded counts' result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id']) print 'merged counts' print result article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index() article_counts['counts'].fillna(0.0, inplace=True) print article_counts print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
def _limit_and_df(self, query, limit, as_df=False): """adds a limit (limit==None := no limit) to any query and allow a return as pandas.DataFrame :param bool as_df: if is set to True results return as pandas.DataFrame :param `sqlalchemy.orm.query.Query` query: SQL Alchemy query :param int or tuple[int] limit: maximum number of results :return: query result of pyuniprot.manager.models.XY objects """ if limit: if isinstance(limit, int): query = query.limit(limit) if isinstance(limit, Iterable) and len(limit) == 2 and [int, int] == [type(x) for x in limit]: page, page_size = limit query = query.limit(page_size) query = query.offset(page * page_size) if as_df: results = read_sql(query.statement, self.engine) else: results = query.all() return results
def get_metrics(self): """Pull requested evaluation metrics for each model meeting the criteria specified in __init__. :returns: evaluation metrics for all models in the batches :rtype: pandas.DataFrame """ metrics_query = """ SELECT DISTINCT metrics.metric || '_' || metrics.parameter as p_metric, metrics.value, metrics.unique_timestamp FROM output.metrics as metrics WHERE metric || '_' || parameter in {0} AND unique_timestamp in {1}; """.format(tuple(self.most_important_metrics), tuple(self.models.index.astype(str).values)) return(pd.read_sql(metrics_query, self.engine))
def get_features(self): """ Pull features from the database. :returns: features :rtype: pandas DataFrame """ query = """ SELECT * FROM {0} WHERE {1} is not null; """.format(self.model['test_table_name'][0], self.model['labelling'].astype(str)[0]) return(pd.read_sql(query, self.engine))
def get_mh_appt_dates(self, people): """ Get the dates of mental health appointments for people labeled correctly by the model. :param people: list of ids to search for :type people: tuple :returns: mental health appointment data :rtype: pandas DataFrame """ mh_services_query = """ select dedupe_id, svc_date from clean.jocomentalhealthservices where svc_date < '{0}' and dedupe_id in {1} """.format(self.features['labeling_end_date'].unique()[0], people) return(pd.read_sql(mh_services_query, self.engine).set_index('dedupe_id'))
def get_y_values(self): """ Return a dataframe containing the true classes and predicted probabilities for each case, sorted by descending probability. :returns: dataframe of true classes and predicted probabilities :rtype: pandas DataFrame """ y_query = """ SELECT label as y_true, prediction_prob as scores FROM output.predictions WHERE unique_timestamp = '{0}'; """.format(self.model_id) y_values = pd.read_sql(y_query, con).sort_values(by = 'scores', ascending = False) return y_values
def tables_exist(fake_today,prediction_window,feature_timestamp): table_list = pd.read_sql(''' SELECT table_name FROM information_schema.tables WHERE table_schema = 'feature_tables' AND table_name LIKE 'features_train_{}_{}_at_{}' ORDER BY table_name desc LIMIT 1; '''.format(fake_today, prediction_window, feature_timestamp), con) if len(table_list) > 0: return True else: return False
def load_table(self, train_or_test, feature_timestamp): # get feature table name if feature_timestamp == '%': feature_timestamp = pd.read_sql(''' SELECT split_part(table_name, '_at_', 2) FROM information_schema.tables WHERE table_schema = 'feature_tables' ORDER BY 1 desc LIMIT 1; ''', con).iat[0,0] feature_table_name = ('{}."features_{}_{}_{}_at_{}"').format(config_db['feature_schema'], train_or_test, self.fake_today, self.prediction_window, feature_timestamp) # load table print 'loading {}'.format(feature_table_name) query = ('SELECT * FROM {}').format(feature_table_name) full_feature_table = pd.read_sql(query, con = con) return full_feature_table, feature_table_name
def by_scenario(self, name): """ """ ormclass = self._mapped[name] query = self.session.query(ormclass).filter( ormclass.scn_name == self.scn_name) if self.version: query = query.filter(ormclass.version == self.version) # TODO: Better handled in db if name == 'Transformer': name = 'Trafo' df = pd.read_sql(query.statement, self.session.bind, index_col=name.lower() + '_id') if 'source' in df: df.source = df.source.map(self.id_to_source()) return df
def is_part_of_speech(self, pos): if hasattr(self.resource, QUERY_ITEM_POS): current_token = tokens.COCAToken(pos, replace=False) rc_feature = getattr(self.resource, QUERY_ITEM_POS) _, table, _ = self.resource.split_resource_feature(rc_feature) S = "SELECT {} FROM {} WHERE {} {} '{}' LIMIT 1".format( getattr(self.resource, "{}_id".format(table)), getattr(self.resource, "{}_table".format(table)), getattr(self.resource, rc_feature), self.resource.get_operator(current_token), pos) engine = self.resource.get_engine() df = pd.read_sql(S.replace("%", "%%"), engine) engine.dispose() return len(df.index) > 0 else: return False
def get_unique(self): if not self.db_name: return sql = sqlhelper.sql_url(options.cfg.current_server, self.db_name) if self._uniques: S = "SELECT DISTINCT {} FROM {}".format(self.column, self.table) self.df = pd.read_sql(S, sqlalchemy.create_engine(sql)) self.df = self.df.sort_values(self.column, ascending=True) else: S = "SELECT {} FROM {}".format(self.column, self.table) self.df = pd.read_sql(S, sqlalchemy.create_engine(sql)) items = (self.df[self.column].apply(utf8) .apply(QtWidgets.QTableWidgetItem)) self.ui.tableWidget.setRowCount(len(items)) self.ui.tableWidget.setColumnCount(1) for row, item in enumerate(items): self.ui.tableWidget.setItem(row, 0, item)
def read_sql(self,sql,chunksize=None,return_generator=True): ''' ????????? :param sql: sql??? :param chunksize: int????????????????????????? :param return_generator: bool?????????False????????????????chunksize?int????? :return: ''' if chunksize is not None and chunksize<=0: chunksize=None result=pd.read_sql(sql,self.engine,chunksize=chunksize) if return_generator: return result else: if chunksize is None: return result else: result=list(result) if len(result)==0: return pd.DataFrame() else: result=pd.concat(result,axis=0) return result
def getData(con): query = 'SELECT {} FROM {}'.format( ','.join(['"{}"'.format(x) for x in DOWNLOAD_COLS]), DOWNLOAD_TABLE) if WHERE: query += ' WHERE {}'.format(WHERE) if LIMIT: query += ' LIMIT {}'.format(LIMIT) print(query) df = pd.read_sql( sql = query, con = con) return df
def getData(con, lo): query = DOWNLOAD_QUERY.format( ','.join(['"{}"'.format(x) for x in DOWNLOAD_COLS]), DOWNLOAD_TABLE, LELIMIT, lo) print(query) df = pd.read_sql( sql = query, con = con) df.columns = ['canonical_smiles'] # fingerDf = computeDescriptors(df) fingerDf = computeMACCS(df) mergedDf = pd.concat([df, fingerDf], axis=1) return mergedDf
def retrieve_data(self, date, world_or_region=WORLD, table_name='trending_places'): """ Fetched records saved in a Database Parameters ---------- date world_or_region table_name Returns ------- pandas data frame """ with self.con: return pd.read_sql(FETCH_QUERY % (table_name, date, world_or_region), self.con, parse_dates=['last_day'])
def _update(stock, conn): try: print "update ----- :", stock query = "select * from '%s' order by date" % stock df = pd.read_sql(query, conn) df = df.set_index('date') print "sql saved:", df.tail(1),df.ix[-1],df.ix[-1].name if dt.now().weekday() == 5: today = str(pd.Timestamp(dt.now()) - pd.Timedelta(days=1))[:10] elif dt.now().weekday() == 6: today = str(pd.Timestamp(dt.now()) - pd.Timedelta(days=2))[:10] else: today = str(pd.Timestamp(dt.now()))[:10] print "today:",today if today != df.ix[-1].name[:10]: df = ts.get_h_data(stock, start=df.ix[-1].name[:10], retry_count=5, pause=1) print "read from tu:",df.head(1) df[['open', 'high', 'close', 'low', 'volume']].to_sql(stock, conn, if_exists='append') import time time.sleep(10) except Exception, arg: print "exceptionu:", stock, arg errorlist.append(stock)
def _clean(stock, conn): try: print "clean ------ :", stock query = "select * from '%s' order by date" % stock df = pd.read_sql(query, conn) print "before",df.tail(5) cur = conn.cursor() query = "delete from '%s' where rowid not in(select max(rowid) from '%s' group by date)" % (stock, stock) cur.execute(query) conn.commit() query = "select * from '%s' order by date" % stock df = pd.read_sql(query, conn) print "after",df.tail(5) except Exception, arg: print "exceptionc:", stock, arg raise SystemExit(-1)
def RNNPredict(mask,trigger_date=None,source='History.db'): class RNNPredict(CustomFactor): inputs = []; window_length = 1 def compute(self, today, assets, out, *inputs): if trigger_date != None and today != pd.Timestamp(trigger_date,tz='UTC'): # ????????factor??????????????factor????computer??????????????? ??? return if os.path.splitext(source)[1] == '.db': conn = sqlite3.connect(source, check_same_thread=False) #????????????????????? query = "select * from predict where date >= '%s' order by date limit 1 " % str(today)[:19] df = pd.read_sql(query, conn) df = df.set_index('date') conn.close() elif os.path.splitext(source)[1] == '.csv': df = pd.read_csv("predict.csv", index_col=0, parse_dates=True) df = df[df.index >= pd.Timestamp(str(today))] print today,df else: raise ValueError new_index = [sid(asset).symbol + "_return" for asset in assets] df = df.reindex(columns = new_index) out[:] = df.ix[0].values print "RNNpredict:", today, out return RNNPredict(mask=mask)
def mysql_into_df(self): # first Extract all the table names # connect = self.db_connection df_tables = pd.read_sql('show tables;', self.db_connection) # store the tables in a dictionary d = {} col_name = df_tables.columns[0] # load individual table into the dictionary for table in df_tables[col_name]: key = table value = pd.read_sql('SELECT * FROM ' + table + ';', self.db_connection) d[key] = value self.dictTables = d # Load qualification table separatedly to infare the timestamp
def get_lowest(self, code, date,current_date): ''' ?????????? :param code: ???? :param date: ?? :return: ''' date = date + '-01-01' cmd = 'select * from `{}` where datetime > \'{}\' and datetime <\'{}\''.format(code, date,current_date) try: df = pd.read_sql(cmd, history_engine,index_col='index') except Exception,e: print e return None,None #print df.dtypes # ??????????????? if len(df)<1: return None,None df['low']=df['low'].astype('float64') idx= df['low'].idxmin() min_date= df.loc[idx] return min_date['datetime'],min_date['low']
def standard_sessions(start_date, end_date): """Return the datetimes corresponding to the trading sessions in a specified time period during which the stock market was officially open. Parameters ---------- start_date: Datetime object. The datetime indicating the beginning of the trading time period. end_date (optional): Datetime object. The datetime indicating the ending of the trading time period. """ qry = """ SELECT p.datetime FROM prices AS p JOIN symbols as s ON p.symbol_id = s.id WHERE s.symbol='^GSPC' AND p.datetime >= '{}' AND p.datetime <= '{}' """.format(start_date, end_date) return pd.read_sql(qry, conn)
def query_sql(db_path, table_name): """ Uses pandas to pull data from our sqlite database args db_path (str) : location of the database table_name (str) : name of the table we want returns data (pd.DataFrame) : """ print('Pulling data for table {} from {}'.format(db_path, table_name)) # connect to our database conn = sqlite3.connect(db_path) # pull data by selecting the entire table data = pd.read_sql(sql='SELECT * from '+str(table_name), con=conn) data.set_index('index', drop=True, inplace=True) # close the connection conn.close() return data
def matches(): singles = pd.read_sql('select * from game where deleted = 0', con=engine) doubles = pd.read_sql('select * from doubles_game where deleted = 0', con=engine) tz = pytz.timezone('America/New_York') for frame in [singles, doubles]: frame['timestamp'] = frame['timestamp'].apply(datetime.utcfromtimestamp) frame['timestamp'] = frame['timestamp'].apply(datetime.replace, tzinfo=tz) frame['timestamp'] = frame['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S %Z') singles = singles.to_dict('records') doubles = doubles.to_dict('records') return render_template('gamelog.html', singles_games=singles, doubles_games=doubles)
def read(self, *fields, **conditionals): """ Pull raw data into a dataframe. If no conditions are passed, pull the whole table. Otherwise, filter based on the conditions specified ( currently only equality ). """ # Parse the list of fields to return field_query = self._parse_fields(*fields) # Parse the conditions conditionals_query = self._parse_conditionals(**conditionals) query = "SELECT {} FROM {} {}".format(field_query, self.table, conditionals_query) if "DISTINCT" not in query: query += " ORDER BY {}".format(self.timestamp_field) return pd.read_sql(query, self.db)
def test_write(self): """Test the tracking of an event.""" tracker = pawprint.Tracker(db=db, table=table, schema={"id": "INT"}) tracker.create_table() # Check the table's empty assert pd.io.sql.execute("SELECT COUNT(*) FROM {}".format(table), db).fetchall() == [(0,)] # Add some data and check if the row count increases by one tracker.write(id=1337) assert pd.io.sql.execute("SELECT COUNT(*) FROM {}".format(table), db).fetchall() == [(1,)] # Pull the data and ensure it's correct data = pd.read_sql("SELECT * FROM {}".format(table), db) assert isinstance(data, pd.DataFrame) assert len(data.columns) == 1 assert data.columns[0] == "id" assert data.id[0] == 1337
def issues_with_close(self, repoid): """ How long on average each week it takes to close an issue :param repoid: The id of the project in the projects table. Use repoid() to get this. :return: DataFrame with issues/day """ issuesSQL = s.sql.text(""" SELECT issues.id as "id", issues.created_at as "date", DATEDIFF(closed.created_at, issues.created_at) AS "days_to_close" FROM issues JOIN (SELECT * FROM issue_events WHERE issue_events.action = "closed") closed ON issues.id = closed.issue_id WHERE issues.repo_id = :repoid""") return pd.read_sql(issuesSQL, self.db, params={"repoid": str(repoid)})
def committer_locations(self, repoid): """ Return committers and their locations @todo: Group by country code instead of users, needs the new schema :param repoid: The id of the project in the projects table. :return: DataFrame with users and locations sorted by commtis """ rawContributionsSQL = s.sql.text(""" SELECT users.login, users.location, COUNT(*) AS "commits" FROM commits JOIN project_commits ON commits.id = project_commits.commit_id JOIN users ON users.id = commits.author_id WHERE project_commits.project_id = :repoid AND LENGTH(users.location) > 1 GROUP BY users.id ORDER BY commits DESC """) return pd.read_sql(rawContributionsSQL, self.db, params={"repoid": str(repoid)})
def issue_response_time(self, repoid): """ How long it takes for issues to be responded to by people who have commits associate with the project :param repoid: The id of the project in the projects table. :return: DataFrame with the issues' id the date it was opened, and the date it was first responded to """ issuesSQL = s.sql.text(""" SELECT issues.created_at AS "created_at", MIN(issue_comments.created_at) AS "responded_at" FROM issues JOIN issue_comments ON issue_comments.issue_id = issues.id WHERE issue_comments.user_id IN (SELECT users.id FROM users JOIN commits WHERE commits.author_id = users.id AND commits.project_id = :repoid) AND issues.repo_id = :repoid GROUP BY issues.id """) return pd.read_sql(issuesSQL, self.db, params={"repoid": str(repoid)})
def get_trip_stop_coordinates(self, trip_I): """ Get coordinates for a given trip_I Parameters ---------- trip_I : int the integer id of the trip Returns ------- stop_coords : pandas.DataFrame with columns "lats" and "lons" """ query = """SELECT lat, lon FROM stop_times JOIN stops USING(stop_I) WHERE trip_I={trip_I} ORDER BY stop_times.seq""".format(trip_I=trip_I) stop_coords = pd.read_sql(query, self.conn) return stop_coords
def remove_dangling_shapes(db_conn): """ Remove dangling entries from the shapes directory. Parameters ---------- db_conn: sqlite3.Connection connection to the GTFS object """ db_conn.execute(DELETE_SHAPES_NOT_REFERENCED_IN_TRIPS_SQL) SELECT_MIN_MAX_SHAPE_BREAKS_BY_TRIP_I_SQL = \ "SELECT trips.trip_I, shape_id, min(shape_break) as min_shape_break, max(shape_break) as max_shape_break FROM trips, stop_times WHERE trips.trip_I=stop_times.trip_I GROUP BY trips.trip_I" trip_min_max_shape_seqs= pandas.read_sql(SELECT_MIN_MAX_SHAPE_BREAKS_BY_TRIP_I_SQL, db_conn) rows = [] for row in trip_min_max_shape_seqs.itertuples(): shape_id, min_shape_break, max_shape_break = row.shape_id, row.min_shape_break, row.max_shape_break if min_shape_break is None or max_shape_break is None: min_shape_break = float('-inf') max_shape_break = float('-inf') rows.append( (shape_id, min_shape_break, max_shape_break) ) DELETE_SQL_BASE = "DELETE FROM shapes WHERE shape_id=? AND (seq<? OR seq>?)" db_conn.executemany(DELETE_SQL_BASE, rows) remove_dangling_shapes_references(db_conn)
def read_sql(sql, con, filePath, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None): """ Read SQL query or database table into a DataFrameModel. Provide a filePath argument in addition to the *args/**kwargs from pandas.read_sql and get a DataFrameModel. NOTE: The chunksize option is overridden to None always (for now). Reference: http://pandas.pydata.org/pandas-docs/version/0.18.1/generated/pandas.read_sql.html pandas.read_sql(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None) :return: DataFrameModel """ # TODO: Decide if chunksize is worth keeping and how to handle? df = pandas.read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize=None) return DataFrameModel(df, filePath=filePath)
def _limit_and_df(self, query, limit, as_df=False): """adds a limit (limit==None := no limit) to any query and allow a return as pandas.DataFrame :param bool as_df: if is set to True results return as pandas.DataFrame :param `sqlalchemy.orm.query.Query` query: SQL Alchemy query :param int limit: maximum number of results :return: query result of pyctd.manager.models.XY objects """ if limit: query = query.limit(limit) if as_df: results = read_sql(query.statement, self.engine) else: results = query.all() return results
def sqlite2observations(filename='observations.db'): """ Restore a databse of observations. """ con = db.connect(filename) df = pd.read_sql('select * from observations;', con) blank = empty_observation() result = df.as_matrix() final_result = np.empty(result.shape[0], dtype=blank.dtype) # XXX-ugh, there has to be a better way. for i, key in enumerate(blank.dtype.names): final_result[key] = result[:, i+1] to_convert = ['RA', 'dec', 'alt', 'az', 'rotSkyPos', 'moonAlt', 'sunAlt'] for key in to_convert: final_result[key] = np.radians(final_result[key]) return final_result
def fill(self): self.init_db(self.engine) df = pd.read_sql("select * from fundamental", self.engine).sort_values(['report_date', 'quarter']) df['trade_date'] = df['report_date'] = pd.to_datetime(df['report_date']) with click.progressbar(df.groupby('code'), label='writing data', item_show_func=lambda x: x[0] if x else None) as bar: bar.is_hidden = False for stock, group in bar: group = group.drop_duplicates(subset='trade_date', keep="last").set_index('trade_date') sessions = pd.date_range(group.index[0], group.index[-1]) d = group.reindex(sessions, copy=False).fillna(method='pad') d.to_sql('full', self.engine, if_exists='append', index_label='trade_date')
def ReadSqlData(self,name, db): conn = create_engine( 'mysql://' + config.user + ':' + config.password + '@' + config.ip + '/' + db + '?charset=utf8') x = 'select * from ' + name + ';' # sql??? return pandas.read_sql(x, con=conn)
def load_world(self, world_type): """For open world validation, we must keep track of which onion service a trace came from. However for closed world validation, we can select traces without consideration of which site they belong to. :returns: a pandas DataFrame df containing the dataset """ select_hs_urls = ', t3.hs_url' if world_type is 'open' else '' labeled_query = ('select t1.*, t3.is_sd {} ' 'from features.frontpage_features t1 ' 'inner join raw.frontpage_examples t2 ' 'on t1.exampleid = t2.exampleid ' 'inner join raw.hs_history t3 ' 'on t3.hsid = t2.hsid').format(select_hs_urls) df = pd.read_sql(labeled_query, self.engine) return df
def get_exampleids(self): """Get list of exampleids""" query = "SELECT DISTINCT exampleid FROM raw.frontpage_traces" df = pd.read_sql(query, self.engine) return df.exampleid.values
def get_ordered_trace_cells(self, exampleid): """Get trace for a given exampleid""" df = pd.read_sql("""SELECT ingoing, t_trace FROM raw.frontpage_traces WHERE exampleid={} ORDER BY t_trace""".format(exampleid), self.engine) return df
def update_live_data(download=False): def live_summary(live_setting_id): group_dict = {1:"?'s", 2:'Aqours'} attr_dict = {1:'Smile', 2:'Pure', 3:'Cool'} diff_dict = {1:'Easy', 2:'Normal', 3:'Hard', 4:'Expert', 6:'Master'} setting = df_live_setting.loc[live_setting_id] track_info = df_live_track.loc[setting['live_track_id']] live_info = { 'cover': cover_path(setting['live_icon_asset']), 'name': track_info['name'], 'group': group_dict[track_info['member_category']], 'attr': attr_dict[setting['attribute_icon_id']], 'note_number': int(setting['s_rank_combo']), 'diff_level': diff_dict[setting['difficulty']], 'diff_star': int(setting['stage_level']), 'file_dir': live_path(setting['notes_setting_asset']) } return live_info print('Downloading latest live.db_') opener = urllib.request.URLopener() opener.addheader('User-Agent', 'whatever') opener.retrieve(live_db_download_url, live_db_dir) print('Generating basic live stats') conn = sqlite3.connect(live_db_dir) df_live_track = pd.read_sql('SELECT * FROM live_track_m', con=conn, index_col='live_track_id') df_live_setting = pd.read_sql('SELECT * FROM live_setting_m', con=conn, index_col='live_setting_id') # live_data = [live_summary(live_setting_id) for live_setting_id, row in df_live_setting.iterrows() if row['difficulty']!=5] live_data = [live_summary(live_setting_id) for live_setting_id, row in df_live_setting.iterrows() if row['difficulty']!=5 and live_setting_id != 10779] with open(live_archive_dir, 'w') as fp: json.dump(live_data, fp) print('Basic live data has been saved in', live_archive_dir)
def demand_daily_data(db, rows=[], feature='', function='lag', unique=['ToiletID','Collection_Date'], conditions=None): """ A function to generate by day variables for a feature Args: DICT DB Connection object (see grab_collections_data) LIST ROWS List of rows STR FEATURE A feature name to create daily records for STR FUNCTION Apply either the LAG or LEAVE function (in the future, maybe some other functions) LIST UNIQUE List of unique identifiers STR CONDITIONS Apply the conditions string (see grab_collections_data) Returns: DF DAILY_DATA Pandas data frame of daily variables """ # Reprocess the unique list to account for capitalization unique = ','.join(['"%s"' %(uu) for uu in unique]) # Construct the sql statement using window functions (e.g., OVER and LAG/LEAVE) statement = 'SELECT %s' %(unique) for rr in rows: statement += ', %s("%s", %i, NULL) OVER(order by %s) as "%s_%s%i" ' %(function, feature, rr, unique, feature, function, rr) # Complete the statement statement += "FROM %s.%s %s ORDER BY %s" %(db['database'], db['table'], conditions, unique) # Execute the statement daily_data = pd.read_sql(statement, con=db['connection'], coerce_float=True, params=None) # Return the lagged/leave data return(daily_data)
def grab_from_features_and_labels(db, fold, config): """ A function that subsets the features df and labels df stored in the Postgres, into train and test features and labels, based on the fold info (train start, train end, test start, test end ) Args DICT FOLD start and end date for both train and test set, in the fomat{"train":(start, end),"test":(start, end)} Returns df features train df labels train df features test df labels test """ RESPONSE_RENAMER = {'response_f':'response', 'response_u':'response'} dataset = pd.read_sql('select * from modeling.dataset where (("Collection_Date" >= '+"'"+fold['train_start'].strftime('%Y-%m-%d')+"'"+') and ("Collection_Date" <= '+"'"+fold['test_end'].strftime('%Y-%m-%d')+"'"+'))', db['connection'], coerce_float=True, params=None) toilet_routes = pd.read_sql('select * from modeling.toilet_route', db['connection'], coerce_float=True, params=None) #TODO: Fix this... dataset = dataset.fillna(0) #A hack to make it run for now... #Drop the toilets that do not have contiguous data. # Note that missing collections are filled with NaN'd rows, so if a toilet is not contiguous, it must mean that it appeared or disappeared during the fold period -> ignore it. toilet_groups = dataset.groupby(config['cols']['toiletname']) toilets = dataset[config['cols']['toiletname']].unique() number_of_days = max(toilet_groups.size()) contiguous_toilets = [t for t in toilets if (toilet_groups.size()[t] == number_of_days)] dataset = dataset.loc[dataset[config['cols']['toiletname']].isin(contiguous_toilets)] #Sort for the purposes of later functions... dataset = dataset.sort_values(by=['Collection_Date','ToiletID']) features_train = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1) features_test = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1) labels_train_u = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER) labels_train_f = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER) labels_test_f = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER) labels_test_u = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER) return(features_train, labels_train_f, labels_train_u, features_test, labels_test_f, labels_test_u, toilet_routes)
def pickle_vis_data_pandas(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn) print len(df) no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first() print len(no_dup) feature = no_dup.loc[no_dup['visual_region']=='lead'] print len(feature) feature.reset_index(inplace=True) feature = no_dup.loc[no_dup['visual_region']=='infobox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region']=='navbox'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False) feature = no_dup.loc[no_dup['visual_region']=='left-body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False) feature = no_dup.loc[no_dup['visual_region']=='body'] print len(feature) feature.reset_index(inplace=True) feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
def get_redirecsfromXML(self, dump_date): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() df = pd.read_sql(('select * from redirects'),conn) return df.set_index('source_article_name')['target_article_name'].to_dict()
def pickle_correlations_zeros(): db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME) conn = db._create_connection() print 'read' df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn) print 'group' article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index() print 'write to file' article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)