我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_csv()。
def calc_word_sim(model, eval_file): df = pd.read_csv(eval_file, sep=',', header=0) # eval dataset col1, col2, score = df.columns.values model_vocab = model.vocab.keys() ground = [] sys = [] for idx, row in df.iterrows(): if row[col1] in model_vocab and row[col2] in model_vocab: ground.append(float(row[score])) sys.append(model.similarity(row[col1], row[col2])) # compute Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) print sys # import pdb;pdb.set_trace() corr, p_val = stats.spearmanr(sys, ground) logger.info("# of pairs found: %s / %s" % (len(ground), len(df))) logger.info("correlation: %s" % corr) return corr, p_val
def draw(path, srv): filename = os.path.join(path, srv["preprocessed_filename"]) df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True) bins = defaultdict(list) for i, col in enumerate(df.columns): serie = df[col].dropna() if pd.algos.is_monotonic_float64(serie.values, False)[0]: serie = serie.diff()[1:] p_value = adfuller(serie, autolag='AIC')[1] if math.isnan(p_value): continue nearest = 0.05 * round(p_value/0.05) bins[nearest].append(serie) for bin, members in bins.items(): series = [serie.name for serie in members] if len(members) <= 10: columns = series else: columns = random.sample(series, 10) subset = df[columns] name = "%s_adf_confidence_%.2f.png" % (srv["name"], bin) print(name) axes = subset.plot(subplots=True) plt.savefig(os.path.join(path, name)) plt.close("all")
def test_addepar2frame(self): r = {'meta': {'columns': [{'key': 'node_id', 'display_name': 'Entity ID', 'output_type': 'Word'}, {'key': '_custom_13_custodian_name_166730', 'display_name': '15. Custodian Name', 'output_type': 'Word'}, {'key': '_custom_15_reference_currency_165485', 'display_name': '17. Reference Currency', 'output_type': 'Currency'}, {'key': '_custom_16_lwm_risk_profile_114480', 'display_name': '18. LWM Risk Profile', 'output_type': 'Word'}, {'key': '_custom_23_lwm_aum_type_293536', 'display_name': '23. LWM - AUM Type', 'output_type': 'Word'}, {'key': 'inception_event_date', 'display_name': 'Inception Date', 'output_type': 'Date'}], 'groupings': [{'key': 'top_level_owner', 'display_name': 'Top Level Owner'}]}, 'data': {'type': 'portfolio_views', 'attributes': {'total': {'name': 'Total', 'columns': {'_custom_15_reference_currency_165485': None, 'inception_event_date': '2013-12-31', '_custom_23_lwm_aum_type_293536': None, '_custom_16_lwm_risk_profile_114480': None, '_custom_13_custodian_name_166730': None, 'node_id': None}, 'children': [{'entity_id': 1146188, 'name': 'A', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-10-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'X', 'node_id': 1146188}, 'children': []}, {'entity_id': 1231399, 'name': 'B', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-09-21', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'Y', 'node_id': 1231399}, 'children': []}, {'entity_id': 1511499, 'name': 'C', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2017-03-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Conservative', '_custom_13_custodian_name_166730': 'Z', 'node_id': 1511499}, 'children': []}, ]}}, 'links': {'self': '/v1/portfolio_views/null'}}} pdt.assert_frame_equal(addepar2frame(r), pd.read_csv("/pyaddepar/test/resources/frame.csv", parse_dates=True), check_dtype=False)
def generate_vocabulary(self, review_summary_file): """ :param review_summary_file: :return: """ self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values for review,summary in self.rev_sum_pair: rev_lst = wordpunct_tokenize(review) sum_lst = wordpunct_tokenize(summary) self.__add_list_to_dict(rev_lst) self.__add_list_to_dict(sum_lst) # Now store the "" empty string as the last word of the voacabulary self.map[""] = len(self.map) self.revmap[len(self.map)] = ""
def plot_csv(stock_data, symbol): """ params: - stock_data(list) : list of dict objects containing stock data - name(str) : output file name specified by `-output` param. """ try: df = pd.read_csv('{}.csv'.format(symbol)) except: write_to_csv(stock_data, symbol) df = pd.read_csv('{}.csv'.format(symbol)) p1 = figure(x_axis_type="datetime", title="Stock Closing Price") p1.grid.grid_line_alpha = 0.3 p1.xaxis.axis_label = 'Date' p1.yaxis.axis_label = 'Price' p1.line(datetime(list(df['date'])), list(df['close']), color='#A6CEE3', legend=symbol) output_file("{}.html".format(symbol), title="Stock Closing Prices") show(p1) # open a browser
def fix_columns(df): """ Changes DataFrame in-place """ # Convert all string columns to str to avoid a PerformanceWarning for col in _STRING_COLUMNS: if col not in df: continue df[col].fillna('', inplace=True) df[col] = df[col].astype('str') # Empty strings have been set to NaN by read_csv. Replacing # by the empty string avoids problems with groupby, which # ignores NaN values. # Columns that have any NaN values in them cannot be converted to # int due to a numpy limitation. for col in _INTEGER_COLUMNS: if col not in df.columns: continue if all(df[col].notnull()): df[col] = df[col].astype(int)
def main(args): if args.minimum_frequency is None: minimum_frequency = max((len(args.tables) + 1) // 2, 2) else: minimum_frequency = args.minimum_frequency logger.info('Minimum frequency set to %s', minimum_frequency) # Read in tables tables = [] for path in args.tables: table = pd.read_csv(path, sep='\t') table = table[table.database_diff >= args.minimum_db_diff] table = table.dropna() tables.append(table) if len(table) == 0: logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff) # Count V sequence occurrences counter = Counter() for table in tables: counter.update(set(table.consensus)) # Find most frequent occurrences and print result print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t') for sequence, frequency in counter.most_common(): if frequency < minimum_frequency: break names = [] gene = None for table in tables: matching_rows = table[table.consensus == sequence] if matching_rows.empty: continue names.extend(matching_rows.name) if gene is None: row = matching_rows.iloc[0] gene = row.gene database_diff = row.database_diff #shm = row['V_SHM'] print(frequency, gene, database_diff, sequence, *names, sep='\t')
def count_full_text_occurrences(candidates, table_path, other_gene, other_errors, merge, min_count): # Use only records that have a chance of reaching the required min_count records = {info.sequence: info for info in candidates if info.max_count >= min_count} # Count full-text occurrences in the genomic_sequence, circumventing # inaccurate IgBLAST alignment boundaries # TODO limit the search to the gene region (especially for D genes) # Speed up search by looking for most common sequences first search_order = sorted(records, key=lambda s: records[s].max_count, reverse=True) cols = [other_gene, 'V_errors', 'J_errors', 'CDR3_nt', 'genomic_sequence'] for chunk in pd.read_csv(table_path, usecols=cols, chunksize=10000, sep='\t'): chunk = chunk[chunk[other_errors] == 0] for row in chunk.itertuples(): for needle in search_order: if needle in row.genomic_sequence: record = records[needle] record.count += 1 record.other_genes.add(getattr(row, other_gene)) record.cdr3s.add(row.CDR3_nt) if merge: break return records.values()
def main(args): n = 0 first = True written = 0 stats = FilteringStatistics() for chunk in pd.read_csv(args.table, chunksize=10000, sep='\t'): fix_columns(chunk) n += len(chunk) filtered, chunk_stats = filtered_table(chunk, v_gene_coverage=args.v_coverage, j_gene_coverage=args.j_coverage, v_gene_evalue=args.v_evalue) stats += chunk_stats print(filtered.to_csv(sep='\t', index=False, header=first), end='') first = False written += len(filtered) logger.info('%s rows in input table', stats.n) logger.info('%s rows have both V and J assignment', stats.vjassigned) logger.info('%s of those do not have a stop codon', stats.stop) logger.info('%s of those have an E-value of at most %s', stats.v_evalue, args.v_evalue) logger.info('%s of those cover the V gene by at least %s%%', stats.v_coverage, args.v_coverage) logger.info('%s of those cover the J gene by at least %s%%', stats.j_coverage, args.j_coverage) logger.info('%d rows written', written)
def get_treasury_data(start_date, end_date): return pd.read_csv( "http://www.federalreserve.gov/datadownload/Output.aspx" "?rel=H15" "&series=bf17364827e38702b42a58cf8eaa3f78" "&lastObs=" "&from=" # An unbounded query is ~2x faster than specifying dates. "&to=" "&filetype=csv" "&label=omit" "&layout=seriescolumn" "&type=package", skiprows=1, # First row is a useless header. parse_dates=['Time Period'], na_values=['ND'], # Presumably this stands for "No Data". index_col=0, ).loc[ start_date:end_date ].dropna( how='all' ).rename( columns=parse_treasury_csv_column ).tz_localize('UTC') * 0.01 # Convert from 2.57% to 0.0257.
def storageindex(self): #get the filelist onlyfiles = [ f for f in listdir(self.indexdata) if isfile(join(self.indexdata,f)) ] #read from using pandas for f in onlyfiles: df = pd.read_csv(self.indexdata+"/"+f) s=f.split('.') name = s[0][2:8] records = json.loads(df.T.to_json()).values() for row in records: row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d") print name self.index[name].insert_many(records) #storage stock pool into database
def load_names_data(): fp = os.path.join(tempfile.gettempdir(), ZIP_NAME) if not os.path.exists(fp): r = requests.get(URL_NAMES) with open(fp, 'wb') as f: f.write(r.content) post = collections.OrderedDict() with zipfile.ZipFile(fp) as zf: # get ZipInfo instances for zi in sorted(zf.infolist(), key=lambda zi: zi.filename): fn = zi.filename if fn.startswith('yob'): year = int(fn[3:7]) df = pd.read_csv( zf.open(zi), header=None, names=('name', 'gender', 'count')) df['year'] = year post[year] = df df = pd.concat(post.values()) df.set_index('name', inplace=True, drop=True) return df
def read_data(fname): """ Read football-data.co.uk csv """ data = ( pd.read_csv(fname) .rename(columns={ 'HomeTeam': 'home_team', 'AwayTeam': 'away_team', 'FTHG': 'home_goals', 'FTAG': 'away_goals' }) .loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games ) team_map = stan_map(pd.concat([data['home_team'], data['away_team']])) data['home_team_id'] = data['home_team'].replace(team_map) data['away_team_id'] = data['away_team'].replace(team_map) for col in ('home_goals', 'away_goals'): data[col] = [int(c) for c in data[col]] return data, team_map
def cluster_map_sheet_pre(): print("------ load cluster_map data ----------") cluster_map_sheet_path = os.path.join(LOAD_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR) print("load data from: ", cluster_map_sheet_path) save_path = os.path.join(SAVE_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR) print("save data to: ", save_path) file = "cluster_map" cluster_sheet = os.path.join(cluster_map_sheet_path, file) data = pd.read_csv(cluster_sheet,header=-1) data.columns = ["raw"] data["district_hash"] = data["raw"].map(lambda x: x.split("\t")[0]) data["district_map"] = data['raw'].map(lambda x: x.split("\t")[1]) del data["raw"] save_df_to_file(data, save_path, file) # handle the order_info sheet
def create_hash_district_map_dict(): file = "cluster_map.csv" district_hash_map_path = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, file) hash_data = pd.read_csv(district_hash_map_path) ## convert the dataframe into dict hash_map_rule = dict(zip(hash_data.district_hash, hash_data.district_map)) # print(type(hash_map_rule)) saved_file = "cluster_map.pickle" map_save_file = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, saved_file) ## save into same dir as file with open(map_save_file, "wb") as f: pickle.dump(hash_map_rule, f) #print(hash_map_rule) # map the district features in the input data_frame into value
def test_prepare_dataset(self, fetch, chamber_of_deputies): """ * Rename columns. * Make `document_type` a category column. * Rename values for `category`. * Create `is_party_expense` column. """ dataset = self.subject.dataset self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns))) document_types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad'] self.assertEqual(document_types, dataset['document_type'].cat.categories.tolist()) fixture = pd.read_csv(os.path.join(self.fixtures_path, 'reimbursements.xz')) meal_rows = fixture \ .query('subquota_description == "Congressperson meal"').index self.assertEqual(['Meal'], dataset.loc[meal_rows, 'category'].unique().tolist()) party_expense_rows = fixture[fixture['congressperson_id'].isnull()].index self.assertEqual([True], dataset.loc[party_expense_rows, 'is_party_expense'].unique().tolist())
def _load_sets(self): print("Loading datasets") train_patients = pd.read_csv("data/stage1/" + "stage1_labels.csv") for idx, row in train_patients.iterrows(): if self._check_sample_exists(row['id']): self._test_set.append(row['id']) for idx, row in train_patients.iterrows(): if self._check_sample_exists(row['id']): self._train_set.append([row['id'], row['cancer']]) #Create permutation for random loading self.shuffle() print("Loading datasets: Done!")
def _load_sets(self): print("Loading datasets") train_patients = pd.read_csv(os.path.join(self._directory, "stage1_labels.csv")) test_patients = pd.read_csv(os.path.join(self._directory, "stage1_sample_submission.csv")) for idx, row in test_patients.iterrows(): self._test_set.append(row['id']) for idx, row in train_patients.iterrows(): self._train_set.append([row['id'], row['cancer']]) #Create permutation for random loading self.shuffle() print("Loading datasets: Done!")
def doctable(ctx): df = pd.read_csv('./docs/flight-options.csv') # open an existing document doc = docx.Document('./docs/style-reference.docx') as_int = partial(format_decimal, format='#') as_usd = partial(format_currency, currency='USD') s = doc.sections[0] width = s.page_width - s.left_margin - s.right_margin doc.add_picture('./docs/diagrams_002.png', width=width) formatters = { 'ticket_price': as_usd, 'total_hours': as_int, 'trip': as_int, 'airline': partial(shorten_long_name, width=20), 'selected': compose({0: 'No', 1: 'Yes'}.get, int) } add_table(df, doc, table_style='Plain Table 3', formatters=formatters) # save the doc doc.save('./docs/test.docx')
def eval(flags): name = flags.pred_path yp = pd.read_csv(name) classes = len([i for i in yp.columns.values if 'class' in i]) yp = yp[['class%d'%i for i in range(1,classes+1)]].values myDB = personalDB(flags,name="full") if "stage1" in name: y=myDB.data['test_variants_filter']['Class']-1 else: myDB.get_split() va = myDB.split[flags.fold][1] y = np.argmax(myDB.y[va],axis=1) if np.max(y)>classes: y = np.argmax(to4c(onehot_encode(y)),axis=1) score = cross_entropy(y,yp) print(name,score,'\n')
def eval(name,clip=False,bar=0.9): base = pd.read_csv('../input/stage1_solution_filtered.csv') base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1) sub = pd.read_csv(name) #sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right') #print(sub.head()) y = base['Class'].values yp = sub[['class%d'%i for i in range(1,10)]].values if clip: yp = np.clip(yp,(1.0-bar)/8,bar) yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1]) print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp)) for i in range(9): y1 = y[y==i] yp1 = yp[y==i] print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
def post(self): if self.flags.task == "test_cnn_stage1": docs = self.DB.clean_doc['test_text_filter'] elif self.flags.task == "test_cnn_stage2": docs = self.DB.clean_doc['stage2_test_text'] else: self.mDB.get_split() docs = self.mDB.split[self.flags.fold][1] nrows = len(docs) p = np.zeros([nrows,9]) for i in range(self.flags.epochs): if i==0: skiprows=None else: skiprows = nrows*i p = p + (pd.read_csv(self.flags.pred_path,header=None,nrows=nrows,skiprows=skiprows).values) p = p/self.flags.epochs if '_cv' in self.flags.task: from utils.np_utils.utils import cross_entropy y = np.argmax(self.mDB.y,axis=1) print("cross entropy", cross_entropy(y[self.mDB.split[self.flags.fold][1]],p)) s = pd.DataFrame(p,columns=['class%d'%i for i in range(1,10)]) s['ID'] = np.arange(nrows)+1 s.to_csv(self.flags.pred_path.replace(".csv","_sub.csv"),index=False,float_format="%.5f")
def post_cv(flags): import re import os path = flags.data_path files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))] s = [] for name in files: s.append(pd.read_csv("%s/%s"%(path,name))) s = pd.concat(s,axis=0) print(s.head()) classes = len([i for i in s.columns.values if 'class' in i]) from utils.np_utils.utils import cross_entropy yp = s[['class%d'%i for i in range(1,classes+1)]].values y=s['real'].values print(cross_entropy(y,yp)) s.to_csv("%s/cv.csv"%path,index=False)
def replace(s,n): seen = pd.read_csv(s) unseen = pd.read_csv(n) te = pd.read_csv('../input/stage2_test_variants.csv') tr = pd.read_csv('../input/training_variants') unseen = pd.merge(unseen,te,on='ID',how='right') seen = pd.merge(seen,te,on='ID',how='right') mask = seen.Gene.isin(tr.Gene) cols = ['class%d'%i for i in range(1,10)] seen.loc[~mask,cols] = 0 mask = unseen.Gene.isin(tr.Gene) unseen.loc[mask,cols] = 0 assert (unseen['ID']==seen['ID']).all() seen[cols] = seen[cols] + unseen[cols] seen[cols+['ID']].to_csv('mix.csv',index=False)
def test2(): s1 = pd.read_csv('../input/test_variants') s3 = pd.read_csv('../input/test_variants_filter') s1 = pd.merge(s1,s3[['ID','Class']],on='ID',how='left').fillna(1) s2 = pd.read_csv('../input/stage2_test_variants.csv') s1 = pd.merge(s1,s2,on= ["Gene", "Variation"],how='inner') s1['ID'] = s1['ID_y'] s2 = pd.merge(s1[['ID','Class']],s2,on='ID',how='right').fillna(1) yp = onehot_encode(s2['Class'].values-1) for i in range(1,10): s2['class%d'%i] = yp[:,i-1] cols = ['class%d'%i for i in range(1,10)] mask = s2['ID'].isin(s1['ID_y']) s2.loc[~mask,cols] = 0.1 s2['ID'] = s2['ID'].astype(int) cols = ['ID']+['class%d'%i for i in range(1,10)] s2[cols].to_csv('sub.csv',index=False)
def x_label(feature_path, pred=False): X_list = [] for each in feature_path: X = pd.read_csv(feature_paths.format(str(each))) X_list.append(X) X = pd.DataFrame(pd.concat(X_list, axis=0)).reset_index().drop('index', axis=1) if not pred: y = X[power_consumption].tolist() X = X.drop([record_date, user_id, power_consumption], axis=1) columns = X.columns X = X.values return X, y, columns else: X = X.drop([record_date, user_id], axis=1) columns = X.columns X = X.values return X, columns
def neighbors(): """ Read the neighbors for each country. """ neighbors_csv = pd.read_csv(csv_path("mledoze-countries.csv"), sep=';', usecols=[4, 17]) neighbors_csv.columns = ["Code", "neighbors"] neighbors_csv["neighbors"] = neighbors_csv["neighbors"].str.split(',') for row in neighbors_csv.loc[neighbors_csv.neighbors.isnull(), 'neighbors'].index: neighbors_csv.at[row, 'neighbors'] = [] # Island nations are a weird exception neighbors_csv.loc[neighbors_csv.Code == "MDG", "neighbors"] = [["MOZ", "ZAF", "TZA"]] neighbors_csv.loc[neighbors_csv.Code == "TWN", "neighbors"] = [["CHN", "PHL"]] neighbors_csv.loc[neighbors_csv.Code == "AUS", "neighbors"] = [["NZL"]] neighbors_csv.loc[neighbors_csv.Code == "NZL", "neighbors"] = [["AUS"]] neighbors_csv.loc[neighbors_csv.Code == "JPN", "neighbors"] = [["TWN", "KOR", "PHL"]] neighbors_csv.loc[neighbors_csv.Code == "PHL", "neighbors"] = [["TWN", "KOR", "JPN"]] neighbors_csv.loc[neighbors_csv.Code == "PRI", "neighbors"] = [["DOM"]] neighbors_csv.loc[neighbors_csv.Code == "SGP", "neighbors"] = [["MYS", "IDN"]] neighbors_csv.loc[neighbors_csv.Code == "JAM", "neighbors"] = [["CUB", "DOM"]] return neighbors_csv
def loadFile(fileName): # checkFileName??? ??, ??? ???? ???? ??? ?? outputFileName = checkFileName(fileName) if outputFileName is not -1: df = pandas.read_csv(outputFileName) content = df["Content"] title = df["Title"] company = df["Company"] print(company) print("csv FIle Load Success") else: print("Error csv File") # checkFileName ?? # ???? ??? ???? ???? ??? -1 ??, ??? ??? ?? # ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ?? # ??? ???? csv ??? ??
def loadFile(fileName,analyzeValue): # checkFileName??? ??, ??? ???? ???? ??? ?? outputFileName = checkFileName(fileName) if outputFileName is not -1: df = pandas.read_csv(outputFileName) content = df["Content"] title = df["Title"] company = df["Company"] print("csv FIle Load Success") if analyzeValue==1: # analyze(title) analyze(content) else: print("Error csv File") # checkFileName ?? # ???? ??? ???? ???? ??? -1 ??, ??? ??? ?? # ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ?? # ??? ???? csv ??? ??
def filter_data(csv_file, start_day=28, end_day=90, interest=780, state=None, **kwargs): f = pd.read_csv(csv_file) f['sub_title'] = f['sub_title'].fillna('') candidate = [] filter = Filter() filter.install_rule(lambda v: v['period'] <= datetime.timedelta(days=20) and v['benefit'] > 6, ok_stop=True, weight=5) filter.install_rule(lambda v: v['benefit'] >= 8 and v['period'] < datetime.timedelta(days=230)) filter.install_rule(lambda v: not v['sub_title'].startswith('????')) for row in f.iterrows(): idx, v = row money = money2float(v['money']) period = period2timedelta(v['period']) # remove percent sign(%) benefit = float(v['expected_benefit'][:-1]) item = { 'title': v['title'], 'sub_title': v['sub_title'], 'money': money, 'period': period, 'benefit': benefit, } if filter.check(item): candidate.append(item) return candidate
def filter_data(csv_file, **kwargs): f = pd.read_csv(csv_file) candidate = [] filter = Filter() filter.install_rule(lambda v: not v['title'].startswith('test')) for row in f.iterrows(): idx, v = row item = { 'title': v['title'], } if filter.check(item): candidate.append(item) return candidate # If len(candicate) > 0 will send to slack, the text will store as slack_txt_file
def parse_psqs(psqs_results_file): """Parse a PSQS result file and returns a Pandas DataFrame of the results Args: psqs_results_file: Path to psqs results file Returns: Pandas DataFrame: Summary of PSQS results """ # TODO: generalize column names for all results, save as dict instead psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None) psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb')) psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1) psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan) psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan) psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)] return psqs_results
def LoadFromTextFile(InputDir): ## raw data TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0) TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0) TestData['parcelid'] = TestData['ParcelId'] TestData.drop('ParcelId', axis=1, inplace=True) PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0) for c, dtype in zip(PropertyData.columns, PropertyData.dtypes): if dtype == np.float64: PropertyData[c] = PropertyData[c].astype(np.float32) ## join dynamic data with static data TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid') TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid') return TrainData,TestData ## class method, save data with pkl format
def get_microbe_taxids(force_download=False): """ Download the latest bacterial genome assembly summary from the NCBI genome ftp site and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes. :return: pandas dataframe of bacteria reference genome data """ if force_download or not os.path.exists("reference_genomes.csv"): assembly = urllib.request.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt") df = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=1, header=0) df = df[df['refseq_category'].isin(['reference genome', 'representative genome'])] all_tax_wdid = id_mapper('P685') df['wdid'] = df['taxid'].apply(lambda x: all_tax_wdid.get(x, None)) df = df.rename(columns={'# assembly_accession': 'assembly_accession'}) df.to_csv('reference_genomes.csv', sep="\t") df.taxid = df.taxid.astype(int) return df else: # use predownloaded and parsed flatfile df = pd.read_csv("reference_genomes.csv", sep="\t", dtype=object, index_col=0) df.taxid = df.taxid.astype(int) return df
def get_assembly_report(self, taxid): if self.ass_sum is None: self.get_assembly_summaries() df = self.ass_sum.query("taxid == {} & refseq_category == 'reference genome'".format(taxid)) if len(df) == 0: # try "representative genome" (needed for mouse and rat) df = self.ass_sum.query("taxid == {} & refseq_category == 'representative genome'".format(taxid)) if len(df) != 1: raise ValueError("unknown reference: {}".format(df)) print(df) ftp_path = list(df.ftp_path)[0] assembly = os.path.split(ftp_path)[1] url = os.path.join(ftp_path, assembly + "_assembly_report.txt") print(url) # read the column names from the file table = request.urlopen(request.Request(url)).read().decode() names = [x for x in table.split("\n") if x.startswith("#")][-1].strip().replace("# ", "").split("\t") self.chr_df[taxid] = pd.read_csv(StringIO(table), sep="\t", names=names, comment='#') self.chr_df[taxid] = self.chr_df[taxid].rename(columns={'Sequence-Name': 'SequenceName', 'Sequence-Role': 'SequenceRole', 'Assigned-Molecule': 'AssignedMolecule', 'Assigned-Molecule-Location/Type': 'AssignedMoleculeLocationType', 'GenBank-Accn': 'GenBankAccn', 'RefSeq-Accn': 'RefSeqAccn', 'UCSC-style-name': 'UCSCstylename'}) #print(self.chr_df[taxid].query("SequenceRole == 'assembled-molecule'"))
def load_bbox(data_dir): bbox_path = os.path.join(data_dir, 'CUB_200_2011/bounding_boxes.txt') df_bounding_boxes = pd.read_csv(bbox_path, delim_whitespace=True, header=None).astype(int) # filepath = os.path.join(data_dir, 'CUB_200_2011/images.txt') df_filenames = pd.read_csv(filepath, delim_whitespace=True, header=None) filenames = df_filenames[1].tolist() print('Total filenames: ', len(filenames), filenames[0]) # filename_bbox = {img_file[:-4]: [] for img_file in filenames} numImgs = len(filenames) for i in xrange(0, numImgs): # bbox = [x-left, y-top, width, height] bbox = df_bounding_boxes.iloc[i][1:].tolist() key = filenames[i][:-4] filename_bbox[key] = bbox # return filename_bbox
def get_sample_item_file(wav_file_names_sample, item_file, output): """ From a sampled dataset, get an item file for running an ABX task Parameters ---------- item file : text file containing at least as columns : #filename, onset, offset, #phoneme and context and side information such as image ID item_file : string, path to the item file of the whole dataset output: string, path where the sample item file will be stored """ wav_names=[] temp=np.load(wav_file_names_sample) for s in temp: wav_names.append(s.split(".")[0]) df=pd.read_csv(item_file, sep="\t", index_col="#filename") df_sample=df.loc[wav_names] df_sample.to_csv(output, sep="\t", header=True, index=False) return(df_sample)
def meansOfMeans(datafile): df = pd.read_csv(datafile, delimiter=",") df = df.loc[df["swapsEager"]>0] grouped = df.groupby("words", as_index=True) idx = grouped.groups.keys() all_means=grouped.mean() mean_of_means = all_means.mean() std_of_means = all_means.std() #Print in latex format: print "& Average number of swaps & Average jump size \\\\" print "\hline" for laziness in ("Eager", "Lazy", "Lazier"): print "{} & {}({}) & {}({})\\\\".format(laziness, \ mean_of_means["swaps%s"%laziness], \ std_of_means["swaps%s"%laziness], \ mean_of_means["avgAltBlockSize%s"%laziness], \ std_of_means["avgAltBlockSize%s"%laziness])
def read_sm_csv(csv_fname): """ Parse the SuperMAG CSV format data record *csv_fname*. For each station, store the information in pandas :class:`DataFrame`. Return a mapping between the station identifier and data frame. """ df = PD.read_csv(csv_fname, header=0, parse_dates=[0], date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'), index_col=0) df_map = {name: group for name, group in df.groupby('IAGA')} for df in df_map.itervalues(): del df['IAGA'] df.rename(columns={'N': 'B_N', 'E': 'B_E', 'Z': 'B_Z'}, inplace=True) return df_map
def crossed_data(): ''' Random effects: 10 subjects, 12 items, 5 sites Subjects crossed with items, nested in sites Items crossed with sites Fixed effects: A continuous predictor, a numeric dummy, and a three-level category (levels a,b,c) Structure: Subjects nested in dummy (e.g., gender), crossed with threecats Items crossed with dummy, nested in threecats Sites partially crossed with dummy (4/5 see a single dummy, 1/5 sees both dummies) Sites crossed with threecats ''' from os.path import dirname, join data_dir = join(dirname(__file__), 'data') data = pd.read_csv(join(data_dir, 'crossed_random.csv')) return data
def calcu_all_stocks_3year_average_profit(year): # ??3??????? path = os.path.join(current_folder, '????%s.csv' % today) if not os.path.exists(path): data = ts.get_stock_basics() lie = ['??', '??', '??', '???', '????', '???', '???(?)', '????', '????', '???', '?????', '????', '????', '???', '????', '????', '?????', '????(%)', '????(%)', '???(%)', '????(%)', '????'] data.columns = lie data.index.names = ['??'] data.to_csv(path, encoding='utf-8') data = pd.read_csv(path, encoding='utf-8', index_col=0) # print(data) data['????'] = 0 for index, row in data.iterrows(): try: data.loc[index, '????'] = calcu_3year_average_profit('%06d' % index, year) except Exception as e: print(e) data.loc[index, '????'] = 0 print('??%s' % index) data.to_csv(os.path.join(current_folder, '3????????????%s.csv' % today), encoding='utf-8')
def save_csv_as_dataframe(request): print("Save CSV as DataFrame") if (request.POST): # Get CSV URL from post; default to None if not provided csv_url = request.POST.get('csv_url', None) if (csv_url): csv_data = pd.read_csv(csv_url) print(csv_data) # Create Data Frame instance data = Data() # Add CSV Data to data_frame field data.data_frame = csv_data data.source_url = csv_url # Save Data Frame data.save()
def store_test_predictions(self, prediction_id='_final'): """ Stores the test predictions in a CSV file :param prediction_id: A simple id appended to the name of the summary for uniqueness :return: None """ # prediction id is usually the step count print 'Storing predictions on Test Data...' review = [] true_summary = [] generated_summary = [] for i in range(self.test_size): if not self.checkpointer.is_output_file_present(): review.append(self._index2sentence(self.test_review[i])) true_summary.append(self._index2sentence(self.true_summary[i])) if i < (self.test_batch_size * (self.test_size // self.test_batch_size)): generated_summary.append(self._index2sentence(self.predicted_test_summary[i])) else: generated_summary.append('') prediction_nm = 'generated_summary' + prediction_id if self.checkpointer.is_output_file_present(): df = pd.read_csv(self.checkpointer.get_result_location(), header=0) df[prediction_nm] = np.array(generated_summary) else: df = pd.DataFrame() df['review'] = np.array(review) df['true_summary'] = np.array(true_summary) df[prediction_nm] = np.array(generated_summary) df.to_csv(self.checkpointer.get_result_location(), index=False) print 'Stored the predictions. Moving Forward' if prediction_id == '_final': print 'All done. Exiting..' print 'Exited'
def load_result(self,result_file): """ :param result_file: :return: """ self.result = pd.read_csv(result_file, header=0) self.__scrape_reference() self.__scrape_all_hypotheses()
def training_set(self): return pd.read_csv(resource_filename('numerai.data', self.train_file_name))
def test_set(self): return pd.read_csv(resource_filename('numerai.data', self.test_file_name))
def sorted_training_set(self): return pd.read_csv(resource_filename('numerai.data', self.sorted_file_name))
def _reader(self): if not self.does_exist(): return dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S.%f') df = pd.read_csv(self.data_file, parse_dates='timestamp', index_col='timestamp', date_parser=dateparse) return df
def get_orders(self): ''' get order context information ''' orders = pd.read_csv(self.raw_data_dir + 'orders.csv') orders = orders.fillna(0.0) orders['days'] = orders.groupby(['user_id'])['days_since_prior_order'].cumsum() orders['days_last'] = orders.groupby(['user_id'])['days'].transform(max) orders['days_up_to_last'] = orders['days_last'] - orders['days'] del orders['days_last'] del orders['days'] return orders