Python pandas 模块,read_json() 实例源码
我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用pandas.read_json()。
def evaluate(input_path, n_jobs):
aud, ann = zip(*crema.utils.get_ann_audio(input_path))
test_idx = set(pd.read_json('index_test.json')['id'])
# drop anything not in the test set
ann = [ann_i for ann_i in ann if crema.utils.base(ann_i) in test_idx]
aud = [aud_i for aud_i in aud if crema.utils.base(aud_i) in test_idx]
stream = tqdm(zip(ann, aud), desc='Evaluating test set', total=len(ann))
results = Parallel(n_jobs=n_jobs)(delayed(track_eval)(ann_i, aud_i)
for ann_i, aud_i in stream)
df = pd.DataFrame.from_dict(dict(results), orient='index')
print('Results')
print('-------')
print(df.describe())
df.to_json(os.path.join(OUTPUT_PATH, 'test_scores.json'))
def get_stats(self):
import pandas as pd
filenames, mode = self._get_files("*.json")
if mode == "pe":
df1 = pd.read_json(filenames[0])
df2 = pd.read_json(filenames[1])
df = pd.concat([df1, df2])
# Should have been sorted !
df.index = ['R1', 'R2']
else:
df = pd.read_json(filenames[0])
df.index = ['R1']
df = df[["A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content",
"average read length", "total bases"]]
for this in "ACGTN":
df[this] /= df["total bases"]
df[this] *= 100
return df
def fetch_raw_metadata_frame(self, api_key, page_number):
if page_number > 1:
return pd.DataFrame([])
raw = pd.read_json(
self._format_metadata_url(
api_key,
page_number,
),
orient='index',
)
raw = raw.sort_index().reset_index()
raw.rename(
columns={'index': 'symbol'},
inplace=True,
)
raw = raw[raw['isFrozen'] == 0]
return raw
def read_stories_without_tags():
stories = list()
current_date = START_DATE
while current_date <= END_DATE:
file_in = open("./TopStories/%s.json" % current_date.isoformat(), 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
for raw_story in raw_data['stories']:
story = dict()
story['top_date'] = current_date.isoformat()
story['story_id'] = raw_story['story_id']
story['author'] = raw_story['author']
story['published_date'] = raw_story['published_date']
story['recommends'] = raw_story['recommends']
story['responses'] = raw_story['responses']
story['tags_count'] = len(raw_story['tags'])
stories.append(story)
print(current_date.isoformat())
current_date = current_date + datetime.timedelta(days=1)
return pd.read_json(json.dumps(stories))
def read_stories_by_tags():
tags = list()
current_date = START_DATE
while current_date <= END_DATE:
file_in = open("./TopStories/%s.json" % current_date.isoformat(), 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
for raw_story in raw_data['stories']:
for raw_tag in raw_story['tags']:
tag = dict()
tag['top_date'] = current_date.isoformat()
tag['story_id'] = raw_story['story_id']
tag['author'] = raw_story['author']
tag['published_date'] = raw_story['published_date']
tag['recommends'] = raw_story['recommends']
tag['responses'] = raw_story['responses']
tag['name'] = raw_tag['name']
tag['post_count'] = raw_tag['postCount']
tag['follower_count'] = raw_tag['metadata']['followerCount']
tags.append(tag)
print(current_date.isoformat())
current_date = current_date + datetime.timedelta(days=1)
return pd.read_json(json.dumps(tags))
def test_frame_from_json_bad_data(self):
self.assertRaises(ValueError, read_json, StringIO('{"key":b:a:d}'))
# too few indices
json = StringIO('{"columns":["A","B"],'
'"index":["2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
self.assertRaises(ValueError, read_json, json,
orient="split")
# too many columns
json = StringIO('{"columns":["A","B","C"],'
'"index":["1","2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
self.assertRaises(AssertionError, read_json, json,
orient="split")
# bad key
json = StringIO('{"badkey":["A","B"],'
'"index":["2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
with tm.assertRaisesRegexp(ValueError, r"unexpected key\(s\): badkey"):
read_json(json, orient="split")
def test_v12_compat(self):
df = DataFrame(
[[1.56808523, 0.65727391, 1.81021139, -0.17251653],
[-0.2550111, -0.08072427, -0.03202878, -0.17581665],
[1.51493992, 0.11805825, 1.629455, -1.31506612],
[-0.02765498, 0.44679743, 0.33192641, -0.27885413],
[0.05951614, -2.69652057, 1.28163262, 0.34703478]],
columns=['A', 'B', 'C', 'D'],
index=pd.date_range('2000-01-03', '2000-01-07'))
df['date'] = pd.Timestamp('19920106 18:21:32.12')
df.ix[3, 'date'] = pd.Timestamp('20130101')
df['modified'] = df['date']
df.ix[1, 'modified'] = pd.NaT
v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
df_unser = pd.read_json(v12_json)
assert_frame_equal(df, df_unser)
df_iso = df.drop(['modified'], axis=1)
v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
df_unser_iso = pd.read_json(v12_iso_json)
assert_frame_equal(df_iso, df_unser_iso)
def test_date_format_frame(self):
df = self.tsframe.copy()
def test_w_date(date, date_unit=None):
df['date'] = Timestamp(date)
df.ix[1, 'date'] = pd.NaT
df.ix[5, 'date'] = pd.NaT
if date_unit:
json = df.to_json(date_format='iso', date_unit=date_unit)
else:
json = df.to_json(date_format='iso')
result = read_json(json)
assert_frame_equal(result, df)
test_w_date('20130101 20:43:42.123')
test_w_date('20130101 20:43:42', date_unit='s')
test_w_date('20130101 20:43:42.123', date_unit='ms')
test_w_date('20130101 20:43:42.123456', date_unit='us')
test_w_date('20130101 20:43:42.123456789', date_unit='ns')
self.assertRaises(ValueError, df.to_json, date_format='iso',
date_unit='foo')
def test_date_format_series(self):
def test_w_date(date, date_unit=None):
ts = Series(Timestamp(date), index=self.ts.index)
ts.ix[1] = pd.NaT
ts.ix[5] = pd.NaT
if date_unit:
json = ts.to_json(date_format='iso', date_unit=date_unit)
else:
json = ts.to_json(date_format='iso')
result = read_json(json, typ='series')
assert_series_equal(result, ts)
test_w_date('20130101 20:43:42.123')
test_w_date('20130101 20:43:42', date_unit='s')
test_w_date('20130101 20:43:42.123', date_unit='ms')
test_w_date('20130101 20:43:42.123456', date_unit='us')
test_w_date('20130101 20:43:42.123456789', date_unit='ns')
ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index)
self.assertRaises(ValueError, ts.to_json, date_format='iso',
date_unit='foo')
def test_date_unit(self):
df = self.tsframe.copy()
df['date'] = Timestamp('20130101 20:43:42')
df.ix[1, 'date'] = Timestamp('19710101 20:43:42')
df.ix[2, 'date'] = Timestamp('21460101 20:43:42')
df.ix[4, 'date'] = pd.NaT
for unit in ('s', 'ms', 'us', 'ns'):
json = df.to_json(date_format='epoch', date_unit=unit)
# force date unit
result = read_json(json, date_unit=unit)
assert_frame_equal(result, df)
# detect date unit
result = read_json(json, date_unit=None)
assert_frame_equal(result, df)
def test_weird_nested_json(self):
# this used to core dump the parser
s = r'''{
"status": "success",
"data": {
"posts": [
{
"id": 1,
"title": "A blog post",
"body": "Some useful content"
},
{
"id": 2,
"title": "Another blog post",
"body": "More content"
}
]
}
}'''
read_json(s)
def test_misc_example(self):
# parsing unordered input fails
result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
error_msg = """DataFrame\\.index are different
DataFrame\\.index values are different \\(100\\.0 %\\)
\\[left\\]: Index\\(\\[u?'a', u?'b'\\], dtype='object'\\)
\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)"""
with tm.assertRaisesRegexp(AssertionError, error_msg):
assert_frame_equal(result, expected, check_index_type=False)
result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]')
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
def _getUserStrategy(self, downloadStrategyInterval=60):
"""??????????????
downloadStrategyInterval: int default=60 ?
return: df"""
k = "SignForWebUser_preLoadTime"
preLoadTime = myredis.get_obj(k)
if preLoadTime is None:
preLoadTime = datetime.datetime(2015, 10, 19, 15, 33, 47, 53000) #????????
#????
if (agl.curTime() - preLoadTime).total_seconds() > downloadStrategyInterval:
url = "http://stocksign.sinaapp.com/query?cmd=query_strategy"
result = Http().get(url)
df_source = pd.read_json(result)
df_source.columns = ['id', 'user_id', 'title', 'code']
preLoadTime = agl.curTime()
myredis.set_obj(k, preLoadTime)
myredis.set_obj('mysource', df_source)
else:
df_source = myredis.get_obj('mysource')
if df_source is None:
df_source = pd.DataFrame([])
return df_source
def read_scraped_jason(filename):
df = pd.read_json(filename)
for column in df.columns:
df[column] = df[column].apply(unlist)
# gets only first 10 characters of date: year/month/day
df['date'] = df['date'].apply(lambda x: x[:10])
df['date'] = pd.to_datetime(df['date'])
# if any removes duplicate posts
df = df.drop_duplicates(subset = ['keywords'])
# sorts dataframe by post date
df = df.sort_values(by='date')
df = df.drop('body', 1)
df = df.drop('title', 1)
df['keywords'].replace('', np.nan, inplace=True)
df = df.dropna()
return df
def extract_features_from_json():
input_path = '../../data/20_5_from_2008/'
df_list = []
for json_file in os.listdir(input_path):
train_data = pd.read_json(os.path.join(input_path, json_file), orient='columns')
train_data.dropna(inplace=True)
train_data.sort_index(ascending=False, inplace=True)
train_data.index = range(len(train_data))
if len(train_data) > 0:
data_norm(train_data)
values = train_data['real_up_after_240'].tolist()
codes = train_data['code'].tolist()
train_data.drop(['datetime', 'code', 'real_up_after_240'], axis=1, inplace=True)
features = train_data.values.tolist()
with open('../../data/20_5_from_2008/data', 'a') as f:
for ix in xrange(len(codes)):
if np.inf not in features[ix] and -np.inf not in features[ix]:
f.write('%s;0 %s;1 %f\n' % (codes[ix][2:], ' '.join([str(x) for x in features[ix]]), values[ix]))
def loadStepsData(dumpDir):
"""
Load steps data from dumping done using the official Fitbit API.
Check README file for further info on the scraping process and saved format
:param dumpDir: the folder where the date has been dumped
:return: a list of dataframes, one for each day, containing the intraday steps data
"""
def loadFun(jsonData):
intradayData = jsonData['activities-steps-intraday']['dataset']
date = jsonData['activities-steps'][0]['dateTime']
if not intradayData:
return None
df = pd.read_json(json.dumps(intradayData))
df['datetime'] = pd.to_datetime(date + ' ' + df['time'])
df.drop('time', inplace=True, axis=1)
return df
return _loadData(dumpDir, 'steps', loadFun)
def get_orders_frame(self, state=None, kind='auth'):
q = self.get_orders(state, kind)
tj = json.dumps(q['orders'])
df = pd.read_json(tj, convert_dates=['creation_timestamp', 'expiration_timestamp'])
df.index = df.creation_timestamp
return df
def processData(data):
df = pd.DataFrame.transpose(pd.read_json(json.dumps(data)))
df = df.dropna(subset = [key for key in df.keys() if "x_" in key])
df = df[pd.notnull(df['y_observed'])]
X = df[[key for key in df.keys() if "x_" in key]].values
y = df["y_observed"].values
return X, y
# 5th: initial model
def read_json_file_into_pandas_df(self, filename, index_col=False):
json_string = self.read_json_file(filename=filename)
return pd.read_json(json_string)
def get_bg_dataframe(id_str):
"""
Function to convert the json file to a pandas dataframe.
It takes in the string of the id and looks for the devicestatus.json file.
All data should be stored such that in the directory where main.py lies,
there is a directory called "data". Inside this directory,
there is another directory with just the ID Number. Inside this data folder lies the
devicestatus.json file, which contains the data. If the file is not in the path given,
it raises an IOError. The path should look like the following example:
./data/12345678/devicestatus.json
Input: id_str ID number as a string
Output: bg_df Pandas dataframe of all of the data from ./data/[id_str]/devicestatus.json
Usage: bg_df = get_bg_dataframe("12345678")
"""
try:
file_location = "./data/" + id_str + "/devicestatus.json"
bg_df = pd.read_json(file_location) #Opens the data file and reads in the data into a dataFrame
except:
raise IOError(file_location + " is not a valid file.")
print
print("{} total entries.".format(len(bg_df)))
return bg_df
#Function to find the indices for the given start and end date strings
def file_search(filename, verbose):
"""Search for filename. Returns dirname of the filename's path, and the full path.
170107 add cache. If the db is not found, create an empty pandas df
and populate this df with append later. If the filename is not in the db
run g/locate. Then, save the found path to the db (using pandas, via df, to json)"""
# cache
if os.path.isfile(JSON_DB):
df = pd.read_json(JSON_DB, orient='records')
#filename = 'x.pse'
pathdf = df[df['fn'] == filename]['path']
if not pathdf.empty:
path = pathdf.to_string(index=False)
logger.info('find file [from the db]:' + filename)
return os.path.dirname(path), path
else:
df = pd.DataFrame()
# if filename is not found in the db
logger.info('find file:' + filename)
if platform.system() == "Linux":
out = commands.getoutput('locate ' + filename)
if platform.system() == "Darwin":
out = commands.getoutput('glocate ' + filename)
first_hit = out.split('\n')[0]
logger.info('# of hits ' + str(len(out.split('\n'))) + " " + out.replace('\n',', '))
if not first_hit:
logger.info('not found')
else:
logger.info('hit ' + first_hit)
# update cache
dffile = pd.DataFrame([[filename, first_hit],], columns=['fn', 'path'])
df = df.append(dffile, ignore_index=True)
# save to json
df.to_json(JSON_DB, orient='records')
##
return os.path.dirname(first_hit), first_hit
def get_holiday_json(self):
"""
???????
:return:
"""
path = os.path.join(pwd, 'holiday.json')
return pd.read_json(path, typ="series").sort_index()
def pd_json_to_df(self, data_json, sorted_by_key="Date", in_ascending=True):
import pandas as pd
new_df = pd.read_json(data_json).sort_values(by=sorted_by_key, ascending=in_ascending)
return new_df
# end of pd_json_to_df
def get_raw(filename):
with open(filename) as infile:
raw = infile.read()
# the next line needs rewriting as soon as the zenodo-dump conforms to 'records'-format
# [{k:v}, {k:v},...]
rawfacts = pd.read_json('[%s]' % ','.join(raw.splitlines()), orient='records')
return rawfacts
### functions for ingesting from CProject
### functions for preprocessing
def load_json(filename):
'''
Function:
- opens json file and store information in a pandas dataframe
- also prints out aggregated df with counts of picture by countrycode
Input:
1. filename/path ex: ./data/filename.json
Output:
1. new dataframe containing json info
'''
df = pd.read_json(filename, lines=True)
test = df.groupby(df['countrycode']).count()
print test.sort(columns='drawing',ascending=False).head(15)
return df
def read_data(project_path):
print "Reading data..."
train = pd.read_json(project_path + "/data/train.json")
test = pd.read_json(project_path + "/data/test.json")
print "Train size:", len(train.id)
print "Test size:", len(test.id)
return train, test
def read_json_file_into_pandas_df(self, filename):
return pd.read_json(os.path.join(self.src_dir, filename), dtype=np.int8)
def read_json_file_into_pandas_df(self, filename):
json_string = self.read_json_file(filename=filename)
return pd.read_json(json_string, dtype=np.int8)
def apiResults(locationInfo):
query = ("https://data.seattle.gov/resource/pu5n-trf4.json?$limit={}&$where=within_circle(incident_location,{},{},{})"
.format(locationInfo['limit'],
locationInfo['latitude'],
locationInfo['longitude'],
locationInfo['radius']))
return pd.read_json(query)
def fetch_raw_symbol_frame(self,
api_key,
symbol,
calendar,
start_date,
end_date,
frequency):
# TODO: replace this with direct exchange call
# The end date and frequency should be used to
# calculate the number of bars
if(frequency == 'minute'):
pc = PoloniexCurator()
raw = pc.onemin_to_dataframe(symbol, start_date, end_date)
else:
raw = pd.read_json(
self._format_data_url(
api_key,
symbol,
start_date,
end_date,
frequency,
),
orient='records',
)
raw.set_index('date', inplace=True)
# BcolzDailyBarReader introduces a 1/1000 factor in the way
# pricing is stored on disk, which we compensate here to get
# the right pricing amounts
# ref: data/us_equity_pricing.py
scale = 1
raw.loc[:, 'open'] /= scale
raw.loc[:, 'high'] /= scale
raw.loc[:, 'low'] /= scale
raw.loc[:, 'close'] /= scale
raw.loc[:, 'volume'] *= scale
return raw
def run_query(query, cache_key, expire=3600, dialect='legacy'):
memcached_client = memcached_discovery.get_client()
if memcached_client is None:
return _run(query, dialect=dialect)
else:
json = memcached_client.get(cache_key)
if json is not None:
df = pd.read_json(json, orient='records')
else:
df = _run(query, dialect=dialect)
memcached_client.set(cache_key, df.to_json(orient='records'), expire=expire)
return df
def main():
start_time = time.time()
args = parse_args()
logger.setLevel(getattr(logging, args.verbosity.upper()))
logger.info("Started")
build_constants()
df = pd.read_json(path_or_buf=DATA_PATH, orient='records', encoding="UTF8")
logger.debug("Loaded {} rows into df".format(len(df)))
df = utils.get_data_subset.crop(df, None, None)
df = utils.get_data_subset.filter_rows_by_string(df,
[TARGET_COL],
['Rock',
'Hip Hop'])
df = utils.clean_data.execute_cleaners(df)
df = utils.normalize_data.normalize_genres(df, TARGET_COL)
X, y = utils.get_data_subset.get_x_y(df, SAMPLE_COL, TARGET_COL)
clf = model_pipeline.get_pipeline(SAMPLE_COL)
utils.persistence.dump(DF_DUMP_NAME, df)
utils.persistence.dump(CLF_DUMP_NAME, clf)
if args.train:
train_and_test.train_and_dump(X, y, clf)
elif args.test:
train_and_test.test_using_kfold(X, y, clf)
logger.info("Finished in {0:.2f} seconds".format(time.time() - start_time))
def handle_dotio_url(wf_module, url, split_url, num_rows):
"""
Processes response for any request to enigma.io. Here, we assume that the API key is provided,
because, at least at first glance (or two or three) there doesn't seem to be any provisions for
accessing dataset endpoints sans API key.
"""
if num_rows > 500:
wf_module.set_error("You can request a maximum of 500 rows.")
return
if "/limit/" not in url:
if url.endswith('/'):
url += "limit/{}".format(num_rows)
else:
url += "/limit/{}".format(num_rows)
response = requests.get(url)
if response.status_code != 200:
error = json.loads(response.text)
if "message" in error:
message = error["message"]
else:
message = error["info"]["message"]
if "additional" in error["info"]:
message += ": " + error["info"]["additional"]["message"]
wf_module.set_error("Unable to retrieve data from Enigma. Received {} status, with message {}"
.format(response.status_code, message))
return
try:
json_text = json.loads(response.text)
table = pd.read_json(json.dumps(json_text['result']))
return table
except Exception as ex: # Generic exceptions suck, but is it the most pragmatic/all-encompassing here?
wf_module.set_error("Unable to process request: {}".format(str(ex)))
return
def _from_json(self, value, obj=None):
if value is not None:
df = pd.read_json(json.dumps(value), orient="split")
else:
df = pd.DataFrame()
return df
def _from_json(self, value, obj=None):
if value is not None:
df = pd.read_json(json.dumps(value), orient="split")
else:
df = pd.DataFrame()
return df
def pd_json_to_df(self, data_json, sorted_by_key="Date", in_ascending=True):
import pandas as pd
new_df = pd.read_json(data_json).sort_values(by=sorted_by_key, ascending=in_ascending)
return new_df
# end of pd_json_to_df
def read_posts():
posts = list()
file_in = open('./post_list.txt', 'r')
post_list = str(file_in.read()).split(' ')
file_in.close()
num = 0
for post_id in post_list:
if not post_id:
continue
if not os.path.exists('./data/Posts/%s.json' % post_id):
continue
try:
file_in = open('./data/Posts/%s.json' % post_id, 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
post = dict()
post['post_id'] = post_id
post['published_date'] = raw_data['published_date']
post['recommends'] = raw_data['recommends']
post['responses'] = raw_data['responses']
posts.append(post)
except:
continue
num += 1
print(post_id)
print(num)
return pd.read_json(json.dumps(posts))
def read_posts():
posts = list()
file_in = open('./post_list.txt', 'r')
post_list = str(file_in.read()).split(' ')
file_in.close()
num = 0
for post_id in post_list:
if not post_id:
continue
if not os.path.exists('./data/Posts/%s.json' % post_id):
continue
try:
file_in = open('./data/Posts/%s.json' % post_id, 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
for tag in raw_data['tags']:
post = dict()
post['post_id'] = post_id
post['published_date'] = raw_data['published_date']
post['recommends'] = raw_data['recommends']
post['responses'] = raw_data['responses']
post['tag'] = tag['name']
posts.append(post)
print(post)
except:
continue
num += 1
print(post_id)
print(num)
return pd.read_json(json.dumps(posts))
def read_users():
users = list()
file_in = open('./username_list.txt', 'r')
username_list = str(file_in.read()).split(' ')
file_in.close()
num = 0
for username in username_list:
if not username:
continue
if not os.path.exists('./data/Users/%s.json' % username):
continue
try:
file_in = open('./data/Users/%s.json' % username, 'r')
raw_data = json.loads(str(file_in.read()))
file_in.close()
user = dict()
user['username'] = username
user['reg_date'] = datetime.date.fromtimestamp(raw_data['profile']['user']['createdAt']/1000.0).isoformat()
if not raw_data['profile']['user']['lastPostCreatedAt']:
raw_data['profile']['user']['lastPostCreatedAt'] = raw_data['profile']['user']['createdAt']
user['last_post_date'] = datetime.date.fromtimestamp(raw_data['profile']['user']['lastPostCreatedAt']/1000.0).isoformat()
user['posts_count'] = raw_data['profile']['numberOfPostsPublished']
user['following_count'] = raw_data['profile']['user']['socialStats']['usersFollowedCount']
user['followers_count'] = raw_data['profile']['user']['socialStats']['usersFollowedByCount']
users.append(user)
except:
continue
num += 1
print(username)
print(num)
return pd.read_json(json.dumps(users))
def data_received(self, data):
updateOZ_event.data=pd.read_json(data.decode())
updateOZ_event.set()
def handle_OZServer(loop):
reader, writer = yield from asyncio.open_connection('127.0.0.1', 2222,loop=loop)
symbolList=list()
while True:
if updateOZ_event.is_set():
print('In Server send')
updateOZ_event.clear()
for element in updateOZ_event.data :
writer.write(('Add_'+ element+'_End').encode())
writer.write('Send'.encode())
outputbuffer = StringIO()
condition = True
while condition:
data = yield from reader.read(1024)
message=data.decode()
if message.find('!ENDMSG!') != -1:
message = message.replace('!ENDMSG!', '')
condition = False
print('End found')
outputbuffer.write(message)
outputbuffer.seek(0)
DF=pd.read_json(outputbuffer)
#print(DF)
yield from updateOZ_queue.put(DF)
yield None
writer.close()
reader.close()
def _load_data(filename, columns=None):
data = pd.read_json(filename, lines=True)
data = data.sort_values('validation_mrr', ascending=False)
mrr_cols = ['validation_mrr', 'test_mrr']
if columns is None:
columns = [x for x in data.columns if
(x not in mrr_cols and x != 'hash')]
cols = data.columns
cols = mrr_cols + columns
return data[cols]
def _load_data(filename, columns=None):
data = pd.read_json(filename, lines=True)
data = data.sort_values('validation_mrr', ascending=False)
mrr_cols = ['validation_mrr', 'test_mrr']
if columns is None:
columns = [x for x in data.columns if
(x not in mrr_cols and x != 'hash')]
cols = data.columns
cols = mrr_cols + columns
return data[cols]
def test_frame_double_encoded_labels(self):
df = DataFrame([['a', 'b'], ['c', 'd']],
index=['index " 1', 'index / 2'],
columns=['a \\ b', 'y / z'])
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split'))
assert_frame_equal(df, read_json(df.to_json(orient='columns'),
orient='columns'))
assert_frame_equal(df, read_json(df.to_json(orient='index'),
orient='index'))
df_unser = read_json(df.to_json(orient='records'), orient='records')
assert_index_equal(df.columns, df_unser.columns)
np.testing.assert_equal(df.values, df_unser.values)
def test_frame_non_unique_index(self):
df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
columns=['x', 'y'])
self.assertRaises(ValueError, df.to_json, orient='index')
self.assertRaises(ValueError, df.to_json, orient='columns')
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split'))
unser = read_json(df.to_json(orient='records'), orient='records')
self.assertTrue(df.columns.equals(unser.columns))
np.testing.assert_equal(df.values, unser.values)
unser = read_json(df.to_json(orient='values'), orient='values')
np.testing.assert_equal(df.values, unser.values)
def test_frame_non_unique_columns(self):
df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
columns=['x', 'x'])
self.assertRaises(ValueError, df.to_json, orient='index')
self.assertRaises(ValueError, df.to_json, orient='columns')
self.assertRaises(ValueError, df.to_json, orient='records')
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split', dtype=False))
unser = read_json(df.to_json(orient='values'), orient='values')
np.testing.assert_equal(df.values, unser.values)
# GH4377; duplicate columns not processing correctly
df = DataFrame([['a', 'b'], ['c', 'd']], index=[
1, 2], columns=['x', 'y'])
result = read_json(df.to_json(orient='split'), orient='split')
assert_frame_equal(result, df)
def _check(df):
result = read_json(df.to_json(orient='split'), orient='split',
convert_dates=['x'])
assert_frame_equal(result, df)
for o in [[['a', 'b'], ['c', 'd']],
[[1.5, 2.5], [3.5, 4.5]],
[[1, 2.5], [3, 4.5]],
[[Timestamp('20130101'), 3.5],
[Timestamp('20130102'), 4.5]]]:
_check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
def test_frame_from_json_nones(self):
df = DataFrame([[1, 2], [4, 5, 6]])
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
df = DataFrame([['1', '2'], ['4', '5', '6']])
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), dtype=False)
self.assertTrue(unser[2][0] is None)
unser = read_json(df.to_json(), convert_axes=False, dtype=False)
self.assertTrue(unser['2']['0'] is None)
unser = read_json(df.to_json(), numpy=False)
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), numpy=False, dtype=False)
self.assertTrue(unser[2][0] is None)
unser = read_json(df.to_json(), numpy=False,
convert_axes=False, dtype=False)
self.assertTrue(unser['2']['0'] is None)
# infinities get mapped to nulls which get mapped to NaNs during
# deserialisation
df = DataFrame([[1, 2], [4, 5, 6]])
df.loc[0, 2] = np.inf
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), dtype=False)
self.assertTrue(np.isnan(unser[2][0]))
df.loc[0, 2] = np.NINF
unser = read_json(df.to_json())
self.assertTrue(np.isnan(unser[2][0]))
unser = read_json(df.to_json(), dtype=False)
self.assertTrue(np.isnan(unser[2][0]))
def test_frame_empty_mixedtype(self):
# mixed type
df = DataFrame(columns=['jim', 'joe'])
df['joe'] = df['joe'].astype('i8')
self.assertTrue(df._is_mixed_type)
assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
check_index_type=False)
def test_frame_mixedtype_orient(self): # GH10289
vals = [[10, 1, 'foo', .1, .01],
[20, 2, 'bar', .2, .02],
[30, 3, 'baz', .3, .03],
[40, 4, 'qux', .4, .04]]
df = DataFrame(vals, index=list('abcd'),
columns=['1st', '2nd', '3rd', '4th', '5th'])
self.assertTrue(df._is_mixed_type)
right = df.copy()
for orient in ['split', 'index', 'columns']:
inp = df.to_json(orient=orient)
left = read_json(inp, orient=orient, convert_axes=False)
assert_frame_equal(left, right)
right.index = np.arange(len(df))
inp = df.to_json(orient='records')
left = read_json(inp, orient='records', convert_axes=False)
assert_frame_equal(left, right)
right.columns = np.arange(df.shape[1])
inp = df.to_json(orient='values')
left = read_json(inp, orient='values', convert_axes=False)
assert_frame_equal(left, right)