Python numpy 模块,array() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.array()。
def min_side(_, pos):
"""
Given an object pixels' positions, return the minimum side length of its
bounding box
:param _: pixel values (unused)
:param pos: pixel position (1-D)
:return: minimum bounding box side length
"""
xs = np.array([i / SSIZE for i in pos])
ys = np.array([i % SSIZE for i in pos])
minx = np.amin(xs)
miny = np.amin(ys)
maxx = np.amax(xs)
maxy = np.amax(ys)
ct1 = compute_line(np.array([minx, miny]), np.array([minx, maxy]))
ct2 = compute_line(np.array([minx, miny]), np.array([maxx, miny]))
return min(ct1, ct2)
def _cascade_evaluation(self, X_test, y_test):
""" Evaluate the accuracy of the cascade using X and y.
:param X_test: np.array
Array containing the test input samples.
Must be of the same shape as training data.
:param y_test: np.array
Test target values.
:return: float
the cascade accuracy.
"""
casc_pred_prob = np.mean(self.cascade_forest(X_test), axis=0)
casc_pred = np.argmax(casc_pred_prob, axis=1)
casc_accuracy = accuracy_score(y_true=y_test, y_pred=casc_pred)
print('Layer validation accuracy = {}'.format(casc_accuracy))
return casc_accuracy
def _create_feat_arr(self, X, prf_crf_pred):
""" Concatenate the original feature vector with the predicition probabilities
of a cascade layer.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param prf_crf_pred: list
Prediction probabilities by a cascade layer for X.
:return: np.array
Concatenation of X and the predicted probabilities.
To be used for the next layer in a cascade forest.
"""
swap_pred = np.swapaxes(prf_crf_pred, 0, 1)
add_feat = swap_pred.reshape([np.shape(X)[0], -1])
feat_arr = np.concatenate([add_feat, X], axis=1)
return feat_arr
def shuffleBlock(self,cells,d,tlx,tly,cols,rows,width,height):
if tlx+cols < width and tly+rows < height:
temp = []
for row in range( rows):
for col in range( cols):
temp.append(d[cells[tlx+col][tly+row]])
temp = np.array(temp)
oldState = temp.copy()
np.random.shuffle(temp)
i = 0
for row in range( rows):
for col in range( cols):
d[cells[tlx+col][tly+row]] = temp[i]
i+=1
return oldState
else:
return []
def train(self, dataset, train_split=0.8, dense_size=32, learning_rate=0.001, batch_size=32, epochs=50, activation='relu'):
self.__load_dataset(dataset, train_split)
train_x = np.array(self.__train_data[:, 0].tolist())
train_y = to_categorical(self.__train_data[:, 1], 2)
test_x = np.array(self.__test_data[:, 0].tolist())
test_y = to_categorical(self.__test_data[:, 1], 2)
print(train_x.shape)
self.__model = Sequential()
self.__model.add(Dense(dense_size, input_dim=train_x.shape[1], activation=activation, init='glorot_uniform'))
self.__model.add(Dense(train_y.shape[1], activation='softmax', init='glorot_uniform'))
self.__model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['categorical_accuracy'])
self.__model.fit(train_x, train_y, batch_size=batch_size, nb_epoch=epochs, validation_data=(test_x, test_y), verbose=2)
def normalize_array (solution, prediction):
''' Use min and max of solution as scaling factors to normalize prediction,
then threshold it to [0, 1]. Binarize solution to {0, 1}.
This allows applying classification scores to all cases.
In principle, this should not do anything to properly formatted
classification inputs and outputs.'''
# Binarize solution
sol=np.ravel(solution) # convert to 1-d array
maxi = np.nanmax((filter(lambda x: x != float('inf'), sol))) # Max except NaN and Inf
mini = np.nanmin((filter(lambda x: x != float('-inf'), sol))) # Mini except NaN and Inf
if maxi == mini:
print('Warning, cannot normalize')
return [solution, prediction]
diff = maxi - mini
mid = (maxi + mini)/2.
new_solution = np.copy(solution)
new_solution[solution>=mid] = 1
new_solution[solution<mid] = 0
# Normalize and threshold predictions (takes effect only if solution not in {0, 1})
new_prediction = (np.copy(prediction) - float(mini))/float(diff)
new_prediction[new_prediction>1] = 1 # and if predictions exceed the bounds [0, 1]
new_prediction[new_prediction<0] = 0
# Make probabilities smoother
#new_prediction = np.power(new_prediction, (1./10))
return [new_solution, new_prediction]
def mvmean(R, axis=0):
''' Moving average to avoid rounding errors. A bit slow, but...
Computes the mean along the given axis, except if this is a vector, in which case the mean is returned.
Does NOT flatten.'''
if len(R.shape)==0: return R
average = lambda x: reduce(lambda i, j: (0, (j[0]/(j[0]+1.))*i[1]+(1./(j[0]+1))*j[1]), enumerate(x))[1]
R=np.array(R)
if len(R.shape)==1: return average(R)
if axis==1:
return np.array(map(average, R))
else:
return np.array(map(average, R.transpose()))
# ======= All metrics used for scoring in the challenge ========
### REGRESSION METRICS (work on raw solution and prediction)
# These can be computed on all solutions and predictions (classification included)
def data_binary_sparse (filename, nbr_features):
''' This function takes as an argument a file representing a binary sparse matrix
binary_sparse_matrix[i][j] = a means matrix[i][j] = 1
It converts it into a numpy array an returns this array. '''
data = data_converter.file_to_array (filename)
nbr_samples = len(data)
dok_sparse = dok_matrix ((nbr_samples, nbr_features)) # the construction is easier w/ dok_sparse
print ("Converting {} to dok sparse matrix".format(filename))
for row in range (nbr_samples):
for feature in data[row]:
dok_sparse[row, int(feature)-1] = 1
print ("Converting {} to csr sparse matrix".format(filename))
return dok_sparse.tocsr()
# ================ Copy results from input to output ==========================
def generate_one_summary(self, review):
"""
Create summary for one review using Encoder Decoder Seq2Seq model
:param review: The input review
:return: Output Summary of the model
"""
review = review.T
review = [np.array([int(x)]) for x in review]
feed_dict_rev = {self.enc_inp[t]: review[t] for t in range(self.seq_length)}
feed_dict_rev.update({self.labels[t]: review[t] for t in range(self.seq_length)})
summary = self.sess.run(self.dec_outputs_tst, feed_dict_rev)
summary = [logits_t.argmax(axis=1) for logits_t in summary]
summary = [x[0] for x in summary]
return summary
def generate_one_summary(self, review):
"""
Create summary for one review using Encoder Decoder Seq2Seq model
:param review: The input review
:return: Output Summary of the model
"""
review = review.T
review = [np.array([int(x)]) for x in review]
feed_dict_rev = {self.enc_inp[t]: review[t] for t in range(self.seq_length)}
feed_dict_rev.update({self.labels[t]: review[t] for t in range(self.seq_length)})
summary = self.sess.run(self.dec_outputs_tst, feed_dict_rev)
summary = [logits_t.argmax(axis=1) for logits_t in summary]
summary = [x[0] for x in summary]
return summary
def __crawl_review(self):
"""
Crawl review
:return: review [numpy array]
"""
review_list = []
print 'Crawling Reviews....'
num_lines = 0
with open(self.raw_data_file) as infile:
for line in infile:
if line.startswith('review/text'):
if num_lines >= self.num_reviews:
break
num_lines += 1
_,review = line.split('/text: ')
review_list.append(review)
return np.array(review_list)
def __crawl_summary(self):
"""
Crawl summary
:return: summary [numpy array]
"""
summary_list = []
print 'Crawling Summary....'
num_lines = 0
with open(self.raw_data_file) as infile:
for line in infile:
if line.startswith('review/summary'):
if num_lines >= self.num_reviews:
break
num_lines += 1
_,summary = line.split('/summary: ')
summary_list.append(summary)
return np.array(summary_list)
def reshape_array(array, newsize, pixcombine='sum'):
"""
Reshape an array to a give size using either the sum, mean or median of the pixels binned
Note that the old array dimensions have to be multiples of the new array dimensions
--- INPUT ---
array Array to reshape (combine pixels)
newsize New size of array
pixcombine The method to combine the pixels with. Choices are sum, mean and median
"""
sh = newsize[0],array.shape[0]//newsize[0],newsize[1],array.shape[1]//newsize[1]
pdb.set_trace()
if pixcombine == 'sum':
reshapedarray = array.reshape(sh).sum(-1).sum(1)
elif pixcombine == 'mean':
reshapedarray = array.reshape(sh).mean(-1).mean(1)
elif pixcombine == 'median':
reshapedarray = array.reshape(sh).median(-1).median(1)
return reshapedarray
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
def test(path_test, input_size, hidden_size, batch_size, save_dir, model_name, maxlen):
db = read_data(path_test)
X = create_sequences(db[:-maxlen], win_size=maxlen, step=maxlen)
X = np.reshape(X, (X.shape[0], X.shape[1], input_size))
# build the model: 1 layer LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(hidden_size, return_sequences=False, input_shape=(maxlen, input_size)))
model.add(Dense(maxlen))
model.load_weights(save_dir + model_name)
model.compile(loss='mse', optimizer='adam')
prediction = model.predict(X, batch_size, verbose=1)
prediction = prediction.flatten()
# prediction_container = np.array(prediction).flatten()
Y = db[maxlen:]
plt.plot(prediction, label='prediction')
plt.plot(Y, label='true')
plt.legend()
plt.show()
def word_list_to_embedding(words, embeddings, embedding_dimension=50):
'''
:param words: an n x (2*window_size + 1) matrix from data_to_mat
:param embeddings: an embedding dictionary where keys are strings and values
are embeddings; the output from embeddings_to_dict
:param embedding_dimension: the dimension of the values in embeddings; in this
assignment, embedding_dimension=50
:return: an n x ((2*window_size + 1)*embedding_dimension) matrix where each entry of the
words matrix is replaced with its embedding
'''
m, n = words.shape
words = words.reshape((-1))
return np.array([embeddings[w] for w in words], dtype=np.float32).reshape(m, n*embedding_dimension)
#
# End Twitter Helper Functions
#
def __init__(self, N, L, comm, precision,
communication="Alltoall",
padsize=1.5,
threads=1,
planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
R2C.__init__(self, N, L, comm, precision,
communication=communication,
padsize=padsize, threads=threads,
planner_effort=planner_effort)
# Reuse all shapes from r2c transform R2C simply by resizing the final complex z-dimension:
self.Nf = N[2]
self.Nfp = int(self.padsize*self.N[2]) # Independent complex wavenumbers in z-direction for padded array
# Rename since there's no real space
self.original_shape_padded = self.real_shape_padded
self.original_shape = self.real_shape
self.transformed_shape = self.complex_shape
self.original_local_slice = self.real_local_slice
self.transformed_local_slice = self.complex_local_slice
self.ks = (fftfreq(N[2])*N[2]).astype(int)
def bag_of_tokens(config, labels, label_lengths):
if config.train_output_embeddings:
with tf.variable_scope('embed', reuse=True):
output_embeddings = tf.get_variable('output_embedding')
else:
output_embeddings = tf.constant(config.output_embedding_matrix)
#everything_label_placeholder = tf.placeholder(shape=(None, config.max_length,), dtype=tf.int32)
#everything_label_length_placeholder = tf.placeholder(shape=(None,), dtype=tf.int32)
labels = tf.constant(np.array(labels))
embedded_output = tf.gather(output_embeddings, labels)
print('embedded_output before', embedded_output)
#mask = tf.sequence_mask(label_lengths, maxlen=config.max_length, dtype=tf.float32)
# note: this multiplication will broadcast the mask along all elements of the depth dimension
# (which is why we run the expand_dims to choose how to broadcast)
#embedded_output = embedded_output * tf.expand_dims(mask, axis=2)
#print('embedded_output after', embedded_output)
return tf.reduce_sum(embedded_output, axis=1)
def _compute_process_and_covariance_matrices(self, dt):
"""Computes the transition and covariance matrix of the process model and measurement model.
Args:
dt (float): Timestep of the discrete transition.
Returns:
F (numpy.ndarray): Transition matrix.
Q (numpy.ndarray): Process covariance matrix.
R (numpy.ndarray): Measurement covariance matrix.
"""
F = np.array(np.bmat([[np.eye(3), dt * np.eye(3)], [np.zeros((3, 3)), np.eye(3)]]))
self.process_matrix = F
q_p = self.process_covariance_position
q_v = self.process_covariance_velocity
Q = np.diag([q_p, q_p, q_p, q_v, q_v, q_v]) ** 2 * dt
r = self.measurement_covariance
R = r * np.eye(4)
self.process_covariance = Q
self.measurement_covariance = R
return F, Q, R
def sample(self, sample_size=20, text=None):
"""Sample the documents."""
p = 1
if text != None:
try:
x, word_idxs = self.reader.get(text)
except Exception as e:
print(e)
return
else:
x, word_idxs = self.reader.random()
print(" [*] Text: %s" % " ".join([self.reader.idx2word[word_idx] for word_idx in word_idxs]))
cur_ps = self.sess.run(self.p_x_i, feed_dict={self.x: x})
word_idxs = np.array(cur_ps).argsort()[-sample_size:][::-1]
ps = cur_ps[word_idxs]
for idx, (cur_p, word_idx) in enumerate(zip(ps, word_idxs)):
print(" [%d] %-20s: %.8f" % (idx+1, self.reader.idx2word[word_idx], cur_p))
p *= cur_p
print(" [*] perp : %8.f" % -np.log(p))
def plot_nucleotide_diversity(ax, fqlists, invert=False):
'''
Create a FastQC-like "?Per base sequence content" plot
Plot fraction of nucleotides per position
zip will stop when shortest read is exhausted
'''
if invert:
fqlists = [list(reversed(read)) for read in fqlists]
numreads = len(fqlists)
sns.set_style("darkgrid")
l_A, = ax.plot(
np.array([pos.count('A') / numreads for pos in zip(*fqlists)]), 'green', label='A')
l_T, = ax.plot(
np.array([pos.count('T') / numreads for pos in zip(*fqlists)]), 'red', label='T')
l_G, = ax.plot(
np.array([pos.count('G') / numreads for pos in zip(*fqlists)]), 'black', label='G')
l_C, = ax.plot(
np.array([pos.count('C') / numreads for pos in zip(*fqlists)]), 'blue', label='C')
if invert:
ax.set_xticklabels(-1 * ax.get_xticks().astype(int))
return [l_A, l_T, l_G, l_C]
def plot_qual(ax, quallist, invert=False):
'''
Create a FastQC-like "?Per base sequence quality?" plot
Plot average quality per position
zip will stop when shortest read is exhausted
'''
sns.set_style("darkgrid")
if invert:
l_Q, = ax.plot(np.array([np.mean(position) for position in zip(
*[list(reversed(read)) for read in quallist])]), 'orange', label="Quality")
ax.set_xlabel('Position in read from end')
ax.set_xticklabels(-1 * ax.get_xticks().astype(int))
else:
l_Q, = ax.plot(np.array([np.mean(position)
for position in zip(*quallist)]), 'orange', label="Quality")
ax.set_xlabel('Position in read from start')
return l_Q
def d_x2(self, factors=None):
"""Creates a sparse matrix for computing the second derivative with respect to x multiplied
by factors given for every point. Uses central difference quotient.
Args:
factors: Factor for each point to be applied after derivation.
Returns:
Sparse matrix the calculate second derivatives of field components.
"""
# use ones as factors if none are specified
if factors is None:
factors = np.array(1).repeat(self.num_points)
return sp.dia_matrix((np.array([factors, -2*factors, factors]), [-1, 0, 1]),
shape=(self.num_points, self.num_points))
def d_x2(self, factors=None):
"""Creates a sparse matrix for computing the second derivative with respect to x multiplied
by factors given for every point. Uses central difference quotient.
Args:
factors: Factor for each point to be applied after derivation.
Returns:
Sparse matrix the calculate second derivatives of field components.
"""
# use ones as factors if none are specified
if factors is None:
factors = np.array(1).repeat(self.num_points)
return sp.dia_matrix((np.array([factors, -2*factors, factors]), [-1, 0, 1]),
shape=(self.num_points, self.num_points))
def plot_region(self, region):
"""Shows the given region in the field plot.
Args:
region: Region to be plotted.
"""
if type(region) == reg.PointRegion:
self.axes.plot(np.ones(2) * region.point_coordinates / self._x_axis_factor,
np.array([-1, 1]) * self.scale, color='black')
elif type(region) == reg.LineRegion:
self.axes.plot(np.ones(2) * region.line_coordinates[0] / self._x_axis_factor,
np.array([-1, 1]) * self.scale, color='black')
self.axes.plot(np.ones(2) * region.line_coordinates[1] / self._x_axis_factor,
np.array([-1, 1]) * self.scale, color='black')
else:
raise TypeError('Unknown type in region list: {}'.format(type(region)))
def test_accuracy_full_batch(tokens, features, mini_batch_size, word_attn, sent_attn, th=0.5):
p = []
l = []
cnt = 0
g = gen_minibatch1(tokens, features, mini_batch_size, False)
for token, feature in g:
if cnt % 100 == 0:
print(cnt)
cnt +=1
# print token.size()
# y_pred = get_predictions(token, word_attn, sent_attn)
# print y_pred
y_pred = get_predictions(token, feature, word_attn, sent_attn)
# print y_pred
# _, y_pred = torch.max(y_pred, 1)
# y_pred = y_pred[:, 1]
# print y_pred
p.append(np.ndarray.flatten(y_pred.data.cpu().numpy()))
p = [item for sublist in p for item in sublist]
p = np.array(p)
return p
def test_accuracy_full_batch(tokens, features, mini_batch_size, word_attn, sent_attn, th=0.5):
p = []
l = []
cnt = 0
g = gen_minibatch1(tokens, features, mini_batch_size, False)
for token, feature in g:
if cnt % 100 == 0:
print cnt
cnt +=1
# print token.size()
# y_pred = get_predictions(token, word_attn, sent_attn)
# print y_pred
y_pred = get_predictions(token, feature, word_attn, sent_attn)
# print y_pred
# _, y_pred = torch.max(y_pred, 1)
# y_pred = y_pred[:, 1]
# print y_pred
p.append(np.ndarray.flatten(y_pred.data.cpu().numpy()))
p = [item for sublist in p for item in sublist]
p = np.array(p)
return p
def _ncc_c(x, y):
"""
>>> _ncc_c([1,2,3,4], [1,2,3,4])
array([ 0.13333333, 0.36666667, 0.66666667, 1. , 0.66666667,
0.36666667, 0.13333333])
>>> _ncc_c([1,1,1], [1,1,1])
array([ 0.33333333, 0.66666667, 1. , 0.66666667, 0.33333333])
>>> _ncc_c([1,2,3], [-1,-1,-1])
array([-0.15430335, -0.46291005, -0.9258201 , -0.77151675, -0.46291005])
"""
den = np.array(norm(x) * norm(y))
den[den == 0] = np.Inf
x_len = len(x)
fft_size = 1<<(2*x_len-1).bit_length()
cc = ifft(fft(x, fft_size) * np.conj(fft(y, fft_size)))
cc = np.concatenate((cc[-(x_len-1):], cc[:x_len]))
return np.real(cc) / den
def layout_tree(correlation):
"""Layout tree for visualization with e.g. matplotlib.
Args:
correlation: A [V, V]-shaped numpy array of latent correlations.
Returns:
A [V, 3]-shaped numpy array of spectral positions of vertices.
"""
assert len(correlation.shape) == 2
assert correlation.shape[0] == correlation.shape[1]
assert correlation.dtype == np.float32
laplacian = -correlation
np.fill_diagonal(laplacian, 0)
np.fill_diagonal(laplacian, -laplacian.sum(axis=0))
evals, evects = scipy.linalg.eigh(laplacian, eigvals=[1, 2, 3])
assert np.all(evals > 0)
assert evects.shape[1] == 3
return evects
def __init__(self, N, V, tree_prior, config):
"""Initialize a model with an empty subsample.
Args:
N (int): Number of rows in the dataset.
V (int): Number of columns (features) in the dataset.
tree_prior: A [K]-shaped numpy array of prior edge log odds, where
K is the number of edges in the complete graph on V vertices.
config: A global config dict.
"""
assert isinstance(N, int)
assert isinstance(V, int)
assert isinstance(tree_prior, np.ndarray)
assert isinstance(config, dict)
K = V * (V - 1) // 2 # Number of edges in complete graph.
assert V <= 32768, 'Invalid # features > 32768: {}'.format(V)
assert tree_prior.shape == (K, )
assert tree_prior.dtype == np.float32
self._config = config.copy()
self._num_rows = N
self._tree_prior = tree_prior
self._tree = TreeStructure(V)
assert self._tree.num_vertices == V
self._program = make_propagation_program(self._tree.tree_grid)
self._added_rows = set()
def sample_tree(self):
"""Samples a random tree.
Returns:
A pair (edges, edge_logits), where:
edges: A list of (vertex, vertex) pairs.
edge_logits: A [K]-shaped numpy array of edge logits.
"""
logger.info('TreeCatTrainer.sample_tree given %d rows',
len(self._added_rows))
SERIES.sample_tree_num_rows.append(len(self._added_rows))
complete_grid = self._tree.complete_grid
edge_logits = self.compute_edge_logits()
assert edge_logits.shape[0] == complete_grid.shape[1]
assert edge_logits.dtype == np.float32
edges = self.get_edges()
edges = sample_tree(complete_grid, edge_logits, edges)
return edges, edge_logits
def compute_edge_logits(self):
"""Compute non-normalized logprob of all V(V-1)/2 candidate edges.
This is used for sampling and estimating the latent tree.
"""
V, E, K, M = self._VEKM
vert_logits = logprob_dc(self._vert_ss, self._vert_prior, axis=1)
if len(self._added_rows) == V:
assignments = self._assignments
else:
assignments = self._assignments[sorted(self._added_rows), :]
assignments = np.array(assignments, order='F')
parallel = self._config['learning_parallel']
result = treecat_compute_edge_logits(M, self._tree.complete_grid,
self._gammaln_table, assignments,
vert_logits, parallel)
result += self._tree_prior
return result
def train(self):
"""Train a TreeCat model using subsample-annealed MCMC.
Returns:
A trained model as a dictionary with keys:
config: A global config dict.
tree: A TreeStructure instance with the learned latent
structure.
edge_logits: A [K]-shaped array of all edge logits.
suffstats: Sufficient statistics of features, vertices, and
edges and a ragged_index for the features array.
assignments: An [N, V]-shaped numpy array of latent cluster
ids for each cell in the dataset, where N be the number of
data rows and V is the number of features.
"""
model = TreeTrainer.train(self)
model['assignments'] = self._assignments
model['suffstats'] = {
'ragged_index': self._table.ragged_index,
'vert_ss': self._vert_ss,
'edge_ss': self._edge_ss,
'feat_ss': self._feat_ss,
'meas_ss': self._meas_ss,
}
return model
def __init__(self, data, tree_prior, config):
"""Initialize a model with an empty subsample.
Args:
data: An [N, V]-shaped numpy array of real-valued data.
tree_prior: A [K]-shaped numpy array of prior edge log odds, where
K is the number of edges in the complete graph on V vertices.
config: A global config dict.
"""
assert isinstance(data, np.ndarray)
data = np.asarray(data, np.float32)
assert len(data.shape) == 2
N, V = data.shape
D = config['model_latent_dim']
E = V - 1 # Number of edges in the tree.
TreeTrainer.__init__(self, N, V, tree_prior, config)
self._data = data
self._latent = np.zeros([N, V, D], np.float32)
# This is symmetric positive definite.
self._vert_ss = np.zeros([V, D, D], np.float32)
# This is arbitrary (not necessarily symmetric).
self._edge_ss = np.zeros([E, D, D], np.float32)
# This represents (count, mean, covariance).
self._feat_ss = np.zeros([V, D, 1 + 1 + D], np.float32)
def train(self):
"""Train a TreeGauss model using subsample-annealed MCMC.
Returns:
A trained model as a dictionary with keys:
config: A global config dict.
tree: A TreeStructure instance with the learned latent
structure.
edge_logits: A [K]-shaped array of all edge logits.
suffstats: Sufficient statistics of features and vertices.
latent: An [N, V, M]-shaped numpy array of latent states, where
N is the number of data rows, V is the number of features,
and M is the dimension of each latent variable.
"""
model = TreeTrainer.train(self)
model['latent'] = self._latent
model['suffstats'] = {
'vert_ss': self._vert_ss,
'edge_ss': self._edge_ss,
'feat_ss': self._feat_ss,
}
return model
def train_ensemble(table, tree_prior, config):
"""Train a TreeCat ensemble model using subsample-annealed MCMC.
The ensemble size is controlled by config['model_ensemble_size'].
Let N be the number of data rows and V be the number of features.
Args:
table: A Table instance holding N rows of V features of data.
tree_prior: A [K]-shaped numpy array of prior edge log odds, where
K is the number of edges in the complete graph on V vertices.
config: A global config dict.
Returns:
A trained model as a dictionary with keys:
tree: A TreeStructure instance with the learned latent structure.
suffstats: Sufficient statistics of features, vertices, and edges.
assignments: An [N, V] numpy array of latent cluster ids for each
cell in the dataset.
"""
tasks = []
for sub_seed in range(config['model_ensemble_size']):
sub_config = config.copy()
sub_config['seed'] += sub_seed
tasks.append((table, tree_prior, sub_config))
return parallel_map(_train_model, tasks)
def test_server_logprob_normalized(N, V, C, M):
model = generate_fake_model(N, V, C, M)
config = TINY_CONFIG.copy()
config['model_num_clusters'] = M
model['config'] = config
server = TreeCatServer(model)
# The total probability of all categorical rows should be 1.
ragged_index = model['suffstats']['ragged_index']
factors = []
for v in range(V):
C = ragged_index[v + 1] - ragged_index[v]
factors.append([one_hot(c, C) for c in range(C)])
data = np.array(
[np.concatenate(columns) for columns in itertools.product(*factors)],
dtype=np.int8)
logprobs = server.logprob(data)
logtotal = np.logaddexp.reduce(logprobs)
assert logtotal == pytest.approx(0.0, abs=1e-5)
def observed_perplexity(self, counts):
"""Compute perplexity = exp(entropy) of observed variables.
Perplexity is an information theoretic measure of the number of
clusters or latent classes. Perplexity is a real number in the range
[1, M], where M is model_num_clusters.
Args:
counts: A [V]-shaped array of multinomial counts.
Returns:
A [V]-shaped numpy array of perplexity.
"""
V, E, M, R = self._VEMR
if counts is not None:
counts = np.ones(V, dtype=np.int8)
assert counts.shape == (V, )
assert counts.dtype == np.int8
assert np.all(counts > 0)
observed_entropy = np.empty(V, dtype=np.float32)
for v in range(V):
beg, end = self._ragged_index[v:v + 2]
probs = np.dot(self._feat_cond[beg:end, :], self._vert_probs[v, :])
observed_entropy[v] = multinomial_entropy(probs, counts[v])
return np.exp(observed_entropy)
def observed_perplexity(self, counts):
"""Compute perplexity = exp(entropy) of observed variables.
Perplexity is an information theoretic measure of the number of
clusters or observed classes. Perplexity is a real number in the range
[1, dim[v]], where dim[v] is the number of categories in an observed
categorical variable or 2 for an ordinal variable.
Args:
counts: A [V]-shaped array of multinomial counts.
Returns:
A [V]-shaped numpy array of perplexity.
"""
result = self._ensemble[0].observed_perplexity(counts)
for server in self._ensemble[1:]:
result += server.observed_perplexity(counts)
result /= len(self._ensemble)
return result
def latent_correlation(self):
"""Compute correlation matrix among latent features.
This computes the generalization of Pearson's correlation to discrete
data. Let I(X;Y) be the mutual information. Then define correlation as
rho(X,Y) = sqrt(1 - exp(-2 I(X;Y)))
Returns:
A [V, V]-shaped numpy array of feature-feature correlations.
"""
result = self._ensemble[0].latent_correlation()
for server in self._ensemble[1:]:
result += server.latent_correlation()
result /= len(self._ensemble)
return result
def logprob(self, rows, evidence=None):
"""Compute non-normalized log probabilies of many rows of data.
If evidence is specified, compute conditional log probability;
otherwise compute unconditional log probability.
Args:
data: A list of rows of data, where each row is a sparse dict
mapping feature name to feature value.
evidence: An optional row of conditioning data, as a sparse dict
mapping feature name to feature value.
Returns:
An [len(rows)]-shaped numpy array of log probabilities.
"""
data = import_rows(self._schema, rows)
if evidence is None:
return self._server.logprob(data)
else:
ragged_evidence = import_rows(self._schema, [evidence])
return (self._server.logprob(data + ragged_evidence) -
self._server.logprob(data + evidence))
def sample(self, N, evidence=None):
"""Draw N samples from the posterior distribution.
Args:
N: The number of samples to draw.
evidence: An optional single row of conditioning data, as a sparse
dict mapping feature name to feature value.
Returns:
An [N, R]-shaped numpy array of sampled multinomial data.
"""
if evidence is None:
data = None
else:
data = import_rows(self._schema, [evidence])[0]
ragged_samples = self._server.sample(N, self._counts, data)
return export_rows(self._schema, ragged_samples)
def fit(self, X, y):
""" Training the gcForest on input data X and associated target y.
:param X: np.array
Array containing the input samples.
Must be of shape [n_samples, data] where data is a 1D array.
:param y: np.array
1D array containing the target values.
Must be of shape [n_samples]
"""
if np.shape(X)[0] != len(y):
raise ValueError('Sizes of y and X do not match.')
mgs_X = self.mg_scanning(X, y)
_ = self.cascade_forest(mgs_X, y)
def contest(self, b, g, r):
""" Search for biased BGR values
Finds closest neuron (min dist) and updates self.freq
finds best neuron (min dist-self.bias) and returns position
for frequently chosen neurons, self.freq[i] is high and self.bias[i] is negative
self.bias[i] = self.GAMMA*((1/self.NETSIZE)-self.freq[i])"""
i, j = self.SPECIALS, self.NETSIZE
dists = abs(self.network[i:j] - np.array([b,g,r])).sum(1)
bestpos = i + np.argmin(dists)
biasdists = dists - self.bias[i:j]
bestbiaspos = i + np.argmin(biasdists)
self.freq[i:j] *= (1-self.BETA)
self.bias[i:j] += self.BETAGAMMA * self.freq[i:j]
self.freq[bestpos] += self.BETA
self.bias[bestpos] -= self.BETAGAMMA
return bestbiaspos
def rasterMaskToGrid( rasterMask ):
grid = []
mask = rasterMask['mask']
for y in range(rasterMask['height']):
for x in range(rasterMask['width']):
if mask[y,x]==0:
grid.append([x,y])
grid = np.array(grid,dtype=np.float)
if not (rasterMask is None) and rasterMask['hex'] is True:
f = math.sqrt(3.0)/2.0
offset = -0.5
if np.argmin(rasterMask['mask'][0]) > np.argmin(rasterMask['mask'][1]):
offset = 0.5
for i in range(len(grid)):
if (grid[i][1]%2.0==0.0):
grid[i][0]-=offset
grid[i][1] *= f
return grid
def match_matrix(event: Event):
"""Returns a numpy participation matrix for the qualification matches in this event, used for calculating OPR.
Each row in the matrix corresponds to a single alliance in a match, meaning that there will be two rows (one for
red, one for blue) per match. Each column represents a single team, ordered by team number. If a team participated
on a certain alliance, the value at that row and column would be 1, otherwise, it would be 0. For example, an
event with teams 1-7 that featured a match that pitted teams 1, 3, and 5 against 2, 4, and 6 would have a match
matrix that looks like this (sans labels):
#1 #2 #3 #4 #5 #6 #7
qm1_red 1 0 1 0 1 0 0
qm1_blue 0 1 0 1 0 1 0
"""
match_list = []
for match in filter(lambda match: match['comp_level'] == 'qm', event.matches):
matchRow = []
for team in event.teams:
matchRow.append(1 if team['key'] in match['alliances']['red']['teams'] else 0)
match_list.append(matchRow)
matchRow = []
for team in event.teams:
matchRow.append(1 if team['key'] in match['alliances']['blue']['teams'] else 0)
match_list.append(matchRow)
mat = numpy.array(match_list)
sum_matches = numpy.sum(mat, axis=0)
avg_team_matches = sum(sum_matches) / float(len(sum_matches))
return mat[:, numpy.apply_along_axis(numpy.count_nonzero, 0, mat) > avg_team_matches - 2]
def get_img(data_path):
# Getting image array from path:
img = imread(data_path)
img = imresize(img, (64, 64))
return img
def get_dataset(dataset_path='Data/Train_Data'):
# Getting all data from data path:
try:
X = np.load('Data/npy_train_data/X.npy')
Y = np.load('Data/npy_train_data/Y.npy')
except:
inputs_path = dataset_path+'/input'
images = listdir(inputs_path) # Geting images
X = []
Y = []
for img in images:
img_path = inputs_path+'/'+img
x_img = get_img(img_path).astype('float32').reshape(64, 64, 3)
x_img /= 255.
y_img = get_img(img_path.replace('input/', 'mask/mask_')).astype('float32').reshape(64, 64, 1)
y_img /= 255.
X.append(x_img)
Y.append(y_img)
X = np.array(X)
Y = np.array(Y)
# Create dateset:
if not os.path.exists('Data/npy_train_data/'):
os.makedirs('Data/npy_train_data/')
np.save('Data/npy_train_data/X.npy', X)
np.save('Data/npy_train_data/Y.npy', Y)
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
return X, X_test, Y, Y_test
def read_groundtruth():
ret = []
with open(
os.path.join(
os.path.abspath(os.path.dirname(__file__)),
'groundtruth.txt'), 'rb') as lines:
for line in lines:
ret.append(line[:-2])
return np.array(ret)
def extract_digits(self, image):
"""
Extract digits from a binary image representing a sudoku
:param image: binary image/sudoku
:return: array of digits and their probabilities
"""
prob = np.zeros(4, dtype=np.float32)
digits = np.zeros((4, 9, 9), dtype=object)
for i in range(4):
labeled, features = label(image, structure=CROSS)
objs = find_objects(labeled)
for obj in objs:
roi = image[obj]
# center of bounding box
cy = (obj[0].stop + obj[0].start) / 2
cx = (obj[1].stop + obj[1].start) / 2
dists = cdist([[cy, cx]], CENTROIDS, 'euclidean')
pos = np.argmin(dists)
cy, cx = pos % 9, pos / 9
# 28x28 image, center relative to sudoku
prediction = self.classifier.classify(morph(roi))
if digits[i, cy, cx] is 0:
# Newly found digit
digits[i, cy, cx] = prediction
prob[i] += prediction[0, 0]
elif prediction[0, 0] > digits[i, cy, cx][0, 0]:
# Overlapping! (noise), choose the most probable prediction
prob[i] -= digits[i, cy, cx][0, 0]
digits[i, cy, cx] = prediction
prob[i] += prediction[0, 0]
image = np.rot90(image)
logging.info(prob)
return digits[np.argmax(prob)]
def diagonal(_, pos):
"""
Given an object pixels' positions, return the diagonal length of its
bound box
:param _: pixel values (unused)
:param pos: pixel position (1-D)
:return: diagonal of bounding box
"""
xs = np.array([i / SSIZE for i in pos])
ys = np.array([i % SSIZE for i in pos])
minx = np.amin(xs)
miny = np.amin(ys)
maxx = np.amax(xs)
maxy = np.amax(ys)
return compute_line(np.array([minx, miny]), np.array([maxx, maxy]))