我们从Python开源项目中,提取了以下36个代码示例,用于说明如何使用keras.backend.softmax()。
def call(self, x, mask=None): mean = super(IntraAttention, self).call(x, mask) # x: (batch_size, input_length, input_dim) # mean: (batch_size, input_dim) ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0) # (1, input_length) # (batch_size, input_length, input_dim) tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1)) if mask is not None: if K.ndim(mask) > K.ndim(x): # Assuming this is because of the bug in Bidirectional. Temporary fix follows. # TODO: Fix Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) x = switch(mask, x, K.zeros_like(x)) # (batch_size, input_length, proj_dim) projected_combination = K.tanh(K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector)) scores = K.dot(projected_combination, self.scorer) # (batch_size, input_length) weights = K.softmax(scores) # (batch_size, input_length) attended_x = K.sum(K.expand_dims(weights) * x, axis=1) # (batch_size, input_dim) return attended_x
def step(self, inputs, states): h_tm1 = states[0] # previous memory #B_U = states[1] # dropout matrices for recurrent units #B_W = states[2] h_tm1a = K.dot(h_tm1, self.Wa) eij = K.dot(K.tanh(h_tm1a + K.dot(inputs[:, :self.h_dim], self.Ua)), self.Va) eijs = K.repeat_elements(eij, self.h_dim, axis=1) #alphaij = K.softmax(eijs) # batchsize * lenh h batchsize * lenh * ndim #ci = K.permute_dimensions(K.permute_dimensions(self.h, [2,0,1]) * alphaij, [1,2,0]) #cisum = K.sum(ci, axis=1) cisum = eijs*inputs[:, :self.h_dim] #print(K.shape(cisum), cisum.shape, ci.shape, self.h.shape, alphaij.shape, x.shape) zr = K.sigmoid(K.dot(inputs[:, self.h_dim:], self.Wzr) + K.dot(h_tm1, self.Uzr) + K.dot(cisum, self.Czr)) zi = zr[:, :self.units] ri = zr[:, self.units: 2 * self.units] si_ = K.tanh(K.dot(inputs[:, self.h_dim:], self.W) + K.dot(ri*h_tm1, self.U) + K.dot(cisum, self.C)) si = (1-zi) * h_tm1 + zi * si_ return si, [si] #h_tm1, [h_tm1]
def _softmax(x, dim): """Computes softmax along a specified dim. Keras currently lacks this feature. """ if K.backend() == 'tensorflow': import tensorflow as tf return tf.nn.softmax(x, dim) elif K.backend() is 'cntk': import cntk return cntk.softmax(x, dim) elif K.backend() == 'theano': # Theano cannot softmax along an arbitrary dim. # So, we will shuffle `dim` to -1 and un-shuffle after softmax. perm = np.arange(K.ndim(x)) perm[dim], perm[-1] = perm[-1], perm[dim] x_perm = K.permute_dimensions(x, perm) output = K.softmax(x_perm) # Permute back perm[dim], perm[-1] = perm[-1], perm[dim] output = K.permute_dimensions(x, output) return output else: raise ValueError("Backend '{}' not supported".format(K.backend()))
def call(self, x, mask=None): y = K.dot(x, self.att_W) if not self.activation: if K.backend() == 'theano': weights = K.theano.tensor.tensordot(self.att_v, y, axes=[0, 2]) elif K.backend() == 'tensorflow': weights = K.tensorflow.python.ops.math_ops.tensordot(self.att_v, y, axes=[0, 2]) elif self.activation == 'tanh': if K.backend() == 'theano': weights = K.theano.tensor.tensordot(self.att_v, K.tanh(y), axes=[0, 2]) elif K.backend() == 'tensorflow': weights = K.tensorflow.python.ops.math_ops.tensordot(self.att_v, K.tanh(y), axes=[0, 2]) weights = K.softmax(weights) out = x * K.permute_dimensions(K.repeat(weights, x.shape[2]), [0, 2, 1]) if self.op == 'attsum': out = out.sum(axis=1) elif self.op == 'attmean': out = out.sum(axis=1) / mask.sum(axis=1, keepdims=True) return K.cast(out, K.floatx())
def content_addressing(memory_t, key_vector_t, key_strength_t): ''' Focusing by content. :param memory_t: external memory. :param key_vector_t: key vector. :param key_strength_t: the strength of key. :return: ''' # print("content addressing:") # print(">>memory_t") # print(key_vector_t) # print(">>key_vector_t") # print(key_vector_t) # print(">>key_strength_t") # print(key_strength_t) _weight_content_t = \ key_strength_t * cosine_similarity_group(key_vector_t, memory_t) weight_content_t = softmax(_weight_content_t) # print("_weight_content_t") # print(_weight_content_t) return weight_content_t
def _get_weight_vector(self, M, w_tm1, k, beta, g, s, gamma): # M = tf.Print(M, [M, w_tm1, k], message='get weights beg1: ') # M = tf.Print(M, [beta, g, s, gamma], message='get weights beg2: ') # Content adressing, see Chapter 3.3.1: num = beta * _cosine_distance(M, k) w_c = K.softmax(num) # It turns out that equation (5) is just softmax. # Location adressing, see Chapter 3.3.2: # Equation 7: w_g = (g * w_c) + (1-g)*w_tm1 # C_s is the circular convolution #C_w = K.sum((self.C[None, :, :, :] * w_g[:, None, None, :]),axis=3) # Equation 8: # TODO: Explain C_s = K.sum(K.repeat_elements(self.C[None, :, :, :], self.batch_size, axis=0) * s[:,:,None,None], axis=1) w_tilda = K.batch_dot(C_s, w_g) # Equation 9: w_out = _renorm(w_tilda ** gamma) return w_out
def register(self, info_tensor, param_tensor): self.info_tensor = info_tensor #(128,1) if self.stddev_fix: self.param_tensor = param_tensor mean = K.clip(param_tensor[:, 0].dimshuffle(0, 'x'), self.min, self.max) std = 1.0 else: self.param_tensor = param_tensor # 2 mean = K.clip(param_tensor[:, 0].dimshuffle(0, 'x'), self.min, self.max) # std = K.maximum( param_tensor[:, 1].dimshuffle(0, 'x'), 0) std = K.sigmoid( param_tensor[:, 1].dimshuffle(0, 'x') ) e = (info_tensor-mean)/(std + K.epsilon()) self.log_Q_c_given_x = \ K.sum(-0.5*np.log(2*np.pi) -K.log(std+K.epsilon()) -0.5*(e**2), axis=1) * self.lmbd # m = Sequential([ Activation('softmax', input_shape=(self.n,)), Lambda(lambda x: K.log(x), lambda x: x) ]) return K.reshape(self.log_Q_c_given_x, (-1, 1))
def __init__(self, vocab_size, sequence_size, setting=None, checkpoint_path="", temperature=10, tying=False): super().__init__(vocab_size, sequence_size, setting, checkpoint_path) self.temperature = temperature self.tying = tying self.gamma = self.setting.gamma if tying: self.model.pop() # remove activation self.model.pop() # remove projection (use self embedding) self.model.add(Lambda(lambda x: K.dot(x, K.transpose(self.embedding.embeddings)))) self.model.add(Activation("softmax"))
def augmented_loss(self, y_true, y_pred): _y_pred = Activation("softmax")(y_pred) loss = K.categorical_crossentropy(_y_pred, y_true) # y is (batch x seq x vocab) y_indexes = K.argmax(y_true, axis=2) # turn one hot to index. (batch x seq) y_vectors = self.embedding(y_indexes) # lookup the vector (batch x seq x vector_length) #v_length = self.setting.vector_length #y_vectors = K.reshape(y_vectors, (-1, v_length)) #y_t = K.map_fn(lambda v: K.dot(self.embedding.embeddings, K.reshape(v, (-1, 1))), y_vectors) #y_t = K.squeeze(y_t, axis=2) # unknown but necessary operation #y_t = K.reshape(y_t, (-1, self.sequence_size, self.vocab_size)) # vector x embedding dot products (batch x seq x vocab) y_t = tf.tensordot(y_vectors, K.transpose(self.embedding.embeddings), 1) y_t = K.reshape(y_t, (-1, self.sequence_size, self.vocab_size)) # explicitly set shape y_t = K.softmax(y_t / self.temperature) _y_pred_t = Activation("softmax")(y_pred / self.temperature) aug_loss = kullback_leibler_divergence(y_t, _y_pred_t) loss += (self.gamma * self.temperature) * aug_loss return loss
def mean_log_Gaussian_like(y_true, parameters): """Mean Log Gaussian Likelihood distribution Note: The 'c' variable is obtained as global variable """ #Note: The output size will be (c + 2) * m = 6 c = 1 #The number of outputs we want to predict m = 2 #The number of distributions we want to use in the mixture components = K.reshape(parameters,[-1, c + 2, m]) mu = components[:, :c, :] sigma = components[:, c, :] alpha = components[:, c + 1, :] alpha = K.softmax(K.clip(alpha,1e-8,1.)) exponent = K.log(alpha) - .5 * float(c) * K.log(2 * np.pi) \ - float(c) * K.log(sigma) \ - K.sum((K.expand_dims(y_true,2) - mu)**2, axis=1)/(2*(sigma)**2) log_gauss = log_sum_exp(exponent, axis=1) res = - K.mean(log_gauss) return res
def mean_log_LaPlace_like(y_true, parameters): """Mean Log Laplace Likelihood distribution Note: The 'c' variable is obtained as global variable """ #Note: The output size will be (c + 2) * m = 6 c = 1 #The number of outputs we want to predict m = 2 #The number of distributions we want to use in the mixture components = K.reshape(parameters,[-1, c + 2, m]) mu = components[:, :c, :] sigma = components[:, c, :] alpha = components[:, c + 1, :] alpha = K.softmax(K.clip(alpha,1e-2,1.)) exponent = K.log(alpha) - float(c) * K.log(2 * sigma) \ - K.sum(K.abs(K.expand_dims(y_true,2) - mu), axis=1)/(sigma) log_gauss = log_sum_exp(exponent, axis=1) res = - K.mean(log_gauss) return res
def summarize_memory(o_t, mem_tm1): ''' This method selects the relevant parts of the memory given the read output and summarizes the memory. Implements Equations 2-3 or 8-11 in the paper. ''' # Selecting relevant memory slots, Equation 2 z_t = K.softmax(K.sum(K.expand_dims(o_t, dim=1) * mem_tm1, axis=2)) # (batch_size, input_length) # Summarizing memory, Equation 3 m_rt = K.sum(K.expand_dims(z_t, dim=2) * mem_tm1, axis=1) # (batch_size, output_dim) return z_t, m_rt
def call(self,logits): u = K.random_uniform(K.shape(logits), 0, 1) gumbel = - K.log(-K.log(u + 1e-20) + 1e-20) return K.in_train_phase( K.softmax( ( logits + gumbel ) / self.tau ), K.softmax( ( logits + gumbel ) / self.min ))
def loss(self): logits = self.logits q = K.softmax(logits) log_q = K.log(q + 1e-20) return - K.mean(q * (log_q - K.log(1.0/K.int_shape(logits)[-1])), axis=tuple(range(1,len(K.int_shape(logits)))))
def softmax(x): # print("x") # print(x) _softmax = K.softmax(x) # print("softmax(x)") # print(_softmax) return _softmax
def call(self, x, mask=None): output_mu = K.dot(x, self.W_mu) output_sigma = K.dot(x, self.W_sigma) output_pi = K.dot(x, self.W_pi) if self.bias: output_mu += self.b_mu output_sigma += self.b_sigma output_pi += self.b_pi return K.concatenate([output_mu, K.exp(output_sigma), K.softmax(output_pi)], axis=-1)
def create_entropy_loss(policy_t, beta): def entropy_loss_func(p_t): log_p_t = tf.nn.log_softmax(p_t) sigm_p_t = K.softmax(p_t) entropy_t = beta * K.sum(sigm_p_t * log_p_t, axis=-1, keepdims=True) return entropy_t entropy_loss_t = Lambda(entropy_loss_func, name="entropy_loss", output_shape=(1,))(policy_t) return entropy_loss_t
def simple_context(self, X, mask): """ Simple context calculation layer logic X = (batch_size, time_steps, units) time_steps are nothing but number of words in our case. """ # segregrate heading and desc desc, head = X[:, :max_len_desc, :], X[:, max_len_desc:, :] # segregrate activation and context part head_activations, head_words = head[:, :, :activation_rnn_size], head[:, :, activation_rnn_size:] desc_activations, desc_words = desc[:, :, :activation_rnn_size], desc[:, :, activation_rnn_size:] # p=(bacth_size, length_desc_words, rnn_units) # q=(bacth_size, length_headline_words, rnn_units) # K.dot(p,q) = (bacth_size, length_desc_words,length_headline_words) activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2)) # make sure we dont use description words that are masked out activation_energies = activation_energies + -1e20 * K.expand_dims(1. - K.cast(mask[:, :max_len_desc], 'float32'), 1) # for every head word compute weights for every desc word activation_energies = K.reshape(activation_energies, (-1, max_len_desc)) activation_weights = K.softmax(activation_energies) activation_weights = K.reshape(activation_weights, (-1, max_len_head, max_len_desc)) # for every head word compute weighted average of desc words desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1)) return K.concatenate((desc_avg_word, head_words))
def neural_network(self, X): """pi, mu, sigma = NN(x; theta)""" X_image = tf.reshape(X, [-1,IMAGE_ROWS,IMAGE_COLS,1]) conv1 = Convolution2D(32, 5, 5, border_mode='same', activation=K.relu, W_regularizer=l2(0.01), input_shape=(IMAGE_ROWS, IMAGE_COLS, 1))(X_image) pool1 = MaxPooling2D(pool_size=(2,2), border_mode='same')(conv1) conv2 = Convolution2D(64, 5, 5, border_mode='same', activation=K.relu, W_regularizer=l2(0.01))(pool1) pool2 = MaxPooling2D(pool_size=(2,2), border_mode='same')(conv2) pool2_flat = tf.reshape(pool2, [-1, IMAGE_ROWS//4 * IMAGE_COLS//4 * 64]) hidden1 = Dense(1024, W_regularizer=l2(0.01), activation=K.relu)(pool2_flat) hidden2 = Dense(64, W_regularizer=l2(0.01), activation=K.relu)(hidden1) self.mus = Dense(self.K)(hidden2) self.sigmas = Dense(self.K, activation=K.softplus)(hidden2) self.pi = Dense(self.K, activation=K.softmax)(hidden2)
def step(self, x, states): h_tm1, c_tm1, y_tm1, B, U, H = states s = K.dot(c_tm1, self.W_h) + self.b_h s = K.repeat(s, self.input_length) energy = time_distributed_dense(s + H, self.W_a, self.b_a) energy = K.squeeze(energy, 2) alpha = K.softmax(energy) alpha = K.repeat(alpha, self.input_dim) alpha = K.permute_dimensions(alpha, (0, 2, 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) y, new_states = super(AttentionDecoder, self).step(v, states[:-1]) return y, new_states
def register(self, info_tensor, param_tensor): self.info_tensor = info_tensor self.param_tensor = param_tensor self.log_Q_c_given_x = \ K.sum(K.log(K.softmax(param_tensor)+K.epsilon()) * info_tensor, axis=1) * self.lmbd # m = Sequential([ Activation('softmax', input_shape=(self.n,)), Lambda(lambda x: K.log(x), lambda x: x) ]) return K.reshape(self.log_Q_c_given_x, (-1, 1))
def call(self, x, mask=None): y = K.dot(x, self.att_W) if not self.activation: weights = K.theano.tensor.tensordot(self.att_v, y, axes=[0, 2]) elif self.activation == 'tanh': weights = K.theano.tensor.tensordot(self.att_v, K.tanh(y), axes=[0, 2]) weights = K.softmax(weights) out = x * K.permute_dimensions(K.repeat(weights, x.shape[2]), [0, 2, 1]) if self.op == 'attsum': out = out.sum(axis=1) elif self.op == 'attmean': out = out.sum(axis=1) / mask.sum(axis=1, keepdims=True) return K.cast(out, K.floatx())
def call(self, x, mask=None): ch_idx = self.channel_index l_idx = K.ndim(x) - 1 # last index x = K.permute_dimensions( x, tuple(i for i in range(K.ndim(x)) if i != ch_idx) + (ch_idx,)) sh = K.shape(x) x = K.reshape(x, (-1, sh[-1])) x = K.softmax(x) x = K.reshape(x, sh) x = K.permute_dimensions( x, tuple(range(ch_idx) + [l_idx] + range(ch_idx, l_idx))) return x # Works TH and TF
def tf_test_error_rate(model, x, X_test, y_test): """ Compute test error. """ assert len(X_test) == len(y_test) # Predictions for the test set eval_prediction = K.softmax(model(x)) predictions = batch_eval([x], [eval_prediction], [X_test])[0] return error_rate(predictions, y_test)
def perplexity(cls, y_true, y_pred): _y_pred = Activation("softmax")(y_pred) return super(AugmentedModel, cls).perplexity(y_true, _y_pred)
def compute_softmax(logits,temp): z = logits + sampling_gumbel( K.shape(logits) ) return K.softmax( z / temp )
def call(self, x, mask=None): # x: (batch_size, input_length, input_dim) where input_length = head_size + 2 head_encoding = x[:, :-2, :] # (batch_size, head_size, input_dim) prep_encoding = x[:, -2, :] # (batch_size, input_dim) child_encoding = x[:, -1, :] # (batch_size, input_dim) if self.composition_type == 'HPCD': # TODO: The following line may not work with TF. # (batch_size, head_size, input_dim, 1) * (1, head_size, input_dim, proj_dim) head_proj_prod = K.expand_dims(head_encoding) * K.expand_dims(self.dist_proj_head, dim=0) head_projection = K.sum(head_proj_prod, axis=2) # (batch_size, head_size, proj_dim) else: head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, head_size, proj_dim) prep_projection = K.expand_dims(K.dot(prep_encoding, self.proj_prep), dim=1) # (batch_size, 1, proj_dim) child_projection = K.expand_dims(K.dot(child_encoding, self.proj_child), dim=1) # (batch_size, 1, proj_dim) #(batch_size, head_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC' or self.composition_type == "HPCD": prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, 1, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, head_size, proj_dim) # (batch_size, head_size) head_word_scores = K.squeeze(K.dot(composed_projection, self.scorer), axis=-1) if mask is None: attachment_probabilities = K.softmax(head_word_scores) # (batch_size, head_size) else: if K.ndim(mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. mask = K.any(mask, axis=(-2, -1)) # We need to do a masked softmax. exp_scores = K.exp(head_word_scores) # (batch_size, head_size) head_mask = mask[:, :-2] # (batch_size, head_size) # (batch_size, head_size) masked_exp_scores = switch(head_mask, exp_scores, K.zeros_like(head_encoding[:, :, 0])) # (batch_size, 1). Adding epsilon to avoid divison by 0. But epsilon is float64. exp_sum = K.cast(K.expand_dims(K.sum(masked_exp_scores, axis=1) + K.epsilon()), 'float32') attachment_probabilities = masked_exp_scores / exp_sum # (batch_size, head_size) return attachment_probabilities
def call(self, x, mask=None): # x[0]: (batch_size, input_length, input_dim) # x[1]: (batch_size, 1) indices of prepositions # Optional: x[2]: (batch_size, input_length - 2) assert isinstance(x, list) or isinstance(x, tuple) encoded_sentence = x[0] prep_indices = K.squeeze(x[1], axis=-1) #(batch_size,) batch_indices = K.arange(K.shape(encoded_sentence)[0]) # (batch_size,) if self.with_attachment_probs: # We're essentially doing K.argmax(x[2]) here, but argmax is not differentiable! head_probs = x[2] head_probs_padding = K.zeros_like(x[2])[:, :2] # (batch_size, 2) # (batch_size, input_length) padded_head_probs = K.concatenate([head_probs, head_probs_padding]) # (batch_size, 1) max_head_probs = K.expand_dims(K.max(padded_head_probs, axis=1)) # (batch_size, input_length, 1) max_head_prob_indices = K.expand_dims(K.equal(padded_head_probs, max_head_probs)) # (batch_size, input_length, input_dim) masked_head_encoding = K.switch(max_head_prob_indices, encoded_sentence, K.zeros_like(encoded_sentence)) # (batch_size, input_dim) head_encoding = K.sum(masked_head_encoding, axis=1) else: head_indices = prep_indices - 1 # (batch_size,) head_encoding = encoded_sentence[batch_indices, head_indices, :] # (batch_size, input_dim) prep_encoding = encoded_sentence[batch_indices, prep_indices, :] # (batch_size, input_dim) child_encoding = encoded_sentence[batch_indices, prep_indices+1, :] # (batch_size, input_dim) ''' prep_indices = x[1] sentence_mask = mask[0] if sentence_mask is not None: if K.ndim(sentence_mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. sentence_mask = K.any(sentence_mask, axis=(-2, -1)) head_encoding, prep_encoding, child_encoding = self.get_split_averages(encoded_sentence, sentence_mask, prep_indices) ''' head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, proj_dim) prep_projection = K.dot(prep_encoding, self.proj_prep) # (batch_size, proj_dim) child_projection = K.dot(child_encoding, self.proj_child) # (batch_size, proj_dim) #(batch_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC': prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, proj_dim) # (batch_size, num_classes) class_scores = K.dot(composed_projection, self.scorer) label_probabilities = K.softmax(class_scores) return label_probabilities
def step(self, inputs, states): h_tm1 = states[0] # previous memory #B_U = states[1] # dropout matrices for recurrent units #B_W = states[2] h_tm1a = K.dot(h_tm1, self.Wa) eij = K.dot(K.tanh(K.repeat(h_tm1a, K.shape(self.h)[1]) + self.ha), self.Va) eijs = K.squeeze(eij, -1) alphaij = K.softmax(eijs) # batchsize * lenh h batchsize * lenh * ndim ci = K.permute_dimensions(K.permute_dimensions(self.h, [2,0,1]) * alphaij, [1,2,0]) cisum = K.sum(ci, axis=1) #print(K.shape(cisum), cisum.shape, ci.shape, self.h.shape, alphaij.shape, x.shape) zr = K.sigmoid(K.dot(inputs, self.Wzr) + K.dot(h_tm1, self.Uzr) + K.dot(cisum, self.Czr)) zi = zr[:, :self.units] ri = zr[:, self.units: 2 * self.units] si_ = K.tanh(K.dot(inputs, self.W) + K.dot(ri*h_tm1, self.U) + K.dot(cisum, self.C)) si = (1-zi) * h_tm1 + zi * si_ return si, [si] #h_tm1, [h_tm1] '''if self.consume_less == 'gpu': matrix_x = K.dot(x * B_W[0], self.W) + self.b matrix_inner = K.dot(h_tm1 * B_U[0], self.U[:, :2 * self.units]) x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units: 2 * self.units] inner_z = matrix_inner[:, :self.units] inner_r = matrix_inner[:, self.units: 2 * self.units] z = self.inner_activation(x_z + inner_z) r = self.inner_activation(x_r + inner_r) x_h = matrix_x[:, 2 * self.units:] inner_h = K.dot(r * h_tm1 * B_U[0], self.U[:, 2 * self.units:]) hh = self.activation(x_h + inner_h) else: if self.consume_less == 'cpu': x_z = x[:, :self.units] x_r = x[:, self.units: 2 * self.units] x_h = x[:, 2 * self.units:] elif self.consume_less == 'mem': x_z = K.dot(x * B_W[0], self.W_z) + self.b_z x_r = K.dot(x * B_W[1], self.W_r) + self.b_r x_h = K.dot(x * B_W[2], self.W_h) + self.b_h else: raise ValueError('Unknown `consume_less` mode.') z = self.inner_activation(x_z + K.dot(h_tm1 * B_U[0], self.U_z)) r = self.inner_activation(x_r + K.dot(h_tm1 * B_U[1], self.U_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * B_U[2], self.U_h)) h = z * h_tm1 + (1 - z) * hh return h, [h]'''
def step(self, inputs, states): h_tm1 = states[0] # previous memory #B_U = states[1] # dropout matrices for recurrent units #B_W = states[2] h_tm1a = K.dot(h_tm1, self.Wa) eij = K.dot(K.tanh(h_tm1a + self.ha), self.Va) eijs = K.repeat_elements(eij, self.h_dim, axis=1) #alphaij = K.softmax(eijs) # batchsize * lenh h batchsize * lenh * ndim #ci = K.permute_dimensions(K.permute_dimensions(self.h, [2,0,1]) * alphaij, [1,2,0]) #cisum = K.sum(ci, axis=1) cisum = eijs*self.h #print(K.shape(cisum), cisum.shape, ci.shape, self.h.shape, alphaij.shape, x.shape) zr = K.sigmoid(K.dot(inputs, self.Wzr) + K.dot(h_tm1, self.Uzr) + K.dot(cisum, self.Czr)) zi = zr[:, :self.units] ri = zr[:, self.units: 2 * self.units] si_ = K.tanh(K.dot(inputs, self.W) + K.dot(ri*h_tm1, self.U) + K.dot(cisum, self.C)) si = (1-zi) * h_tm1 + zi * si_ return si, [si] #h_tm1, [h_tm1] '''if self.consume_less == 'gpu': matrix_x = K.dot(x * B_W[0], self.W) + self.b matrix_inner = K.dot(h_tm1 * B_U[0], self.U[:, :2 * self.units]) x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units: 2 * self.units] inner_z = matrix_inner[:, :self.units] inner_r = matrix_inner[:, self.units: 2 * self.units] z = self.inner_activation(x_z + inner_z) r = self.inner_activation(x_r + inner_r) x_h = matrix_x[:, 2 * self.units:] inner_h = K.dot(r * h_tm1 * B_U[0], self.U[:, 2 * self.units:]) hh = self.activation(x_h + inner_h) else: if self.consume_less == 'cpu': x_z = x[:, :self.units] x_r = x[:, self.units: 2 * self.units] x_h = x[:, 2 * self.units:] elif self.consume_less == 'mem': x_z = K.dot(x * B_W[0], self.W_z) + self.b_z x_r = K.dot(x * B_W[1], self.W_r) + self.b_r x_h = K.dot(x * B_W[2], self.W_h) + self.b_h else: raise ValueError('Unknown `consume_less` mode.') z = self.inner_activation(x_z + K.dot(h_tm1 * B_U[0], self.U_z)) r = self.inner_activation(x_r + K.dot(h_tm1 * B_U[1], self.U_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * B_U[2], self.U_h)) h = z * h_tm1 + (1 - z) * hh return h, [h]'''
def create_model(self,): """ RNN model creation Layers include Embedding Layer, 3 LSTM stacked, Simple Context layer (manually defined), Time Distributed Layer """ length_vocab, embedding_size = self.word2vec.shape print ("shape of word2vec matrix ", self.word2vec.shape) model = Sequential() # TODO: look at mask zero flag model.add( Embedding( length_vocab, embedding_size, input_length=max_length, weights=[self.word2vec], mask_zero=True, name='embedding_layer' ) ) for i in range(rnn_layers): lstm = LSTM(rnn_size, return_sequences=True, name='lstm_layer_%d' % (i + 1) ) model.add(lstm) # No drop out added ! model.add(Lambda(self.simple_context, mask=lambda inputs, mask: mask[:, max_len_desc:], output_shape=self.output_shape_simple_context_layer, name='simple_context_layer')) vocab_size = self.word2vec.shape[0] model.add(TimeDistributed(Dense(vocab_size, name='time_distributed_layer'))) model.add(Activation('softmax', name='activation_layer')) model.compile(loss='categorical_crossentropy', optimizer='adam') K.set_value(model.optimizer.lr, np.float32(learning_rate)) print (model.summary()) return model
def _split_and_apply_activations(self, controller_output): """ This takes the controller output, splits it in ntm_output, read and wright adressing data. It returns a triple of ntm_output, controller_instructions_read, controller_instructions_write. ntm_output is a tensor, controller_instructions_read and controller_instructions_write are lists containing the adressing instruction (k, beta, g, shift, gamma) and in case of write also the writing constructions, consisting of an erase and an add vector. As it is necesseary for stable results, k and add_vector is activated via tanh, erase_vector via sigmoid (this is critical!), shift via softmax, gamma is sigmoided, inversed and clipped (probably not ideal) g is sigmoided, beta is linear (probably not ideal!) """ # splitting ntm_output, controller_instructions_read, controller_instructions_write = tf.split( controller_output, np.asarray([self.output_dim, self.read_heads * self.controller_read_head_emitting_dim, self.write_heads * self.controller_write_head_emitting_dim]), axis=1) controller_instructions_read = tf.split(controller_instructions_read, self.read_heads, axis=1) controller_instructions_write = tf.split(controller_instructions_write, self.write_heads, axis=1) controller_instructions_read = [ tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1]), axis=1) for single_head_data in controller_instructions_read] controller_instructions_write = [ tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1, self.m_depth, self.m_depth]), axis=1) for single_head_data in controller_instructions_write] #activation ntm_output = self.activation(ntm_output) controller_instructions_read = [(tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 1 + 9*sigmoid(gamma)) for (k, beta, g, shift, gamma) in controller_instructions_read] controller_instructions_write = [ (tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 1 + 9*sigmoid(gamma), hard_sigmoid(erase_vector), tanh(add_vector)) for (k, beta, g, shift, gamma, erase_vector, add_vector) in controller_instructions_write] return (ntm_output, controller_instructions_read, controller_instructions_write)
def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.1, **kwargs): if bidirectional and hidden_dim % 2 != 0: raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).") super(AttentionSeq2seq, self).__init__() if type(depth) not in [list, tuple]: depth = (depth, depth) if 'batch_input_shape' in kwargs: shape = kwargs['batch_input_shape'] del kwargs['batch_input_shape'] elif 'input_shape' in kwargs: shape = (None,) + tuple(kwargs['input_shape']) del kwargs['input_shape'] elif 'input_dim' in kwargs: if 'input_length' in kwargs: input_length = kwargs['input_length'] else: input_length = None shape = (None, input_length, kwargs['input_dim']) del kwargs['input_dim'] self.add(Layer(batch_input_shape=shape)) if bidirectional: self.add(Bidirectional(LSTMEncoder(output_dim=int(hidden_dim / 2), state_input=False, return_sequences=True, **kwargs))) else: self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)) for i in range(0, depth[0] - 1): self.add(Dropout(dropout)) if bidirectional: self.add(Bidirectional(LSTMEncoder(output_dim=int(hidden_dim / 2), state_input=False, return_sequences=True, **kwargs))) else: self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)) encoder = self.layers[-1] self.add(Dropout(dropout)) self.add(TimeDistributed(Dense(hidden_dim if depth[1] > 1 else output_dim))) decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs) self.add(Dropout(dropout)) self.add(decoder) for i in range(0, depth[1] - 1): self.add(Dropout(dropout)) self.add(LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)) self.add(Dropout(dropout)) self.add(TimeDistributed(Dense(output_dim, activation='softmax'))) self.encoder = encoder self.decoder = decoder
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape(feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_xy, box_wh, box_confidence, box_class_probs