我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用keras.backend.exp()。
def elementwise_softmax_3d(matrix): """ Computes element-wise softmax for 3D arrays (volumes), that is, for a matrix with shape (num_samples, dim1, dim2, dim3, num_classes) Parameters ---------- matrix : keras.placeholder Placeholder for the 3D array whose softmax distribution we want to compute Returns ------- keras.placeholder Placeholder for a 3D array with the softmax distribution for all classes with shape (num_samples, dim1, dim2, dim3, num_classes) """ expon = lambda x: K.exp(x) expon_matrix = expon(matrix) softmax_matrix = expon_matrix / K.sum(expon_matrix, axis=4, keepdims=True) return softmax_matrix
def call(self, x, mask=None): eij = dot_product(x, self.W) if self.bias: eij += self.b eij = K.tanh(eij) a = K.exp(eij) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ? to the sum. # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a return K.sum(weighted_input, axis=1)
def call(self, x, mask=None): uit = dot_product(x, self.W) if self.bias: uit += self.b uit = K.tanh(uit) ait = K.dot(uit, self.u) a = K.exp(ait) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ? to the sum. # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a return K.sum(weighted_input, axis=1)
def masked_softmax(tensor, mask, expand=2, axis=1): """Masked soft-max using Lambda and merge-multiplication. Args: tensor: tensor containing scores mask: mask for tensor where 1 - means values at this position and 0 - means void, padded, etc.. expand: axis along which to repeat mask axis: axis along which to compute soft-max Returns: masked soft-max values """ mask = tf.expand_dims(mask, axis=expand) exponentiate = Lambda(lambda x: K.exp(x - K.max(x, axis=axis, keepdims=True)))(tensor) masked = tf.multiply(exponentiate, mask) div = tf.expand_dims(tf.reduce_sum(masked, axis=axis), axis=axis) predicted = tf.divide(masked, div) return predicted
def vae_loss(self, x, x_decoded_mean): xent_loss = K.sum(K.binary_crossentropy(x_decoded_mean, x), axis=-1) kl_loss = - 0.5 * K.sum(1 + self.z_log_var - K.square(self.z_mean) - K.exp(self.z_log_var), axis=-1) return xent_loss + kl_loss # def weighted_vae_loss(self, feature_weights): # def loss(y_true, y_pred): # try: # x = K.binary_crossentropy(y_pred, y_true) # y = tf.Variable(feature_weights.astype('float32')) # # y2 = y_true / K.sum(y_true) # # import pdb;pdb.set_trace() # xent_loss = K.dot(x, y) # kl_loss = - 0.5 * K.sum(1 + self.z_log_var - K.square(self.z_mean) - K.exp(self.z_log_var), axis=-1) # except Exception as e: # print e # import pdb;pdb.set_trace() # return xent_loss + kl_loss # return loss
def call(self, x,mask=None): import theano.tensor as T newx = T.sort(x) #response = K.reverse(newx, axes=1) #response = K.sum(x> 0.5, axis=1) / self.k return newx #response = K.reshape(newx,[-1,1]) #return K.concatenate([1-response, response], axis=self.label) #response = K.reshape(x[:,self.axis], (-1,1)) #return K.concatenate([1-response, response], axis=self.axis) #e = K.exp(x - K.max(x, axis=self.axis, keepdims=True)) #s = K.sum(e, axis=self.axis, keepdims=True) #return e / s
def call(self, x,mask=None): newx = K.sort(x) #response = K.reverse(newx, axes=1) #response = K.sum(x> 0.5, axis=1) / self.k return K.concatenate([newx[:,:self.softmink], newx[:,newx.shape[1]-self.softmaxk:]], axis=-1) #response = K.reshape(newx,[-1,1]) #return K.concatenate([1-response, response], axis=self.label) #response = K.reshape(x[:,self.axis], (-1,1)) #return K.concatenate([1-response, response], axis=self.axis) #e = K.exp(x - K.max(x, axis=self.axis, keepdims=True)) #s = K.sum(e, axis=self.axis, keepdims=True) #return e / s
def _buildEncoder(self, x, latent_rep_size, max_length, epsilon_std = 0.01): h = Convolution1D(9, 9, activation = 'relu', name='conv_1')(x) h = Convolution1D(9, 9, activation = 'relu', name='conv_2')(h) h = Convolution1D(10, 11, activation = 'relu', name='conv_3')(h) h = Flatten(name='flatten_1')(h) h = Dense(435, activation = 'relu', name='dense_1')(h) def sampling(args): z_mean_, z_log_var_ = args batch_size = K.shape(z_mean_)[0] epsilon = K.random_normal(shape=(batch_size, latent_rep_size), mean=0., std = epsilon_std) return z_mean_ + K.exp(z_log_var_ / 2) * epsilon z_mean = Dense(latent_rep_size, name='z_mean', activation = 'linear')(h) z_log_var = Dense(latent_rep_size, name='z_log_var', activation = 'linear')(h) def vae_loss(x, x_decoded_mean): x = K.flatten(x) x_decoded_mean = K.flatten(x_decoded_mean) xent_loss = max_length * objectives.binary_crossentropy(x, x_decoded_mean) kl_loss = - 0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis = -1) return xent_loss + kl_loss return (vae_loss, Lambda(sampling, output_shape=(latent_rep_size,), name='lambda')([z_mean, z_log_var]))
def _process_input(self, x): """Apply logistic and softmax activations to input tensor """ logistic_activate = lambda x: 1.0/(1.0 + K.exp(-x)) (batch, w, h, channels) = x.get_shape() x_temp = K.permute_dimensions(x, (3, 0, 1, 2)) x_t = [] for i in range(self.num): k = self._entry_index(i, 0) x_t.extend([ logistic_activate(K.gather(x_temp, (k, k + 1))), # 0 K.gather(x_temp, (k + 2, k + 3))]) if self.background: x_t.append(K.gather(x_temp, (k + 4,))) else: x_t.append(logistic_activate(K.gather(x_temp, (k + 4,)))) x_t.append( softmax( K.gather(x_temp, tuple(range(k + 5, k + self.coords + self.classes + 1))), axis=0)) x_t = K.concatenate(x_t, axis=0) return K.permute_dimensions(x_t, (1, 2, 3, 0))
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) vs = [K.zeros(K.get_variable_shape(p)) for p in params] self.weights = [self.iterations]+ vs for p, g, v in zip(params, grads, vs): v_t = v + K.square(g) p_t = p - self.lr * g / (v_t + self.xi_2*K.exp(-self.xi_1*v_t) ) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if p in constraints: c = constraints[p] new_p = c(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def call(self, inputs): stim = inputs[0] center = inputs[1] centers_x = self.XX[None, :, :, None] - center[:, 0, None, None, None] - self.centers[0][None, None, None, :] centers_y = self.YY[None, :, :, None] - center[:, 1, None, None, None] - self.centers[1][None, None, None, :] senv = self.stds[None, None, None, :] gauss = self.gauss_scale * (K.square(self.dx) / (2 * np.pi * K.square(senv) + K.epsilon()))*K.exp(-(K.square(centers_x) + K.square(centers_y))/(2.0 * K.square(senv))) # gauss = (1 / K.sqrt(2 * np.pi * K.square(senv) + K.epsilon()))*K.exp(-(K.square(centers_x) + K.square(centers_y))/(2.0 * K.square(senv))) # gauss /= K.max(gauss, axis=(1, 2), keepdims=True) gauss = K.reshape(gauss, self.kernel_shape) if K.backend() == 'theano': output = K.sum(stim[..., None] * K.pattern_broadcast(gauss, self.kernel_broadcast), axis=self.filter_axes, keepdims=False) else: output = K.sum(stim[..., None] * gauss, axis=self.filter_axes, keepdims=False) return output
def call(self, x, mask=None): # computes a probability distribution over the timesteps # uses 'max trick' for numerical stability # reshape is done to avoid issue with Tensorflow # and 1-dimensional weights logits = K.dot(x, self.W) x_shape = K.shape(x) logits = K.reshape(logits, (x_shape[0], x_shape[1])) ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True)) # masked timesteps have zero weight if mask is not None: mask = K.cast(mask, K.floatx()) ai = ai * mask att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon()) weighted_input = x * K.expand_dims(att_weights) result = K.sum(weighted_input, axis=1) if self.return_attention: return [result, att_weights] return result
def SP_pixelwise_loss(y_true,y_pred): y_true_label=y_true[:,:class_number,:,:] y_true_SP_weight=y_true[:,class_number:,:,:] y_pred=K.clip(y_pred,-50.,50.)#prevent overflow sample_num_per_class=K.sum(y_true_label,axis=[2,3],keepdims=True) class_ind=K.cast(K.greater(sample_num_per_class,0.),'float32') avg_sample_num_per_class=K.sum(sample_num_per_class,axis=1,keepdims=True)/K.sum(class_ind,axis=1,keepdims=True) sample_weight_per_class=avg_sample_num_per_class/(sample_num_per_class+0.1) exp_pred=K.exp(y_pred-K.max(y_pred,axis=1,keepdims=True)) y_pred_softmax=exp_pred/K.sum(exp_pred,axis=1,keepdims=True) pixel_wise_loss=-K.log(y_pred_softmax)*y_true_label pixel_wise_loss=pixel_wise_loss*sample_weight_per_class weighter_pixel_wise_loss=K.sum(pixel_wise_loss,axis=1,keepdims=True) return K.mean(weighter_pixel_wise_loss*y_true_SP_weight) #label distribution loss
def layout_loss_hard(y_true,y_pred): y_pred=K.clip(y_pred,-50.,50.)#prevent overflow exp_pred=K.exp(y_pred-K.max(y_pred,axis=1,keepdims=True)) y_pred_softmax=exp_pred/K.sum(exp_pred,axis=1,keepdims=True) max_pred_softmax=K.max(y_pred_softmax,axis=1,keepdims=True) bin_pred_softmax_a=y_pred_softmax/max_pred_softmax bin_pred_softmax=bin_pred_softmax_a**6. final_pred=K.mean(bin_pred_softmax,axis=[2,3]) final_pred=final_pred/(K.sum(final_pred,axis=1,keepdims=True)+K.epsilon()) y_true_s=K.squeeze(y_true,axis=3) y_true_s=K.squeeze(y_true_s,axis=2) tier_wise_loss_v=-K.clip(K.log(final_pred),-500,500)*y_true_s return K.mean(K.sum(tier_wise_loss_v,axis=1)) #compile
def _L(x): # initialize with zeros batch_size = x.shape[0] a = T.zeros((batch_size, num_actuators, num_actuators)) # set diagonal elements batch_idx = T.extra_ops.repeat(T.arange(batch_size), num_actuators) diag_idx = T.tile(T.arange(num_actuators), batch_size) b = T.set_subtensor(a[batch_idx, diag_idx, diag_idx], T.flatten(T.exp(x[:, :num_actuators]))) # set lower triangle cols = np.concatenate([np.array(range(i), dtype=np.uint) for i in xrange(num_actuators)]) rows = np.concatenate([np.array([i]*i, dtype=np.uint) for i in xrange(num_actuators)]) cols_idx = T.tile(T.as_tensor_variable(cols), batch_size) rows_idx = T.tile(T.as_tensor_variable(rows), batch_size) batch_idx = T.extra_ops.repeat(T.arange(batch_size), len(cols)) c = T.set_subtensor(b[batch_idx, rows_idx, cols_idx], T.flatten(x[:, num_actuators:])) return c
def mix_gaussian_loss(x, mu, log_sig, w): ''' Combine the mixture of gaussian distribution and the loss into a single function so that we can do the log sum exp trick for numerical stability... ''' if K.backend() == "tensorflow": x.set_shape([None, 1]) gauss = log_norm_pdf(K.repeat_elements(x=x, rep=mu.shape[1], axis=1), mu, log_sig) # TODO: get rid of clipping. gauss = K.clip(gauss, -40, 40) max_gauss = K.maximum((0.), K.max(gauss)) # log sum exp trick... gauss = gauss - max_gauss out = K.sum(w * K.exp(gauss), axis=1) loss = K.mean(-K.log(out) + max_gauss) return loss
def free_energy_gap(self, x_train, x_test): """ Computes the free energy gap between train and test set, F(x_test) - F(x_train). In order to avoid overfitting, we cannot directly monitor if the probability of held out data is starting to decrease, due to the partition function. We can however compute the ratio p(x_train)/p(x_test), because here the partition functions cancel out. This ratio should be close to 1, if it is > 1, the model may be overfitting. The ratio can be compute as, r = p(x_train)/p(x_test) = exp(-F(x_train) + F(x_test)). Alternatively, we compute the free energy gap, gap = F(x_test) - F(x_train), where F(x) indicates the mean free energy of test data and a representative subset of training data respectively. The gap should around 0 normally, but when it starts to grow, the model may be overfitting. However, even when the gap is growing, the probability of the training data may be growing even faster, so the probability of the test data may still be improving. See: Hinton, "A Practical Guide to Training Restricted Boltzmann Machines", UTML TR 2010-003, 2010, section 6. """ return T.mean(self.free_energy(x_train)) - T.mean(self.free_energy(x_test))
def _get_attention_and_kappa(self, attended, params, kappa_tm1): """ # Args params: the params of this distribution attended: the attended sequence (samples, timesteps, features) # Returns attention tensor (samples, features) """ att_idx = K.constant(np.arange(self.attended_shape[1])[None, :, None]) alpha, beta, kappa_diff = self.distribution.split_param_types(params) kappa = kappa_diff + kappa_tm1 kappa_ = K.expand_dims(kappa, 1) beta_ = K.expand_dims(beta, 1) alpha_ = K.expand_dims(alpha, 1) attention_w = K.sum( alpha_ * K.exp(- beta_ * K.square(kappa_ - att_idx)), axis=-1, # keepdims=True ) attention_w = K.expand_dims(attention_w, -1) # TODO remove and keepdims attention = K.sum(attention_w * attended, axis=1) return attention, kappa
def sampling(args): epsilon_std = 1.0 if len(args) == 2: z_mean, z_log_var = args epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=epsilon_std) # return z_mean + K.exp( z_log_var / 2 ) * epsilon else: z_mean = args[0] epsilon = K.random_normal(shape=K.shape(z_mean), mean=0., stddev=epsilon_std) return z_mean + K.exp( 1.0 / 2 ) * epsilon
def loss_logcosh(y_true, x): """ This loss implements a logcosh loss with a dummy for the uncertainty. It approximates a mean-squared loss for small differences and a linear one for large differences, therefore it is conceptually similar to the Huber loss. This loss here is scaled, such that it start becoming linear around 4-5 sigma """ scalefactor_a=30 scalefactor_b=0.4 from tensorflow import where, greater, abs, zeros_like, exp x_pred = x[:,1:] x_sig = x[:,:1] def cosh(y): return (K.exp(y) + K.exp(-y)) / 2 return K.mean(0.5*K.square(x_sig)) + K.mean(scalefactor_a* K.log(cosh( scalefactor_b*(x_pred - y_true))), axis=-1)
def loss_logcosh_noUnc(y_true, x_pred): """ This loss implements a logcosh loss without a dummy for the uncertainty. It approximates a mean-squared loss for small differences and a linear one for large differences, therefore it is conceptually similar to the Huber loss. This loss here is scaled, such that it start becoming linear around 4-5 sigma """ scalefactor_a=1. scalefactor_b=3. from tensorflow import where, greater, abs, zeros_like, exp dxrel=(x_pred - y_true)/(y_true+0.0001) def cosh(x): return (K.exp(x) + K.exp(-x)) / 2 return scalefactor_a*K.mean( K.log(cosh(scalefactor_b*dxrel)), axis=-1)
def call(self, x, mask=None): # 1. transform, (None, steps, idim)*(idim, outdim) -> (None, steps, outdim) u = self.attn_activation(K.dot(x, self.W_s) + self.B_s) # 2. * attention sum : {(None, steps, outdim) *(outdim), axis = 2} -> (None, steps) att = K.sum(u*self.Attention_vec, axis=2) # 3. softmax, (None, steps) att = K.exp(att) att_sum = K.sum(att, axis=1) att_sum = att_sum.dimshuffle(0,'x') #att_sum = K.expand_dims(att_sum, 1) att = att/att_sum # 4. weighted sum att = att.dimshuffle(0, 1, 'x') #att = K.expand_dims(att, 2) va = att*x v = K.sum(va, axis=1) return v
def gen_cosine_amp(amp=100, period=1000, x0=0, xn=50000, step=1, k=0.0001): """Generates an absolute cosine time series with the amplitude exponentially decreasing Arguments: amp: amplitude of the cosine function period: period of the cosine function x0: initial x of the time series xn: final x of the time series step: step of the time series discretization k: exponential rate """ cos = np.zeros(((xn - x0) * step, 1, 1)) for i in range(len(cos)): idx = x0 + i * step cos[i, 0, 0] = amp * np.cos(2 * np.pi * idx / period) cos[i, 0, 0] = cos[i, 0, 0] * np.exp(-k * idx) return cos
def on_train_begin(self, logs={}): N = self.mi_calculator.miN dims = self.mi_calculator.data.shape[1] Kdists = K.placeholder(ndim=2) Klogvar = K.placeholder(ndim=0) lossfunc = K.function([Kdists, Klogvar,], [kde_entropy_from_dists_loo(Kdists, N, dims, K.exp(Klogvar))]) jacfunc = K.function([Kdists, Klogvar,], K.gradients(kde_entropy_from_dists_loo(Kdists, N, dims, K.exp(Klogvar)), Klogvar)) def obj(logvar, dists): return lossfunc([dists, logvar.flat[0]])[0] def jac(logvar, dists): return np.atleast_2d(np.array(jacfunc([dists, logvar.flat[0]])))[0] self.obj = obj self.jac = jac
def softmax(x, axis, mask=None): if mask is None: mask = K.constant(True) mask = K.cast(mask, K.floatx()) if K.ndim(x) is K.ndim(mask) + 1: mask = K.expand_dims(mask) m = K.max(x, axis=axis, keepdims=True) e = K.exp(x - m) * mask s = K.sum(e, axis=axis, keepdims=True) s += K.cast(K.cast(s < K.epsilon(), K.floatx()) * K.epsilon(), K.floatx()) return e / s
def call(self, x, mask=None): #print '\nhi in attention' #print x._keras_shape uit = K.dot(x, self.W) #print '\nuit' #print uit._keras_shape uit += self.bw uit = K.tanh(uit) ait = K.dot(uit, self.uw) a = K.exp(ait) # apply mask after the exp. will be re-normalized next #print mask if mask is not None: a *= K.cast(mask, K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) #print "in att ", K.shape(a) weighted_input = x * a #print weighted_input ssi = K.sum(weighted_input, axis=1) #print "type ", type(ssi) #print "in att si ", theano.tensor.shape(ssi) #1111print "hello" return [a, ssi]
def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal_variable(shape=(batch_size, m), mean=0., scale=epsilon_std) return z_mean + K.exp(z_log_var / 2) * epsilon
def vae_loss(x, x_hat): kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) xent_loss = n * objectives.binary_crossentropy(x, x_hat) mse_loss = n * objectives.mse(x, x_hat) if use_loss == 'xent': return xent_loss + kl_loss elif use_loss == 'mse': return mse_loss + kl_loss else: raise Expception, 'Nonknow loss!'
def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., std=epsilon_std) return z_mean + K.exp(z_log_var / 2) * epsilon # note that "output_shape" isn't necessary with the TensorFlow backend
def vae_loss(x, x_decoded_mean): xent_loss = original_dim * objectives.binary_crossentropy(x, x_decoded_mean) kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) return xent_loss + kl_loss
def sampling(args): z_mean, z_log_var = args # N(0,1) ?????? epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., std=epsilon_std) return z_mean + K.exp(z_log_var) * epsilon # note that "output_shape" isn't necessary with the TensorFlow backend # so you could write `Lambda(sampling)([z_mean, z_log_var])`
def vae_loss(x, x_decoded_mean): # NOTE: binary_crossentropy expects a batch_size by dim # for x and x_decoded_mean, so we MUST flatten these! # Flatten x = K.flatten(x) x_decoded_mean = K.flatten(x_decoded_mean) xent_loss = img_rows * img_cols * objectives.binary_crossentropy(x, x_decoded_mean) kl_loss = - 0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) return xent_loss + kl_loss # input_shape: (100,1,28,28) # output_shape: (100,1,28,28)
def _softmax(x): """ Softmax that works on ND inputs. """ e = K.exp(x - K.max(x, axis=-1, keepdims=True)) s = K.sum(e, axis=-1, keepdims=True) return e / s
def sampling(self, args): z_mean, z_log_var = args epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.dim[1]), mean=0.,\ stddev=self.epsilon_std) return z_mean + K.exp(z_log_var / 2) * epsilon
def bp_mll_loss(y_true, y_pred): # get true and false labels y_i = K.equal(y_true, K.ones_like(y_true)) y_i_bar = K.not_equal(y_true, K.ones_like(y_true)) # cast to float as keras backend has no logical and y_i = K.cast(y_i, dtype='float32') y_i_bar = K.cast(y_i_bar, dtype='float32') # get indices to check truth_matrix = pairwise_and(y_i, y_i_bar) # calculate all exp'd differences sub_matrix = pairwise_sub(y_pred, y_pred) exp_matrix = K.exp(-sub_matrix) # check which differences to consider and sum them sparse_matrix = exp_matrix * truth_matrix sums = K.sum(sparse_matrix, axis=[1,2]) # get normalizing terms and apply them y_i_sizes = K.sum(y_i, axis=1) y_i_bar_sizes = K.sum(y_i_bar, axis=1) normalizers = y_i_sizes * y_i_bar_sizes results = sums / normalizers # sum over samples return K.sum(results) # compute pairwise differences between elements of the tensors a and b
def call(self, x,mask=None): e = K.exp(x - K.max(x, axis=self.axis, keepdims=True)) s = K.sum(e, axis=self.axis, keepdims=True) return e / s
def call(self, x,mask=None): response = K.reshape(x[:,self.axis], (-1,1)) return K.concatenate([1-response, response], axis=self.axis) #e = K.exp(x - K.max(x, axis=self.axis, keepdims=True)) #s = K.sum(e, axis=self.axis, keepdims=True) #return e / s
def call(self, x,mask=None): response = K.max(x, axis=-1, keepdims=True) #K.reshape(x, (-1,1)) return K.concatenate([1-response, response], axis=self.axis) #e = K.exp(x - K.max(x, axis=self.axis, keepdims=True)) #s = K.sum(e, axis=self.axis, keepdims=True) #return e / s