我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.addbroadcast()。
def get_output_for(self, inputs, **kwargs): inputs = autocrop(inputs, self.cropping) # modify broadcasting pattern. if self.broadcastable is not None: for n, broadcasting_dim in enumerate(self.broadcastable): for dim, broadcasting in enumerate(broadcasting_dim): if broadcasting: inputs[n] = T.addbroadcast(inputs[n], dim) output = None for input in inputs: if output is not None: output = self.merge_function(output, input) else: output = input return output # Definition of the network
def conv_pairwise_distance(feature_maps, codebook): """ Calculates the pairwise distances between the feature maps (n_samples, filters, x, y) :param feature_maps: :param codebook: :return: """ x_square = T.sum(feature_maps ** 2, axis=1) # n_samples, filters, x, y x_square = x_square.reshape((x_square.shape[0], 1, x_square.shape[1], x_square.shape[2])) x_square = T.addbroadcast(x_square, 1) y_square = T.sum(codebook ** 2, axis=1) y_square = y_square.reshape((1, y_square.shape[0], y_square.shape[1], y_square.shape[2])) y_square = T.addbroadcast(y_square, 0, 2, 3) inner_product = T.nnet.conv2d(feature_maps, codebook) dist = x_square + y_square - 2 * inner_product dist = T.sqrt(T.maximum(dist, 0)) return dist
def sequence_iteration(self, output, mask,use_dropout=0,dropout_value=0.5): dot_product = T.dot(output , self.t_w_out) net_o = T.add( dot_product , self.t_b_out ) ex_net = T.exp(net_o) sum_net = T.sum(ex_net, axis=2, keepdims=True) softmax_o = ex_net / sum_net mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, softmax_o) + T.mul( (1. - mask) , 1e-6 ) return output #result ###### Linear Layer ########################################
def create_updates(self, grads, params, alpha, opt_alg, opt_params): # call super-class to generate SGD/ADAM updates grad_updates = Model.create_updates(self, grads, params, alpha, opt_alg, opt_params) # create updates for centering signal # load neural net outputs (probabilities have been precomputed) _, _, _, l_cv, c, v = self.network log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) # compute learning signals l = log_pxz - log_qz_given_x - cv l_avg, l_var = l.mean(), l.var() c_new = 0.8*c + 0.2*l_avg v_new = 0.8*v + 0.2*l_var # compute update for centering signal cv_updates = {c : c_new, v : v_new} return OrderedDict( grad_updates.items() + cv_updates.items() )
def create_updates(self, grads, params, alpha, opt_alg, opt_params): # call super-class to generate SGD/ADAM updates grad_updates = Model.create_updates(self, grads, params, alpha, opt_alg, opt_params) # create updates for centering signal # load neural net outputs (probabilities have been precomputed) l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network # load neural net outputs (probabilities have been precomputed) log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz log_qz_given_x = self.log_qz_given_x cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) # compute learning signals l = log_px_given_z + log_pz - log_qz_given_x - cv l_avg, l_var = l.mean(), l.var() c_new = 0.8*c + 0.2*l_avg v_new = 0.8*v + 0.2*l_var # compute update for centering signal cv_updates = {c : c_new, v : v_new} return OrderedDict( grad_updates.items() + cv_updates.items() )
def __call__(self, input,input_lm=None, return_list = False): # activation function if input_lm == None: self.h_l, _ = theano.scan(self.step2, sequences=input.dimshuffle(1,0,2), outputs_info=theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)) else: self.h_l, _ = theano.scan(self.step, sequences=[input.dimshuffle(1,0,2),T.addbroadcast(input_lm.dimshuffle(1,0,'x'), -1)], outputs_info=theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)) self.h_l = self.h_l.dimshuffle(1,0,2) if return_list == True: return self.h_l return self.h_l[:,-1,:]
def get_padded_shuffled_mask(self, train, X, pad=0): mask = self.get_input_mask(train) if mask is None: mask = T.ones_like(X.sum(axis=-1)) # is there a better way to do this without a sum? # mask is (nb_samples, time) mask = T.shape_padright(mask) # (nb_samples, time, 1) mask = T.addbroadcast(mask, -1) # the new dimension (the '1') is made broadcastable # see http://deeplearning.net/software/theano/library/tensor/basic.html#broadcasting-in-theano-vs-numpy mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1) if pad > 0: # left-pad in time with 0 padding = alloc_zeros_matrix(pad, mask.shape[1], 1) mask = T.concatenate([padding, mask], axis=0) return mask.astype('int8')
def __init__(self, incomings, parameters, layer_num, W=lasagne.init.Normal(0.01), num_features=None, **kwargs): super(DCNNLayer, self).__init__(incomings, **kwargs) self.parameters = parameters if num_features is None: self.num_features = self.parameters.num_features else: self.num_features = num_features self.W = T.addbroadcast( self.add_param(W, (1, parameters.num_hops + 1, self.num_features), name='DCNN_W_%d' % layer_num), 0) self.nonlinearity = params.nonlinearity_map[self.parameters.dcnn_nonlinearity]
def __init__(self, incomings, parameters, layer_num, W=lasagne.init.Normal(0.01), num_features=None, **kwargs): super(AggregatedDCNNLayer, self).__init__(incomings, **kwargs) self.parameters = parameters if num_features is None: self.num_features = self.parameters.num_features else: self.num_features = num_features self.W = T.addbroadcast( self.add_param(W, (self.parameters.num_hops + 1, 1, self.num_features), name='AGGREGATE_DCNN_W_%d' % layer_num), 1) self.nonlinearity = params.nonlinearity_map[self.parameters.dcnn_nonlinearity]
def __call__(self, input,input_lm=None, return_list = False): # activation function if input_lm == None: self.h_l, _ = theano.scan(self.step2, sequences=input.dimshuffle(1,0,2), outputs_info=[theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True), theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)]) else: self.h_l, _ = theano.scan(self.step, sequences=[input.dimshuffle(1,0,2),T.addbroadcast(input_lm.dimshuffle(1,0,'x'), -1)], outputs_info=[theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True), theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)]) self.h_l = self.h_l[0].dimshuffle(1,0,2) if return_list == True: return self.h_l return self.h_l[:,-1,:]
def call(self, x, mask=None): ax = 1 if self.is_q else 2 def _step(v1, v2): cosine_score = T.tensordot(v1 / T.sqrt(T.sum(T.sqr(v1), axis=2, keepdims=True) + 1e-6), (v2) / T.sqrt(T.sum(T.sqr(v2), axis=ax, keepdims=True) + 1e-6), [[2], [ax]]) return cosine_score l_s = x[0] # n_b x n_s x n_w_s x D l_a = x[1] # n_b x 4 x n_w_qa x D # w_qa = self.layers[2].get_output(train) # n_b x 4 x n_w_qa x 1 # w_qa = T.addbroadcast(w_qa, len(self.layers[2].output_shape) - 1) # get cosine similarity for ALL word pairs output, _ = theano.scan(_step, sequences=[l_s, l_a], outputs_info=None) if not self.is_q: output = output.dimshuffle(0, 1, 3, 2, 4) # n_b x n_s x 4 x n_w_s x n_w_qa return output
def visualize(self, p0, p=None): if p is None: p = self.get_prob(*self.get_params()) p0 = T.addbroadcast(p0, 0) return p - p0
def __init__(self, incoming, n_codewords=24, V=lasagne.init.Normal(0.1), gamma=lasagne.init.Constant(0.1), eps=0.00001, input_var=None, initializers=None, spatial_level=1, **kwargs): """ Creates a BoF layer :param incoming: :param n_codewords: number of codewords :param V: initializer used for the codebook :param gamma: initializer used for the scaling factors :param eps: epsilon used to ensure numerical stability :param input_var: input_var of the model (used to compile a function that extract the features fed to layer) :param initializers: :param spatial_level: 0 (no spatial segmentation), 1 (first spatial level) :param pooling_type: either 'mean' or 'max' :param kwargs: """ super(CBoF_Layer, self).__init__(incoming, **kwargs) self.n_codewords = n_codewords self.spatial_level = spatial_level n_filters = self.input_shape[1] self.eps = eps # Create parameters self.V = self.add_param(V, (n_codewords, n_filters, 1, 1), name='V') self.gamma = self.add_param(gamma, (1, n_codewords, 1, 1), name='gamma') # Make gammas broadcastable self.gamma = T.addbroadcast(self.gamma, 0, 2, 3) # Compile function used for feature extraction if input_var is not None: self.features_fn = theano.function([input_var], lasagne.layers.get_output(incoming, deterministic=True)) if initializers is not None: initializers.append(self.initialize_layer)
def squeeze(self, x, axis): '''Remove a 1-dimension from the tensor at index "axis". ''' x = T.addbroadcast(x, axis) return T.squeeze(x)
def sequence_iteration(self, output, mask, use_dropout=0, dropout_value=0.5): dot_product = T.dot(output, self.t_w_out) linear_o = T.add(dot_product, self.t_b_out) mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, linear_o) + T.mul((1. - mask), 1e-6) return output # result ### TEST FUNCTIONS # to do make new file with test functions
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, w_hidden_hidden, b_act): pre_w_sig = T.dot(pre_out_sig, w_hidden_hidden) inner_act = self.activation out_sig = inner_act(T.add(cur_w_in_sig, pre_w_sig, b_act)) mask = T.addbroadcast(mask, 1) out_sig_m = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig_m]
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, pre_cell_sig, w_ig_c, w_fg_c, w_og_c, w_ifco, b_ifco, t_n_out): ifco = T.add(T.dot(pre_out_sig, w_ifco), b_ifco) inner_act = self.activation gate_act = self.sigmoid() # Input Gate ig_t1 = gate_act(T.add(ifco[:, 0:t_n_out], T.mul(pre_cell_sig, w_ig_c), cur_w_in_sig[:, 0:t_n_out])) # Forget Gate fg_t1 = gate_act(T.add(ifco[:, 1 * t_n_out:2 * t_n_out], T.mul(pre_cell_sig, w_fg_c), cur_w_in_sig[:, 1 * t_n_out:2 * t_n_out])) # Cell State cs_t1 = T.add(T.mul(fg_t1, pre_cell_sig), T.mul(ig_t1, inner_act( T.add(ifco[:, 2 * t_n_out:3 * t_n_out], cur_w_in_sig[:, 2 * t_n_out:3 * t_n_out])))) mask = T.addbroadcast(mask, 1) cs_t1 = mask * cs_t1 + (1. - mask) * pre_cell_sig # functionality: cs_t1 = T.switch(mask , cs_t1, pre_cell_sig) # Output Gate og_t1 = gate_act( T.add(ifco[:, 3 * t_n_out:4 * t_n_out], T.mul(cs_t1, w_og_c), cur_w_in_sig[:, 3 * t_n_out:4 * t_n_out])) # Output LSTM out_sig = T.mul(og_t1, inner_act(cs_t1)) out_sig = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig, cs_t1]
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, pre_cell_sig, w_ifco, b_ifco, t_n_out): ifco = T.add(T.dot(pre_out_sig, w_ifco), b_ifco) inner_act = self.activation gate_act = self.sigmoid() # Input Gate ig_t1 = gate_act(T.add(ifco[:, 0:t_n_out], cur_w_in_sig[:, 0:t_n_out])) # Forget Gate fg_t1 = gate_act(T.add(ifco[:, 1 * t_n_out:2 * t_n_out], cur_w_in_sig[:, 1 * t_n_out:2 * t_n_out])) # Cell State cs_t1 = T.add(T.mul(fg_t1, pre_cell_sig), T.mul(ig_t1, inner_act( T.add(ifco[:, 2 * t_n_out:3 * t_n_out], cur_w_in_sig[:, 2 * t_n_out:3 * t_n_out])))) mask = T.addbroadcast(mask, 1) cs_t1 = mask * cs_t1 + (1. - mask) * pre_cell_sig # functionality: cs_t1 = T.switch(mask , cs_t1, pre_cell_sig) # Output Gate og_t1 = gate_act( T.add(ifco[:, 3 * t_n_out:4 * t_n_out], cur_w_in_sig[:, 3 * t_n_out:4 * t_n_out])) # Output LSTM out_sig = T.mul(og_t1, inner_act(cs_t1)) out_sig = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig, cs_t1]
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, w_hidden_hidden, b_act, ln_s1, ln_b1, ln_s2, ln_b2): pre_w_sig = T.dot(pre_out_sig, w_hidden_hidden) inner_act = self.activation pre_w_sig_ln = self.ln(pre_w_sig, ln_b1, ln_s1) cur_w_in_sig_ln = self.ln(cur_w_in_sig, ln_b2, ln_s2) out_sig = inner_act(T.add(cur_w_in_sig_ln, pre_w_sig_ln, b_act)) mask = T.addbroadcast(mask, 1) out_sig_m = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig_m]
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, pre_cell_sig, w_ig_c, w_fg_c, w_og_c, w_ifco, b_ifco, ln_b1,ln_s1, ln_b2,ln_s2,ln_b3,ln_s3, t_n_out): cur_w_in_sig_ln = self.ln(cur_w_in_sig, ln_b1, ln_s1) pre_w_out_sig = T.dot(pre_out_sig, w_ifco) pre_w_out_sig_ln = self.ln(pre_w_out_sig, ln_b2, ln_s2) preact = T.add(cur_w_in_sig_ln, pre_w_out_sig_ln, b_ifco) inner_act = self.activation # T.nnet.hard_sigmoid T.tanh gate_act = self.sigmoid() # T.nnet.hard_sigmoid # Input Gate ig_t1 = gate_act(T.add(preact[:, 0:t_n_out], T.mul(pre_cell_sig, w_ig_c))) # Forget Gate fg_t1 = gate_act(T.add(preact[:, 1 * t_n_out:2 * t_n_out], T.mul(pre_cell_sig, w_fg_c),)) # Cell State cs_t1 = T.add(T.mul(fg_t1, pre_cell_sig), T.mul(ig_t1, inner_act( T.add(preact[:, 2 * t_n_out:3 * t_n_out])))) mask = T.addbroadcast(mask, 1) cs_t1 = mask * cs_t1 + (1. - mask) * pre_cell_sig # functionality: cs_t1 = T.switch(mask , cs_t1, pre_cell_sig) cs_t1_ln = self.ln(cs_t1, ln_b3, ln_s3) # Output Gate og_t1 = gate_act( T.add(preact[:, 3 * t_n_out:4 * t_n_out], T.mul(cs_t1_ln, w_og_c))) # Output LSTM out_sig = T.mul(og_t1, inner_act(cs_t1_ln)) out_sig = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig, cs_t1]
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, pre_cell_sig, w_ifco, b_ifco,ln_b1,ln_s1, ln_b2,ln_s2,ln_b3,ln_s3, t_n_out): cur_w_in_sig_ln = self.ln(cur_w_in_sig, ln_b1, ln_s1) pre_w_out_sig = T.dot(pre_out_sig, w_ifco) pre_w_out_sig_ln = self.ln(pre_w_out_sig, ln_b2, ln_s2) preact = T.add(cur_w_in_sig_ln, pre_w_out_sig_ln, b_ifco) inner_act = self.activation # T.nnet.hard_sigmoid #T.tanh # T.nnet.hard_sigmoid T.tanh gate_act = self.sigmoid() # T.nnet.hard_sigmoid #T.nnet.sigmoid # Input Gate ig_t1 = gate_act(preact[:, 0:t_n_out]) # Forget Gate fg_t1 = gate_act(preact[:, 1 * t_n_out:2 * t_n_out]) # Cell State cs_t1 = T.add(T.mul(fg_t1, pre_cell_sig), T.mul(ig_t1, inner_act(preact[:, 2 * t_n_out:3 * t_n_out]))) mask = T.addbroadcast(mask, 1) cs_t1 = mask * cs_t1 + (1. - mask) * pre_cell_sig cs_t1_ln = self.ln(cs_t1, ln_b3, ln_s3) # Output Gate og_t1 = gate_act(preact[:, 3 * t_n_out:4 * t_n_out]) # Output LSTM out_sig = T.mul(og_t1, inner_act(cs_t1_ln)) out_sig = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig, cs_t1]
def t_forward_step(self,mask, rzup_in_sig, h_pre,b_rzup, u_rz, u_up,ln_b1,ln_s1, ln_b2,ln_s2,ln_b3,ln_s3, t_n_out): signal_act = self.activation gate_act = self.sigmoid() rzup_in_sig_ln = self.ln(rzup_in_sig, ln_b1, ln_s1) rzup_b_in_sig_ln = T.add(rzup_in_sig_ln, b_rzup) preact = T.dot( h_pre, u_rz) preact_ln = self.ln(preact, ln_b2, ln_s2) r = gate_act( T.add( rzup_b_in_sig_ln[:, 0:t_n_out] , preact_ln[:, 0:t_n_out] )) z = gate_act( T.add( rzup_b_in_sig_ln[:, t_n_out:2 * t_n_out] , preact_ln[:, t_n_out:2 * t_n_out] )) preactx = T.dot(h_pre , u_up) preactx_ln = self.ln(preactx, ln_b3, ln_s3) h_pre_r_ln = T.mul( preactx_ln, r) h_update = signal_act( T.add( rzup_b_in_sig_ln[:, 2*t_n_out:3*t_n_out] , h_pre_r_ln )) h_new = T.add( (1.-z) * h_update , z * h_pre ) mask = T.addbroadcast(mask, 1) out_sig = T.add( mask * h_new , (1. - mask) * h_pre ) return out_sig
def create_gradients(self, loss, deterministic=False): # load networks l_p_mu, l_q_mu, _, l_cv, c, v = self.network # load params p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True) q_params = lasagne.layers.get_all_params(l_q_mu, trainable=True) cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) # load neural net outputs (probabilities have been precomputed) log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) # compute learning signals l = log_pxz - log_qz_given_x - cv l_avg, l_var = l.mean(), l.var() c_new = 0.8*c + 0.2*l_avg v_new = 0.8*v + 0.2*l_var l = (l - c_new) / T.maximum(1, T.sqrt(v_new)) # compute grad wrt p p_grads = T.grad(-log_pxz.mean(), p_params) # compute grad wrt q q_target = T.mean(dg(l) * log_qz_given_x) q_grads = T.grad(-0.2*q_target, q_params) # 5x slower rate for q # compute grad of cv net cv_target = T.mean(l**2) cv_grads = T.grad(cv_target, cv_params) # combine and clip gradients clip_grad = 1 max_norm = 5 grads = p_grads + q_grads + cv_grads mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] return cgrads
def get_layer(self, x_in, op=lambda x: x): b_ = T.addbroadcast(self._params['b'], 0) ret = op(T.dot(x_in, self._params['U']) + b_) return ret
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic or self.fixed: # use stored mean and std mean = self.mean std = self.std else: # use this batch's mean and std mean = input.mean(self.axes, keepdims=True) #std = input.std(self.axes, keepdims=True) std = (input.var(self.axes, keepdims=True)+self.epsilon).sqrt() # and update the stored mean and std: # we create (memory-aliased) clones of the stored mean and std running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * std) # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean std += 0 * running_std #std += self.epsilon mean = T.addbroadcast(mean, *self.axes) std = T.addbroadcast(std, *self.axes) beta = T.addbroadcast(self.beta, *self.axes) gamma = T.addbroadcast(self.gamma, *self.axes) # normalized = (input - mean) * (gamma / std) + beta normalized = (input - mean) / std if self.rescale: normalized = normalized * gamma + beta return self.nonlinearity(normalized)
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: # use stored mean and std mean = self.mean std = self.std else: # use this batch's mean and std mean = input.mean(self.axes, keepdims=True) std = input.std(self.axes, keepdims=True) # and update the stored mean and std: # we create (memory-aliased) clones of the stored mean and std running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * std) # and include them in the graph so their default updates will be # applied (although the expressions will be optimized away later) mean += 0 * running_mean std += 0 * running_std std += self.epsilon mean = T.addbroadcast(mean, *self.axes) std = T.addbroadcast(std, *self.axes) beta = T.addbroadcast(self.beta, *self.axes) gamma = T.addbroadcast(self.gamma, *self.axes) normalized = (input - mean) * (gamma / std) + beta return self.nonlinearity(normalized)
def get_padded_shuffled_mask(self, train, X, pad=0): mask = self.get_input_mask(train) if mask is None: mask = T.ones_like(X.sum(axis=-1)) # is there a better way to do this without a sum? # mask is (nb_samples, time) mask = T.shape_padright(mask) # (nb_samples, time, 1) mask = T.addbroadcast(mask, -1) # (time, nb_samples, 1) matrix. mask = mask.dimshuffle(1, 0, 2) # (time, nb_samples, 1) if pad > 0: # left-pad in time with 0 padding = alloc_zeros_matrix(pad, mask.shape[1], 1) mask = T.concatenate([padding, mask], axis=0) return mask.astype('int8')
def squeeze(x, axis): '''Remove a 1-dimension from the tensor at index "axis". ''' x = T.addbroadcast(x, axis) return T.squeeze(x)
def __call__(self, input, input_lm=None, h0=None): batch_size = input.shape[0] if h0 == None: h0 = T.alloc(np.asarray(0., dtype=theano.config.floatX), batch_size, self.n_hidden) if input_lm == None: def step(x_t, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W_h) + self.b_h z_t = self.inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = self.inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = self.activation(x_h + T.dot(r_t * h_tm_prev, self.U_h)) h_t = (1 - z_t) * hh_t + z_t * h_tm_prev return h_t self.h_l, _ = theano.scan(step, sequences=input.dimshuffle(1, 0, 2), outputs_info=h0) else: def step(x_t, mask, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W_h) + self.b_h z_t = self.inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = self.inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh = self.activation(x_h + T.dot(r_t * h_tm_prev, self.U_h)) h_t = z_t * h_tm_prev + (1 - z_t) * hh h_t = mask * h_t + (1 - mask) * h_tm_prev return h_t self.h_l, _ = theano.scan(step, sequences=[input.dimshuffle(1, 0, 2), T.addbroadcast(input_lm.dimshuffle(1, 0, 'x'), -1)], outputs_info=h0) self.h_l = self.h_l.dimshuffle(1, 0, 2) return self.h_l[:, -1, :]
def get_output_for(self, input, style=None, **kwargs): mean = input.mean(self.axes) inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) pattern = [0, 1, 'x', 'x'] if style == None: pattern_params = ['x', 0, 'x', 'x'] beta = 0 if self.beta is None else self.beta.dimshuffle(pattern_params) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern_params) else: pattern_params = pattern beta = 0 if self.beta is None else self.beta[style].dimshuffle(pattern_params) gamma = 1 if self.gamma is None else self.gamma[style].dimshuffle(pattern_params) # if self.beta is not None: # beta = ifelse(T.eq(style.shape[0], 1), T.addbroadcast(beta, 0), beta) # if self.gamma is not None: # gamma = ifelse(T.eq(style.shape[0], 1), T.addbroadcast(gamma, 0), gamma) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def get_output_for(self, inputs, **kwargs): m = tensor.mean(inputs[0], axis=1, keepdims=True) sv = tensor.addbroadcast(inputs[1], 1) return inputs[0] + sv - m
def __call__(self, input): mean = input.mean(self.axes, keepdims=True) std = input.std(self.axes, keepdims=True) + self.epsilon # Don't batchnoramlise a single data point mean = ifelse(T.gt(input.shape[0], 1), mean, T.zeros(mean.shape, dtype=mean.dtype)) std = ifelse(T.gt(input.shape[0], 1), std, T.ones(std.shape, dtype=std.dtype)) return (input - mean) * T.addbroadcast((self.gamma / std) + self.beta, *self.axes)
def grad(self, inputs, gout): (x,) = inputs (gz,) = gout if x.dtype not in continuous_dtypes: return [x.zeros_like(dtype=theano.config.floatX)] if self.structured: if self.axis is None: r = gz * theano.sparse.sp_ones_like(x) elif self.axis == 0: r = col_scale(theano.sparse.sp_ones_like(x), gz) elif self.axis == 1: r = row_scale(theano.sparse.sp_ones_like(x), gz) else: raise ValueError('Illegal value for self.axis.') else: o_format = x.format x = dense_from_sparse(x) if _is_sparse_variable(gz): gz = dense_from_sparse(gz) if self.axis is None: r = tensor.second(x, gz) else: ones = tensor.ones_like(x) if self.axis == 0: r = tensor.addbroadcast(gz.dimshuffle('x', 0), 0) * ones elif self.axis == 1: r = tensor.addbroadcast(gz.dimshuffle(0, 'x'), 1) * ones else: raise ValueError('Illegal value for self.axis.') r = SparseFromDense(o_format)(r) return [r]
def test_rebroadcast_rebroadcast(self): mode = theano.compile.get_default_mode().including('canonicalize') m = T.matrix() s = T.addbroadcast(m, 0, 1) v = T.unbroadcast(s, 1) f = theano.function([m], v, mode=mode) f([[76]]) e = f.maker.fgraph.toposort() rebroadcast_nodes = [n for n in e if isinstance(n.op, T.Rebroadcast)] assert len(rebroadcast_nodes) == 1 assert rebroadcast_nodes[0].op.axis == {0: True}
def __init__(self, input_l, input_r, n_in, n_hidden, n_out, activation=T.tanh, output_type='real',batch_size=200,input_lm=None,input_rm=None): if input_lm == None: input_lm = theano.shared(value=np.ones((batch_size,20), dtype=theano.config.floatX),borrow=True) if input_rm == None: input_rm = theano.shared(value=np.ones((batch_size,20), dtype=theano.config.floatX),borrow=True) self.activation = activation self.output_type = output_type # Parameters are reshaped views of theta param_idx = 0 # pointer to somewhere along parameter vector # recurrent weights as a shared variable self.W = theano.shared(ortho_weight(n_hidden),borrow=True,name='W') # input to hidden layer weights self.W_in = theano.shared(glorot_uniform((n_in,n_hidden)),borrow=True,name='W_in') self.h0 = theano.shared(value=np.zeros((batch_size,n_hidden), dtype=theano.config.floatX),borrow=True,name='h0') self.bh = theano.shared(value=np.zeros((batch_size,n_hidden), dtype=theano.config.floatX),borrow=True,name='bh') #self.by = theano.shared(value=np.zeros((n_out,), dtype=theano.config.floatX),borrow=True,name='by') # for convenience self.params = [self.W, self.W_in, self.bh] # activation function def step(x_t, mask, h_tm1): h_tm1 = mask * h_tm1 #h_t = h_tm1 + self.bh h_t = T.tanh(T.dot(x_t, self.W_in) + \ T.dot(h_tm1, self.W) + self.bh) #y_t = T.dot(h_t, self.W_out) + self.by return h_t #a = T.addbroadcast(input_lm.dimshuffle(1,0), -1) self.h_l, _ = theano.scan(step, sequences=[input_l.dimshuffle(1,0,2),T.addbroadcast(input_lm.dimshuffle(1,0,'x'), -1)], outputs_info=theano.shared(value=np.zeros((batch_size,n_hidden), dtype=theano.config.floatX),borrow=True)) self.h_r, _ = theano.scan(step, sequences=[input_r.dimshuffle(1,0,2),T.addbroadcast(input_rm.dimshuffle(1,0,'x'), -1)], outputs_info=theano.shared(value=np.zeros((batch_size,n_hidden), dtype=theano.config.floatX),borrow=True)) self.h_l = self.h_l.dimshuffle(1,0,2) self.h_r = self.h_r.dimshuffle(1,0,2)
def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply( **merge(self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp(self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh(tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out
def _outer_substract(self, x, y): z = x.dimshuffle(0, 1, 'x') z = T.addbroadcast(z, 2) return (z - y.T).dimshuffle(0, 2, 1)
def get_output_for(self, inputs): A = inputs[0] X = inputs[1] num_nodes = A.shape[0] structural_symbolic_loss = T.addbroadcast( T.reshape( 1 + A + self._symbolic_triangles(A) + self._symbolic_arrows(A), [num_nodes, num_nodes, 1] ), 2 ) feature_symbolic_loss = ( (self._outer_substract(X, X) ** 2) * T.addbroadcast(self.W, 0, 1) ) unnormalized_logprobs = T.sum( structural_symbolic_loss + feature_symbolic_loss, 2 ) flat_reduction_index = T.argmax(unnormalized_logprobs) return self.reduce(A, [ flat_reduction_index // num_nodes, flat_reduction_index % num_nodes ])
def __call__(self, input,input_lm=None, return_list = False, Init_input =None,check_gate = False): # activation function if Init_input == None: init = theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True) else: init = Init_input if check_gate: self.h_l, _ = theano.scan(self.step3, sequences=[input.dimshuffle(1,0,2),T.addbroadcast(input_lm.dimshuffle(1,0,'x'), -1)], outputs_info=[init, theano.shared(value=np.zeros((self.batch_size,self.n_hidden), dtype=theano.config.floatX),borrow=True)]) return [self.h_l[0][:,-1,:], self.h_l[1]] if input_lm == None: self.h_l, _ = theano.scan(self.step2, sequences=input.dimshuffle(1,0,2), outputs_info=init) else: self.h_l, _ = theano.scan(self.step, sequences=[input.dimshuffle(1,0,2),T.addbroadcast(input_lm.dimshuffle(1,0,'x'), -1)], outputs_info=init) self.h_l = self.h_l.dimshuffle(1,0,2) if return_list == True: return self.h_l return self.h_l[:,-1,:]
def addbroadcast(x, *axes): return T.addbroadcast(x, *axes) # =========================================================================== # Predefined data # ===========================================================================
def call(self, x, mask=None): lay0 = x[0] lay1 = x[1] lay1 = T.addbroadcast(lay1, lay1.ndim - 1) return lay0 * lay1
def create_gradients(self, loss, deterministic=False): # load networks l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network # load params p_params = lasagne.layers.get_all_params( # [l_px_mu], trainable=True) [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True) qa_params = lasagne.layers.get_all_params(l_qa_mu, trainable=True) qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) # load neural net outputs (probabilities have been precomputed) log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz log_qza_given_x = self.log_qza_given_x log_qz_given_x = self.log_qz_given_x log_qz_given_x_dgz = self.log_qz_given_x_dgz cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) # compute learning signals l0 = log_px_given_z + log_pz - log_qz_given_x #- cv # NOTE: this disn't have q(a) l_avg, l_var = l0.mean(), l0.var() c_new = 0.8*c + 0.2*l_avg v_new = 0.8*v + 0.2*l_var l = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) l_target = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) # l_target = log_px_given_z + log_pz - log_qz_given_x # compute grad wrt p p_grads = T.grad(-log_pxz.mean(), p_params) # compute grad wrt q_a elbo = T.mean(log_pxz - log_qza_given_x) qa_grads = T.grad(-elbo, qa_params) # compute grad wrt q_z qz_target = T.mean(dg(l_target) * log_qz_given_x_dgz) qz_grads = T.grad(-0.2*qz_target, qz_params) # 5x slower rate for q # qz_grads = T.grad(-0.2*T.mean(l0), qz_params) # 5x slower rate for q # qz_grads = T.grad(-0.2*elbo, qz_params) # 5x slower rate for q # compute grad of cv net cv_target = T.mean(l0**2) # cv_grads = [0.2*g for g in T.grad(cv_target, cv_params)] # combine and clip gradients clip_grad = 1 max_norm = 5 # grads = p_grads + qa_grads + qz_grads + cv_grads grads = p_grads + qa_grads + qz_grads #+ cv_grads mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] return cgrads
def __call__(self, input, input_lm=None, h0=None, c0=None): batch_size = input_lm.shape[0] if h0 == None: h0 = T.alloc(np.asarray(0., dtype=theano.config.floatX), batch_size, self.n_hidden) if c0 == None: c0 = T.alloc(np.asarray(0., dtype=theano.config.floatX), batch_size, self.n_hidden) if input_lm == None: def step(x_t, h_tm_prev, c_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = self.inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = self.inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * self.activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = self.inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * self.activation(c_t) # actual hidden state return [h_t, c_t] self.h_1, _ = theano.scan(step, sequences=input.dimshuffle(1, 0, 2), outputs_info=[h0, c0] ) else: def step(x_t, mask, h_tm_prev, c_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = self.inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = self.inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * self.activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = self.inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * self.activation(c_t) # actual hidden state h_t = mask * h_t + (1 - mask) * h_tm_prev c_t = mask * c_t + (1 - mask) * c_tm_prev return [h_t, c_t] self.h_1, _ = theano.scan(step, sequences=[input.dimshuffle(1, 0, 2), T.addbroadcast(input_lm.dimshuffle(1, 0, 'x'), -1)], outputs_info=[h0, c0]) self.h_1 = self.h_1[0].dimshuffle(1, 0, 2) return self.h_1[:, -1, :]
def test_dnn_batchnorm_train(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) if dnn.version(raises=False) < 5000: raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias