def get_sampling_model_and_input(exp_config): # Create Theano variables encoder = BidirectionalEncoder( exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) decoder = Decoder( exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, loss_function='min_risk') # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) # build the model that will let us get a theano function from the sampling graph logger.info("Creating Sampling Model...") sampling_model = Model(generated) return sampling_model, sampling_input, encoder, decoder
def test_adv1_inc_sub_notlastdim_2didx(self): # Test that taking 1-dimensional advanced indexing # over a dimension that's not the first (outer-most) works, # if the index is a matrix. m = matrix('m') i = lmatrix('i') m1 = set_subtensor(m[:, i], 0) m2 = inc_subtensor(m[:, i], 1) f = theano.function([m, i], [m1, m2]) m_val = rand(5, 7) i_val = randint_ranged(min=0, max=6, shape=(4, 2)) m1_ref = m_val.copy() m2_ref = m_val.copy() m1_val, m2_val = f(m_val, i_val) for idx in i_val.ravel(): m1_ref[:, idx] = 0 m2_ref[:, idx] += 1 assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref) assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref)
def test_blocksparse_inplace_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=[sparse_block_gemv]) else: assert f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=[sparse_block_gemv_inplace])
def test_blocksparse_inplace_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)]) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer) else: assert f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
def __init__(self, rng, cost, errors, params, method, learning_rate=0.01, max_norm=9): self.rng = rng self.cost = cost self.errors = errors self.params = params self.batch_x = T.lmatrix('batch_x') self.batch_y = T.ivector('batch_y') if method == 'adagrad': self.updates = get_adagrad_updates(cost, params, learning_rate=learning_rate, max_norm=max_norm, _eps=1e-6)
def arch_memnet_lexical(self): ''' each memory slot is a lexical. ''' contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lvector('yvs') hop = 1 params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) if self.kwargs.get('position_encoding'): lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) print '[memory network] use PE' q = q * lmat u = mean(q, axis=1) params.extend(question_layer.params) mem_layers = [] for hi in range(hop): mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim, **self.kwargs) params.extend(mem_layer.params) mem_layers.append(mem_layer) o = mem_layer(contexts, u) u = u + o linear = LinearLayer(self.hidden_dim, self.vocab_size) params.extend(linear.params) probs = softmax(linear(u)) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def arch_lstmq(self, param_b=2): contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lvector('yvs') params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) params.extend(question_layer.params) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) q = q * lmat u = mean(q, axis=1) embed_layer = Embed(self.vocab_size, self.hidden_dim) params.extend(embed_layer.params) lmat = position_encoding(self.unit_size, self.hidden_dim).dimshuffle('x', 'x', 0, 1) m = T.reshape(embed_layer(contexts.flatten()), (self.batchsize, self.mem_size, self.unit_size, self.hidden_dim)) m = mean(m * lmat, axis=2) lstm = LSTMq(self.batchsize, self.hidden_dim) params.extend(lstm.params) o = lstm(m.dimshuffle(1, 0, 2), u) linear = LinearLayer(self.hidden_dim, self.vocab_size) params.extend(linear.params) probs = softmax(linear(o)) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def arch_memnet_selfsup(self): ''' memory net with self supervision. ''' contexts = T.ltensor3('contexts') querys = T.lmatrix('querys') yvs = T.lmatrix('yvs') params = [] question_layer = Embed(self.vocab_size, self.hidden_dim) q = T.reshape(question_layer(querys.flatten()), (self.batchsize, self.sen_maxlen, self.hidden_dim) ) if self.kwargs.get('position_encoding'): lmat = position_encoding(self.sen_maxlen, self.hidden_dim).dimshuffle('x', 0, 1) print '[memory network] use PE' q = q * lmat u = mean(q, axis=1) params.extend(question_layer.params) mem_layer = MemoryLayer(self.batchsize, self.mem_size, self.unit_size, self.vocab_size, self.hidden_dim, **self.kwargs) probs = mem_layer.get_probs(contexts, u).dimshuffle(0, 2) inputs = { 'contexts': contexts, 'querys': querys, 'yvs': yvs, 'cvs': T.lmatrix('cvs') } return (probs, inputs, params)
def get_sampling_model_and_input(exp_config): # Create Theano variables encoder = BidirectionalEncoder( exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) # Note: the 'min_risk' kwarg tells the decoder which sequence_generator and cost_function to use decoder = NMTPrefixDecoder( exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, loss_function='min_risk') # rename to match baseline NMT systems decoder.name = 'decoder' # Create Theano variables logger.info('Creating theano variables') sampling_source_input = tensor.lmatrix('source') sampling_prefix_input = tensor.lmatrix('prefix') # Get generation model logger.info("Building sampling model") sampling_source_representation = encoder.apply( sampling_source_input, tensor.ones(sampling_source_input.shape)) # WORKING: get the costs (logprobs) from here # WORKING: make a theano variable for the costs, pass it to expected_cost # WORKING: how _exactly_ are the costs computed? # return (next_states + [next_outputs] + # list(next_glimpses.values()) + [next_costs]) generated = decoder.generate(sampling_source_input, sampling_source_representation, target_prefix=sampling_prefix_input) # build the model that will let us get a theano function from the sampling graph logger.info("Creating Sampling Model...") sampling_model = Model(generated) # TODO: update clients with sampling_context_input return sampling_model, sampling_source_input, sampling_prefix_input, encoder, decoder, generated
def create_model(encoder, decoder, smoothing_constant=0.005): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') samples = tensor.lmatrix('samples') samples_mask = tensor.matrix('samples_mask') target_prefix = tensor.lmatrix('target_prefix') prefix_mask = tensor.matrix('target_prefix_mask') # scores is (batch, samples) scores = tensor.matrix('scores') # We don't need a scores mask because there should be the same number of scores for each instance # num samples is a hyperparameter of the model # This is the part that is different for the MinimumRiskSequenceGenerator cost = decoder.expected_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, samples, samples_mask, scores, prefix_outputs=target_prefix, prefix_mask=prefix_mask, smoothing_constant=smoothing_constant ) return cost # def main(model, cost, config, tr_stream, dev_stream, use_bokeh=False):
def load_params_and_get_beam_search(exp_config): encoder = BidirectionalEncoder( exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) decoder = Decoder( exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2) # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) # Set the parameters logger.info("Creating Model...") model = Model(generated) logger.info("Loading parameters from model: {}".format(exp_config['saved_parameters'])) # load the parameter values from an .npz file if the `saved_parameters` field is present in the config param_values = LoadNMT.load_parameter_values(exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(model, param_values) return beam_search, sampling_input
def create_model(encoder, decoder): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # target_samples = tensor.tensor3('samples').astype('int64') # target_samples_mask = tensor.tensor3('target_samples_mask').astype('int64') samples = tensor.lmatrix('samples') samples_mask = tensor.matrix('samples_mask') # scores is (batch, samples) scores = tensor.matrix('scores') # We don't need a scores mask because there should be the same number of scores for each instance # num samples is a hyperparameter of the model # the name is important to make sure pre-trained params get loaded correctly # decoder.name = 'decoder' # This is the part that is different for the MinimumRiskSequenceGenerator cost = decoder.expected_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, samples, samples_mask, scores) return cost
def test_correct_solution(self): x = tensor.lmatrix() y = tensor.lmatrix() z = tensor.lscalar() b = theano.tensor.nlinalg.lstsq()(x, y, z) f = function([x, y, z], b) TestMatrix1 = numpy.asarray([[2, 1], [3, 4]]) TestMatrix2 = numpy.asarray([[17, 20], [43, 50]]) TestScalar = numpy.asarray(1) f = function([x, y, z], b) m = f(TestMatrix1, TestMatrix2, TestScalar) self.assertTrue(numpy.allclose(TestMatrix2, numpy.dot(TestMatrix1, m[0])))
def test_adv1_inc_sub_notlastdim_1_2dval_broadcast(self): # Test that taking 1-dimensional advanced indexing # over a dimension that's not the first (outer-most), # and incrementing/setting with broadcast m = matrix('m') # Test for both vector and matrix as index sym_i = (lvector('i'), lmatrix('i')) shape_i = ((4,), (4, 2)) shape_val = ((3, 1), (3, 1, 1)) # Disable the warning emitted for that case orig_warn = config.warn.inc_set_subtensor1 try: config.warn.inc_set_subtensor1 = False for i, shp_i, shp_v in zip(sym_i, shape_i, shape_val): sub_m = m[:, i] m1 = set_subtensor(sub_m, numpy.zeros(shp_v)) m2 = inc_subtensor(sub_m, numpy.ones(shp_v)) f = theano.function([m, i], [m1, m2]) m_val = rand(3, 5) i_val = randint_ranged(min=0, max=4, shape=shp_i) m1_ref = m_val.copy() m2_ref = m_val.copy() m1_val, m2_val = f(m_val, i_val) for idx in i_val.ravel(): m1_ref[:, idx] = 0 m2_ref[:, idx] += 1 assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref) assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref) finally: config.warn.inc_set_subtensor1 = orig_warn
def test_adv1_inc_sub_notlastdim_1_2dval_no_broadcast(self): # Test that taking 1-dimensional advanced indexing # over a dimension that's not the first (outer-most), # and incrementing/setting without broadcast m = matrix('m') # Test for both vector and matrix as index sym_i = (lvector('i'), lmatrix('i')) shape_i = ((4,), (4, 2)) shape_val = ((3, 4), (3, 4, 2)) # Disable the warning emitted for that case orig_warn = config.warn.inc_set_subtensor1 try: config.warn.inc_set_subtensor1 = False for i, shp_i, shp_v in zip(sym_i, shape_i, shape_val): sub_m = m[:, i] m1 = set_subtensor(sub_m, numpy.zeros(shp_v)) m2 = inc_subtensor(sub_m, numpy.ones(shp_v)) f = theano.function([m, i], [m1, m2]) m_val = rand(3, 5) i_val = randint_ranged(min=0, max=4, shape=shp_i) m1_ref = m_val.copy() m2_ref = m_val.copy() m1_val, m2_val = f(m_val, i_val) # We have to explicitly loop over all individual indices, # not as a list or array, numpy only increments the indexed # elements once even if the indices are repeated. for idx in i_val.ravel(): m1_ref[:, idx] = 0 m2_ref[:, idx] += 1 assert numpy.allclose(m1_val, m1_ref), (m1_val, m1_ref) assert numpy.allclose(m2_val, m2_ref), (m2_val, m2_ref) finally: config.warn.inc_set_subtensor1 = orig_warn
def setUp(self): self.s = iscalar() self.v = fvector() self.m = dmatrix() self.t = ctensor3() self.ft4 = ftensor4() self.ix1 = lvector() # advanced 1d query self.ix12 = lvector() self.ix2 = lmatrix() self.ixr = lrow()
def test_multMatVect(): A1 = tensor.lmatrix('A1') s1 = tensor.ivector('s1') m1 = tensor.iscalar('m1') A2 = tensor.lmatrix('A2') s2 = tensor.ivector('s2') m2 = tensor.iscalar('m2') g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2) f0 = theano.function([A1, s1, m1, A2, s2, m2], g0) i32max = numpy.iinfo(numpy.int32).max A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s1 = numpy.random.randint(0, i32max, 3).astype('int32') m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s2 = numpy.random.randint(0, i32max, 3).astype('int32') m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") f0.input_storage[0].storage[0] = A1 f0.input_storage[1].storage[0] = s1 f0.input_storage[2].storage[0] = m1 f0.input_storage[3].storage[0] = A2 f0.input_storage[4].storage[0] = s2 f0.input_storage[5].storage[0] = m2 r_a1 = rng_mrg.matVecModM(A1, s1, m1) r_a2 = rng_mrg.matVecModM(A2, s2, m2) f0.fn() r_b = f0.output_storage[0].value assert numpy.allclose(r_a1, r_b[:3]) assert numpy.allclose(r_a2, r_b[3:])
def multMatVect(v, A, m1, B, m2): # TODO : need description for parameter and return """ Multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2. Notes ----- The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time than running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix('A') s_sym = tensor.ivector('s') m_sym = tensor.iscalar('m') A2_sym = tensor.lmatrix('A2') s2_sym = tensor.ivector('s2') m2_sym = tensor.iscalar('m2') o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function( [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def test_blocksparse_gpu_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)], mode=mode_with_gpu) assert sum(1 for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuSparseBlockOuter)) == 1
def __init__(self, data, config, fast_predict=False): self.embedding_shapes = data.embedding_shapes; self.lstm_type = config.lstm_cell self.lstm_hidden_size = int(config.lstm_hidden_size) self.num_lstm_layers = int(config.num_lstm_layers) self.max_grad_norm = float(config.max_grad_norm) self.vocab_size = data.word_dict.size() self.label_space_size = data.label_dict.size() self.unk_id = data.unk_id # Initialize layers and parameters self.embedding_layer = EmbeddingLayer(data.embedding_shapes, data.embeddings) self.params = [p for p in self.embedding_layer.params] self.rnn_layers = [None] * self.num_lstm_layers for l in range(self.num_lstm_layers): input_dim = self.embedding_layer.output_size if l == 0 else self.lstm_hidden_size input_dropout = config.input_dropout_prob if (config.per_layer_dropout or l == 0) else 0.0 recurrent_dropout = config.recurrent_dropout_prob self.rnn_layers[l] = get_rnn_layer(self.lstm_type)(input_dim, self.lstm_hidden_size, input_dropout_prob=input_dropout, recurrent_dropout_prob=recurrent_dropout, fast_predict=fast_predict, prefix='lstm_{}'.format(l)) print (self.rnn_layers[l]) self.params.extend(self.rnn_layers[l].params) self.softmax_layer = SoftmaxLayer(self.lstm_hidden_size, self.label_space_size) self.params.extend(self.softmax_layer.params) # Build model # Shape of x: [seq_len, batch_size, num_features] self.x0 = tensor.ltensor3('x') self.y0 = tensor.lmatrix('y') self.mask0 = tensor.matrix('mask', dtype=floatX) self.is_train = tensor.bscalar('is_train') self.x = self.x0.dimshuffle(1, 0, 2) self.y = self.y0.dimshuffle(1, 0) self.mask = self.mask0.dimshuffle(1, 0) self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = self.embedding_layer.connect(self.x) self.rev_mask = self.mask[::-1] for l, rnn in enumerate(self.rnn_layers): outputs = rnn.connect(self.inputs[l], self.mask if l % 2 == 0 else self.rev_mask, self.is_train) self.inputs[l+1] = outputs[::-1] self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1]) self.pred0 = self.pred.reshape([self.x.shape[0], self.x.shape[1]]).dimshuffle(1, 0)
def Xtest_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = gpuarray_shared_constructor(W_val, context=test_ctx_name) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha') mode = mode.excluding('local_merge_blocksparse_output') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def Xtest_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = float32_shared_constructor(W_val) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha') mode = mode.excluding('local_merge_blocksparse_output') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)