我们从Python开源项目中,提取了以下36个代码示例,用于说明如何使用theano.tensor.ftensor3()。
def test_get_output_for(self): keys_var = T.ftensor3() values_var = T.ftensor3() mask_var = T.fmatrix() queries_var = T.ftensor3() keys_layer = L.InputLayer((None, None, 3), input_var=keys_var) values_layer = L.InputLayer((None, None, 5), input_var=values_var) mask_layer = L.InputLayer((None, None), input_var=mask_var) queries_layer = L.InputLayer((None, None, 7), input_var=queries_var) attention_layer = BahdanauKeyValueAttentionLayer([keys_layer, values_layer, mask_layer, queries_layer], 9) attention_outputs = L.get_output(attention_layer) fn = theano.function([keys_var, values_var, mask_var, queries_var], attention_outputs, on_unused_input='warn') keys = np.random.rand(32, 13, 3).astype(np.float32) values = np.random.rand(32, 13, 5).astype(np.float32) mask = np.random.rand(32, 13).astype(np.float32) queries = np.random.rand(32, 17, 7).astype(np.float32) _att = fn(keys, values, mask, queries) self.assertEqual((32, 17, 5), _att.shape)
def setup(self, bottom, top): # check input pair if len(bottom) != 2: raise Exception("Need two inputs to compute the dice. the result of the softmax and the ground truth.") if len(bottom[0].data.shape)==4 : self.prediction = T.fmatrix() self.ground_truth = T.fmatrix() elif len(bottom[0].data.shape)==5 : self.prediction = T.ftensor3() self.ground_truth = T.ftensor3() else: raise Exception('DiceIndexLayer only supports 2D or 3D data at the moment.') intersection = T.sum(self.prediction * self.ground_truth) denominator = T.sum(self.prediction) + T.sum(self.ground_truth) dice = 2 * intersection / (denominator + 0.00001) self.f = theano.function([self.prediction, self.ground_truth], dice) top[0].reshape(1)
def __init__(self, config): autoassign(locals()) self.margin_size = config.get('margin_size', 0.2) self.updater = util.Adam(max_norm=config['max_norm'], lr=config['lr']) self.Encode = Encoder(config['size_vocab'], config['size'], filter_length=config.get('filter_length', 6), filter_size=config.get('filter_size', 1024), stride=config.get('stride', 3), depth=config.get('depth', 1), recur_depth=config.get('recur_depth',1), drop_i=config.get('drop_i', 0.75), drop_s=config.get('drop_s', 0.25), residual=config.get('residual', False), seed=config.get('seed', 1)) self.Attn = Attention(config['size'], size=config.get('size_attn', 512)) self.ImgEncoder = Dense(config['size_target'], config['size']) self.inputs = [T.ftensor3()] self.target = T.fmatrix()
def set_model(self): argv = self.argv ##################### # Network variables # ##################### x = T.ftensor3() d = T.imatrix() n_in = self.init_emb.shape[1] n_h = argv.hidden n_y = self.arg_dict.size() reg = argv.reg ################# # Build a model # ################# say('\n\nMODEL: Unit: %s Opt: %s' % (argv.unit, argv.opt)) self.model = Model(argv=argv, x=x, y=d, n_in=n_in, n_h=n_h, n_y=n_y, reg=reg)
def test_frag_queue(): feature_strengths = T.fmatrix() feature_vects = T.ftensor3() peek_strengths, res = QueueManager.queue_transform(feature_strengths, feature_vects, True) grad_s, grad_v = theano.gradient.grad(T.sum(res[:,:,1]), [feature_strengths,feature_vects]) fun = theano.function([feature_strengths, feature_vects], [peek_strengths, res, grad_s, grad_v], allow_input_downcast=True) mystrengths = np.array([[0.3,0.3,0.2,0.6,0.3,0.7,0.2,1], [0.3,0.3,0.2,0.6,0.3,0.7,0.2,1]], np.float32) myvects = np.tile(np.eye(8, dtype=np.float32), (2,1,1)) mypeek, myres, mygs, mygv = fun(mystrengths, myvects) print(mypeek) print(myres) print(mygs) print(mygv) return mypeek, myres, mygs, mygv
def test_Strides3D(self): x = T.ftensor3('x') for axis in [0, 1, 2, None, -1, -2, -3]: a = np.random.random((42, 30, 25)).astype("float32") cumsum_function = theano.function([x], cumsum(x, axis=axis), mode=self.mode) slicings = [slice(None, None, None), # Normal strides slice(None, None, 2), # Stepped strides slice(None, None, -1), # Negative strides ] # Cartesian product of all slicings to test. for slicing in itertools.product(slicings, repeat=x.ndim): f = theano.function([x], cumsum(x[slicing], axis=axis), mode=self.mode) assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumsum)] utt.assert_allclose(np.cumsum(a[slicing], axis=axis), f(a)) utt.assert_allclose(np.cumsum(a[slicing], axis=axis), cumsum_function(a[slicing]))
def test_blocksparse_inplace_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)]) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer) else: assert f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
def test_sparseblockdot(self): """ Compares the numpy version of sparseblockgemv to sparse_block_dot. """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy( b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def test_sparseblockgemv(self): """ Compares the numpy and theano versions of sparseblockgemv. """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy( b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def test_sparseblockgemv_grad_shape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx) go = theano.grad(o.sum(), [b, W, h]) f = theano.function([W, h, iIdx, b, oIdx], go, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() # just make sure that it runs correcly and all the shapes are ok. b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val) assert b_g.shape == b_val.shape assert h_g.shape == h_val.shape assert W_g.shape == W_val.shape
def test_bugFunctioProvidesIntermediateNodesAsInputs(self): # This is a bug recently reported by Ilya # made it CPU friendly V = tensor.ftensor3('INPUT') orig = tensor.fmatrix('PARAM') # = gpu_from_host(orig) # <-- this doesn't work W = orig + 2 # <-- has same effect but it works on CPU as well # W = T.fmatrix('PARAM') # <-- this line works def one_step(v, W): o = v + 1 + W.sum() # <-- this doesn't work # o = v + 1 # <-- this line works return o OS, updates = theano.scan( fn=one_step, sequences=V, outputs_info=[None], non_sequences=[W]) O = OS.sum() + W.sum() # This bug manifests itself by not allowing the function to compile, # so if it compiles it means the test pass f = theano.function([V, W], O)
def test_batched_dot_errors(self): def fail(a_shp, b_shp): a = numpy.random.randn(* a_shp).astype(numpy.float32) b = numpy.random.randn(* b_shp).astype(numpy.float32) x = tensor.ftensor3() y = tensor.ftensor3() f = theano.function([x, y], batched_dot(x, y), mode=mode_with_gpu) f(a, b) # Different batch size self.assertRaises(RuntimeError, fail, (5, 4, 3), (6, 3, 2)) # Shape mismatch self.assertRaises(RuntimeError, fail, (5, 4, 3), (5, 2, 2))
def setup(self, bottom, top): # check input pair if len(bottom) != 2: raise Exception("Need two inputs to compute the dice. the result of the softmax and the ground truth.") try : params = eval(self.param_str) if "param1" in params : self.ignore_label = int(params["param1"]) except : pass if len(bottom[0].data.shape)==4 : self.prediction = T.fmatrix() self.ground_truth = T.fmatrix() elif len(bottom[0].data.shape)==5 : self.prediction = T.ftensor3() self.ground_truth = T.ftensor3() else: raise Exception('DiceIndexLayer only supports 2D or 3D data at the moment.') intersection = T.sum(self.prediction * self.ground_truth) denominator = T.sum(self.prediction) + T.sum(self.ground_truth) dice = 1 - 2 * intersection / (denominator + 0.00001) self.f = theano.function([self.prediction, self.ground_truth], dice) grad = T.grad(dice, wrt=self.prediction) self.g = theano.function([self.prediction, self.ground_truth], grad)
def run(self): print "Starting tests..." print for feedforward, test_info in self.dict_of_test.iteritems(): if len(test_info[0]) == 5: dtensor5 = T.TensorType('float32', (False,)*5) x = dtensor5('x') elif len(test_info[0]) == 4: x = T.ftensor4('x') elif len(test_info[0]) == 3: x = T.ftensor3('x') elif len(test_info[0]) == 2: x = T.fmatrix('x') print "Testing " + feedforward.prefix out = feedforward.fprop(x) f = theano.function([x], out) npx = np.random.random(test_info[0]).astype(np.float32) if self.mode is 'no_crash' : try: out_shape = f(npx).shape print out_shape except: print "Error encountered in this network" else : out_shape = f(npx).shape print out_shape print print "Finished" print
def make_node(self, x, x2, x3, x4, x5): # check that the theano version has support for __props__. # This next line looks like it has a typo, # but it's actually a way to detect the theano version # is sufficiently recent to support the use of __props__. assert hasattr(self, '_props'), "Your version of theano is too old to support __props__." x = tensor.as_tensor_variable(x) x2 = tensor.as_tensor_variable(x2) x3 = tensor.as_tensor_variable(x3) x4 = tensor.as_tensor_variable(x4) x5 = tensor.as_tensor_variable(x5) if prm.att_doc: if prm.compute_emb: td = tensor.itensor4().type() else: td = tensor.ftensor4().type() tm = tensor.ftensor3().type() else: if prm.compute_emb: td = tensor.itensor3().type() else: td = tensor.ftensor3().type() tm = tensor.fmatrix().type() return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \ tensor.fmatrix().type(), tensor.ivector().type()])
def setup_decode(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) feat_strengths = T.fmatrix() # dimensions: (batch, time, feature_size) feat_vects = T.ftensor3() n_batch, n_time = chord_roots.shape features = QueueManager.queue_transform(feat_strengths, feat_vects) specs = [lstmstack.prepare_sample_scan( start_pos=T.alloc(np.array(encoding.STARTING_POSITION, np.int32), (n_batch)), start_out=T.tile(encoding.initial_encoded_form(), (n_batch,1)), timestep=T.tile(T.arange(n_time), (n_batch,1)), cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_feature=features, deterministic_dropout=True ) for lstmstack, encoding in zip(self.dec_lstmstacks, self.encodings)] updates, all_chosen, all_probs, indiv_probs = helper_generate_from_spec(specs, self.dec_lstmstacks, self.encodings, self.srng, n_batch, n_time, self.bounds) self.decode_fun = theano.function( inputs=[chord_roots, chord_types, feat_strengths, feat_vects], updates=updates, outputs=all_chosen, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.decode_visualize_fun = theano.function( inputs=[chord_roots, chord_types, feat_strengths, feat_vects], updates=updates, outputs=[all_chosen, all_probs] + indiv_probs + [features], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def _setup_vars(self, sparse_input): '''Setup Theano variables for our network. Parameters ---------- sparse_input : bool Not used -- sparse inputs are not supported for recurrent networks. Returns ------- vars : list of theano variables A list of the variables that this network requires as inputs. ''' _warn_dimshuffle() assert not sparse_input, 'Theanets does not support sparse recurrent models!' self.src = TT.ftensor3('src') #self.src_mask = TT.imatrix('src_mask') self.src_mask = TT.matrix('src_mask') self.dst = TT.ftensor3('dst') self.labels = TT.imatrix('labels') self.weights = TT.matrix('weights') if self.weighted: return [self.src, self.src_mask, self.dst, self.labels, self.weights] return [self.src, self.dst]
def test_sparseblockgemvF(self): """ Test the fortan order for W (which can happen in the grad for some graphs). """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), tensor.DimShuffle((False, False, False, False), (0, 1, 3, 2)) (tensor.as_tensor_variable(W)), h, iIdx, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy( b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def test_dot_infershape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() self._compile_and_check([W, h, iIdx, b, oIdx], [sparse_block_dot(W, h, iIdx, b, oIdx)], self.gemv_data(), self.gemv_class)
def test_gemv_infershape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() self._compile_and_check( [W, h, iIdx, b, oIdx], [self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)], self.gemv_data(), self.gemv_class)
def test_outer_infershape(self): o = tensor.ftensor4() x = tensor.ftensor3() y = tensor.ftensor3() xIdx = tensor.imatrix() yIdx = tensor.imatrix() self._compile_and_check([o, x, y, xIdx, yIdx], [self.outer_op(o, x, y, xIdx, yIdx)], self.outer_data(), self.outer_class)
def test_memory_reuse_gpudimshuffle(self): # Test the memory pre-allocation feature in scan when one output is # the result of a GpuDimshuffle (because an optimization in # GpuDimshuffle can cause issues with the memory pre-allocation # where it falsely thinks that a pre-allocated memory region has # been used when it hasn't). def inner_fn(seq1, recurrent_out): temp = seq1 + recurrent_out.sum() output1 = temp.dimshuffle(1, 0) output2 = temp.sum() + recurrent_out return output1, output2 input1 = theano.tensor.ftensor3() init = theano.tensor.ftensor3() outputs_info = [None, init] out, _ = theano.scan(inner_fn, sequences=[input1], outputs_info=outputs_info, mode=self.mode_with_gpu) out1 = out[0].flatten() out2 = out[1].flatten() fct = theano.function([input1, init], [out1, out2], mode=self.mode_with_gpu) output = fct(numpy.ones((2, 1, 1), dtype="float32"), numpy.ones((1, 1, 1), dtype="float32")) expected_output = (numpy.array([2, 4], dtype="float32"), numpy.array([3, 7], dtype="float32")) utt.assert_allclose(output, expected_output)
def test_blocksparse_gpu_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)], mode=mode_with_gpu) assert sum(1 for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuSparseBlockOuter)) == 1
def __init__(self, computeGradient = True): super(CpuCtc,self).__init__() self.computeGradient = computeGradient self.costs = T.fvector(name="ctc_cost") if self.computeGradient: self.gradients = T.ftensor3(name="ctc_grad")
def name_model(): LSTM_SIZE = 300 layer1 = LSTM(len(CHARKEY), LSTM_SIZE, activation=T.tanh) layer2 = Layer(LSTM_SIZE, len(CHARKEY), activation=lambda x:x) params = layer1.params + [layer1.initial_hidden_state] + layer2.params ################# Train ################# train_data = T.ftensor3() n_batch = train_data.shape[0] train_input = T.concatenate([T.zeros([n_batch,1,len(CHARKEY)]),train_data[:,:-1,:]],1) train_output = train_data def _scan_train(last_out, last_state): new_state = layer1.activate(last_out, last_state) layer_out = layer1.postprocess_activation(new_state) layer2_out = layer2.activate(layer_out) new_out = T.nnet.softmax(layer2_out) return new_out, new_state outputs_info = [None, initial_state(layer1, n_batch)] (scan_outputs, scan_states), _ = theano.scan(_scan_train, sequences=[train_input.dimshuffle([1,0,2])], outputs_info=outputs_info) flat_scan_outputs = scan_outputs.dimshuffle([1,0,2]).reshape([-1,len(CHARKEY)]) flat_train_output = train_output.reshape([-1,len(CHARKEY)]) crossentropy = T.nnet.categorical_crossentropy(flat_scan_outputs, flat_train_output) loss = T.sum(crossentropy)/T.cast(n_batch,'float32') adam_updates = Adam(loss, params) train_fn = theano.function([train_data],loss,updates=adam_updates) ################# Eval ################# length = T.iscalar() srng = MRG_RandomStreams(np.random.randint(1, 1024)) def _scan_gen(last_out, last_state): new_state = layer1.activate(last_out, last_state) layer_out = layer1.postprocess_activation(new_state) layer2_out = layer2.activate(layer_out) new_out = T.nnet.softmax(T.shape_padleft(layer2_out)) sample = srng.multinomial(n=1,pvals=new_out)[0,:] sample = T.cast(sample,'float32') return sample, new_state initial_input = np.zeros([len(CHARKEY)], np.float32) outputs_info = [initial_input, layer1.initial_hidden_state] (scan_outputs, scan_states), updates = theano.scan(_scan_gen, n_steps=length, outputs_info=outputs_info) gen_fn = theano.function([length],scan_outputs,updates=updates) return layer1, layer2, train_fn, gen_fn
def test_GpuCumsum3D(self): block_max_size = self.max_threads_dim0 * 2 x = T.ftensor3('x') for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]): f = theano.function([x], cumsum(x, axis=axis), mode=self.mode) assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumsum)] # Extensive testing for the first 1025 sizes a_shape = [5, 5, 5] a_shape[shape_axis] = 1025 a = np.random.rand(*a_shape).astype("float32") slices = [slice(None), slice(None), slice(None)] for i in xrange(a.shape[shape_axis]): slices[shape_axis] = slice(i) fa = f(a[slices]) npa = np.cumsum(a[slices], axis=axis) utt.assert_allclose(npa, fa) # Use multiple GPU threadblocks (along accumulation axis) a_shape = [2, 2, 2] a_shape[shape_axis] = block_max_size + 2 a = np.random.random(a_shape).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) # Use multiple GPU gridblocks (not along accumulation axis) a_shape = [5, 5, 5] a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") if axis is None: # Avoid floating point error a = np.sign(a - 0.5).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) a_shape = [5, 5, 5] a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") if axis is None: # Avoid floating point error a = np.sign(a - 0.5).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) # Use recursive cumsum (along accumulation axis) a_shape = [3, 3, 3] a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2 a = np.random.random(a_shape).astype("float32") a = np.sign(a - 0.5).astype("float32") # Avoid floating point error utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def Xtest_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = gpuarray_shared_constructor(W_val, context=test_ctx_name) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha') mode = mode.excluding('local_merge_blocksparse_output') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def test_dnn_batchnorm_train(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) if dnn.version(raises=False) < 5000: raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
def test_batchnorm_inference(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) if dnn.version(raises=False) < 5000: raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale', 'bias', 'mean', 'var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out = dnn.dnn_batch_normalization_test(x, scale, bias, mean, var, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes) for t in (scale, bias, mean, var)) out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out, out2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') Mean = numpy.random.randn(*param_shape).astype('float32') Var = numpy.random.rand(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Mean, Var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[1]) # out # compare gradients utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def test_GpuCumsum3D(self): block_max_size = self.max_threads_dim0 * 2 x = T.ftensor3('x') for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]): f = theano.function([x], cumsum(x, axis=axis), mode=self.mode) assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumsum)] # Extensive testing for the first 1025 sizes a_shape = [5, 5, 5] a_shape[shape_axis] = 1025 a = np.random.rand(*a_shape).astype("float32") slices = [slice(None), slice(None), slice(None)] for i in xrange(a.shape[shape_axis]): slices[shape_axis] = slice(i) fa = f(a[slices]) npa = np.cumsum(a[slices], axis=axis) utt.assert_allclose(npa, fa) # Use multiple GPU threadblocks (along accumulation axis) a_shape = [2, 2, 2] a_shape[shape_axis] = block_max_size + 2 a = np.random.random(a_shape).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) # Use multiple GPU gridblocks (not along accumulation axis) a_shape = [5, 5, 5] a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") if axis is None: # Avoid floating point error a = np.sign(a - 0.5).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) a_shape = [5, 5, 5] a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") if axis is None: # Avoid floating point error a = np.sign(a - 0.5).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) # Use recursive cumsum (along accumulation axis) a_shape = [3, 3, 3] a_shape[shape_axis] = block_max_size * ( block_max_size + 1) + 2 a = np.random.random(a_shape).astype("float32") a = np.sign(a - 0.5).astype( "float32") # Avoid floating point error utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_batched_dot_correctness(self): # test both implementations for threshold in [0, 100]: batched_dot = GpuBatchedDot(stream_threshold=threshold) def cmp(a_shp, b_shp): a = numpy.random.randn(* a_shp).astype(numpy.float32) b = numpy.random.randn(* b_shp).astype(numpy.float32) x = tensor.ftensor3() y = tensor.ftensor3() f = theano.function([x, y], batched_dot(x, y), mode=mode_with_gpu) z0 = numpy.asarray(f(a, b)) ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum( a[:, :, :, None] * b[:, None, :, :], axis=-2) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum( a[:, :, :, None] * b[:, None, :, :], axis=-2) unittest_tools.assert_allclose(z0, z_test) unittest_tools.assert_allclose(z1, z_test) cmp((5, 4, 3), (5, 3, 2)) cmp((5, 3, 3), (5, 3, 3)) cmp((5, 2, 6), (5, 6, 3)) # Test dimensions of 0 cmp((0, 2, 6), (0, 6, 3)) cmp((5, 0, 3), (5, 3, 2)) cmp((5, 4, 0), (5, 0, 2)) cmp((5, 4, 3), (5, 3, 0)) cmp((0, 0, 0), (0, 0, 0)) # Test dimensions of 1 cmp((1, 2, 6), (1, 6, 3)) cmp((5, 1, 3), (5, 3, 2)) cmp((5, 4, 1), (5, 1, 2)) cmp((5, 4, 3), (5, 3, 1))
def Xtest_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = float32_shared_constructor(W_val) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = numpy.asarray(0.05, dtype='float32') upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha') mode = mode.excluding('local_merge_blocksparse_output') f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def test_batchnorm_train(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) if cuda.dnn.version() < (5000, 5000): raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
def test_batchnorm_inference(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) if cuda.dnn.version() < (5000, 5000): raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale', 'bias', 'mean', 'var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out = cuda.dnn.dnn_batch_normalization_test(x, scale, bias, mean, var, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes) for t in (scale, bias, mean, var)) out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out, out2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') Mean = numpy.random.randn(*param_shape).astype('float32') Var = numpy.random.rand(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Mean, Var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[1]) # out # compare gradients utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \ learning_rate_rl=0.005, batch_size=32, ment=0.1): # 2-layer MLP self.in_size = in_size # x and y coordinate self.out_size = out_size # up, down, right, left self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \ T.itensor3('am'), T.fvector('r') in_var = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1],self.in_size)) l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask) pol_in = T.fmatrix('pol-h') l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var) l_pol_rnn = L.GRULayer(l_in, n_hid, hid_init=pol_in, mask_input=l_mask_in) # B x H x D pol_out = L.get_output(l_pol_rnn)[:,-1,:] l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, nonlinearity=lasagne.nonlinearities.softmax) self.network = l_out self.params = L.get_all_params(self.network) # rl probs = L.get_output(self.network) # BH x A out_probs = T.reshape(probs, (input_var.shape[0],input_var.shape[1],self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs*act_mask).sum(axis=2) # B x H ep_probs = (act_probs*turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B self.loss = 0.-T.mean(ep_probs*reward_var + ment*H_probs) updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \ epsilon=1e-4) self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in] self.train_fn = theano.function(self.inps, self.loss, updates=updates) self.obj_fn = theano.function(self.inps, self.loss) self.act_fn = theano.function([input_var, turn_mask, pol_in], [out_probs, pol_out]) # sl sl_loss = 0.-T.mean(ep_probs) sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \ epsilon=1e-4) self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \ updates=sl_updates) self.sl_obj_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss)