我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用theano.tensor.constant()。
def get_output_for(self, input, deterministic=False, **kwargs): if not isinstance(input, (S.SparseVariable, S.SparseConstant, S.sharedvar.SparseTensorSharedVariable)): raise ValueError("Input for this layer must be sparse") if deterministic or self.p == 0: return input else: # Using Theano constant to prevent upcasting one = T.constant(1, name='one') retain_prob = one - self.p if self.rescale: input = S.mul(input, one/retain_prob) input_shape = self.input_shape if any(s is None for s in input_shape): input_shape = input.shape return input * self._srng.binomial(input_shape, p=retain_prob, dtype=input.dtype)
def l2_decay(self, gamma, layers=None): '''L2 decay cost. Args: gamma (float): l2 decay rate. layers (Optional[list]): layer numbers to do l2 decay on. Returns: T.tensor: L2 cost. ''' if layers is None: layers = range(self.n_layers) cost = T.constant(0.).astype(floatX) for l in layers: W = self.__dict__['W%d' % l] cost += gamma * (W ** 2).sum() return cost
def l2_decay(self, rate): rec_l2_cost = T.constant(0.).astype(floatX) gen_l2_cost = T.constant(0.).astype(floatX) for l in xrange(self.n_layers): rec_l2_cost += self.posteriors[l].l2_decay(rate) gen_l2_cost += self.conditionals[l].l2_decay(rate) rval = OrderedDict( rec_l2_cost=rec_l2_cost, gen_l2_cost=gen_l2_cost, cost = rec_l2_cost + gen_l2_cost ) return rval # --------------------------------------------------------------------------
def forward(self, inputtensor): inputimage = inputtensor[0] #print('conv2d.forward.type: {}'.format(inputimage.ndim)) if self.dc == 0.0: pass else: if 0 <self.dc <=1: _srng = RandomStreams(np.random.randint(1, 2147462579)) one = T.constant(1) retain_prob = one - self.dc mask_shape = self.w.shape mask = _srng.binomial(mask_shape, p=retain_prob, dtype=self.w.dtype) self.w = self.w * mask else: raise IndexError l3conv = T.nnet.conv2d(inputimage, self.w, border_mode=self.border, subsample=self.subsample) if self.need_bias: return ((l3conv+self.b.dimshuffle('x', 0, 'x', 'x')), ) else: return (l3conv, )
def forward(self, inputtensor): inputimage = inputtensor[0] if self.dc == 0.0: pass else: if 0 <self.dc <=1: _srng = RandomStreams(np.random.randint(1, 2147462579)) one = T.constant(1) retain_prob = one - self.dc mask_shape = self.w.shape mask = _srng.binomial(mask_shape, p=retain_prob, dtype=self.w.dtype) self.w = self.w * mask else: raise IndexError if self.need_bias: return ((T.dot(inputimage, self.w)+self.b), ) else: return (T.dot(inputimage, self.w),)
def RmsProp(cost, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): updates = OrderedDict() grads = T.grad(cost, params) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def EGD(cost, params, learning_rate = 0.33, constraint = 1.0): updates = OrderedDict() grads = T.grad(cost, params) U = T.constant(constraint) #first half of params rw_pos = T.exp(-learning_rate * U * grads[0]) rb_pos = T.exp(-learning_rate * U * grads[1]) #second half rw_neg = 1/rw_pos rb_neg = 1/rb_pos rs = [rw_pos, rb_pos, rw_neg, rb_neg] partition = T.sum(params[0]*rs[0]) + T.sum(params[1]*rs[1]) + T.sum(params[2]*rs[2]) + T.sum(params[3]*rs[3]) for param, r in zip(params, rs): updates[param] = U*param*r/partition return updates
def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true dropout and scaling is disabled, see notes """ if deterministic or self.p == 0: return input else: # Using theano constant to prevent upcasting one = T.constant(1) retain_prob = one - self.p if self.rescale: input /= retain_prob mask = _srng.binomial(input.shape[:2], p=retain_prob, dtype=theano.config.floatX) axes = [0, 1] + (['x'] * (input.ndim - 2)) mask = mask.dimshuffle(*axes) return input * mask
def temporal_padding_mask(mask, kernel_size, padding_size): """Pad the middle dimension of a 2D matrix with "padding" zeros left and right. Apologies for the inane API, but Theano makes this really hard. Code from https://github.com/fchollet/keras/blob/master/keras/backend/theano_backend.py x: (batch, length) """ mask_shape = mask.shape mask_sum = T.sum(mask, axis=1) output_length = mask_sum - kernel_size + 2 * padding_size + 1 max_output_length = mask_shape[1] - kernel_size + 2 * padding_size + 1 real_output_length = T.maximum(output_length, 1) range_base = T.arange(max_output_length) range_matrix = T.outer(T.ones((mask_shape[0],)), range_base) mask = (range_matrix < real_output_length[:, None]) * T.constant(1.0) return mask
def print_graph_linker(print_prog=True): if 1: imap = {None:'-'} def blah(i, node, thunk): imap[node] = str(i) if print_prog:# and node.op.__class__ is T.DimShuffle: if False and node.op == T.DimShuffle((), ['x', 'x'], inplace = True): print(node.op == T.DimShuffle((), ['x', 'x'], inplace=True), end=' ') print(node.inputs[0], type(node.inputs[0]), end=' ') print(node.inputs[0].equals(T.constant(2)), end=' ') outputs = node.outputs inputs = theano.gof.graph.inputs(outputs) print('node ', i, node, end=' ') print(':'.join([imap[inp.owner] for inp in node.inputs])) #print theano.sandbox.pprint.pp.process_graph(inputs, outputs) return theano.sandbox.wraplinker.WrapLinkerMany( [theano.gof.OpWiseCLinker()], [theano.sandbox.wraplinker.run_all ,blah #,theano.sandbox.wraplinker.numpy_notall_isfinite ]) else: return theano.gof.OpWiseCLinker()
def test_csm_unsorted(self): """ Test support for gradients of unsorted inputs. """ sp_types = {'csc': sp.csc_matrix, 'csr': sp.csr_matrix} for format in ['csr', 'csc', ]: for dtype in ['float32', 'float64']: x = tensor.tensor(dtype=dtype, broadcastable=(False,)) y = tensor.ivector() z = tensor.ivector() s = tensor.ivector() # Sparse advanced indexing produces unsorted sparse matrices a = sparse_random_inputs(format, (4, 3), out_dtype=dtype, unsorted_indices=True)[1][0] # Make sure it's unsorted assert not a.has_sorted_indices def my_op(x): y = tensor.constant(a.indices) z = tensor.constant(a.indptr) s = tensor.constant(a.shape) return tensor.sum( dense_from_sparse(CSM(format)(x, y, z, s) * a)) verify_grad_sparse(my_op, [a.data])
def test_constant_folding(): """ Test that constant folding get registered at fast_compile An error removed that registration during the registration. """ x = tensor.dvector() mode = theano.compile.get_mode("FAST_COMPILE").excluding("fusion") f = theano.function([x], [x * 2, x + x], mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 2 # Test that we do not crash when constant folding elemwise scalar # as they should not generate c code. x = tensor.constant(3) assert x.ndim == 0 mode = theano.compile.get_mode("FAST_COMPILE").excluding("fusion") f = theano.function([], [x * 2, x + x], mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 2 assert all([isinstance(n.op, DeepCopyOp) for n in topo])
def test_local_add_specialize(): # test of non-zero dimension a = tensor.vector() s = tensor.add(tensor.zeros_like(a)) assert local_add_specialize.transform(s.owner) # test of 0-d a = tensor.scalar() s = tensor.add(tensor.zeros_like(a)) assert local_add_specialize.transform(s.owner) # Test when the 0 input is forcing upcasting a = tensor.constant(0, dtype='int64') b = tensor.constant(1, dtype='int32') s = a + b transformed = local_add_specialize.transform(s.owner) assert transformed assert transformed[0].type == s.type
def test_lt(self): for dtype in self.dtypes: l = numpy.asarray([0., -1., 1.], dtype=dtype) r = numpy.asarray([0., 1., -1.], dtype=dtype) for x, y, err in [ (self.shared(l.astype(dtype)), self.shared(r.astype(dtype)), False), (l, self.shared(r.astype(dtype)), True), (tensor.constant(l), self.shared(r.astype(dtype)), False), (self.shared(l.astype(dtype)), r, False), (self.shared(l.astype(dtype)), tensor.constant(r), False), ]: try: fn = self.inplace_func([], x < y) v = fn() self.assertTrue(numpy.all(v == (l < r)), (v, (l < r))) except TypeError: assert err
def test_le(self): for dtype in self.dtypes: l = numpy.asarray([0., -1., 1.], dtype=dtype) r = numpy.asarray([0., 1., -1.], dtype=dtype) for x, y, err in [ (self.shared(l.astype(dtype)), self.shared(r.astype(dtype)), False), (l, self.shared(r.astype(dtype)), True), (tensor.constant(l), self.shared(r.astype(dtype)), False), (self.shared(l.astype(dtype)), r, False), (self.shared(l.astype(dtype)), tensor.constant(r), False), ]: try: fn = self.inplace_func([], x <= y) v = fn() self.assertTrue(numpy.all(v == (l <= r)), (v, (l <= r))) except TypeError: assert err
def test_eq(self): for dtype in self.dtypes: l = numpy.asarray([0., -1., 1.], dtype=dtype) r = numpy.asarray([0., 1., -1.], dtype=dtype) for x, y, err in [ (self.shared(l.astype(dtype)), self.shared(r.astype(dtype)), False), (l, self.shared(r.astype(dtype)), True), (tensor.constant(l), self.shared(r.astype(dtype)), False), (self.shared(l.astype(dtype)), r, False), (self.shared(l.astype(dtype)), tensor.constant(r), False), ]: try: fn = self.inplace_func([], eq(x, y)) v = fn() self.assertTrue(numpy.all(v == (l == r)), (v, (l == r))) except TypeError: assert err
def test_neq(self): for dtype in self.dtypes: l = numpy.asarray([0., -1., 1.], dtype=dtype) r = numpy.asarray([0., 1., -1.], dtype=dtype) for x, y, err in [ (self.shared(l.astype(dtype)), self.shared(r.astype(dtype)), False), (l, self.shared(r.astype(dtype)), True), (tensor.constant(l), self.shared(r.astype(dtype)), False), (self.shared(l.astype(dtype)), r, False), (self.shared(l.astype(dtype)), tensor.constant(r), False), ]: try: fn = self.inplace_func([], neq(x, y)) v = fn() self.assertTrue(numpy.all(v == (l != r)), (v, (l != r))) except TypeError: assert err
def test1(self): s = scal.constant(56) t = as_tensor_variable(s) self.assertTrue(t.owner.op is tensor_from_scalar) self.assertTrue(t.type.broadcastable == (), t.type.broadcastable) self.assertTrue(t.type.ndim == 0, t.type.ndim) self.assertTrue(t.type.dtype == s.type.dtype) v = eval_outputs([t]) self.assertTrue(v == 56, v) self.assertTrue(isinstance(v, numpy.ndarray)) self.assertTrue(v.shape == (), v.shape) g = grad(t, s) self.assertTrue(eval_outputs([g]) == 0.)
def test2(self): s = scal.constant(56.) t = as_tensor_variable(s) self.assertTrue(t.owner.op is tensor_from_scalar) self.assertTrue(t.type.broadcastable == (), t.type.broadcastable) self.assertTrue(t.type.ndim == 0, t.type.ndim) self.assertTrue(t.type.dtype == s.type.dtype) v = eval_outputs([t]) self.assertTrue(v == 56., v) self.assertTrue(isinstance(v, numpy.ndarray)) self.assertTrue(v.shape == (), v.shape) g = grad(t, s) self.assertTrue(eval_outputs([g]) == 1.)
def test0(self): tt = constant(56) # scal.constant(56) ss = scalar_from_tensor(tt) self.assertTrue(ss.owner.op is scalar_from_tensor) self.assertTrue(ss.type.dtype == tt.type.dtype) v = eval_outputs([ss]) self.assertTrue(v == 56, v) if config.cast_policy == 'custom': self.assertTrue(isinstance(v, numpy.int16)) elif config.cast_policy in ('numpy', 'numpy+floatX'): self.assertTrue(isinstance( v, getattr(numpy, str(numpy.asarray(56).dtype)))) else: raise NotImplementedError(config.cast_policy) self.assertTrue(v.shape == (), v.shape) tt = lscalar() ss = scalar_from_tensor(tt) g = ss.owner.op.grad([tt], [ss]) fff = function([tt], ss) v = fff(numpy.asarray(5)) self.assertTrue(v == 5, v) self.assertTrue(isinstance(v, numpy.int64)) self.assertTrue(v.shape == (), v.shape)
def _test_autocast_numpy(): """Called from `test_autocast`.""" assert config.cast_policy == 'numpy' # Go through some typical scalar values. def ok(z): assert tensor.constant(z).dtype == numpy.asarray(z).dtype for x in ([2 ** i for i in xrange(63)] + [0, L(0), L(1), L(2 ** 63 - 1)] + [0., 1., 1.1, 1.5]): n_x = numpy.asarray(x) # Make sure the data type is the same as the one found by numpy. ok(x) ok(-x) ok(x - 1) ok(-x + 1) ok(n_x)
def infer_shape(self, node, i_shapes): r, shp = node.inputs[0:2] # if shp is a constant array of len 0, then it means 'automatic shape' unknown_shape = len(getattr(shp, 'data', [0, 1, 2])) == 0 # if ndim_added == 0 and shape != () then shape if self.ndim_added == 0 and not unknown_shape: sample_shp = shp else: # if shape == () then it will depend on args # if ndim_added != 0 and shape != () then it will depend on args # Use the default infer_shape implementation. raise tensor.ShapeError() return [None, [sample_shp[i] for i in xrange(node.outputs[1].ndim)]]
def make_node(self, x, index): assert isinstance(x.type, TypedListType) if not isinstance(index, Variable): if isinstance(index, slice): index = Constant(SliceType(), index) return Apply(self, [x, index], [x.type()]) else: index = T.constant(index, ndim=0, dtype='int64') return Apply(self, [x, index], [x.ttype()]) if isinstance(index.type, SliceType): return Apply(self, [x, index], [x.type()]) elif isinstance(index, T.TensorVariable) and index.ndim == 0: assert index.dtype == 'int64' return Apply(self, [x, index], [x.ttype()]) else: raise TypeError('Expected scalar or slice as index.')
def test_constant(self): orig_compute_test_value = theano.config.compute_test_value try: theano.config.compute_test_value = 'raise' x = T.constant(numpy.random.rand(2, 3), dtype=config.floatX) y = theano.shared(numpy.random.rand(3, 6).astype(config.floatX), 'y') # should work z = T.dot(x, y) assert hasattr(z.tag, 'test_value') f = theano.function([], z) assert _allclose(f(), z.tag.test_value) # this test should fail x = T.constant(numpy.random.rand(2, 4), dtype=config.floatX) self.assertRaises(ValueError, T.dot, x, y) finally: theano.config.compute_test_value = orig_compute_test_value
def test_gpualloc(): ''' This tests tries to catch the scenario when, due to infer_shape, the input of the alloc changes from tensor scalar to a constant 1. In this case the original constracted broadcastable pattern will have a False for that dimension, but the new broadcastable pattern that will be inserted by gpualloc will have a True since it knows the dimension is 1 and therefore broadcastable. ''' x = theano.shared(numpy.ones(3, dtype='float32'), 'x') m = (x).dimshuffle(['x', 0]) v = tensor.alloc(1., *m.shape) f = theano.function([], v + x, mode=mode_with_gpu.excluding( "local_elemwise_alloc")) l = f.maker.fgraph.toposort() assert numpy.any([isinstance(y.op, cuda.GpuAlloc) for y in l])
def rmsprop_updates(grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): """ """ updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new try: updates[param] = lasagne.updates.norm_constraint( param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) , MAX_NORM ) except: updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def rmsprop_updates(grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6): updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) c = 0 for param, grad in zip(params, grads): print c value = param.get_value(borrow=True) accu = theano.shared(numpy.zeros(value.shape, dtype=value.dtype),broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new mid_up = param - (learning_rate * grad / (T.sqrt(accu_new + epsilon))) try: updates[param] = lasagne.updates.norm_constraint( mid_up , 40 , 0) except: updates[param] = mid_up c+=1 return updates
def __init__(self,rng, W=None,m=1.0, n_samples=50,shape=None,batch_size=1000): if W is None: W = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (shape[0] + shape[1])), high=numpy.sqrt(6. / (shape[0] + shape[1])), size=(shape[0], shape[1])), dtype=theano.config.floatX) self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True) self.batch_size = batch_size self.n_ht = W.shape[0] self.m = m self.n_samples = n_samples self.csrng = CURAND_RandomStreams(123) mask = self.csrng.uniform(size=(self.n_samples,1),low=0.0,high=1.0,dtype=theano.config.floatX) self.rfun = theano.function([],mask.argsort(axis=0)) self.alpha = T.constant(1.0/numpy.arange(start=1,stop=self.n_ht + 1,step=1)) self.weights = [self.W] self.biases = []
def get_updates_rmsprop(self, cost, params, rho=0.9, eps=1e-8): lr = self.lr print(' - RMSprop: lr = %.2e' % (lr.get_value(borrow=True))) one = T.constant(1.) grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): value = p.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) accu_new = rho * accu + (one - rho) * g ** 2 gradient_scaling = T.sqrt(accu_new + eps) g = g / gradient_scaling updates.append((accu, accu_new)) updates.append((p, p - lr * g)) return updates
def careful_rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6, grad_clipping=1.0e-2): """ RMSProp with gradient clipping. :param grad_clipping: maximal norm of gradient, if norm of the actual gradient exceeds this values it is rescaled. :return: updates """ grads = get_or_compute_grads(loss_or_grads, params) updates = OrderedDict() grads = total_norm_constraint(grads, max_norm=grad_clipping, epsilon=epsilon) # Using theano constant to prevent upcasting of float32 one = T.constant(1) for param, grad in zip(params, grads): value = param.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) accu_new = rho * accu + (one - rho) * grad ** 2 updates[accu] = accu_new updates[param] = param - (learning_rate * grad / T.sqrt(accu_new + epsilon)) return updates
def adadelta(loss, params, learning_rate=1.0, rho=0.95, epsilon=1e-6): grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params) grad_updates = [(grad_shared_flat, flat_grad)] one = T.constant(1) param_updates = list() for p, g in zip(params, unflat_grads): value = p.get_value(borrow=True) accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) delta_accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) accu_new = rho * accu + (one - rho) * g ** 2 update = g * T.sqrt(delta_accu + epsilon) / T.sqrt(accu_new + epsilon) delta_accu_new = rho * delta_accu + (one - rho) * update ** 2 param_updates += [(accu, accu_new)] param_updates += [(p, p - learning_rate * update)] param_updates += [(delta_accu, delta_accu_new)] return grad_updates, param_updates, grad_shared_flat
def adam(loss, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8): grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params) grad_updates = [(grad_shared_flat, flat_grad)] t_prev = theano.shared(np.array(0, dtype=theano.config.floatX)) one = T.constant(1) t = t_prev + one a_t = learning_rate * T.sqrt(one - beta2 ** t) / (one - beta1 ** t) param_updates = list() for p, g in zip(params, unflat_grads): value = p.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g v_t = beta2 * v_prev + (one - beta2) * g ** 2 step = a_t * m_t / (T.sqrt(v_t) + epsilon) param_updates += [(m_prev, m_t), (v_prev, v_t), (p, p - step)] param_updates += [(t_prev, t)] return grad_updates, param_updates, grad_shared_flat
def adamax(loss, params, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8): grad_shared_flat, flat_grad, unflat_grads = flat_unflat_grads(loss, params) grad_updates = [(grad_shared_flat, flat_grad)] t_prev = theano.shared(np.array(0, dtype=theano.config.floatX)) one = T.constant(1) t = t_prev + one a_t = learning_rate / (one - beta1 ** t) param_updates = list() for p, g in zip(params, unflat_grads): value = p.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g u_t = T.maximum(beta2 * u_prev, abs(g)) step = a_t * m_t / (u_t + epsilon) param_updates += [(m_prev, m_t), (u_prev, u_t), (p, p - step)] param_updates += [(t_prev, t)] return grad_updates, param_updates, grad_shared_flat
def build_model(model_): global fn_predict, fn_record global g_ozer, g_mdl g_ozer = dict(simple=VanillaSGD, adam=AdamSGD)[OZER]() g_ozer.lr = LEARN_RATE s_x = T.tensor4('x') s_y = T.ivector('y') s_pdpo = T.scalar() s_out = model_(s_x, s_pdpo) s_y_onehot = T.extra_ops.to_one_hot(s_y, len(g_dataset.label_map)) s_loss = T.mean(-s_y_onehot*T.log(s_out + 1e-3)) s_accr = T.mean( T.switch( T.eq(T.argmax(s_out, axis=1), T.argmax(s_y_onehot, axis=1)), 1, 0)) no_dropout = [(s_pdpo, T.constant(0., dtype=th.config.floatX))] fn_predict = th.function( [s_x, s_y], {'pred':s_out, 'accr':s_accr, 'loss':s_loss}, givens=no_dropout, profile=PROFILE) rec_fetches = { 'x': s_x, 'y': s_y, 'pred': s_out} rec_fetches.update(g_mdl.params_di) fn_record = th.function( [s_x, s_y], rec_fetches, givens=no_dropout, profile=PROFILE) g_ozer.compile( [s_x, s_y], s_loss, g_mdl.params_di.values(), fetches_={'pred': s_out, 'loss': s_loss, 'accr': s_accr}, givens_=[(s_pdpo, T.constant(TRAIN_PDPO, dtype=th.config.floatX))], profile_=PROFILE)
def get_updates(self, learning_rate, params, grads, lr_scalers): """Compute the parameters' updates. """ t_prev = theano.shared(floatX(0.)) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) t = t_prev + 1 a_t = learning_rate*T.sqrt(one-self.beta2**t)/(one-self.beta1**t) for param, g_t in zip(params, grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = self.beta1*m_prev + (one-self.beta1)*g_t v_t = self.beta2*v_prev + (one-self.beta2)*g_t**2 step = a_t*m_t/(T.sqrt(v_t) + self.epsilon) updates[m_prev] = m_t updates[v_prev] = v_t new_param = param - step if self.max_colm_norm and param.name in ["W", "w"]: new_param_final = norm_constraint(tensor_var=new_param, max_norm=self.max_norm) else: new_param_final = new_param updates[param] = new_param_final updates[t_prev] = t return updates
def get_updates(self, learning_rate, params, grads, lr_scalers): """Compute the parameters' updates. """ t_prev = theano.shared(floatX(0.)) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = T.constant(1) t = t_prev + 1 a_t = learning_rate/(one-self.beta1**t) for param, g_t in zip(params, grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = self.beta1*m_prev + (one-self.beta1)*g_t u_t = T.maximum(self.beta2*u_prev, abs(g_t)) step = a_t*m_t/(u_t + self.epsilon) updates[m_prev] = m_t updates[u_prev] = u_t new_param = param - step if self.max_colm_norm and param.name in ["W", "w"]: new_param_final = norm_constraint(tensor_var=new_param, max_norm=self.max_norm) else: new_param_final = new_param updates[param] = new_param_final updates[t_prev] = t return updates
def dropout_from_layer(rng, layer_output, p): """ p: float. The probablity of dropping a unit. """ srng = theano.tensor.shared_randomstreams.RandomStreams( rng.randint(99999)) one = T.constant(1) retain_prob = one - p mask = srng.binomial(n=1, p=retain_prob, size=layer_output.shape, dtype=layer_output.dtype) output = layer_output * mask return output
def __init__(self, rng, input, dropout_rate, rescale): """ rescale: Boolean. Can be only used when applying dropout. """ if rescale: one = T.constant(1) retain_prob = one - dropout_rate input /= retain_prob super(DropoutIdentityHiddenLayer, self).__init__(rng=rng, input=input) if dropout_rate > 0.: self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
def __init__(self, rng, input, n_in, n_out, dropout_rate, rescale, W=None, b=None, b_v=0., activation=None): """ rescale: Boolean. Can be only used when applying dropout. """ if rescale: one = T.constant(1) retain_prob = one - dropout_rate input /= retain_prob super(DropoutHiddenLayer, self).__init__( input=input, n_in=n_in, n_out=n_out, W=W, b=b, activation=activation, rng=rng) if dropout_rate > 0.: self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
def step_infer(self, *params): model = self.model params = list(params) rs = params[:model.n_layers] qs = params[model.n_layers:2*model.n_layers] y = params[2*model.n_layers] params = params[1+2*model.n_layers:] prior_params = model.get_prior_params(*params) hs = [] new_qs = [] for l, (q, r) in enumerate(zip(qs, rs)): h = (r <= q[None, :, :]).astype(floatX) hs.append(h) ys = [y[None, :, :]] + hs[:-1] p_ys = [model.p_y_given_h(h, l, *params) for l, h in enumerate(hs)] log_ph = -model.prior.step_neg_log_prob(hs[-1], *prior_params) log_py_h = T.constant(0.).astype(floatX) log_qh = T.constant(0.).astype(floatX) for l in xrange(model.n_layers): log_py_h += -model.conditionals[l].neg_log_prob(ys[l], p_ys[l]) log_qh += -model.posteriors[l].neg_log_prob(hs[l], qs[l][None, :, :]) log_p = log_py_h + log_ph - log_qh w_tilde = get_w_tilde(log_p) cost = -log_p.mean() for q, h in zip(qs, hs): q_ = (w_tilde[:, :, None] * h).sum(axis=0) new_qs.append(self.inference_rate * q_ + (1 - self.inference_rate) * q) return tuple(new_qs) + (cost,)
def params_infer(self): return [T.constant(self.momentum).astype(floatX)]
def entropy(self): return T.constant(0.).astype(floatX)
def get_L2_weight_cost(self, gamma, layers=None): if layers is None: layers = range(self.n_layers) cost = T.constant(0.).astype(floatX) for l in layers: W = self.__dict__['W%d' % l] cost += gamma * (W ** 2).sum() return cost
def __init__(self, ntimes = False, n = TT.constant(0)): """ :type ntimes: bool :param ntimes: If the last state needs to be repeated `n` times :type n: int, theano constant, None :param n: how many times the last state is repeated """ self.ntimes = ntimes self.n = n super(LastState, self).__init__(0, 0, None)
def const(value): return TT.constant(numpy.asarray(value, dtype=theano.config.floatX))