我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用theano.tensor.inv()。
def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3): """Computes mean and std for batch then apply batch_normalization on batch. """ # TODO remove this if statement when Theano without # T.nnet.bn.batch_normalization_train is deprecated if not hasattr(T.nnet.bn, 'batch_normalization_train'): return _old_normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon) if gamma is None: if beta is None: gamma = ones_like(x) else: gamma = ones_like(beta) if beta is None: if gamma is None: beta = zeros_like(x) beta = zeros_like(gamma) normed, mean, stdinv = T.nnet.bn.batch_normalization_train( x, gamma, beta, reduction_axes, epsilon) return normed, mean, T.inv(stdinv ** 2)
def test_dim1(self): """Test the inversion of one permutation (int vector)""" p = ivector() inv = inverse_permutation(p) assert inv.dtype == p.dtype f_inverse = function([p], inv) # Generate a random permutation rng = numpy.random.RandomState(utt.fetch_seed()) p_val = rng.permutation(10).astype('int32') inv_val = f_inverse(p_val) # Check that the inverse of the inverse is the original permutation assert numpy.all(f_inverse(inv_val) == p_val) # Check that permutation(inverse) == inverse(permutation) = identity assert numpy.all(p_val[inv_val] == numpy.arange(10)) assert numpy.all(inv_val[p_val] == numpy.arange(10))
def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=0.0001): '''Computes mean and std for batch then apply batch_normalization on batch. ''' dev = theano.config.device use_cudnn = ndim(x) < 5 and reduction_axes == [0, 2, 3] and (dev.startswith('cuda') or dev.startswith('gpu')) if use_cudnn: broadcast_beta = beta.dimshuffle('x', 0, 'x', 'x') broadcast_gamma = gamma.dimshuffle('x', 0, 'x', 'x') try: normed, mean, stdinv = theano.sandbox.cuda.dnn.dnn_batch_normalization_train( x, broadcast_gamma, broadcast_beta, 'spatial', epsilon) var = T.inv(stdinv ** 2) return normed, T.flatten(mean), T.flatten(var) except AttributeError: pass var = x.var(reduction_axes) mean = x.mean(reduction_axes) target_shape = [] for axis in range(ndim(x)): if axis in reduction_axes: target_shape.append(1) else: target_shape.append(x.shape[axis]) target_shape = T.stack(*target_shape) broadcast_mean = T.reshape(mean, target_shape) broadcast_var = T.reshape(var, target_shape) broadcast_beta = T.reshape(beta, target_shape) broadcast_gamma = T.reshape(gamma, target_shape) normed = batch_normalization(x, broadcast_mean, broadcast_var, broadcast_beta, broadcast_gamma, epsilon) return normed, mean, var
def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3): """Computes mean and std for batch then apply batch_normalization on batch. """ # TODO remove this if statement when Theano without # T.nnet.bn.batch_normalization_train is deprecated if not hasattr(T.nnet.bn, 'batch_normalization_train'): return _old_normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon) normed, mean, stdinv = T.nnet.bn.batch_normalization_train( x, gamma, beta, reduction_axes, epsilon) return normed, mean, T.inv(stdinv ** 2)
def l2_norm_layer(ip): norm = T.inv(T.sqrt(((ip**2).sum(axis=(1,2,3))))) sq = T.reshape(norm, (batch_size,1,1,1)) op = ip*sq return op
def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3): '''Computes mean and std for batch then apply batch_normalization on batch. ''' # TODO remove this if statement when Theano without # T.nnet.bn.batch_normalization_train is deprecated if not hasattr(T.nnet.bn, 'batch_normalization_train'): return _old_normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon) normed, mean, stdinv = T.nnet.bn.batch_normalization_train( x, gamma, beta, reduction_axes, epsilon) return normed, mean, T.inv(stdinv ** 2)
def _old_normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3): '''Computes mean and std for batch then apply batch_normalization on batch. ''' dev = theano.config.device use_cudnn = ndim(x) < 5 and reduction_axes == [0, 2, 3] and (dev.startswith('cuda') or dev.startswith('gpu')) if use_cudnn: broadcast_beta = beta.dimshuffle('x', 0, 'x', 'x') broadcast_gamma = gamma.dimshuffle('x', 0, 'x', 'x') try: normed, mean, stdinv = theano.sandbox.cuda.dnn.dnn_batch_normalization_train( x, broadcast_gamma, broadcast_beta, 'spatial', epsilon) var = T.inv(stdinv ** 2) return normed, T.flatten(mean), T.flatten(var) except AttributeError: pass var = x.var(reduction_axes) mean = x.mean(reduction_axes) target_shape = [] for axis in range(ndim(x)): if axis in reduction_axes: target_shape.append(1) else: target_shape.append(x.shape[axis]) target_shape = T.stack(*target_shape) broadcast_mean = T.reshape(mean, target_shape) broadcast_var = T.reshape(var, target_shape) broadcast_beta = T.reshape(beta, target_shape) broadcast_gamma = T.reshape(gamma, target_shape) normed = batch_normalization(x, broadcast_mean, broadcast_var, broadcast_beta, broadcast_gamma, epsilon) return normed, mean, var # TODO remove this if statement when Theano without # T.nnet.bn.batch_normalization_test is deprecated
def get_output_for(self, input, style=None, **kwargs): mean = input.mean(self.axes) inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) pattern = [0, 1, 'x', 'x'] if style == None: pattern_params = ['x', 0, 'x', 'x'] beta = 0 if self.beta is None else self.beta.dimshuffle(pattern_params) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern_params) else: pattern_params = pattern beta = 0 if self.beta is None else self.beta[style].dimshuffle(pattern_params) gamma = 1 if self.gamma is None else self.gamma[style].dimshuffle(pattern_params) # if self.beta is not None: # beta = ifelse(T.eq(style.shape[0], 1), T.addbroadcast(beta, 0), beta) # if self.gamma is not None: # gamma = ifelse(T.eq(style.shape[0], 1), T.addbroadcast(gamma, 0), gamma) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def test(self): """ test optimization for consecutive functional inverses """ dx = numpy.random.rand(5, 4).astype("float32") self.assert_func_pair_optimized(T.deg2rad, T.rad2deg, dx) dx = numpy.random.rand(5, 4).astype("float32")*180 self.assert_func_pair_optimized(T.rad2deg, T.deg2rad, dx) # Test the other functional inverses dx = numpy.random.rand(5, 4).astype("float32") self.assert_func_pair_optimized(T.cosh, T.arccosh, dx) self.assert_func_pair_optimized(T.arcsinh, T.sinh, dx) self.assert_func_pair_optimized(T.arctanh, T.tanh, dx) self.assert_func_pair_optimized(T.inv, T.inv, dx) self.assert_func_pair_optimized(T.neg, T.neg, dx) cx = dx + complex(0, 1)*(dx + 0.01) self.assert_func_pair_optimized(T.conj, T.conj, cx, is_complex=True) # Test that non-inverse functions are ran normally self.assert_func_pair_optimized(T.conj, T.neg, cx, should_copy=False, is_complex=True) dx = numpy.random.rand(5, 4).astype("float32")+0.01 self.assert_func_pair_optimized(T.rad2deg, T.rad2deg, dx, should_copy=False) self.assert_func_pair_optimized(T.rad2deg, T.cosh, dx, should_copy=False)
def test_local_log_erfc(self): val = [-30, -27, -26, -11, -10, -3, -2, -1, 0, 1, 2, 3, 10, 11, 26, 27, 28, 30] if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]: # python mode don't like the inv(0) val.remove(0) val = numpy.asarray(val, dtype=config.floatX) x = T.vector('x') # their is some nan that will happear in the graph for the log of the negatives values mode = copy.copy(self.mode) mode.check_isfinite = False mode_fusion = copy.copy(self.mode_fusion) mode_fusion.check_isfinite = False f = theano.function([x], T.log(T.erfc(x)), mode=mode) assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes) assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX assert all(numpy.isfinite(f(val))) f = theano.function([x], T.log(T.erfc(-x)), mode=mode) assert len(f.maker.fgraph.apply_nodes) == 24, len(f.maker.fgraph.apply_nodes) assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX assert all(numpy.isfinite(f(-val))) f = theano.function([x], T.log(T.erfc(x)), mode=mode_fusion) assert len(f.maker.fgraph.apply_nodes) == 1, len(f.maker.fgraph.apply_nodes) assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX assert len(f.maker.fgraph.toposort()[0].fgraph.toposort()[ 0].op.scalar_op.fgraph.apply_nodes) == 22, len(f.maker.fgraph.toposort()[0].fgraph.toposort()[0].op.scalar_op.fgraph.apply_nodes) # TODO: fix this problem if theano.config.floatX == "float32" and theano.config.mode in ["DebugMode", "DEBUG_MODE"]: raise SkipTest('The python code upcast somewhere internally ' 'some value of float32 to python float for ' 'part of its computation. That make that the ' 'c and python code dont generate the same value. ' 'You can ignore this error.') assert all(numpy.isfinite(f(val)))
def __call__(self, x): axes = range(x.ndim) axes.remove(self.axis) axes = tuple(axes) input_mean = x.mean(axes) input_inv_std = T.inv(T.sqrt(x.var(axes) + self.epsilon)) if self.train: mean = input_mean inv_std = input_inv_std else: if self.collect: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std self.updates = {} if self.train: if self.collect: self.updates[self.mean] = (1 - self.alpha) * self.mean + self.alpha * input_mean self.updates[self.inv_std] = (1 - self.alpha) * self.inv_std + self.alpha * input_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(x.ndim - len(axes))) pattern = ['x' if input_axis in axes else next(param_axes) for input_axis in range(x.ndim)] # apply dimshuffle pattern to all parameters beta = self.beta.dimshuffle(pattern) gamma = self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (x - mean) * (gamma * inv_std) + beta return normalized
def __call__(self, x): mean = x.mean(1, keepdims=True) inv_std = T.inv(T.sqrt(x.var(1, keepdims=True) + self.epsilon)) pattern = ['x', 0] + ['x' for _ in xrange(x.ndim - 2)] beta = self.beta.dimshuffle(tuple(pattern)) gamma = self.gamma.dimshuffle(tuple(pattern)) # normalize normalized = (x - mean) * gamma * inv_std + beta return normalized
def nll_loss(self, mus, sigmas, corxy, pis, y_true): """ negative log likelihood loss of a 2d y_true coordinate in each of the Gaussians with parameters mus, sigmas, corxy, pis. Note that the mus, sigmas and corxy are shared between all samples and only pis are different for each sample. The formula for negative log likelihood is : \mathcal{L}(y \vert x) = - \log\bigg\{\sum_{k=1}^K \pi_k(x) \mathcal{N}\big(y \vert \mu_k(x), \Sigma_k(x)\big)\bigg\} The size of pis is n_batch x n_components, the size of mus is n_batch x n_components x 2, the size of sigmas is n_batch x n_components x 2 and the size of corxy is n_batch x n_components. The size of y_true is batch_size x 2. """ Y = y_true[:, :, np.newaxis] diff = Y - mus diffprod = T.prod(diff, axis=-2) sigmainvs = T.inv(sigmas) sigmainvprods = sigmainvs[:,0, :] * sigmainvs[:,1, :] sigmas2 = sigmas ** 2 corxy2 = corxy **2 diff2 = diff ** 2 diffsigma = diff2 * T.inv(sigmas2) diffsigmanorm = T.sum(diffsigma, axis=-2) z = diffsigmanorm - 2 * corxy * diffprod * sigmainvprods oneminuscorxy2inv = T.inv(1.0 - corxy2) ''' expterm = T.exp(-0.5 * z * oneminuscorxy2inv) probs = (0.5 / np.pi) * sigmainvprods * T.sqrt(oneminuscorxy2inv) * expterm loss = - T.log(T.sum(pis * probs, axis=1)) loss = T.mean(loss) ''' #logsumexp trick exponent = -0.5 * z * oneminuscorxy2inv #normalizer = (0.5 / np.pi) * sigmainvprods * T.sqrt(oneminuscorxy2inv) #when something is a * exp(x) = exp(x + loga) new_exponent = exponent + T.log(0.5 / np.pi) + T.log(sigmainvprods) + T.log(T.sqrt(oneminuscorxy2inv)) + T.log(pis) max_exponent = T.max(new_exponent ,axis=1, keepdims=True) mod_exponent = new_exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent),axis=1) log_gauss = max_exponent + T.log(gauss_mix) loss = -T.mean(log_gauss) return loss
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_std = TT.sqrt(input.var(self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics use_averages = kwargs.get('batch_norm_use_averages', deterministic) if use_averages: mean = self.mean std = self.std else: mean = input_mean std = input_std # Decide whether to update the stored averages update_averages = kwargs.get('batch_norm_update_averages', not deterministic) if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * input_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean std += 0 * running_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * TT.inv(std)) + beta return normalized
def _old_normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3): """Computes mean and std for batch then apply batch_normalization on batch. """ dev = theano.config.device use_cudnn = ndim(x) < 5 and reduction_axes == [0, 2, 3] and (dev.startswith('cuda') or dev.startswith('gpu')) if use_cudnn: broadcast_beta = beta.dimshuffle('x', 0, 'x', 'x') broadcast_gamma = gamma.dimshuffle('x', 0, 'x', 'x') try: normed, mean, stdinv = theano.sandbox.cuda.dnn.dnn_batch_normalization_train( x, broadcast_gamma, broadcast_beta, 'spatial', epsilon) normed = theano.tensor.as_tensor_variable(normed) mean = theano.tensor.as_tensor_variable(mean) stdinv = theano.tensor.as_tensor_variable(stdinv) var = T.inv(stdinv ** 2) return normed, T.flatten(mean), T.flatten(var) except AttributeError: pass var = x.var(reduction_axes) mean = x.mean(reduction_axes) target_shape = [] for axis in range(ndim(x)): if axis in reduction_axes: target_shape.append(1) else: target_shape.append(x.shape[axis]) target_shape = T.stack(*target_shape) broadcast_mean = T.reshape(mean, target_shape) broadcast_var = T.reshape(var, target_shape) broadcast_beta = T.reshape(beta, target_shape) broadcast_gamma = T.reshape(gamma, target_shape) normed = batch_normalization(x, broadcast_mean, broadcast_var, broadcast_beta, broadcast_gamma, epsilon) return normed, mean, var # TODO remove this if statement when Theano without # T.nnet.bn.batch_normalization_test is deprecated
def get_output_for(self, input, deterministic=False, **kwargs): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # Decide whether to use the stored averages or mini-batch statistics use_averages = kwargs.get('batch_norm_use_averages', deterministic) if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # Decide whether to update the stored averages update_averages = kwargs.get('batch_norm_update_averages', not deterministic) if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def get_output_for(self, input, deterministic=False, batch_norm_use_averages=None, batch_norm_update_averages=None): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # decide whether to use the sotred averages or mini-batch statistics if batch_norm_use_averages is None: batch_norm_use_averages = deterministic use_averages = batch_norm_use_averages if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # decide whether to update the stored averages if batch_norm_update_averages is None: batch_norm_update_averages = not deterministic update_averages = batch_norm_update_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics. running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(range(input.ndim - len(self.axes))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def _old_normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3): """Computes mean and std for batch then apply batch_normalization on batch. """ if gamma is None: gamma = ones_like(x) if beta is None: beta = zeros_like(x) dev = theano.config.device use_cudnn = ndim(x) < 5 and reduction_axes == [0, 2, 3] and (dev.startswith('cuda') or dev.startswith('gpu')) if use_cudnn: broadcast_beta = beta.dimshuffle('x', 0, 'x', 'x') broadcast_gamma = gamma.dimshuffle('x', 0, 'x', 'x') try: normed, mean, stdinv = theano.sandbox.cuda.dnn.dnn_batch_normalization_train( x, broadcast_gamma, broadcast_beta, 'spatial', epsilon) normed = theano.tensor.as_tensor_variable(normed) mean = theano.tensor.as_tensor_variable(mean) stdinv = theano.tensor.as_tensor_variable(stdinv) var = T.inv(stdinv ** 2) return normed, T.flatten(mean), T.flatten(var) except AttributeError: pass var = x.var(reduction_axes) mean = x.mean(reduction_axes) target_shape = [] for axis in range(ndim(x)): if axis in reduction_axes: target_shape.append(1) else: target_shape.append(x.shape[axis]) target_shape = T.stack(*target_shape) broadcast_mean = T.reshape(mean, target_shape) broadcast_var = T.reshape(var, target_shape) broadcast_beta = T.reshape(beta, target_shape) broadcast_gamma = T.reshape(gamma, target_shape) normed = batch_normalization(x, broadcast_mean, broadcast_var, broadcast_beta, broadcast_gamma, epsilon) return normed, mean, var # TODO remove this if statement when Theano without # T.nnet.bn.batch_normalization_test is deprecated
def test_dnn_batchnorm_train(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) if dnn.version(raises=False) < 5000: raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
def test_local_pow_specialize(): mode = theano.config.mode if mode == 'FAST_COMPILE': mode = 'FAST_RUN' mode = compile.mode.get_mode(mode) mode = mode.excluding('fusion') v = T.vector() val = numpy.arange(10, dtype=theano.config.floatX) val_no0 = numpy.arange(1, 10, dtype=theano.config.floatX) f = function([v], v ** 0, mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] assert nodes == [Shape_i(0), T.alloc] utt.assert_allclose(f(val), val ** 0) f = function([v], v ** 1, mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] nodes == [deep_copy_op] utt.assert_allclose(f(val), val ** 1) f = function([v], v ** (-1), mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] assert nodes == [T.inv] utt.assert_allclose(f(val_no0), val_no0 ** (-1)) f = function([v], v ** 2, mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] assert nodes == [T.sqr] utt.assert_allclose(f(val), val ** 2) f = function([v], v ** (-2), mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] assert len(nodes) == 2 assert nodes[0] == T.sqr assert isinstance(nodes[1].scalar_op, theano.scalar.basic.Inv) # assert nodes == [T.sqr,T.inv]#Why this don't work? utt.assert_allclose(f(val_no0), val_no0 ** (-2)) f = function([v], v ** (.5), mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] assert nodes == [T.sqrt] utt.assert_allclose(f(val), val ** (.5)) f = function([v], v ** (-.5), mode=mode) nodes = [node.op for node in f.maker.fgraph.toposort()] assert len(nodes) == 2 assert nodes[0] == T.sqrt assert isinstance(nodes[1].scalar_op, theano.scalar.basic.Inv) # assert nodes == [T.sqrt,T.inv]#Why this don't work? utt.assert_allclose(f(val_no0), val_no0 ** (-.5))
def test_batchnorm_train(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) if cuda.dnn.version() < (5000, 5000): raise SkipTest("batch normalization requires cudnn v5+") utt.seed_rng() for mode in ('per-activation', 'spatial'): for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector): x, scale, bias = (vartype(n) for n in ('x', 'scale', 'bias')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # forward pass out, x_mean, x_invstd = cuda.dnn.dnn_batch_normalization_train( x, scale, bias, mode, eps) # reference forward pass if mode == 'per-activation': axes = (0,) elif mode == 'spatial': axes = (0,) + tuple(range(2, ndim)) x_mean2 = x.mean(axis=axes, keepdims=True) x_invstd2 = T.inv(T.sqrt(x.var(axis=axes, keepdims=True) + eps)) scale2 = T.addbroadcast(scale, *axes) bias2 = T.addbroadcast(bias, *axes) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd, out2, x_mean2, x_invstd2] + grads + grads2, mode=mode_with_gpu) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32') Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32') Scale = numpy.random.randn(*param_shape).astype('float32') Bias = numpy.random.randn(*param_shape).astype('float32') outputs = f(X, Scale, Bias, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 3]) # out utt.assert_allclose(outputs[1], outputs[1 + 3]) # mean utt.assert_allclose(outputs[2], outputs[2 + 3]) # invstd # compare gradients utt.assert_allclose(outputs[6], outputs[6 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[7], outputs[7 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[8], outputs[8 + 3]) # dbias
def get_output(self, input, **kwargs): input_mean = input.mean(self.axes) input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) # input_inv_std = T.inv(T.sqrt(input.var(self.axes)) + 1E-6) # Decide whether to use the stored averages or mini-batch statistics use_averages = self.deterministic if use_averages: mean = self.mean inv_std = self.inv_std else: mean = input_mean inv_std = input_inv_std # Decide whether to update the stored averages update_averages = self.update_averages and not use_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_inv_std = theano.clone(self.inv_std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_inv_std.default_update = ((1 - self.alpha) * running_inv_std + self.alpha * input_inv_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean inv_std += 0 * running_inv_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) inv_std = inv_std.dimshuffle(pattern) # normalize normalized = (input - mean) * (gamma * inv_std) + beta return normalized
def get_output(self, input, **kwargs): input_mean = input.mean(self.axes) # input_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) input_std = T.sqrt(input.var(self.axes) + self.epsilon) # Decide whether to use the stored averages or mini-batch statistics use_averages = self.deterministic if use_averages: mean = self.mean std = self.std else: mean = input_mean std = input_std # Decide whether to update the stored averages update_averages = self.update_averages and not use_averages if update_averages: # Trick: To update the stored statistics, we create memory-aliased # clones of the stored statistics: running_mean = theano.clone(self.mean, share_inputs=False) running_std = theano.clone(self.std, share_inputs=False) # set a default update for them: running_mean.default_update = ((1 - self.alpha) * running_mean + self.alpha * input_mean) running_std.default_update = ((1 - self.alpha) * running_std + self.alpha * input_std) # and make sure they end up in the graph without participating in # the computation (this way their default_update will be collected # and applied, but the computation will be optimized away): mean += 0 * running_mean std += 0 * running_std # prepare dimshuffle pattern inserting broadcastable axes as needed param_axes = iter(list(range(input.ndim - len(self.axes)))) pattern = ['x' if input_axis in self.axes else next(param_axes) for input_axis in range(input.ndim)] # apply dimshuffle pattern to all parameters beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) mean = mean.dimshuffle(pattern) std = std.dimshuffle(pattern) # normalize # normalized = (input - mean) * (gamma * std) + beta normalized = batch_normalization( input, gamma, beta, mean, std, mode='low_mem') return self.activation(normalized)