Merge pull request #808 from SimonKohl/add_instance_norm

Implementation of instance normalization and layer normalization
Lasagne · Jun 10, 2018 · a61b76f · a61b76f
2 parents 7992faa + 18f7ee0
commit a61b76f
Show file tree

Hide file tree

Showing 4 changed files with 500 additions and 0 deletions.
diff --git a/docs/modules/layers.rst b/docs/modules/layers.rst
@@ -166,6 +166,9 @@
  LocalResponseNormalization2DLayer
  BatchNormLayer
  batch_norm
+ StandardizationLayer
+ instance_norm
+ layer_norm
 
 
 .. rubric:: :doc:`layers/embedding`

diff --git a/docs/modules/layers/normalization.rst b/docs/modules/layers/normalization.rst
@@ -13,3 +13,10 @@ Normalization layers
 
 .. autofunction:: batch_norm
 
+.. autoclass:: StandardizationLayer
+ :members:
+
+.. autofunction:: instance_norm
+
+.. autofunction:: layer_norm
+
diff --git a/lasagne/layers/normalization.py b/lasagne/layers/normalization.py
@@ -50,6 +50,9 @@
  "LocalResponseNormalization2DLayer",
  "BatchNormLayer",
  "batch_norm",
+ "StandardizationLayer",
+ "instance_norm",
+ "layer_norm",
 ]
 
 
@@ -376,3 +379,243 @@ def batch_norm(layer, **kwargs):
  nonlin_name = bn_name and bn_name + '_nonlin'
  layer = NonlinearityLayer(layer, nonlinearity, name=nonlin_name)
  return layer
+
+
+class StandardizationLayer(Layer):
+ """
+ Standardize inputs to zero mean and unit variance:
+
+ .. math::
+ y_i = \\frac{x_i - \\mu_i}{\\sqrt{\\sigma_i^2 + \\epsilon}}
+
+ The mean :math:`\\mu_i` and variance :math:`\\sigma_i^2` are computed and
+ shared across a given set of axes. In contrast to batch normalization,
+ these axes usually do not include the batch dimension, so each example is
+ normalized independently from other examples in the minibatch, both during
+ training and testing.
+
+ The :class:`StandardizationLayer` can be employed to realize instance
+ normalization [1]_ and layer normalization [2]_, for both of which
+ convenience functions (:func:`instance_norm` and :func:`layer_norm`) are
+ available.
+
+ Parameters
+ ----------
+ incoming : a :class:`Layer` instance or a tuple
+ The layer feeding into this layer, or the expected input shape
+ axes : 'auto', 'spatial', 'features', int or tuple of int
+ The axis or axes to normalize over. If ``'auto'`` (the default),
+ two-dimensional inputs are normalized over the last dimension (i.e.,
+ this will normalize over units for dense layers), input tensors with
+ more than two dimensions are normalized over all but the first two
+ dimensions (i.e., this will normalize over all spatial dimensions for
+ convolutional layers). If ``'spatial'``, will normalize over all but
+ the first two dimensions. If ``'features'``, will normalize over all
+ but the first dimension.
+ epsilon : scalar
+ Small constant :math:`\\epsilon` added to the variance before taking
+ the square root and dividing by it, to avoid numerical problems
+ **kwargs
+ Any additional keyword arguments are passed to the :class:`Layer`
+ superclass.
+
+ See also
+ --------
+ instance_norm : Convenience function to apply instance normalization
+ layer_norm : Convenience function to apply layer normalization to a layer
+
+ References
+ ----------
+ .. [1] Ulyanov, D., Vedaldi, A., & Lempitsky, V. (2016):
+ Instance Normalization: The Missing Ingredient for Fast Stylization.
+ https://arxiv.org/abs/1607.08022.
+
+ .. [2] Ba, J., Kiros, J., & Hinton, G. (2016):
+ Layer normalization.
+ https://arxiv.org/abs/1607.06450.
+ """
+ def __init__(self, incoming, axes='auto', epsilon=1e-4, **kwargs):
+ super(StandardizationLayer, self).__init__(incoming, **kwargs)
+
+ if axes == 'auto':
+ # default: normalize across 2nd dimension for 2D inputs
+ # and across all but the first two axes for 3D+ inputs
+ if len(self.input_shape) == 2:
+ axes = (1,)
+ else:
+ axes = tuple(range(2, len(self.input_shape)))
+ elif axes == 'spatial':
+ # normalize over spatial dimensions only,
+ # separate for each instance in the batch
+ axes = tuple(range(2, len(self.input_shape)))
+ elif axes == 'features':
+ # normalize over features and spatial dimensions,
+ # separate for each instance in the batch
+ axes = tuple(range(1, len(self.input_shape)))
+ elif isinstance(axes, int):
+ axes = (axes,)
+ self.axes = axes
+
+ self.epsilon = epsilon
+
+ def get_output_for(self, input, **kwargs):
+ mean = input.mean(self.axes, keepdims=True)
+ std = T.sqrt(input.var(self.axes, keepdims=True) + self.epsilon)
+ return (input - mean) / std
+
+
+def instance_norm(layer, learn_scale=True, learn_bias=True, **kwargs):
+ """
+ Apply instance normalization to an existing layer. This is a convenience
+ function modifying an existing layer to include instance normalization: It
+ will steal the layer's nonlinearity if there is one (effectively
+ introducing the normalization right before the nonlinearity), remove
+ the layer's bias if there is one (because it would be effectless), and add
+ a :class:`StandardizationLayer` and :class:`NonlinearityLayer` on top.
+ Depending on the given arguments, an additional :class:`ScaleLayer` and
+ :class:`BiasLayer` will be inserted in between.
+
+ In effect, it will separately standardize each feature map of each input
+ example, followed by an optional scale and shift learned per channel,
+ followed by the original nonlinearity, as proposed in [1]_.
+
+ Parameters
+ ----------
+ layer : A :class:`Layer` instance
+ The layer to apply the normalization to; note that it will be
+ irreversibly modified as specified above
+ learn_scale : bool (default: True)
+ Whether to add a ScaleLayer after the StandardizationLayer
+ learn_bias : bool (default: True)
+ Whether to add a BiasLayer after the StandardizationLayer (or the
+ optional ScaleLayer)
+ **kwargs
+ Any additional keyword arguments are passed on to the
+ :class:`StandardizationLayer` constructor.
+
+ Returns
+ -------
+ StandardizationLayer, ScaleLayer, BiasLayer, or NonlinearityLayer instance
+ The last layer stacked on top of the given modified `layer` to
+ implement instance normalization with optional scaling and shifting.
+
+ Examples
+ --------
+ Just wrap any layer into a :func:`instance_norm` call on creating it:
+
+ >>> from lasagne.layers import InputLayer, Conv2DLayer, instance_norm
+ >>> from lasagne.nonlinearities import rectify
+ >>> l1 = InputLayer((10, 3, 28, 28))
+ >>> l2 = instance_norm(Conv2DLayer(l1, num_filters=64, filter_size=3,
+ ... nonlinearity=rectify))
+
+ This introduces instance normalization right before its nonlinearity:
+
+ >>> from lasagne.layers import get_all_layers
+ >>> [l.__class__.__name__ for l in get_all_layers(l2)]
+ ['InputLayer', 'Conv2DLayer', 'StandardizationLayer', \
+'ScaleLayer', 'BiasLayer', 'NonlinearityLayer']
+
+ References
+ ----------
+ .. [1] Ulyanov, D., Vedaldi, A., & Lempitsky, V. (2016):
+ Instance Normalization: The Missing Ingredient for Fast Stylization.
+ https://arxiv.org/abs/1607.08022.
+ """
+ nonlinearity = getattr(layer, 'nonlinearity', None)
+ if nonlinearity is not None:
+ layer.nonlinearity = nonlinearities.identity
+ if hasattr(layer, 'b') and layer.b is not None:
+ del layer.params[layer.b]
+ layer.b = None
+ in_name = (kwargs.pop('name', None) or
+ (getattr(layer, 'name', None) and layer.name + '_in'))
+ layer = StandardizationLayer(layer, axes='spatial', name=in_name, **kwargs)
+ if learn_scale:
+ from .special import ScaleLayer
+ scale_name = in_name and in_name + '_scale'
+ layer = ScaleLayer(layer, shared_axes='auto', name=scale_name)
+ if learn_bias:
+ from .special import BiasLayer
+ bias_name = in_name and in_name + '_bias'
+ layer = BiasLayer(layer, shared_axes='auto', name=bias_name)
+ if nonlinearity is not None:
+ from .special import NonlinearityLayer
+ nonlin_name = in_name and in_name + '_nonlin'
+ layer = NonlinearityLayer(layer, nonlinearity, name=nonlin_name)
+ return layer
+
+
+def layer_norm(layer, **kwargs):
+ """
+ Apply layer normalization to an existing layer. This is a convenience
+ function modifying an existing layer to include layer normalization: It
+ will steal the layer's nonlinearity if there is one (effectively
+ introducing the normalization right before the nonlinearity), remove
+ the layer's bias if there is one, and add a :class:`StandardizationLayer`,
+ :class:`ScaleLayer`, :class:`BiasLayer`, and :class:`NonlinearityLayer` on
+ top.
+
+ In effect, it will standardize each input example across the feature and
+ spatial dimensions (if any), followed by a scale and shift learned per
+ feature, followed by the original nonlinearity, as proposed in [1]_.
+
+ Parameters
+ ----------
+ layer : A :class:`Layer` instance
+ The layer to apply the normalization to; note that it will be
+ irreversibly modified as specified above
+ **kwargs
+ Any additional keyword arguments are passed on to the
+ :class:`StandardizationLayer` constructor.
+
+ Returns
+ -------
+ StandardizationLayer or NonlinearityLayer instance
+ The last layer stacked on top of the given modified `layer` to
+ implement layer normalization with feature-wise scaling and shifting.
+
+ Examples
+ --------
+ Just wrap any layer into a :func:`layer_norm` call on creating it:
+
+ >>> from lasagne.layers import InputLayer, DenseLayer, layer_norm
+ >>> from lasagne.nonlinearities import rectify
+ >>> l1 = InputLayer((10, 28))
+ >>> l2 = layer_norm(DenseLayer(l1, num_units=64, nonlinearity=rectify))
+
+ This introduces layer normalization right before its nonlinearity:
+
+ >>> from lasagne.layers import get_all_layers
+ >>> [l.__class__.__name__ for l in get_all_layers(l2)]
+ ['InputLayer', 'DenseLayer', 'StandardizationLayer', \
+'ScaleLayer', 'BiasLayer', 'NonlinearityLayer']
+
+ References
+ ----------
+ .. [1] Ba, J., Kiros, J., & Hinton, G. (2016):
+ Layer normalization.
+ https://arxiv.org/abs/1607.06450.
+ """
+ nonlinearity = getattr(layer, 'nonlinearity', None)
+ if nonlinearity is not None:
+ layer.nonlinearity = nonlinearities.identity
+ ln_name = (kwargs.pop('name', None) or
+ (getattr(layer, 'name', None) and layer.name + '_ln'))
+ if hasattr(layer, 'b') and layer.b is not None:
+ del layer.params[layer.b]
+ layer.b = None
+ layer = StandardizationLayer(layer, axes='features', name=ln_name,
+ **kwargs)
+ scale_name = ln_name and ln_name + '_scale'
+ from .special import ScaleLayer
+ layer = ScaleLayer(layer, shared_axes='auto', name=scale_name)
+ from .special import BiasLayer
+ bias_name = ln_name and ln_name + '_bias'
+ layer = BiasLayer(layer, shared_axes='auto', name=bias_name)
+
+ if nonlinearity is not None:
+ from .special import NonlinearityLayer
+ nonlin_name = ln_name and ln_name + '_nonlin'
+ layer = NonlinearityLayer(layer, nonlinearity, name=nonlin_name)
+ return layer