Optimize training update operators

Summary: This commit fuses the weight decay and mixed precision conversion into update kernels to get lower training latency.

Optimize training update operators
Summary: This commit fuses the weight decay and mixed precision conversion into update kernels to get lower training latency.
Ting PAN
Commit 494774d3 authored Dec 31, 2021 by Ting PAN
Showing with 2315 additions and 2003 deletions
caffe/core/layers/common.py
caffe/core/layers/data.py
cmake/MiscCheck.cmake
docs/api/python/dragon.rst
docs/api/python/dragon/cuda.rst
docs/api/python/dragon/channel_affine.rst → docs/api/python/dragon/cuda/set_cublas_flags.rst
docs/api/python/dragon/math.rst
docs/api/python/dragon/math/lp_normalize.rst → docs/api/python/dragon/math/affine.rst
docs/api/python/dragon/channel_normalize.rst → docs/api/python/dragon/math/atan2.rst
docs/api/python/dragon/nn.rst
docs/api/python/dragon/nn/channel_norm.rst
docs/api/python/dragon/channel_shuffle.rst → docs/api/python/dragon/nn/channel_shuffle.rst
docs/api/python/dragon/nn/lp_norm.rst
docs/api/python/tensorflow/math.rst
docs/api/python/torch/channel_normalize.rst → docs/api/python/tensorflow/math/atan2.rst
docs/api/python/torch.rst
docs/api/python/torch/Tensor_.rst
docs/api/python/torch/channel_affine.rst → docs/api/python/torch/atan2.rst
docs/api/python/torch/backends.rst
docs/api/python/torch/backends/cuda.rst
--- a/caffe/core/layers/common.py
+++ b/caffe/core/layers/common.py
@@ -418,9 +418,9 @@ class Normalize(Layer):
    def __call__(self, bottom):
        if len(self.blobs) == 0:
            self.build(bottom)
-        outputs = [normalization_ops.lp_normalize(bottom, **self.norm_args)]
+        outputs = [normalization_ops.lp_norm(bottom, **self.norm_args)]
        outputs += [blob['data'] for blob in self.blobs]
-        return array_ops.channel_affine(outputs, **self.scale_args)
+        return math_ops.affine(outputs, **self.scale_args)


 class Permute(Layer):
@@ -591,8 +591,7 @@ class Scale(Layer):
        param = layer_param.scale_param
        self.axis = param.axis
        self.num_axes = param.num_axes
-        end_axis = -1 if self.num_axes < 1 else self.axis + self.num_axes - 1
-        self.call_args = {'axis': self.axis, 'end_axis': end_axis}
+        self.call_args = {'axis': list(range(self.axis, self.axis + self.num_axes))}
        self.filler = caffe_pb2.FillerParameter(type='constant', value=1)
        self.filler = param.filler if param.HasField('filler') else self.filler
        self.bias_filler = param.bias_filler
@@ -609,7 +608,7 @@ class Scale(Layer):
        if len(self.blobs) == 0:
            self.build(bottom)
        inputs = [bottom] + [blob['data'] for blob in self.blobs]
-        return array_ops.channel_affine(inputs, **self.call_args)
+        return math_ops.affine(inputs, **self.call_args)


 class Slice(Layer):

--- a/caffe/core/layers/data.py
+++ b/caffe/core/layers/data.py
@@ -16,8 +16,8 @@ from __future__ import print_function

 from dragon.core.framework import workspace
 from dragon.core.io.kpl_record import KPLRecordDataset
-from dragon.core.ops import array_ops
 from dragon.core.ops import framework_ops
+from dragon.core.ops import normalization_ops
 from dragon.utils import vision
 from dragon.vm.caffe.core.layer import Layer

@@ -121,5 +121,5 @@ class Data(Layer):
        data._shape = (self.data_args['batch_size'],
                       None, None, len(self.norm_args['mean']))
        label._shape = (self.data_args['batch_size'], None)
-        data = array_ops.channel_normalize(data, **self.norm_args)
+        data = normalization_ops.channel_norm(data, **self.norm_args)
        return data, label
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -9,6 +9,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 # ---[ Compiler flags
 if (MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_ENABLE_EXTENDED_ALIGNED_STORAGE")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}
      /wd4003 /wd4114

--- a/docs/api/python/dragon.rst
+++ b/docs/api/python/dragon.rst
@@ -36,16 +36,6 @@ dragon
  `cast(...) <dragon/cast.html>`_
  : Cast the data type of input.

-  `channel_affine(...) <dragon/channel_affine.html>`_
-  : Apply affine transformation to each channel of input.
-
-  `channel_normalize(...) <dragon/channel_normalize.html>`_
-  : Apply normalization to each channel of input.
-
-  `channel_shuffle(...) <dragon/channel_shuffle.html>`_
-  : Apply group shuffle to each channel of input.
-  `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
-
  `concat(...) <dragon/concat.html>`_
  : Concatenate the inputs along the given axis.

@@ -211,9 +201,6 @@ dragon
  dragon/boolean_mask
  dragon/broadcast_to
  dragon/cast
-  dragon/channel_affine
-  dragon/channel_normalize
-  dragon/channel_shuffle
  dragon/concat
  dragon/constant
  dragon/device

--- a/docs/api/python/dragon/cuda.rst
+++ b/docs/api/python/dragon/cuda.rst
@@ -24,6 +24,9 @@ dragon.cuda
  `memory_allocated(...) <cuda/memory_allocated.html>`_
  : Return the size of memory used by tensors in current workspace.

+  `set_cublas_flags(...) <cuda/set_cublas_flags.html>`_
+  : Set the flags of cuBLAS library.
+
  `set_cudnn_flags(...) <cuda/set_cudnn_flags.html>`_
  : Set the flags of cuDNN library.

@@ -44,6 +47,7 @@ dragon.cuda
  cuda/get_device_capability
  cuda/is_available
  cuda/memory_allocated
+  cuda/set_cublas_flags
  cuda/set_cudnn_flags
  cuda/set_default_device
  cuda/set_device

--- a/docs/api/python/dragon/channel_affine.rst
+++ b/docs/api/python/dragon/channel_affine.rst
-channel_affine
-==============
+set_cublas_flags
+================

-.. autofunction:: dragon.channel_affine
+.. autofunction:: dragon.cuda.set_cublas_flags

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.";
+      content: "dragon.cuda.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/math.rst
+++ b/docs/api/python/dragon/math.rst
@@ -12,12 +12,18 @@ dragon.math
  `add(...) <math/add.html>`_
  : Compute the element-wise addition.

+  `affine(...) <math/affine.html>`_
+  : Apply the affine transformation to input.
+
  `argmax(...) <math/argmax.html>`_
  : Compute the index of maximum elements along the given axis.

  `argmin(...) <math/argmin.html>`_
  : Compute the index of minimum elements along the given axis.

+  `atan2(...) <math/atan2.html>`_
+  : Compute the element-wise arc-tangent of two arguments.
+
  `ceil(...) <math/ceil.html>`_
  : Compute the smallest integer not less than input.

@@ -81,9 +87,6 @@ dragon.math
  `logical_xor(...) <math/logical_xor.html>`_
  : Compute the element-wise XOR logical operation.

-  `lp_normalize(...) <math/lp_normalize.html>`_
-  : Apply the lp normalization.
-
  `matmul(...) <math/matmul.html>`_
  : Compute the matrix multiplication.

@@ -158,8 +161,10 @@ dragon.math

  math/abs
  math/add
+  math/affine
  math/argmax
  math/argmin
+  math/atan2
  math/ceil
  math/clip
  math/cos
@@ -181,7 +186,6 @@ dragon.math
  math/logical_not
  math/logical_or
  math/logical_xor
-  math/lp_normalize
  math/matmul
  math/max
  math/maximum

--- a/docs/api/python/dragon/math/lp_normalize.rst
+++ b/docs/api/python/dragon/math/lp_normalize.rst
-lp_normalize
-============
+affine
+======

-.. autofunction:: dragon.math.lp_normalize
+.. autofunction:: dragon.math.affine

 .. raw:: html


--- a/docs/api/python/dragon/channel_normalize.rst
+++ b/docs/api/python/dragon/channel_normalize.rst
-channel_normalize
-=================
+atan2
+=====

-.. autofunction:: dragon.channel_normalize
+.. autofunction:: dragon.math.atan2

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.";
+      content: "dragon.math.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/nn.rst
+++ b/docs/api/python/dragon/nn.rst
@@ -28,6 +28,13 @@ dragon.nn
  `bias_add(...) <nn/bias_add.html>`_
  : Add the bias across channels to input.

+  `channel_norm(...) <nn/channel_norm.html>`_
+  : Apply the normalization to each channel of input.
+
+  `channel_shuffle(...) <nn/channel_shuffle.html>`_
+  : Apply the group shuffle to each channel of input.
+  `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
+
  `conv(...) <nn/conv.html>`_
  : Apply the n-dimension convolution.

@@ -107,6 +114,9 @@ dragon.nn
  `log_softmax(...) <nn/log_softmax.html>`_
  : Compute the composite of logarithm and softmax.

+  `lp_norm(...) <nn/lp_norm.html>`_
+  : Apply the lp normalization.
+
  `moments(...) <nn/moments.html>`_
  : Compute the mean and variance of input along the given axis.

@@ -157,6 +167,8 @@ dragon.nn
  nn/RNN
  nn/batch_norm
  nn/bias_add
+  nn/channel_norm
+  nn/channel_shuffle
  nn/conv
  nn/conv_transpose
  nn/conv1d
@@ -180,6 +192,7 @@ dragon.nn
  nn/leaky_relu
  nn/local_response_norm
  nn/log_softmax
+  nn/lp_norm
  nn/moments
  nn/pool
  nn/pool1d

--- a/docs/api/python/dragon/nn/channel_norm.rst
+++ b/docs/api/python/dragon/nn/channel_norm.rst
+channel_norm
+============
+
+.. autofunction:: dragon.nn.channel_norm
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "dragon.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/dragon/channel_shuffle.rst
+++ b/docs/api/python/dragon/channel_shuffle.rst
 channel_shuffle
 ===============

-.. autofunction:: dragon.channel_shuffle
+.. autofunction:: dragon.nn.channel_shuffle

 .. raw:: html

  <style>
    h1:before {
-      content: "dragon.";
+      content: "dragon.nn.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/dragon/nn/lp_norm.rst
+++ b/docs/api/python/dragon/nn/lp_norm.rst
+lp_norm
+=======
+
+.. autofunction:: dragon.nn.lp_norm
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "dragon.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/tensorflow/math.rst
+++ b/docs/api/python/tensorflow/math.rst
@@ -21,6 +21,9 @@ vm.tensorflow.math
  `argmin(...) <math/argmin.html>`_
  : Compute the index of minimum elements along the given axis.

+  `atan2(...) <math/atan2.html>`_
+  : Compute the element-wise arc-tangent of two arguments.
+
  `ceil(...) <math/ceil.html>`_
  : Compute the smallest integer not less than input.

@@ -134,6 +137,7 @@ vm.tensorflow.math
  math/add_n
  math/argmax
  math/argmin
+  math/atan2
  math/ceil
  math/cos
  math/cumsum

--- a/docs/api/python/torch/channel_normalize.rst
+++ b/docs/api/python/torch/channel_normalize.rst
-channel_normalize
-=================
+atan2
+=====

-.. autofunction:: dragon.vm.torch.channel_normalize
+.. autofunction:: dragon.vm.tensorflow.math.atan2

 .. raw:: html

  <style>
    h1:before {
-      content: "torch.";
+      content: "tf.math.";
      color: #103d3e;
    }
  </style>
--- a/docs/api/python/torch.rst
+++ b/docs/api/python/torch.rst
@@ -51,6 +51,9 @@ vm.torch
  `argsort(...) <torch/argsort.html>`_
  : Return the index of sorted elements along the given dimension.

+  `atan2(...) <torch/atan2.html>`_
+  : Compute the element-wise arc-tangent of two arguments.
+
  `baddbmm(...) <torch/baddbmm.html>`_
  : Add input to the result of batched matrix-matrix multiplication.

@@ -75,12 +78,6 @@ vm.torch
  `ceil(...) <torch/ceil.html>`_
  : Compute the smallest integer not less than input.

-  `channel_affine(...) <torch/channel_affine.html>`_
-  : Apply affine transformation to each channel of input.
-
-  `channel_normalize(...) <torch/channel_normalize.html>`_
-  : Apply normalization to each channel of input.
-
  `chunk(...) <torch/chunk.html>`_
  : Split input into a specific number of chunks.

@@ -345,6 +342,7 @@ vm.torch
  torch/argmax
  torch/argmin
  torch/argsort
+  torch/atan2
  torch/baddbmm
  torch/bitwise_and
  torch/bitwise_not
@@ -353,8 +351,6 @@ vm.torch
  torch/bmm
  torch/cat
  torch/ceil
-  torch/channel_affine
-  torch/channel_normalize
  torch/chunk
  torch/clamp
  torch/cos

--- a/docs/api/python/torch/Tensor_.rst
+++ b/docs/api/python/torch/Tensor_.rst
@@ -73,6 +73,10 @@ argsort
 #######
 .. automethod:: dragon.vm.torch.Tensor.argsort

+atan2
+#####
+.. automethod:: dragon.vm.torch.Tensor.atan2
+
 backward
 ########
 .. automethod:: dragon.vm.torch.Tensor.backward
@@ -699,6 +703,7 @@ zero\_
 .. _torch.argmax(...): argmax.html
 .. _torch.argmin(...): argmin.html
 .. _torch.argsort(...): argsort.html
+.. _torch.atan2(...): atan2.html
 .. _torch.baddbmm(...): baddbmm.html
 .. _torch.bitwise_and(...): bitwise_and.html
 .. _torch.bitwise_not(...): bitwise_not.html

--- a/docs/api/python/torch/channel_affine.rst
+++ b/docs/api/python/torch/channel_affine.rst
-channel_affine
-==============
+atan2
+=====

-.. autofunction:: dragon.vm.torch.channel_affine
+.. autofunction:: dragon.vm.torch.atan2

 .. raw:: html


--- a/docs/api/python/torch/backends.rst
+++ b/docs/api/python/torch/backends.rst
@@ -6,12 +6,16 @@ vm.torch.backends
  Modules
  -------

+  `Module cuda <backends/cuda.html>`_
+  : The CUDA backend module.
+
  `Module cudnn <backends/cudnn.html>`_
  : The cuDNN backend module.

 .. toctree::
  :hidden:

+  backends/cuda
  backends/cudnn

 .. raw:: html

--- a/docs/api/python/torch/backends/cuda.rst
+++ b/docs/api/python/torch/backends/cuda.rst
+cuda
+====
+
+Properties
+----------
+
+matmul.allow_tf32
+#################
+.. data:: dragon.vm.torch.backends.cuda.matmul.allow_tf32
+  :annotation: = False
+
+  The flag that allows TF32 math type for matmul or not.
+
+Functions
+---------
+
+is_built
+########
+.. automethod:: dragon.vm.torch.backends.cuda.is_built
+
+.. raw:: html
+
+  <style>
+  h1:before {
+    content: "torch.backends.";
+    color: #103d3e;
+  }
+  </style>
--- a/docs/api/python/torch/nn.rst
+++ b/docs/api/python/torch/nn.rst
@@ -24,8 +24,8 @@ vm.torch.nn
  `class AdaptiveMaxPool3d <nn/AdaptiveMaxPool3d.html>`_
  : Apply the 3d adaptive max pooling.

-  `class AffineChannel <nn/AffineChannel.html>`_
-  : Apply affine transformation along the channels.
+  `class Affine <nn/Affine.html>`_
+  : Apply the affine transformation.

  `class AvgPool1d <nn/AvgPool1d.html>`_
  : Apply the 1d average pooling.
@@ -312,7 +312,7 @@ vm.torch.nn
  nn/AdaptiveMaxPool1d
  nn/AdaptiveMaxPool2d
  nn/AdaptiveMaxPool3d
-  nn/AffineChannel
+  nn/Affine
  nn/AvgPool1d
  nn/AvgPool2d
  nn/AvgPool3d

--- a/docs/api/python/torch/nn/AffineChannel.rst
+++ b/docs/api/python/torch/nn/AffineChannel.rst
-AffineChannel
-=============
+Affine
+======

-.. autoclass:: dragon.vm.torch.nn.AffineChannel
+.. autoclass:: dragon.vm.torch.nn.Affine

 __init__
 --------
-.. automethod:: dragon.vm.torch.nn.AffineChannel.__init__
+.. automethod:: dragon.vm.torch.nn.Affine.__init__

-.. _torch.channel_affine(...): ../channel_affine.html
+.. _torch.nn.functional.affine(...): functional/affine.html

 .. raw:: html


--- a/docs/api/python/torch/nn/functional.rst
+++ b/docs/api/python/torch/nn/functional.rst
@@ -24,6 +24,9 @@ vm.torch.nn.functional
  `adaptive_max_pool3d(...) <functional/adaptive_max_pool3d.html>`_
  : Apply the 3d adaptive max pooling to input.

+  `affine(...) <functional/affine.html>`_
+  : Apply the affine transformation to input.
+
  `avg_pool1d(...) <functional/avg_pool1d.html>`_
  : Apply the 1d average pooling to input.

@@ -40,8 +43,11 @@ vm.torch.nn.functional
  `binary_cross_entropy_with_logits(...) <functional/binary_cross_entropy_with_logits.html>`_
  : Compute the sigmoid cross entropy with contiguous target.

+  `channel_norm(...) <nn/channel_norm.html>`_
+  : Apply the normalization to each channel of input.
+
  `channel_shuffle(...) <functional/channel_shuffle.html>`_
-  : Apply group shuffle to each channel of input.
+  : Apply the group shuffle to each channel of input.
  `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.

  `conv1d(...) <functional/conv1d.html>`_
@@ -229,11 +235,13 @@ vm.torch.nn.functional
  functional/adaptive_max_pool1d
  functional/adaptive_max_pool2d
  functional/adaptive_max_pool3d
+  functional/affine
  functional/avg_pool1d
  functional/avg_pool2d
  functional/avg_pool3d
  functional/batch_norm
  functional/binary_cross_entropy_with_logits
+  functional/channel_norm
  functional/channel_shuffle
  functional/conv1d
  functional/conv2d

--- a/docs/api/python/torch/nn/functional/affine.rst
+++ b/docs/api/python/torch/nn/functional/affine.rst
+affine
+======
+
+.. autofunction:: dragon.vm.torch.nn.functional.affine
+
+.. _torch.nn.affine(...): ../Affine.html
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional/channel_norm.rst
+++ b/docs/api/python/torch/nn/functional/channel_norm.rst
+channel_norm
+============
+
+.. autofunction:: dragon.vm.torch.nn.functional.channel_norm
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/core/context_cuda.h
+++ b/dragon/core/context_cuda.h
@@ -56,15 +56,16 @@ class CUDAObjects {
      auto& handle = handles[stream_id];
      CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
      CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id)));
+    }
+    auto& handle = handles[stream_id];
 #if CUDA_VERSION >= 11000
-      if (cudnn_allow_tf32_) {
-        CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
-      } else {
-        CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
-      }
-#endif
+    if (cublas_allow_tf32_) {
+      CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
+    } else {
+      CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
    }
-    return handles[stream_id];
+#endif
+    return handle;
  }

  /*! \brief Return the specified cudnn handle */
@@ -150,6 +151,9 @@ class CUDAObjects {
  Map<string, ncclComm_t> nccl_comms_[CUDA_MAX_DEVICES];
 #endif

+  /*! \brief The flag that allows cuBLAS TF32 math type or not */
+  bool cublas_allow_tf32_ = false;
+
  /*! \brief The flag that uses cuDNN or not */
  bool cudnn_enabled_ = true;


--- a/dragon/core/registry.h
+++ b/dragon/core/registry.h
@@ -20,32 +20,32 @@ namespace dragon {
 /*!
 * \brief Registry to create class instances.
 */
-template <class KeyType, class ObjectType, class... Args>
+template <class KeyT, class ClassT, class... Args>
 class Registry {
 public:
-  typedef std::function<ObjectType*(Args...)> Creator;
+  typedef std::function<ClassT*(Args...)> Creator;

  /*! \brief Create an instance of specified class */
-  ObjectType* Create(const KeyType& key, Args... args) {
+  ClassT* Create(const KeyT& key, Args... args) {
    CHECK(registry_.count(key)) << "\nKey(" << key << ") has not registered.";
    return registry_[key](args...);
  }

  /*! \brief Return whether the specified class is registered */
-  bool Has(const KeyType& key) {
+  bool Has(const KeyT& key) {
    return (registry_.count(key)) != 0;
  }

  /*! \brief Register a class with the creator */
-  void Register(const KeyType& key, Creator creator) {
+  void Register(const KeyT& key, Creator creator) {
    CHECK(!registry_.count(key))
        << "\nKey(" << key << ") has already registered.";
    registry_[key] = creator;
  }

  /*! \brief Return the key of registered classes */
-  vector<KeyType> keys() {
-    vector<KeyType> ret;
+  vector<KeyT> keys() {
+    vector<KeyT> ret;
    for (const auto& it : registry_) {
      ret.push_back(it.first);
    }
@@ -54,50 +54,49 @@ class Registry {

 private:
  /*! \brief The registry map */
-  Map<KeyType, Creator> registry_;
+  Map<KeyT, Creator> registry_;
 };

 /*!
 * \brief Register creator into the registry.
 */
-template <class KeyType, class ObjectType, class... Args>
+template <class KeyT, class ClassT, class... Args>
 class Registerer {
 public:
  /*! \brief Constructor with key and creator */
  Registerer(
-      const KeyType& key,
-      Registry<KeyType, ObjectType, Args...>* registry,
-      typename Registry<KeyType, ObjectType, Args...>::Creator creator,
+      const KeyT& key,
+      Registry<KeyT, ClassT, Args...>* registry,
+      typename Registry<KeyT, ClassT, Args...>::Creator creator,
      const string& help_msg = "") {
    registry->Register(key, creator);
  }

  /*! \brief Return the default creator */
-  template <class DerivedType>
-  static ObjectType* DefaultCreator(Args... args) {
-    return new DerivedType(args...);
+  template <class DerivedT>
+  static ClassT* DefaultCreator(Args... args) {
+    return new DerivedT(args...);
  }
 };

-// Used in *.h files
-#define DECLARE_TYPED_REGISTRY(RegistryName, KeyType, ObjectType, ...)     \
-  DRAGON_API Registry<KeyType, ObjectType, ##__VA_ARGS__>* RegistryName(); \
-  typedef Registerer<KeyType, ObjectType, ##__VA_ARGS__>                   \
-      Registerer##RegistryName;
-
-// Used in *.cc files
-#define DEFINE_TYPED_REGISTRY(RegistryName, KeyType, ObjectType, ...) \
-  Registry<KeyType, ObjectType, ##__VA_ARGS__>* RegistryName() {      \
-    static Registry<KeyType, ObjectType, ##__VA_ARGS__>* registry =   \
-        new Registry<KeyType, ObjectType, ##__VA_ARGS__>();           \
-    return registry;                                                  \
+// Used in *.h files.
+#define DECLARE_TYPED_REGISTRY(RegistryName, KeyT, ClassT, ...)     \
+  DRAGON_API Registry<KeyT, ClassT, ##__VA_ARGS__>* RegistryName(); \
+  typedef Registerer<KeyT, ClassT, ##__VA_ARGS__> Registerer##RegistryName;
+
+// Used in *.cc files.
+#define DEFINE_TYPED_REGISTRY(RegistryName, KeyT, ClassT, ...) \
+  Registry<KeyT, ClassT, ##__VA_ARGS__>* RegistryName() {      \
+    static Registry<KeyT, ClassT, ##__VA_ARGS__>* registry =   \
+        new Registry<KeyT, ClassT, ##__VA_ARGS__>();           \
+    return registry;                                           \
  }

-#define DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  DECLARE_TYPED_REGISTRY(RegistryName, string, ObjectType, ##__VA_ARGS__)
+#define DECLARE_REGISTRY(RegistryName, ClassT, ...) \
+  DECLARE_TYPED_REGISTRY(RegistryName, string, ClassT, ##__VA_ARGS__)

-#define DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  DEFINE_TYPED_REGISTRY(RegistryName, string, ObjectType, ##__VA_ARGS__)
+#define DEFINE_REGISTRY(RegistryName, ClassT, ...) \
+  DEFINE_TYPED_REGISTRY(RegistryName, string, ClassT, ##__VA_ARGS__)

 #define REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
  static Registerer##RegistryName ANONYMOUS_VARIABLE(g_##RegistryName)( \

--- a/dragon/kernels/array/boolean_mask_op_kernel.cc
+++ b/dragon/kernels/array/boolean_mask_op_kernel.cc
--- a/dragon/kernels/array/boolean_mask_op_kernel.cu
+++ b/dragon/kernels/array/boolean_mask_op_kernel.cu
--- a/dragon/kernels/array/channel_affine_op_kernel.cc
+++ b/dragon/kernels/array/channel_affine_op_kernel.cc
-#include "dragon/utils/device/common_eigen.h"
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernels {
-
-namespace {
-
-template <typename T>
-void _ChannelAffine(
-    const int N,
-    const int S,
-    const int C,
-    const T* x,
-    const T* scale,
-    const T* bias,
-    T* y) {
-  if (S == 1) {
-    if (bias != nullptr) {
-      EigenArrayMap<T>(y, C, N) = (ConstEigenArrayMap<T>(x, C, N).colwise() *
-                                   ConstEigenVectorArrayMap<T>(scale, C))
-                                      .colwise() +
-          ConstEigenVectorArrayMap<T>(bias, C);
-    } else {
-      EigenArrayMap<T>(y, C, N) = ConstEigenArrayMap<T>(x, C, N).colwise() *
-          ConstEigenVectorArrayMap<T>(scale, C);
-    }
-    return;
-  }
-  for (int i = 0; i < N; ++i) {
-    for (int j = 0; j < C; ++j) {
-      if (bias != nullptr) {
-        EigenVectorArrayMap<T>(y, S) =
-            ConstEigenVectorArrayMap<T>(x, S) * scale[j] + bias[j];
-      } else {
-        EigenVectorArrayMap<T>(y, S) =
-            ConstEigenVectorArrayMap<T>(x, S) * scale[j];
-      }
-      x += S;
-      y += S;
-    }
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-template <>
-void ChannelAffine<float16, CPUContext>(
-    const int N,
-    const int S,
-    const int C,
-    const float16* x,
-    const float16* w,
-    const float16* b,
-    float16* y,
-    CPUContext* ctx) {
-  CPU_FP16_NOT_SUPPORTED;
-}
-
-#define DEFINE_KERNEL_LAUNCHER(T)               \
-  template <>                                   \
-  void ChannelAffine<T, CPUContext>(            \
-      const int N,                              \
-      const int S,                              \
-      const int C,                              \
-      const T* x,                               \
-      const T* scale,                           \
-      const T* bias,                            \
-      T* y,                                     \
-      CPUContext* ctx) {                        \
-    _ChannelAffine(N, S, C, x, scale, bias, y); \
-  }
-
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernels
-
-} // namespace dragon
--- a/dragon/kernels/array/channel_affine_op_kernel.cu
+++ b/dragon/kernels/array/channel_affine_op_kernel.cu
-#ifdef USE_CUDA
-
-#include "dragon/core/context_cuda.h"
-#include "dragon/utils/math_functions.h"
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernels {
-
-namespace {
-
-template <typename T, typename AccT>
-__global__ void _ChannelAffine(
-    const int NxCxS,
-    const int S,
-    const int C,
-    const T* x,
-    const T* scale,
-    T* y) {
-  CUDA_1D_KERNEL_LOOP(i, NxCxS) {
-    y[i] = convert::To<T>(
-        convert::To<AccT>(x[i]) *
-        convert::To<AccT>(__ldg(scale + (i / S) % C)));
-  }
-}
-
-template <typename T, typename AccT>
-__global__ void _ChannelAffine(
-    const int NxCxS,
-    const int S,
-    const int C,
-    const T* x,
-    const T* scale,
-    const T* bias,
-    T* y) {
-  CUDA_1D_KERNEL_LOOP(i, NxCxS) {
-    const int j = (i / S) % C;
-    y[i] = convert::To<T>(
-        fma(convert::To<AccT>(x[i]),
-            convert::To<AccT>(__ldg(scale + j)),
-            convert::To<AccT>(__ldg(bias + j))));
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)                                              \
-  template <>                                                                  \
-  void ChannelAffine<T, CUDAContext>(                                          \
-      const int N,                                                             \
-      const int S,                                                             \
-      const int C,                                                             \
-      const T* x,                                                              \
-      const T* scale,                                                          \
-      const T* bias,                                                           \
-      T* y,                                                                    \
-      CUDAContext* ctx) {                                                      \
-    const auto NxCxS = N * C * S;                                              \
-    if (bias != nullptr) {                                                     \
-      _ChannelAffine<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(       \
-              NxCxS,                                                           \
-              S,                                                               \
-              C,                                                               \
-              reinterpret_cast<const math::ScalarType<T>::type*>(x),           \
-              reinterpret_cast<const math::ScalarType<T>::type*>(scale),       \
-              reinterpret_cast<const math::ScalarType<T>::type*>(bias),        \
-              reinterpret_cast<math::ScalarType<T>::type*>(y));                \
-    } else {                                                                   \
-      _ChannelAffine<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-          <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>(       \
-              NxCxS,                                                           \
-              S,                                                               \
-              C,                                                               \
-              reinterpret_cast<const math::ScalarType<T>::type*>(x),           \
-              reinterpret_cast<const math::ScalarType<T>::type*>(scale),       \
-              reinterpret_cast<math::ScalarType<T>::type*>(y));                \
-    }                                                                          \
-  }
-
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernels
-
-} // namespace dragon
-
-#endif // USE_CUDA
--- a/dragon/kernels/array/channel_shuffle_op_kernel.cc
+++ b/dragon/kernels/array/channel_shuffle_op_kernel.cc
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernels {
-
-namespace {
-
-template <typename T>
-void _ChannelShuffle(
-    const int N,
-    const int S,
-    const int G,
-    const int K,
-    const T* x,
-    T* y) {
-  for (int i = 0; i < N; ++i) {
-    for (int gi = 0; gi < G; ++gi) {
-      for (int ki = 0; ki < K; ++ki) {
-        std::memcpy(
-            y + ((i * K + ki) * G + gi) * S,
-            x + ((i * G + gi) * K + ki) * S,
-            S * sizeof(T));
-      }
-    }
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)          \
-  template <>                              \
-  void ChannelShuffle<T, CPUContext>(      \
-      const int N,                         \
-      const int S,                         \
-      const int C,                         \
-      const int G,                         \
-      const T* x,                          \
-      T* y,                                \
-      CPUContext* ctx) {                   \
-    _ChannelShuffle(N, S, G, C / G, x, y); \
-  }
-
-DEFINE_KERNEL_LAUNCHER(bool);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernels
-
-} // namespace dragon
--- a/dragon/kernels/array/channel_shuffle_op_kernel.cu
+++ b/dragon/kernels/array/channel_shuffle_op_kernel.cu
-#ifdef USE_CUDA
-
-#include "dragon/core/context_cuda.h"
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernels {
-
-namespace {
-
-template <typename T>
-__global__ void _ChannelShuffle(
-    const int NxCxS,
-    const int S,
-    const int G,
-    const int K,
-    const T* x,
-    T* y) {
-  CUDA_1D_KERNEL_LOOP(index, NxCxS) {
-    const int j = index % S;
-    const int gi = index / S % G;
-    const int ki = index / S / G % K;
-    const int i = index / S / G / K;
-    y[index] = x[((i * G + gi) * K + ki) * S + j];
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)                        \
-  template <>                                            \
-  void ChannelShuffle<T, CUDAContext>(                   \
-      const int N,                                       \
-      const int S,                                       \
-      const int C,                                       \
-      const int G,                                       \
-      const T* x,                                        \
-      T* y,                                              \
-      CUDAContext* ctx) {                                \
-    const auto NxCxS = N * C * S;                        \
-    _ChannelShuffle<<<                                   \
-        CUDA_BLOCKS(NxCxS),                              \
-        CUDA_THREADS,                                    \
-        0,                                               \
-        ctx->cuda_stream()>>>(NxCxS, S, G, C / G, x, y); \
-  }
-
-DEFINE_KERNEL_LAUNCHER(bool);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernels
-
-} // namespace dragon
-
-#endif // USE_CUDA
--- a/dragon/kernels/array/unique_op_kernel.cu
+++ b/dragon/kernels/array/unique_op_kernel.cu
@@ -52,14 +52,23 @@ __global__ void _ComputeCounts(
      CUDAContext* ctx) {                                                      \
    math::Copy(dim, x, y, ctx);                                                \
    auto policy = thrust::cuda::par.on(ctx->cuda_stream());                    \
+    auto* data = reinterpret_cast<math::ScalarType<T>::type*>(y);              \
    thrust::device_vector<int> order1(dim), order2(dim);                       \
    thrust::sequence(policy, order1.begin(), order1.end());                    \
    thrust::sequence(policy, order2.begin(), order2.end());                    \
    thrust::sort_by_key(                                                       \
-        policy, y, y + dim, order1.begin(), math::LessFunctor<T>());           \
+        policy,                                                                \
+        data,                                                                  \
+        data + dim,                                                            \
+        order1.begin(),                                                        \
+        math::LessFunctor<math::ScalarType<T>::type>());                       \
    auto last = thrust::unique_by_key(                                         \
-        policy, y, y + dim, order2.begin(), math::EqualFunctor<T>());          \
-    int n = num[0] = last.first - y;                                           \
+        policy,                                                                \
+        data,                                                                  \
+        data + dim,                                                            \
+        order2.begin(),                                                        \
+        math::EqualFunctor<math::ScalarType<T>::type>());                      \
+    int n = num[0] = last.first - data;                                        \
    if (inverse_index) {                                                       \
      _RemapInverse<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
          dim, n, order1.data(), order2.data(), inverse_index);                \

--- a/dragon/kernels/array/channel_normalize_op_kernel.cc
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cc
@@ -8,7 +8,7 @@ namespace kernels {
 namespace {

 template <typename InputT, typename OutputT>
-void _ChannelNormalize(
+void _ChannelNorm(
    const int axis,
    const int num_dims,
    const int64_t* x_strides,
@@ -19,15 +19,14 @@ void _ChannelNormalize(
    OutputT* y) {
  const auto N = math::utils::Prod(num_dims, y_dims);
  vec64_t idx(num_dims, 0);
-  int64_t xi, wi;
  for (int yi = 0; yi < N; ++yi) {
-    xi = 0;
+    int64_t xi = 0, wi;
    for (int d = num_dims - 1; d >= 0; --d) {
      xi += idx[d] * x_strides[d];
      if (d == axis) wi = idx[d];
    }
-    y[yi] =
-        convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]);
+    const float val = convert::To<float>(x[xi]);
+    y[yi] = convert::To<OutputT>((val - mean[wi]) / std[wi]);
    math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
  }
 }
@@ -36,19 +35,19 @@ void _ChannelNormalize(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                            \
-  template <>                                                              \
-  void ChannelNormalize<InputT, OutputT, CPUContext>(                      \
-      const int axis,                                                      \
-      const int num_dims,                                                  \
-      const int64_t* x_strides,                                            \
-      const int64_t* y_dims,                                               \
-      const InputT* x,                                                     \
-      const float* mean,                                                   \
-      const float* std,                                                    \
-      OutputT* y,                                                          \
-      CPUContext* ctx) {                                                   \
-    _ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
+#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                       \
+  template <>                                                         \
+  void ChannelNorm<InputT, OutputT, CPUContext>(                      \
+      const int axis,                                                 \
+      const int num_dims,                                             \
+      const int64_t* x_strides,                                       \
+      const int64_t* y_dims,                                          \
+      const InputT* x,                                                \
+      const float* mean,                                              \
+      const float* std,                                               \
+      OutputT* y,                                                     \
+      CPUContext* ctx) {                                              \
+    _ChannelNorm(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
  }

 DEFINE_KERNEL_LAUNCHER(uint8_t, float16);

--- a/dragon/kernels/array/channel_normalize_op_kernel.cu
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cu
@@ -11,7 +11,7 @@ namespace kernels {
 namespace {

 template <typename InputT, typename OutputT, int D>
-__global__ void _ChannelNormalize(
+__global__ void _ChannelNorm(
    const int N,
    const int axis,
    const int num_dims,
@@ -38,31 +38,27 @@ __global__ void _ChannelNormalize(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                 \
-  template <>                                                   \
-  void ChannelNormalize<InputT, OutputT, CUDAContext>(          \
-      const int axis,                                           \
-      const int num_dims,                                       \
-      const int64_t* x_strides,                                 \
-      const int64_t* y_dims,                                    \
-      const InputT* x,                                          \
-      const float* mean,                                        \
-      const float* std,                                         \
-      OutputT* y,                                               \
-      CUDAContext* ctx) {                                       \
-    CUDA_TENSOR_DIMS_CHECK(num_dims);                           \
-    SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;   \
-    const auto N = math::utils::Prod(num_dims, y_dims);         \
-    for (int i = 0; i < num_dims; ++i) {                        \
-      X_strides.data[i] = x_strides[i];                         \
-      Y_dims.data[i] = y_dims[i];                               \
-    }                                                           \
-    _ChannelNormalize<<<                                        \
-        CUDA_BLOCKS(N),                                         \
-        CUDA_THREADS,                                           \
-        0,                                                      \
-        ctx->cuda_stream()>>>(                                  \
-        N, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
+#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                            \
+  template <>                                                              \
+  void ChannelNorm<InputT, OutputT, CUDAContext>(                          \
+      const int axis,                                                      \
+      const int num_dims,                                                  \
+      const int64_t* x_strides,                                            \
+      const int64_t* y_dims,                                               \
+      const InputT* x,                                                     \
+      const float* mean,                                                   \
+      const float* std,                                                    \
+      OutputT* y,                                                          \
+      CUDAContext* ctx) {                                                  \
+    CUDA_TENSOR_DIMS_CHECK(num_dims);                                      \
+    SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;              \
+    const auto N = math::utils::Prod(num_dims, y_dims);                    \
+    for (int i = 0; i < num_dims; ++i) {                                   \
+      X_strides.data[i] = x_strides[i];                                    \
+      Y_dims.data[i] = y_dims[i];                                          \
+    }                                                                      \
+    _ChannelNorm<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+        N, axis, num_dims, X_strides, Y_dims, x, mean, std, y);            \
  }

 DEFINE_KERNEL_LAUNCHER(uint8_t, float16);

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cc
@@ -8,7 +8,7 @@ namespace kernels {
 namespace {

 template <typename T>
-void _L1Normalize(
+void _L1Norm(
    const int N,
    const int S,
    const int C,
@@ -28,7 +28,7 @@ void _L1Normalize(
 }

 template <typename T>
-void _L2Normalize(
+void _L2Norm(
    const int N,
    const int S,
    const int C,
@@ -48,7 +48,7 @@ void _L2Normalize(
 }

 template <typename T>
-void _L1NormalizeGrad(
+void _L1NormGrad(
    const int N,
    const int S,
    const int C,
@@ -73,7 +73,7 @@ void _L1NormalizeGrad(
 }

 template <typename T>
-void _L2NormalizeGrad(
+void _L2NormGrad(
    const int N,
    const int S,
    const int C,
@@ -101,7 +101,7 @@ void _L2NormalizeGrad(
 /* ------------------- Launcher Separator ------------------- */

 template <>
-void L1Normalize<float16, CPUContext>(
+void L1Norm<float16, CPUContext>(
    const int N,
    const int S,
    const int C,
@@ -114,7 +114,7 @@ void L1Normalize<float16, CPUContext>(
 }

 template <>
-void L2Normalize<float16, CPUContext>(
+void L2Norm<float16, CPUContext>(
    const int N,
    const int S,
    const int C,
@@ -127,7 +127,7 @@ void L2Normalize<float16, CPUContext>(
 }

 template <>
-void L1NormalizeGrad<float16, CPUContext>(
+void L1NormGrad<float16, CPUContext>(
    const int N,
    const int S,
    const int C,
@@ -138,10 +138,10 @@ void L1NormalizeGrad<float16, CPUContext>(
    float16* dx,
    CPUContext* ctx) {
  CPU_FP16_NOT_SUPPORTED;
-} // L1NormalizeGrad
+} // L1NormGrad

 template <>
-void L2NormalizeGrad<float16, CPUContext>(
+void L2NormGrad<float16, CPUContext>(
    const int N,
    const int S,
    const int C,
@@ -152,7 +152,7 @@ void L2NormalizeGrad<float16, CPUContext>(
    float16* dx,
    CPUContext* ctx) {
  CPU_FP16_NOT_SUPPORTED;
-} // L2NormalizeGrad
+} // L2NormGrad

 #define DEFINE_KERNEL_LAUNCHER(name, T)         \
  template <>                                   \
@@ -183,14 +183,14 @@ void L2NormalizeGrad<float16, CPUContext>(
    _##name<T>(N, S, C, normalizer, eps, dy, x, dx); \
  }

-DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
-DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double);
+DEFINE_KERNEL_LAUNCHER(L1Norm, float);
+DEFINE_KERNEL_LAUNCHER(L1Norm, double);
+DEFINE_KERNEL_LAUNCHER(L2Norm, float);
+DEFINE_KERNEL_LAUNCHER(L2Norm, double);
+DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, double);
+DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER


--- a/dragon/kernels/normalization/lp_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cu
@@ -12,7 +12,7 @@ namespace kernels {
 namespace {

 template <typename T, typename AccT>
-__global__ void _L1Normalize(
+__global__ void _L1Norm(
    const int NxS,
    const int S,
    const int C,
@@ -41,7 +41,7 @@ __global__ void _L1Normalize(
 }

 template <typename T, typename AccT>
-__global__ void _L2Normalize(
+__global__ void _L2Norm(
    const int NxS,
    const int S,
    const int C,
@@ -70,7 +70,7 @@ __global__ void _L2Normalize(
 }

 template <typename T, typename AccT>
-__global__ void _L1NormalizeGrad(
+__global__ void _L1NormGrad(
    const int NxS,
    const int S,
    const int C,
@@ -107,7 +107,7 @@ __global__ void _L1NormalizeGrad(
 }

 template <typename T, typename AccT>
-__global__ void _L2NormalizeGrad(
+__global__ void _L2NormGrad(
    const int NxS,
    const int S,
    const int C,
@@ -195,18 +195,18 @@ __global__ void _L2NormalizeGrad(
            reinterpret_cast<math::ScalarType<T>::type*>(dx));      \
  }

-DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, float);
-DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float);
-DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, float);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float);
-DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double);
+DEFINE_KERNEL_LAUNCHER(L1Norm, float16, float);
+DEFINE_KERNEL_LAUNCHER(L1Norm, float, float);
+DEFINE_KERNEL_LAUNCHER(L1Norm, double, double);
+DEFINE_KERNEL_LAUNCHER(L2Norm, float16, float);
+DEFINE_KERNEL_LAUNCHER(L2Norm, float, float);
+DEFINE_KERNEL_LAUNCHER(L2Norm, double, double);
+DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float16, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, double, double);
+DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float16, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER


--- a/dragon/kernels/training/adam_op_kernel.cc
+++ b/dragon/kernels/training/adam_op_kernel.cc
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 namespace kernels {

-template <>
-void Adam<float, CPUContext>(
+namespace {
+
+template <typename T, typename CopyT>
+void _Adam(
    const int N,
-    const float lr,
-    const float beta1,
-    const float beta2,
-    const float eps,
-    float* g,
-    float* m,
-    float* v,
-    CPUContext* ctx) {
+    const T lr,
+    const T beta1,
+    const T beta2,
+    const T eps,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* v,
+    T* y,
+    CopyT* y_copy) {
  for (int i = 0; i < N; ++i) {
-    float gi = g[i];
-    float mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
-    float vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    g[i] = lr * mi / (std::sqrt(vi) + eps);
+    const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
+    const T mi = m[i] = std::fma(beta1, m[i], (T(1) - beta1) * gi);
+    const T vi = v[i] = std::fma(beta2, v[i], (T(1) - beta2) * gi * gi);
+    y[i] -= lr * mi / (std::sqrt(vi) + eps);
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

-template <>
-void AdamW<float, CPUContext>(
+template <typename T, typename CopyT>
+void _AdamW(
    const int N,
-    const float lr,
-    const float beta1,
-    const float beta2,
-    const float eps,
-    const float wd,
-    const float* x,
-    float* g,
-    float* m,
-    float* v,
-    CPUContext* ctx) {
+    const T lr,
+    const T beta1,
+    const T beta2,
+    const T eps,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* v,
+    T* y,
+    CopyT* y_copy) {
  for (int i = 0; i < N; ++i) {
-    float gi = g[i];
-    float mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
-    float vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    g[i] = lr * mi / (std::sqrt(vi) + eps) + wd * x[i];
+    const T gi = g[i];
+    const T mi = m[i] = std::fma(beta1, m[i], (T(1) - beta1) * gi);
+    const T vi = v[i] = std::fma(beta2, v[i], (T(1) - beta2) * gi * gi);
+    y[i] -= wd > T(0) ? std::fma(wd, x[i], lr * mi / (std::sqrt(vi) + eps))
+                      : lr * mi / (std::sqrt(vi) + eps);
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

+} // namespace
+
+#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
+  template <>                                  \
+  void name<T, CopyT, CPUContext>(             \
+      const int N,                             \
+      const float lr,                          \
+      const float beta1,                       \
+      const float beta2,                       \
+      const float eps,                         \
+      const float wd,                          \
+      const T* x,                              \
+      const T* g,                              \
+      T* m,                                    \
+      T* v,                                    \
+      T* y,                                    \
+      CopyT* y_copy,                           \
+      CPUContext* ctx) {                       \
+    _##name(                                   \
+        N,                                     \
+        convert::To<T>(lr),                    \
+        convert::To<T>(beta1),                 \
+        convert::To<T>(beta2),                 \
+        convert::To<T>(eps),                   \
+        convert::To<T>(wd),                    \
+        x,                                     \
+        g,                                     \
+        m,                                     \
+        v,                                     \
+        y,                                     \
+        y_copy);                               \
+  }
+
+DEFINE_KERNEL_LAUNCHER(Adam, float, float16);
+DEFINE_KERNEL_LAUNCHER(Adam, float, float);
+DEFINE_KERNEL_LAUNCHER(Adam, double, double);
+DEFINE_KERNEL_LAUNCHER(AdamW, float, float16);
+DEFINE_KERNEL_LAUNCHER(AdamW, float, float);
+DEFINE_KERNEL_LAUNCHER(AdamW, double, double);
+#undef DEFINE_KERNEL_LAUNCHER
+
 } // namespace kernels

 } // namespace dragon
--- a/dragon/kernels/training/adam_op_kernel.cu
+++ b/dragon/kernels/training/adam_op_kernel.cu
 #ifdef USE_CUDA

 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {
@@ -9,25 +10,32 @@ namespace kernels {

 namespace {

-template <typename T>
+template <typename T, typename CopyT>
 __global__ void _Adam(
    const int N,
    const T lr,
    const T beta1,
    const T beta2,
    const T eps,
-    T* g,
+    const T wd,
+    const T* x,
+    const T* g,
    T* m,
-    T* v) {
+    T* v,
+    T* y,
+    CopyT* y_copy) {
  CUDA_1D_KERNEL_LOOP(i, N) {
-    T gi = g[i];
-    T mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
-    T vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    g[i] = lr * mi / (sqrt(vi) + eps);
+    const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
+    const T mi = m[i] = fma(beta1, m[i], (T(1) - beta1) * gi);
+    const T vi = v[i] = fma(beta2, v[i], (T(1) - beta2) * gi * gi);
+    y[i] -= lr * mi / (sqrt(vi) + eps);
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

-template <typename T>
+template <typename T, typename CopyT>
 __global__ void _AdamW(
    const int N,
    const T lr,
@@ -36,14 +44,20 @@ __global__ void _AdamW(
    const T eps,
    const T wd,
    const T* x,
-    T* g,
+    const T* g,
    T* m,
-    T* v) {
+    T* v,
+    T* y,
+    CopyT* y_copy) {
  CUDA_1D_KERNEL_LOOP(i, N) {
-    T gi = g[i];
-    T mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
-    T vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    g[i] = lr * mi / (sqrt(vi) + eps) + wd * x[i];
+    const T gi = g[i];
+    const T mi = m[i] = fma(beta1, m[i], (T(1) - beta1) * gi);
+    const T vi = v[i] = fma(beta2, v[i], (T(1) - beta2) * gi * gi);
+    y[i] -= wd > T(0) ? fma(wd, x[i], lr * mi / (sqrt(vi) + eps))
+                      : lr * mi / (sqrt(vi) + eps);
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

@@ -51,37 +65,44 @@ __global__ void _AdamW(

 /* ------------------- Launcher Separator ------------------- */

-template <>
-void Adam<float, CUDAContext>(
-    const int N,
-    const float lr,
-    const float beta1,
-    const float beta2,
-    const float eps,
-    float* g,
-    float* m,
-    float* v,
-    CUDAContext* ctx) {
-  _Adam<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      N, lr, beta1, beta2, eps, g, m, v);
-}
+#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT)                        \
+  template <>                                                         \
+  void name<T, CopyT, CUDAContext>(                                   \
+      const int N,                                                    \
+      const float lr,                                                 \
+      const float beta1,                                              \
+      const float beta2,                                              \
+      const float eps,                                                \
+      const float wd,                                                 \
+      const T* x,                                                     \
+      const T* g,                                                     \
+      T* m,                                                           \
+      T* v,                                                           \
+      T* y,                                                           \
+      CopyT* y_copy,                                                  \
+      CUDAContext* ctx) {                                             \
+    _##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+        N,                                                            \
+        convert::To<T>(lr),                                           \
+        convert::To<T>(beta1),                                        \
+        convert::To<T>(beta2),                                        \
+        convert::To<T>(eps),                                          \
+        convert::To<T>(wd),                                           \
+        x,                                                            \
+        g,                                                            \
+        m,                                                            \
+        v,                                                            \
+        y,                                                            \
+        reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy));    \
+  }

-template <>
-void AdamW<float, CUDAContext>(
-    const int N,
-    const float lr,
-    const float beta1,
-    const float beta2,
-    const float eps,
-    const float wd,
-    const float* x,
-    float* g,
-    float* m,
-    float* v,
-    CUDAContext* ctx) {
-  _AdamW<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      N, lr, beta1, beta2, eps, wd, x, g, m, v);
-}
+DEFINE_KERNEL_LAUNCHER(Adam, float, float16);
+DEFINE_KERNEL_LAUNCHER(Adam, float, float);
+DEFINE_KERNEL_LAUNCHER(Adam, double, double);
+DEFINE_KERNEL_LAUNCHER(AdamW, float, float16);
+DEFINE_KERNEL_LAUNCHER(AdamW, float, float);
+DEFINE_KERNEL_LAUNCHER(AdamW, double, double);
+#undef DEFINE_KERNEL_LAUNCHER

 } // namespace kernels


--- a/dragon/kernels/training/rmsprop_op_kernel.cc
+++ b/dragon/kernels/training/rmsprop_op_kernel.cc
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 namespace kernels {

-template <>
-void RMSprop<float, CPUContext>(
+namespace {
+
+template <typename T, typename CopyT>
+void _RMSprop(
    const int N,
-    const float lr,
-    const float momentum,
-    const float decay,
-    const float eps,
-    float* g,
-    float* m,
-    float* v,
-    CPUContext* ctx) {
+    const T lr,
+    const T momentum,
+    const T alpha,
+    const T eps,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* v,
+    T* y,
+    CopyT* y_copy) {
  for (int i = 0; i < N; ++i) {
-    float gi = g[i];
-    float vi = v[i] = decay * v[i] + (1 - decay) * gi * gi;
-    float mi = m[i] = std::fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
-    g[i] = lr * mi;
+    const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
+    const T vi = v[i] = std::fma(alpha, v[i], (T(1) - alpha) * gi * gi);
+    const T mi = m[i] = std::fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
+    y[i] -= lr * mi;
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

+} // namespace
+
+#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
+  template <>                                  \
+  void name<T, CopyT, CPUContext>(             \
+      const int N,                             \
+      const float lr,                          \
+      const float momentum,                    \
+      const float alpha,                       \
+      const float eps,                         \
+      const float wd,                          \
+      const T* x,                              \
+      const T* g,                              \
+      T* m,                                    \
+      T* v,                                    \
+      T* y,                                    \
+      CopyT* y_copy,                           \
+      CPUContext* ctx) {                       \
+    _##name(                                   \
+        N,                                     \
+        convert::To<T>(lr),                    \
+        convert::To<T>(momentum),              \
+        convert::To<T>(alpha),                 \
+        convert::To<T>(eps),                   \
+        convert::To<T>(wd),                    \
+        x,                                     \
+        g,                                     \
+        m,                                     \
+        v,                                     \
+        y,                                     \
+        y_copy);                               \
+  }
+
+DEFINE_KERNEL_LAUNCHER(RMSprop, float, float16);
+DEFINE_KERNEL_LAUNCHER(RMSprop, float, float);
+DEFINE_KERNEL_LAUNCHER(RMSprop, double, double);
+#undef DEFINE_KERNEL_LAUNCHER
+
 } // namespace kernels

 } // namespace dragon
--- a/dragon/kernels/training/rmsprop_op_kernel.cu
+++ b/dragon/kernels/training/rmsprop_op_kernel.cu
 #ifdef USE_CUDA

 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {
@@ -9,21 +10,28 @@ namespace kernels {

 namespace {

-template <typename T>
+template <typename T, typename CopyT>
 __global__ void _RMSprop(
    const int N,
    const T lr,
    const T momentum,
-    const T decay,
+    const T alpha,
    const T eps,
-    T* g,
+    const T wd,
+    const T* x,
+    const T* g,
    T* m,
-    T* v) {
+    T* v,
+    T* y,
+    CopyT* y_copy) {
  CUDA_1D_KERNEL_LOOP(i, N) {
-    T gi = g[i];
-    T vi = v[i] = decay * v[i] + (1 - decay) * gi * gi;
-    T mi = m[i] = fma(momentum, m[i], gi / (sqrt(vi) + eps));
-    g[i] = lr * mi;
+    const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
+    const T vi = v[i] = fma(alpha, v[i], (T(1) - alpha) * gi * gi);
+    const T mi = m[i] = fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
+    y[i] -= lr * mi;
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

@@ -31,20 +39,41 @@ __global__ void _RMSprop(

 /* ------------------- Launcher Separator ------------------- */

-template <>
-void RMSprop<float, CUDAContext>(
-    const int N,
-    const float lr,
-    const float momentum,
-    const float decay,
-    const float eps,
-    float* g,
-    float* m,
-    float* v,
-    CUDAContext* ctx) {
-  _RMSprop<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      N, lr, momentum, decay, eps, g, m, v);
-}
+#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT)                        \
+  template <>                                                         \
+  void name<T, CopyT, CUDAContext>(                                   \
+      const int N,                                                    \
+      const float lr,                                                 \
+      const float momentum,                                           \
+      const float alpha,                                              \
+      const float eps,                                                \
+      const float wd,                                                 \
+      const T* x,                                                     \
+      const T* g,                                                     \
+      T* m,                                                           \
+      T* v,                                                           \
+      T* y,                                                           \
+      CopyT* y_copy,                                                  \
+      CUDAContext* ctx) {                                             \
+    _##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+        N,                                                            \
+        convert::To<T>(lr),                                           \
+        convert::To<T>(momentum),                                     \
+        convert::To<T>(alpha),                                        \
+        convert::To<T>(eps),                                          \
+        convert::To<T>(wd),                                           \
+        x,                                                            \
+        g,                                                            \
+        m,                                                            \
+        v,                                                            \
+        y,                                                            \
+        reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy));    \
+  }
+
+DEFINE_KERNEL_LAUNCHER(RMSprop, float, float16);
+DEFINE_KERNEL_LAUNCHER(RMSprop, float, float);
+DEFINE_KERNEL_LAUNCHER(RMSprop, double, double);
+#undef DEFINE_KERNEL_LAUNCHER

 } // namespace kernels


--- a/dragon/kernels/training/sgd_op_kernel.cc
+++ b/dragon/kernels/training/sgd_op_kernel.cc
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 namespace kernels {

-template <>
-void MomentumSGD<float, CPUContext>(
+namespace {
+
+template <typename T, typename CopyT>
+void _MomentumSGD(
    const int N,
-    const float lr,
-    const float momentum,
-    float* g,
-    float* m,
-    CPUContext* ctx) {
+    const T lr,
+    const T momentum,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* y,
+    CopyT* y_copy) {
  for (int i = 0; i < N; ++i) {
-    float mi = m[i] = std::fma(momentum, m[i], g[i]);
-    g[i] = lr * mi;
+    const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
+    const T mi = m[i] = std::fma(momentum, m[i], gi);
+    y[i] -= lr * mi;
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

-template <>
-void NesterovSGD<float, CPUContext>(
+template <typename T, typename CopyT>
+void _NesterovSGD(
    const int N,
-    const float lr,
-    const float momentum,
-    float* g,
-    float* m,
-    CPUContext* ctx) {
+    const T lr,
+    const T momentum,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* y,
+    CopyT* y_copy) {
  for (int i = 0; i < N; ++i) {
-    float gi = g[i];
-    float mi = m[i] = std::fma(momentum, m[i], gi);
-    g[i] = lr * std::fma(momentum, mi, gi);
+    const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
+    const T mi = m[i] = std::fma(momentum, m[i], gi);
+    y[i] -= lr * std::fma(momentum, mi, gi);
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

+} // namespace
+
+#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
+  template <>                                  \
+  void name<T, CopyT, CPUContext>(             \
+      const int N,                             \
+      const float lr,                          \
+      const float momentum,                    \
+      const float wd,                          \
+      const T* x,                              \
+      const T* g,                              \
+      T* m,                                    \
+      T* y,                                    \
+      CopyT* y_copy,                           \
+      CPUContext* ctx) {                       \
+    _##name(                                   \
+        N,                                     \
+        convert::To<T>(lr),                    \
+        convert::To<T>(momentum),              \
+        convert::To<T>(wd),                    \
+        x,                                     \
+        g,                                     \
+        m,                                     \
+        y,                                     \
+        y_copy);                               \
+  }
+
+DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float16);
+DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float);
+DEFINE_KERNEL_LAUNCHER(MomentumSGD, double, double);
+DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float16);
+DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float);
+DEFINE_KERNEL_LAUNCHER(NesterovSGD, double, double);
+#undef DEFINE_KERNEL_LAUNCHER
+
 } // namespace kernels

 } // namespace dragon
--- a/dragon/kernels/training/sgd_op_kernel.cu
+++ b/dragon/kernels/training/sgd_op_kernel.cu
 #ifdef USE_CUDA

 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {
@@ -9,22 +10,45 @@ namespace kernels {

 namespace {

-template <typename T>
-__global__ void
-_MomentumSGD(const int N, const T lr, const T momentum, T* g, T* m) {
+template <typename T, typename CopyT>
+__global__ void _MomentumSGD(
+    const int N,
+    const T lr,
+    const T momentum,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* y,
+    CopyT* y_copy) {
  CUDA_1D_KERNEL_LOOP(i, N) {
-    T mi = m[i] = fma(momentum, m[i], g[i]);
-    g[i] = lr * mi;
+    const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
+    const T mi = m[i] = fma(momentum, m[i], gi);
+    y[i] -= lr * mi;
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

-template <typename T>
-__global__ void
-_NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) {
+template <typename T, typename CopyT>
+__global__ void _NesterovSGD(
+    const int N,
+    const T lr,
+    const T momentum,
+    const T wd,
+    const T* x,
+    const T* g,
+    T* m,
+    T* y,
+    CopyT* y_copy) {
  CUDA_1D_KERNEL_LOOP(i, N) {
-    T gi = g[i];
-    T mi = m[i] = fma(momentum, m[i], gi);
-    g[i] = lr * fma(momentum, mi, gi);
+    const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
+    const T mi = m[i] = fma(momentum, m[i], gi);
+    y[i] -= lr * fma(momentum, mi, gi);
+    if (y_copy != nullptr) {
+      y_copy[i] = convert::To<CopyT>(y[i]);
+    }
  }
 }

@@ -32,29 +56,38 @@ _NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) {

 /* ------------------- Launcher Separator ------------------- */

-template <>
-void MomentumSGD<float, CUDAContext>(
-    const int N,
-    const float lr,
-    const float momentum,
-    float* g,
-    float* m,
-    CUDAContext* ctx) {
-  _MomentumSGD<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      N, lr, momentum, g, m);
-}
+#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT)                        \
+  template <>                                                         \
+  void name<T, CopyT, CUDAContext>(                                   \
+      const int N,                                                    \
+      const float lr,                                                 \
+      const float momentum,                                           \
+      const float wd,                                                 \
+      const T* x,                                                     \
+      const T* g,                                                     \
+      T* m,                                                           \
+      T* y,                                                           \
+      CopyT* y_copy,                                                  \
+      CUDAContext* ctx) {                                             \
+    _##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+        N,                                                            \
+        convert::To<T>(lr),                                           \
+        convert::To<T>(momentum),                                     \
+        convert::To<T>(wd),                                           \
+        x,                                                            \
+        g,                                                            \
+        m,                                                            \
+        y,                                                            \
+        reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy));    \
+  }

-template <>
-void NesterovSGD<float, CUDAContext>(
-    const int N,
-    const float lr,
-    const float momentum,
-    float* g,
-    float* m,
-    CUDAContext* ctx) {
-  _NesterovSGD<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      N, lr, momentum, g, m);
-}
+DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float16);
+DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float);
+DEFINE_KERNEL_LAUNCHER(MomentumSGD, double, double);
+DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float16);
+DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float);
+DEFINE_KERNEL_LAUNCHER(NesterovSGD, double, double);
+#undef DEFINE_KERNEL_LAUNCHER

 } // namespace kernels


--- a/dragon/modules/python/cuda.h
+++ b/dragon/modules/python/cuda.h
@@ -91,16 +91,24 @@ void RegisterModule_cuda(py::module& m) {
 #endif
  });

+  /*! \brief Set the flags of cuBLAS library */
+  m.def("cublasSetFlags", [](int allow_tf32) {
+#ifdef USE_CUDA
+    auto& ctx = CUDAContext::objects();
+    if (allow_tf32 >= 0) ctx.cublas_allow_tf32_ = allow_tf32;
+#endif
+  });
+
  /*! \brief Set the flags of cuDNN library */
  m.def(
      "cudnnSetFlags",
-      [](bool enabled, bool benchmark, bool deterministic, bool allow_tf32) {
+      [](int enabled, int benchmark, int deterministic, int allow_tf32) {
 #ifdef USE_CUDA
-        auto& cuda_objects = CUDAContext::objects();
-        cuda_objects.cudnn_enabled_ = enabled;
-        cuda_objects.cudnn_deterministic_ = deterministic;
-        cuda_objects.cudnn_benchmark_ = benchmark;
-        cuda_objects.cudnn_allow_tf32_ = allow_tf32;
+        auto& ctx = CUDAContext::objects();
+        if (enabled >= 0) ctx.cudnn_enabled_ = enabled;
+        if (benchmark >= 0) ctx.cudnn_benchmark_ = benchmark;
+        if (deterministic >= 0) ctx.cudnn_deterministic_ = deterministic;
+        if (allow_tf32 >= 0) ctx.cudnn_allow_tf32_ = allow_tf32;
 #endif
      });


--- a/dragon/modules/python/module.cc
+++ b/dragon/modules/python/module.cc
@@ -132,8 +132,8 @@ PYBIND11_MODULE(libdragon_python, m) {
                PRINT(INFO) << GetVerboseDef(def.DebugString(), "graph");
              }
            }
-            // Return the graph name may be different from the def
-            // We will make a unique dummy name on creating the graph
+            // Return the graph name may be different from the def.
+            // We will make a unique dummy name on creating the graph.
            return graph->name();
          })

@@ -175,8 +175,8 @@ PYBIND11_MODULE(libdragon_python, m) {
        GraphDef init_graph, pred_graph;
        onnx::ONNXBackend onnx_backend;
        onnx_backend.Prepare(model_path, &init_graph, &pred_graph);
-        // Serializing to Python is intractable
-        // We should apply the initializer immediately
+        // Serializing to Python is intractable.
+        // We should apply the initializer immediately.
        self->RunGraph(self->CreateGraph(init_graph)->name());
        return py::bytes(pred_graph.SerializeAsString());
      });

--- a/dragon/modules/python/plugin_op.cc
+++ b/dragon/modules/python/plugin_op.cc
@@ -24,14 +24,14 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws)
  Py_Initialize();
  auto* module = PyImport_ImportModule(module_name_.c_str());
  CHECK(module) << "\nFailed to import module: " << module;
-
  auto* module_dict = PyModule_GetDict(module);
  auto* op_class = PyDict_GetItemString(module_dict, class_name_.c_str());
  CHECK(op_class) << "\nFailed to import class: " << class_name_
                  << " from module: " << module_name_;
+
  self_ = PyObject_CallObject(op_class, NULL);

-  // Project inputs and outputs.
+  // Set inputs and outputs.
  inputs_ = PyList_New(InputSize());
  outputs_ = PyList_New(OutputSize());
  for (int i = 0; i < InputSize(); i++) {
@@ -41,16 +41,15 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws)
    PyList_SetItem(outputs_, i, PyBytes_FromStdString(Output(i)->name()));
  }

-  // Set: self.kwargs_str
+  // Attr: "kwargs_str"
  PyObject_SetAttr(
      self_,
      PyBytes_FromRawString("kwargs_str"),
      PyBytes_FromStdString(kwargs_str_));

-  // Method: self.setup(inputs, outputs)
  if (PyObject_HasAttr(self_, PyBytes_FromRawString("setup"))) {
    CHECK(PyObject_CallMethod(self_, "setup", "OO", inputs_, outputs_))
-        << CallMethodHelper("setup");
+        << CallMethodHelper("setup"); // Method: setup(inputs, outputs)
  }
 }

@@ -67,27 +66,24 @@ string PythonPluginOp<Context>::CallMethodHelper(const string& method_name) {

 template <class Context>
 void PythonPluginOp<Context>::RunOnDevice() {
-  // GIL may have been released
+  // GIL may have been released.
  pybind11::gil_scoped_acquire g;

-  // Atrribute: self.phase
+  // Attr: phase
  PyObject_SetAttr(
      self_, PyBytes_FromRawString("phase"), PyBytes_FromStdString(phase()));

-  // Method: self.reshape(input, outputs)
  if (PyObject_HasAttr(self_, PyBytes_FromRawString("reshape"))) {
    CHECK(PyObject_CallMethod(self_, "reshape", "OO", inputs_, outputs_))
-        << CallMethodHelper("reshape");
+        << CallMethodHelper("reshape"); // Method: reshape(input, outputs)
  }

-  // Method: self.run(input, outputs)
-  // Method: self.forward(input, outputs)
  if (PyObject_HasAttr(self_, PyBytes_FromRawString("forward"))) {
    CHECK(PyObject_CallMethod(self_, "forward", "OO", inputs_, outputs_))
-        << CallMethodHelper("forward");
+        << CallMethodHelper("forward"); // Method: run(input, outputs)
  } else if (PyObject_HasAttr(self_, PyBytes_FromRawString("run"))) {
    CHECK(PyObject_CallMethod(self_, "run", "OO", inputs_, outputs_))
-        << CallMethodHelper("run");
+        << CallMethodHelper("run"); // Method: forward(input, outputs)
  }
 }


--- a/dragon/onnx/onnx_backend.cc
+++ b/dragon/onnx/onnx_backend.cc
@@ -13,7 +13,6 @@ void ONNXBackend::Prepare(
  ModelProto onnx_model;
  CHECK(ReadProtoFromBinaryFile(onnx_model_path.c_str(), &onnx_model))
      << "\nFailed to parse the onnx model.";
-
  int opset_version = -1;
  for (const auto& imp : onnx_model.opset_import()) {
    if ((!imp.has_domain()) || imp.domain().empty()) {
@@ -31,7 +30,6 @@ void ONNXBackend::Prepare(
      std::cout << "Unrecognized operator set " << opset_version << std::endl;
    }
  }
-
  if (opset_version < 0) {
    if (onnx_model.ir_version() >= 0x00000003) {
      LOG(FATAL) << "Model with IR version >= 3 "
@@ -40,7 +38,6 @@ void ONNXBackend::Prepare(
      opset_version = 1;
    }
  }
-
  ONNXToDragon(onnx_model, opset_version, true, init_graph, pred_graph);
 }

@@ -52,22 +49,23 @@ void ONNXBackend::ONNXToDragon(
    GraphDef* pred_graph) {
  ModelProto init_model = ModelProto();
  ModelProto pred_model = onnx_model;
-
  pred_graph->set_name(onnx_model.graph().name());
  init_graph->set_name(onnx_model.graph().name() + "/init");
-
  ValueInfoMap graph_value_infos{};
  InitializerMap graph_initializer{};
-
-  for (const auto& vi : onnx_model.graph().input())
-    graph_value_infos[vi.name()].CopyFrom(vi);
-
-  for (const auto& vi : onnx_model.graph().output())
-    graph_value_infos[vi.name()].CopyFrom(vi);
-
-  for (const auto& vi : onnx_model.graph().value_info())
-    graph_value_infos[vi.name()].CopyFrom(vi);
-
+  // Collect graph inputs.
+  for (const auto& v : onnx_model.graph().input()) {
+    graph_value_infos[v.name()].CopyFrom(v);
+  }
+  // Collect graph outputs.
+  for (const auto& v : onnx_model.graph().output()) {
+    graph_value_infos[v.name()].CopyFrom(v);
+  }
+  // Collect graph values.
+  for (const auto& v : onnx_model.graph().value_info()) {
+    graph_value_infos[v.name()].CopyFrom(v);
+  }
+  // Collect graph initializers.
  for (const auto& tensor : onnx_model.graph().initializer()) {
    if (include_initializers) {
      auto* op_def = init_graph->add_op();
@@ -76,16 +74,18 @@ void ONNXBackend::ONNXToDragon(
    }
    graph_initializer[tensor.name()] = &tensor;
  }
-
+  // Convert to graph defs.
  auto converter = [&](const ModelProto& model, GraphDef* graph) mutable {
    for (const auto& node : model.graph().node()) {
      ValueInfoMap value_infos{};
      InitializerMap initializer{};
      for (const auto& name : node.input()) {
-        if (graph_value_infos.count(name))
+        if (graph_value_infos.count(name)) {
          value_infos[name].CopyFrom(graph_value_infos[name]);
-        if (graph_initializer.count(name))
+        }
+        if (graph_initializer.count(name)) {
          initializer[name] = graph_initializer[name];
+        }
      }
      auto onnx_node = ONNXNode(node);
      auto returns = ONNXNodeToOps(
@@ -98,23 +98,18 @@ void ONNXBackend::ONNXToDragon(
      }
    }
  };
-
  converter(pred_model, pred_graph);
-
-  // Set(Initializer) + Set(Placehoders) = Set(Inputs)
+  // Add external inputs.
  Set<string> initializer;
-  for (const auto& e : onnx_model.graph().initializer()) {
-    initializer.insert(e.name());
+  for (const auto& v : onnx_model.graph().initializer()) {
+    initializer.insert(v.name());
  }
-
-  // Add External Inputs
  for (const auto& e : onnx_model.graph().input()) {
    if (initializer.count(e.name()) == 0) {
      pred_graph->add_input(e.name());
    }
  }
-
-  // Add External Outputs
+  // Add external outputs.
  for (const auto& e : onnx_model.graph().output()) {
    pred_graph->add_output(e.name());
  }

--- a/dragon/operators/array/channel_shuffle_op.cc
+++ b/dragon/operators/array/channel_shuffle_op.cc
-#include "dragon/operators/array/channel_shuffle_op.h"
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-template <class Context>
-template <typename T>
-void ChannelShuffleOp<Context>::DoRunWithType() {
-  auto &X = Input(0), *Y = Output(0);
-  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
-
-  CHECK_EQ(X.dim(axis) % group_, 0)
-      << "\nThe " << X.dim(axis) << " channels "
-      << "can not be split into " << group_ << " groups.";
-
-  kernels::ChannelShuffle(
-      X.count(0, axis),
-      X.count(axis + 1),
-      X.dim(axis),
-      group_,
-      X.template data<T, Context>(),
-      Y->ReshapeLike(X)->template mutable_data<T, Context>(),
-      ctx());
-}
-
-template <class Context>
-void ChannelShuffleOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Generic>::Call(this, Input(0));
-}
-
-template <class Context>
-template <typename T>
-void ChannelShuffleGradientOp<Context>::DoRunWithType() {
-  auto &dY = Input(0), *dX = Output(0);
-  GET_OP_AXIS_ARG(axis, dY.ndim(), -1);
-
-  kernels::ChannelShuffle(
-      dY.count(0, axis),
-      dY.count(axis + 1),
-      dY.dim(axis),
-      dY.dim(axis) / group_,
-      dY.template data<T, Context>(),
-      dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
-      ctx());
-}
-
-template <class Context>
-void ChannelShuffleGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
-DEPLOY_CPU_OPERATOR(ChannelShuffle);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(ChannelShuffle);
-#endif
-
-DEPLOY_CPU_OPERATOR(ChannelShuffleGradient);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(ChannelShuffleGradient);
-#endif
-
-OPERATOR_SCHEMA(ChannelShuffle)
-    /* X */
-    .NumInputs(1)
-    /* Y */
-    .NumOutputs(1);
-
-OPERATOR_SCHEMA(ChannelShuffleGradient)
-    /* dY */
-    .NumInputs(1)
-    /* dX */
-    .NumOutputs(1);
-
-REGISTER_GRADIENT(ChannelShuffle, SimpleGradientMaker);
-
-} // namespace dragon
--- a/dragon/operators/array/shuffle_op.cc
+++ b/dragon/operators/array/shuffle_op.cc
+#include "dragon/operators/array/shuffle_op.h"
+#include "dragon/utils/math_functions.h"
+
+namespace dragon {
+
+template <class Context>
+template <typename T>
+void ChannelShuffleOp<Context>::DoRunWithType() {
+  auto &X = Input(0), *Y = Output(0);
+  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
+
+  CHECK_EQ(X.dim(axis) % group_, 0)
+      << "\nThe " << X.dim(axis) << " channels "
+      << "can not be split into " << group_ << " groups.";
+  auto G = group_, K = X.dim(axis) / group_;
+  if (def().type() == "ChannelShuffleGradient") std::swap(G, K);
+
+  math::Transpose(
+      4,
+      vec64_t({X.count(0, axis), G, K, X.count(axis + 1)}).data(),
+      vec64_t({0, 2, 1, 3}).data(),
+      X.template data<T, Context>(),
+      Y->ReshapeLike(X)->template mutable_data<T, Context>(),
+      ctx());
+}
+
+DEPLOY_CPU_OPERATOR(ChannelShuffle);
+REGISTER_CPU_OPERATOR(ChannelShuffleGradient, ChannelShuffleOp<CPUContext>);
+#ifdef USE_CUDA
+DEPLOY_CUDA_OPERATOR(ChannelShuffle);
+REGISTER_CUDA_OPERATOR(ChannelShuffleGradient, ChannelShuffleOp<CUDAContext>);
+#endif
+
+OPERATOR_SCHEMA(ChannelShuffle).NumInputs(1).NumOutputs(1);
+OPERATOR_SCHEMA(ChannelShuffleGradient).NumInputs(1).NumOutputs(1);
+
+REGISTER_GRADIENT(ChannelShuffle, SimpleGradientMaker);
+
+} // namespace dragon
--- a/dragon/operators/array/channel_shuffle_op.h
+++ b/dragon/operators/array/channel_shuffle_op.h
@@ -10,8 +10,8 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_
-#define DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_
+#ifndef DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
+#define DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_

 #include "dragon/core/operator.h"

@@ -25,7 +25,9 @@ class ChannelShuffleOp final : public Operator<Context> {
        group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Generic>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
@@ -34,22 +36,6 @@ class ChannelShuffleOp final : public Operator<Context> {
  int64_t group_;
 };

-template <class Context>
-class ChannelShuffleGradientOp final : public Operator<Context> {
- public:
-  ChannelShuffleGradientOp(const OperatorDef& def, Workspace* ws)
-      : Operator<Context>(def, ws),
-        group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
-  USE_OPERATOR_FUNCTIONS;
-
-  void RunOnDevice() override;
-  template <typename T>
-  void DoRunWithType();
-
- protected:
-  int64_t group_;
-};
-
 } // namespace dragon

-#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_
+#endif // DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
--- a/dragon/operators/array/channel_affine_op.cc
+++ b/dragon/operators/array/channel_affine_op.cc
-#include "dragon/operators/array/channel_affine_op.h"
+#include "dragon/operators/math/affine_op.h"
 #include "dragon/core/workspace.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
 template <typename T>
-void ChannelAffineOp<Context>::DoRunWithType() {
+void AffineOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), *Y = Output(0, {0});
-  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
-  GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);

-  vec64_t affine_dims(
-      {X.dims().begin() + axis, X.dims().begin() + end_axis + 1});
+  // Compute affine dimensions.
+  vec64_t affine_dims;
+  for (auto axis : axes_) {
+    axis = axis < 0 ? axis + X.ndim() : axis;
+    CHECK(axis >= 0 && axis < X.ndim())
+        << "\nExcepted the axis in [-" << X.ndim() << ", " << X.ndim()
+        << "), got " << axis << ".";
+    affine_dims.push_back(X.dim(axis));
+  }
  CHECK(W.dims() == affine_dims)
      << "\nExcepted the weight shape is " << Tensor::DimString(affine_dims)
      << ", got " << W.DimString() << ".";
@@ -23,10 +27,11 @@ void ChannelAffineOp<Context>::DoRunWithType() {
        << ", got " << Input(2).DimString() << ".";
  }

-  kernels::ChannelAffine(
-      X.count(0, axis),
-      X.count(end_axis + 1),
-      X.count(axis, end_axis + 1),
+  math::Affine(
+      X.ndim(),
+      X.dims().data(),
+      axes_.size(),
+      axes_.data(),
      X.template data<T, Context>(),
      W.template data<T, Context>(),
      InputSize() <= 2 ? nullptr : Input(2).template data<T, Context>(),
@@ -35,28 +40,30 @@ void ChannelAffineOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void ChannelAffineOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
-}
-
-template <class Context>
 template <typename T>
-void ChannelAffineGradientOp<Context>::DoRunWithType() {
+void AffineGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &W = Input(1), &dY = Input(2);
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
-  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
-  GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);

-  vec64_t affine_dims = {X.count(0, axis),
-                         X.count(axis, end_axis + 1),
-                         X.count(end_axis + 1)},
-          affine_axes = {0, 2};
+  // Compute reduce axes.
+  vec64_t reduce_axes;
+  for (int i = 0; i < X.ndim(); ++i) {
+    bool keep = true;
+    for (auto axis : axes_) {
+      axis = axis < 0 ? axis + X.ndim() : axis;
+      if (axis == i) keep = false;
+    }
+    if (keep) reduce_axes.push_back(i);
+  }
+
+  // Scratch to save the intermediates.
+  T* data = nullptr;
+  if (dW->has_name() && X.count() != W.count()) {
+    data = ctx()->workspace()->template data<T, Context>(X.count());
+  }

  // dW = dY * X
  if (dW->has_name()) {
-    Output(1)->ReshapeLike(Input(1));
-    auto* x = Input(0).template data<T, Context>();
-    auto* dw = Output(1)->template mutable_data<T, Context>();
    if (X.count() == W.count()) {
      math::Mul(
          X.count(),
@@ -65,20 +72,19 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
          dW->ReshapeLike(W)->template mutable_data<T, Context>(),
          ctx());
    } else {
-      T* scratch = ctx()->workspace()->template data<T, Context>(X.count());
      math::Mul(
          X.count(),
          dY.template data<T, Context>(),
          X.template data<T, Context>(),
-          scratch,
+          data,
          ctx());
      math::ReduceSum(
-          3,
-          affine_dims.data(),
-          2,
-          affine_axes.data(),
+          X.ndim(),
+          X.dims().data(),
+          reduce_axes.size(),
+          reduce_axes.data(),
          1.f,
-          scratch,
+          data,
          dW->ReshapeLike(W)->template mutable_data<T, Context>(),
          ctx());
    }
@@ -90,10 +96,10 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
      dB->ReshapeLike(W)->CopyFrom(dY, ctx());
    } else {
      math::ReduceSum(
-          3,
-          affine_dims.data(),
-          2,
-          affine_axes.data(),
+          X.ndim(),
+          X.dims().data(),
+          reduce_axes.size(),
+          reduce_axes.data(),
          1.f,
          dY.template data<T, Context>(),
          dB->ReshapeLike(W)->template mutable_data<T, Context>(),
@@ -103,11 +109,11 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {

  // dX = dY * W
  if (dX->has_name()) {
-    Output(0)->ReshapeLike(Input(-1));
-    kernels::ChannelAffine(
-        X.count(0, axis),
-        X.count(end_axis + 1),
-        X.count(axis, end_axis + 1),
+    math::Affine(
+        X.ndim(),
+        X.dims().data(),
+        axes_.size(),
+        axes_.data(),
        dY.template data<T, Context>(),
        W.template data<T, Context>(),
        (const T*)nullptr,
@@ -116,22 +122,17 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
  }
 }

-template <class Context>
-void ChannelAffineGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
-DEPLOY_CPU_OPERATOR(ChannelAffine);
+DEPLOY_CPU_OPERATOR(Affine);
 #ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(ChannelAffine);
+DEPLOY_CUDA_OPERATOR(Affine);
 #endif

-DEPLOY_CPU_OPERATOR(ChannelAffineGradient);
+DEPLOY_CPU_OPERATOR(AffineGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(ChannelAffineGradient);
+DEPLOY_CUDA_OPERATOR(AffineGradient);
 #endif

-OPERATOR_SCHEMA(ChannelAffine)
+OPERATOR_SCHEMA(Affine)
    /* X, W, B */
    .NumInputs(2, 3)
    /* Y */
@@ -139,7 +140,7 @@ OPERATOR_SCHEMA(ChannelAffine)
    /* X => Y */
    .AllowInplace({{0, 0}});

-OPERATOR_SCHEMA(ChannelAffineGradient)
+OPERATOR_SCHEMA(AffineGradient)
    /* X, W, dY */
    .NumInputs(3)
    /* dX, dW, dB */
@@ -163,6 +164,6 @@ class GradientMaker final : public GradientMakerBase {

 } // namespace

-REGISTER_GRADIENT(ChannelAffine, GradientMaker);
+REGISTER_GRADIENT(Affine, GradientMaker);

 } // namespace dragon
--- a/dragon/operators/array/channel_affine_op.h
+++ b/dragon/operators/array/channel_affine_op.h
@@ -10,37 +10,49 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
-#define DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
+#ifndef DRAGON_OPERATORS_MATH_AFFINE_OP_H_
+#define DRAGON_OPERATORS_MATH_AFFINE_OP_H_

 #include "dragon/core/operator.h"

 namespace dragon {

 template <class Context>
-class ChannelAffineOp final : public Operator<Context> {
+class AffineOp final : public Operator<Context> {
 public:
-  SIMPLE_CTOR_DTOR(ChannelAffineOp);
+  AffineOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
+
+ protected:
+  vec64_t axes_;
 };

 template <class Context>
-class ChannelAffineGradientOp final : public Operator<Context> {
+class AffineGradientOp final : public Operator<Context> {
 public:
-  SIMPLE_CTOR_DTOR(ChannelAffineGradientOp);
+  AffineGradientOp(const OperatorDef& def, Workspace* ws)
+      : Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
+
+ protected:
+  vec64_t axes_;
 };

 } // namespace dragon

-#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
+#endif // DRAGON_OPERATORS_MATH_AFFINE_OP_H_
--- a/dragon/operators/math/elementwise_op.cc
+++ b/dragon/operators/math/elementwise_op.cc
@@ -26,6 +26,7 @@ DISPATCH_WITH_TENSOR_TYPES(IsInf, dtypes::Floating, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(IsNaN, dtypes::Floating, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(IsFinite, dtypes::Floating, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Pow, dtypes::Floating, Input(0));
+DISPATCH_WITH_TENSOR_TYPES(Atan2, dtypes::Floating, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Minimum, dtypes::Numerical, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(Maximum, dtypes::Numerical, Input(0));
 DISPATCH_WITH_TENSOR_TYPES(BitwiseNot, dtypes::Bitwise, Input(0));
@@ -120,6 +121,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(BitwiseNot, T);
  }

 DEFINE_SIMPLE_BINARY_OP_IMPL(Pow, T);
+DEFINE_SIMPLE_BINARY_OP_IMPL(Atan2, T);
 DEFINE_SIMPLE_BINARY_OP_IMPL(Minimum, T);
 DEFINE_SIMPLE_BINARY_OP_IMPL(Maximum, T);
 DEFINE_SIMPLE_BINARY_OP_IMPL(BitwiseAnd, T);
@@ -152,6 +154,7 @@ DEPLOY_CPU_OPERATOR(IsInf);
 DEPLOY_CPU_OPERATOR(IsNaN);
 DEPLOY_CPU_OPERATOR(IsFinite);
 DEPLOY_CPU_OPERATOR(Pow);
+DEPLOY_CPU_OPERATOR(Atan2);
 DEPLOY_CPU_OPERATOR(Minimum);
 DEPLOY_CPU_OPERATOR(Maximum);
 DEPLOY_CPU_OPERATOR(BitwiseNot);
@@ -186,6 +189,7 @@ DEPLOY_CUDA_OPERATOR(IsInf);
 DEPLOY_CUDA_OPERATOR(IsNaN);
 DEPLOY_CUDA_OPERATOR(IsFinite);
 DEPLOY_CUDA_OPERATOR(Pow);
+DEPLOY_CUDA_OPERATOR(Atan2);
 DEPLOY_CUDA_OPERATOR(Minimum);
 DEPLOY_CUDA_OPERATOR(Maximum);
 DEPLOY_CUDA_OPERATOR(BitwiseNot);
@@ -222,6 +226,7 @@ OPERATOR_SCHEMA(IsNaN).NumInputs(1).NumOutputs(1);
 OPERATOR_SCHEMA(IsFinite).NumInputs(1).NumOutputs(1);
 OPERATOR_SCHEMA(Not).NumInputs(1).NumOutputs(1);
 OPERATOR_SCHEMA(Pow).NumInputs(2).NumOutputs(1);
+OPERATOR_SCHEMA(Atan2).NumInputs(2).NumOutputs(1);
 OPERATOR_SCHEMA(Minimum).NumInputs(2).NumOutputs(1);
 OPERATOR_SCHEMA(Maximum).NumInputs(2).NumOutputs(1);
 OPERATOR_SCHEMA(BitwiseAnd)
@@ -250,6 +255,7 @@ NO_GRADIENT(Round);
 NO_GRADIENT(IsInf);
 NO_GRADIENT(IsNaN);
 NO_GRADIENT(IsFinite);
+NO_GRADIENT(Atan2);
 NO_GRADIENT(BitwiseNot);
 NO_GRADIENT(BitwiseAnd);
 NO_GRADIENT(BitwiseOr);

--- a/dragon/operators/math/elementwise_op.h
+++ b/dragon/operators/math/elementwise_op.h
@@ -70,7 +70,7 @@ inline vec32_t CheckOutputAliases(
  return available_aliases;
 }

-// Unary ElementwiseOp
+// Unary ElementwiseOp.
 DECLARE_ELEMENTWISE_OP(Abs);
 DECLARE_ELEMENTWISE_OP(Ceil);
 DECLARE_ELEMENTWISE_OP(Cos);
@@ -101,12 +101,13 @@ DECLARE_ELEMENTWISE_OP(SignGradient);
 DECLARE_ELEMENTWISE_OP(SinGradient);
 DECLARE_ELEMENTWISE_OP(SqrtGradient);
 DECLARE_ELEMENTWISE_OP(SquareGradient);
-// Binary ElementwiseOp
+// Binary ElementwiseOp.
 DECLARE_ELEMENTWISE_OP(Add);
 DECLARE_ELEMENTWISE_OP(Sub);
 DECLARE_ELEMENTWISE_OP(Mul);
 DECLARE_ELEMENTWISE_OP(Div);
 DECLARE_ELEMENTWISE_OP(Pow);
+DECLARE_ELEMENTWISE_OP(Atan2);
 DECLARE_ELEMENTWISE_OP(Minimum);
 DECLARE_ELEMENTWISE_OP(Maximum);
 DECLARE_ELEMENTWISE_OP(BitwiseAnd);
@@ -128,7 +129,7 @@ DECLARE_ELEMENTWISE_OP(DivGradient);
 DECLARE_ELEMENTWISE_OP(PowGradient);
 DECLARE_ELEMENTWISE_OP(MinimumGradient);
 DECLARE_ELEMENTWISE_OP(MaximumGradient);
-// Trinary ElementwiseOp
+// Trinary ElementwiseOp.
 DECLARE_ELEMENTWISE_OP(Where);
 DECLARE_ELEMENTWISE_OP(WhereGradient);
 #undef DECLARE_ELEMENTWISE_OP

--- a/dragon/operators/math/matmul_op.cc
+++ b/dragon/operators/math/matmul_op.cc
@@ -199,11 +199,6 @@ void MatMulOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void MatMulOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
-template <class Context>
 template <typename T>
 void MatMulGradientOp<Context>::DoRunWithType() {
  auto &A = Input(0), &B = Input(1), &dY = Input(2);
@@ -590,11 +585,6 @@ void MatMulGradientOp<Context>::DoRunWithType() {
  }
 }

-template <class Context>
-void MatMulGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
 DEPLOY_CPU_OPERATOR(MatMul);
 #ifdef USE_CUDA
 DEPLOY_CUDA_OPERATOR(MatMul);

--- a/dragon/operators/math/matmul_op.h
+++ b/dragon/operators/math/matmul_op.h
@@ -23,7 +23,9 @@ class MatMulOp final : public Operator<Context> {
  SIMPLE_CTOR_DTOR(MatMulOp);
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
@@ -35,7 +37,9 @@ class MatMulGradientOp final : public Operator<Context> {
  SIMPLE_CTOR_DTOR(MatMulGradientOp);
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();

--- a/dragon/operators/array/channel_normalize_op.cc
+++ b/dragon/operators/array/channel_normalize_op.cc
-#include "dragon/operators/array/channel_normalize_op.h"
+#include "dragon/operators/normalization/channel_norm_op.h"
 #include "dragon/core/workspace.h"
 #include "dragon/utils/op_kernels.h"

@@ -6,7 +6,7 @@ namespace dragon {

 template <class Context>
 template <typename InputT, typename OutputT>
-void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
+void ChannelNormOp<Context>::DoRunWithTypeAndCast() {
  auto &X = Input(0), *Y = Output(0);
  GET_OP_AXIS_ARG(axis, X.ndim(), -1);

@@ -30,7 +30,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
      << "\nProviding " << X_mean_.count() << " values to normalize Dimension("
      << Y_dims[axis] << ").";

-  kernels::ChannelNormalize(
+  kernels::ChannelNorm(
      axis,
      num_dims,
      X_strides.data(),
@@ -44,7 +44,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {

 template <class Context>
 template <typename T>
-void ChannelNormalizeOp<Context>::DoRunWithType() {
+void ChannelNormOp<Context>::DoRunWithType() {
  if (data_type() == "float16") {
    DoRunWithTypeAndCast<T, float16>();
  } else if (data_type() == "float32") {
@@ -58,21 +58,21 @@ void ChannelNormalizeOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void ChannelNormalizeOp<Context>::RunOnDevice() {
+void ChannelNormOp<Context>::RunOnDevice() {
  DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
 }

-DEPLOY_CPU_OPERATOR(ChannelNormalize);
+DEPLOY_CPU_OPERATOR(ChannelNorm);
 #ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(ChannelNormalize);
+DEPLOY_CUDA_OPERATOR(ChannelNorm);
 #endif

-OPERATOR_SCHEMA(ChannelNormalize)
+OPERATOR_SCHEMA(ChannelNorm)
    /* X */
    .NumInputs(1)
    /* Y */
    .NumOutputs(1);

-NO_GRADIENT(ChannelNormalize);
+NO_GRADIENT(ChannelNorm);

 } // namespace dragon
--- a/dragon/operators/array/channel_normalize_op.h
+++ b/dragon/operators/array/channel_normalize_op.h
@@ -10,17 +10,17 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_
-#define DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_
+#ifndef DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
+#define DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_

 #include "dragon/core/operator.h"

 namespace dragon {

 template <class Context>
-class ChannelNormalizeOp final : public Operator<Context> {
+class ChannelNormOp final : public Operator<Context> {
 public:
-  ChannelNormalizeOp(const OperatorDef& def, Workspace* ws)
+  ChannelNormOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws) {
    INITIALIZE_OP_REPEATED_ARG(int64_t, perm);
    auto mean = OP_REPEATED_ARG(float, "mean");
@@ -50,8 +50,8 @@ class ChannelNormalizeOp final : public Operator<Context> {
  DECLARE_OP_REPEATED_ARG(int64_t, perm);
 };

-DEFINE_OP_REPEATED_ARG(int64_t, ChannelNormalizeOp, perm);
+DEFINE_OP_REPEATED_ARG(int64_t, ChannelNormOp, perm);

 } // namespace dragon

-#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_
+#endif // DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
--- a/dragon/operators/normalization/lp_normalize_op.cc
+++ b/dragon/operators/normalization/lp_normalize_op.cc
-#include "dragon/operators/normalization/lp_normalize_op.h"
-#include "dragon/core/workspace.h"
-#include "dragon/utils/math_functions.h"
+#include "dragon/operators/normalization/lp_norm_op.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {

 template <class Context>
 template <typename T>
-void LpNormalizeOp<Context>::DoRunWithType() {
+void LpNormOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
  GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
  auto reduce_dim = X.count(axis, end_axis + 1);

-  // Normalize input with a scaled Lp-norm
  if (p_ == 1) {
-    kernels::L1Normalize(
+    kernels::L1Norm(
        X.count(0, axis),
        X.count(end_axis + 1),
        reduce_dim,
@@ -25,7 +22,7 @@ void LpNormalizeOp<Context>::DoRunWithType() {
        Y->ReshapeLike(X)->template mutable_data<T, Context>(),
        ctx());
  } else if (p_ == 2) {
-    kernels::L2Normalize(
+    kernels::L2Norm(
        X.count(0, axis),
        X.count(end_axis + 1),
        reduce_dim,
@@ -40,20 +37,15 @@ void LpNormalizeOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void LpNormalizeOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
-template <class Context>
 template <typename T>
-void LpNormalizeGradientOp<Context>::DoRunWithType() {
+void LpNormGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &dY = Input(1), *dX = Output(0);
  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
  GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
  auto reduce_dim = X.count(axis, end_axis + 1);

  if (p_ == 1) {
-    kernels::L1NormalizeGrad(
+    kernels::L1NormGrad(
        X.count(0, axis),
        X.count(end_axis + 1),
        reduce_dim,
@@ -64,7 +56,7 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() {
        dX->ReshapeLike(X)->template mutable_data<T, Context>(),
        ctx());
  } else if (p_ == 2) {
-    kernels::L2NormalizeGrad(
+    kernels::L2NormGrad(
        X.count(0, axis),
        X.count(end_axis + 1),
        reduce_dim,
@@ -79,33 +71,28 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() {
  }
 }

-template <class Context>
-void LpNormalizeGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
-DEPLOY_CPU_OPERATOR(LpNormalize);
+DEPLOY_CPU_OPERATOR(LpNorm);
 #ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(LpNormalize);
+DEPLOY_CUDA_OPERATOR(LpNorm);
 #endif

-DEPLOY_CPU_OPERATOR(LpNormalizeGradient);
+DEPLOY_CPU_OPERATOR(LpNormGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(LpNormalizeGradient);
+DEPLOY_CUDA_OPERATOR(LpNormGradient);
 #endif

-OPERATOR_SCHEMA(LpNormalize)
+OPERATOR_SCHEMA(LpNorm)
    /* X */
    .NumInputs(1)
    /* Y */
    .NumOutputs(1);

-OPERATOR_SCHEMA(LpNormalizeGradient)
+OPERATOR_SCHEMA(LpNormGradient)
    /* X, dY */
    .NumInputs(2)
    /* dX */
    .NumOutputs(1);

-REGISTER_GRADIENT(LpNormalize, GenericGradientMaker);
+REGISTER_GRADIENT(LpNorm, GenericGradientMaker);

 } // namespace dragon
--- a/dragon/operators/normalization/lp_normalize_op.h
+++ b/dragon/operators/normalization/lp_normalize_op.h
@@ -10,24 +10,26 @@
 * ------------------------------------------------------------
 */

-#ifndef DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_
-#define DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_
+#ifndef DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
+#define DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_

 #include "dragon/core/operator.h"

 namespace dragon {

 template <class Context>
-class LpNormalizeOp final : public Operator<Context> {
+class LpNormOp final : public Operator<Context> {
 public:
-  LpNormalizeOp(const OperatorDef& def, Workspace* ws)
+  LpNormOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
        p_(OP_SINGLE_ARG(int64_t, "p", 2)),
        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
        reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
@@ -39,16 +41,18 @@ class LpNormalizeOp final : public Operator<Context> {
 };

 template <class Context>
-class LpNormalizeGradientOp final : public Operator<Context> {
+class LpNormGradientOp final : public Operator<Context> {
 public:
-  LpNormalizeGradientOp(const OperatorDef& def, Workspace* ws)
+  LpNormGradientOp(const OperatorDef& def, Workspace* ws)
      : Operator<Context>(def, ws),
        p_(OP_SINGLE_ARG(int64_t, "p", 2)),
        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
        reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
  USE_OPERATOR_FUNCTIONS;

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
@@ -61,4 +65,4 @@ class LpNormalizeGradientOp final : public Operator<Context> {

 } // namespace dragon

-#endif // DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_
+#endif // DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
--- a/dragon/operators/training/adam_op.cc
+++ b/dragon/operators/training/adam_op.cc
@@ -5,46 +5,41 @@
 namespace dragon {

 template <class Context>
-void AdamOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
+template <typename T, typename CopyT>
+void AdamOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
  kernels::Adam(
      dX->count(),
      lr_ * correction_,
      beta1_,
      beta2_,
      eps_,
-      dX->template mutable_data<float, Context>(),
-      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
-      Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      this->weight_decay_,
+      X->template data<T, Context>(),
+      dX->template mutable_data<T, Context>(),
+      GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      X->template mutable_data<T, Context>(),
+      Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
      ctx());
 }

 template <class Context>
-void AdamWOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
-  if (lambda_ > 0.f) {
-    kernels::AdamW(
-        dX->count(),
-        lr_ * correction_,
-        beta1_,
-        beta2_,
-        eps_,
-        this->lr_ * lambda_,
-        X->template data<float, Context>(),
-        dX->template mutable_data<float, Context>(),
-        Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
-        Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
-        ctx());
-  } else {
-    kernels::Adam(
-        dX->count(),
-        lr_ * correction_,
-        beta1_,
-        beta2_,
-        eps_,
-        dX->template mutable_data<float, Context>(),
-        Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
-        Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
-        ctx());
-  }
+template <typename T, typename CopyT>
+void AdamWOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
+  kernels::AdamW(
+      dX->count(),
+      lr_ * correction_,
+      beta1_,
+      beta2_,
+      eps_,
+      lr_ * this->weight_decay_,
+      X->template data<T, Context>(),
+      dX->template mutable_data<T, Context>(),
+      GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      X->template mutable_data<T, Context>(),
+      Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
+      ctx());
 }

 DEPLOY_CPU_OPERATOR(Adam);

--- a/dragon/operators/training/rmsprop_op.cc
+++ b/dragon/operators/training/rmsprop_op.cc
@@ -5,16 +5,21 @@
 namespace dragon {

 template <class Context>
-void RMSpropOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
+template <typename T, typename CopyT>
+void RMSpropOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
  kernels::RMSprop(
      dX->count(),
      lr_,
      momentum_,
-      decay_,
+      alpha_,
      eps_,
-      dX->template mutable_data<float, Context>(),
-      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
-      Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      this->weight_decay_,
+      X->template data<T, Context>(),
+      dX->template mutable_data<T, Context>(),
+      GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      X->template mutable_data<T, Context>(),
+      Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
      ctx());
 }


--- a/dragon/operators/training/momentum_sgd_op.cc
+++ b/dragon/operators/training/momentum_sgd_op.cc
@@ -6,33 +6,44 @@
 namespace dragon {

 template <class Context>
-void MomentumSGDOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
+template <typename T, typename CopyT>
+void MomentumSGDOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
  kernels::MomentumSGD(
      dX->count(),
      lr_,
      momentum_,
-      dX->template mutable_data<float, Context>(),
-      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      this->weight_decay_,
+      X->template data<T, Context>(),
+      dX->template mutable_data<T, Context>(),
+      GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      X->template mutable_data<T, Context>(),
+      Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
      ctx());
 }

 template <class Context>
-void NesterovSGDOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
+template <typename T, typename CopyT>
+void NesterovSGDOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
  kernels::NesterovSGD(
      dX->count(),
      lr_,
      momentum_,
-      dX->template mutable_data<float, Context>(),
-      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      this->weight_decay_,
+      X->template data<T, Context>(),
+      dX->template mutable_data<T, Context>(),
+      GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      X->template mutable_data<T, Context>(),
+      Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
      ctx());
 }

 template <class Context>
-void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
+template <typename T, typename CopyT>
+void LARSOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
  float trust_ratio = 0.f;
  if (trust_coef_ > 0.f) {
-    auto* x = X->template data<float, Context>();
-    auto* dx = dX->template mutable_data<float, Context>();
+    auto* x = X->template data<T, Context>();
+    auto* dx = dX->template mutable_data<T, Context>();
    float x_norm = std::sqrt(math::Dot(X->count(), x, x, ctx()));
    float dx_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
    if (x_norm > 0.f && dx_norm > 0.f) {
@@ -43,16 +54,20 @@ void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
    math::Scale(
        dX->count(),
        trust_ratio,
-        dX->template data<float, Context>(),
-        dX->template mutable_data<float, Context>(),
+        dX->template data<T, Context>(),
+        dX->template mutable_data<T, Context>(),
        ctx());
  }
  kernels::MomentumSGD(
      dX->count(),
      lr_,
      momentum_,
-      dX->template mutable_data<float, Context>(),
-      Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
+      this->weight_decay_,
+      X->template data<T, Context>(),
+      dX->template mutable_data<T, Context>(),
+      GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
+      X->template mutable_data<T, Context>(),
+      Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
      ctx());
 }


--- a/dragon/operators/training/update_op.h
+++ b/dragon/operators/training/update_op.h
@@ -37,17 +37,14 @@ class UpdateOpBase : public Operator<Context> {
  void RunOnDevice() override;

  template <typename T>
-  void TransformGrad(Tensor* dX, Tensor* X);
+  void TransformGrad(Tensor* dX);

-  virtual void ComputeUpdate(Tensor* dX, Tensor* X) = 0;
-
-  template <typename T>
-  void ApplyUpdate(Tensor* dX, Tensor* X);
+  virtual void ApplyUpdate(Tensor* dX, Tensor* X, Tensor* Y) = 0;

  template <typename T>
  T GetHyper(const string& key);

-  Tensor* Slot(const string& key);
+  Tensor* GetState(const string& key);

 protected:
  int weight_index_;
@@ -55,9 +52,26 @@ class UpdateOpBase : public Operator<Context> {
  float clip_norm_, clip_value_;
 };

-#define USE_UPDATE_FUNCTIONS             \
-  using UpdateOpBase<Context>::GetHyper; \
-  using UpdateOpBase<Context>::Slot
+#define USE_UPDATE_FUNCTIONS                                       \
+  using UpdateOpBase<Context>::GetHyper;                           \
+  using UpdateOpBase<Context>::GetState;                           \
+  void ApplyUpdate(Tensor* dX, Tensor* X, Tensor* Y) override {    \
+    if (dX->template IsType<float>()) {                            \
+      if (Y == nullptr) {                                          \
+        DoRunWithType<float, float>(dX, X, Y);                     \
+      } else if (Y->template IsType<float16>()) {                  \
+        DoRunWithType<float, float16>(dX, X, Y);                   \
+      } else {                                                     \
+        LOG(FATAL) << MessageForUnsupported(                       \
+            dtypes::to_string(Y->meta()), {"float16", "float32"}); \
+      }                                                            \
+    } else if (dX->template IsType<double>()) {                    \
+      DoRunWithType<double, double>(dX, X, Y);                     \
+    } else {                                                       \
+      LOG(FATAL) << MessageForUnsupported(                         \
+          dtypes::to_string(dX->meta()), {"float32", "float64"});  \
+    }                                                              \
+  }

 template <class Context>
 class MomentumSGDOp final : public UpdateOpBase<Context> {
@@ -73,7 +87,8 @@ class MomentumSGDOp final : public UpdateOpBase<Context> {
    UpdateOpBase<Context>::GetArguments();
  }

-  void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
+  template <typename T, typename CopyT>
+  void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);

 protected:
  float lr_, momentum_;
@@ -93,7 +108,8 @@ class NesterovSGDOp final : public UpdateOpBase<Context> {
    UpdateOpBase<Context>::GetArguments();
  }

-  void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
+  template <typename T, typename CopyT>
+  void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);

 protected:
  float lr_, momentum_;
@@ -110,15 +126,16 @@ class RMSpropOp final : public UpdateOpBase<Context> {
  void GetArguments() override {
    lr_ = this->template GetHyper<float>("lr");
    momentum_ = this->template GetHyper<float>("momentum");
-    decay_ = this->template GetHyper<float>("decay");
+    alpha_ = this->template GetHyper<float>("alpha");
    eps_ = this->template GetHyper<float>("eps");
    UpdateOpBase<Context>::GetArguments();
  }

-  void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
+  template <typename T, typename CopyT>
+  void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);

 protected:
-  float lr_, momentum_, decay_, eps_;
+  float lr_, momentum_, alpha_, eps_;
 };

 template <class Context>
@@ -139,7 +156,8 @@ class AdamOp : public UpdateOpBase<Context> {
    UpdateOpBase<Context>::GetArguments();
  }

-  void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
+  template <typename T, typename CopyT>
+  void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);

 protected:
  int64_t t_;
@@ -163,16 +181,15 @@ class AdamWOp final : public UpdateOpBase<Context> {
    t_++;
    correction_ = sqrt(1.f - pow(beta2_, t_)) / (1.f - pow(beta1_, t_));
    UpdateOpBase<Context>::GetArguments();
-    lambda_ = this->weight_decay_;
-    this->weight_decay_ = 0.f;
  }

-  void ComputeUpdate(Tensor* dX, Tensor* X) override;
+  template <typename T, typename CopyT>
+  void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);

 protected:
  int64_t t_;
  float lr_, beta1_, beta2_;
-  float eps_, correction_, lambda_;
+  float eps_, correction_;
 };

 template <class Context>
@@ -190,14 +207,13 @@ class LARSOp final : public UpdateOpBase<Context> {
    UpdateOpBase<Context>::GetArguments();
  }

-  void ComputeUpdate(Tensor* dX, Tensor* X) override;
+  template <typename T, typename CopyT>
+  void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);

 protected:
  float lr_, momentum_, trust_coef_;
 };

-#undef USE_UPDATE_FUNCTIONS
-
 } // namespace dragon

 #endif // DRAGON_OPERATORS_TRAINING_UPDATE_OP_H_
--- a/dragon/operators/training/update_op_base.cc
+++ b/dragon/operators/training/update_op_base.cc
@@ -13,67 +13,40 @@ T UpdateOpBase<Context>::GetHyper(const string& key) {
 }

 template <class Context>
-Tensor* UpdateOpBase<Context>::Slot(const string& key) {
+Tensor* UpdateOpBase<Context>::GetState(const string& key) {
  const string& weight_name = Output(weight_index_)->name();
  return workspace()->CreateTensor(name() + "/" + weight_name + "/" + key);
 }

 template <class Context>
 template <typename T>
-void UpdateOpBase<Context>::TransformGrad(Tensor* dX, Tensor* X) {
-  // Scale.
+void UpdateOpBase<Context>::TransformGrad(Tensor* dX) {
  if (grad_scale_ != 1.f) {
    auto* dx = dX->template mutable_data<T, Context>();
    math::Scale(dX->count(), grad_scale_, dx, dx, ctx());
  }
-  // Clip.
  if (clip_norm_ > 0.f) {
    auto* dx = dX->template mutable_data<T, Context>();
-    float grad_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
-    if (grad_norm > clip_norm_) {
-      math::Scale(dX->count(), clip_norm_ / grad_norm, dx, dx, ctx());
+    float norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
+    if (norm > clip_norm_) {
+      math::Scale(dX->count(), clip_norm_ / norm, dx, dx, ctx());
    }
  } else if (clip_value_ > 0.f) {
    auto* dx = dX->template mutable_data<T, Context>();
    kernels::Clip(dX->count(), -clip_value_, clip_value_, dx, dx, ctx());
  }
-  // Penalty.
-  if (weight_decay_ > 0.f) {
-    math::Axpy(
-        X->count(),
-        weight_decay_,
-        X->template data<T, Context>(),
-        dX->template mutable_data<T, Context>(),
-        ctx());
-  }
-}
-
-template <class Context>
-template <typename T>
-void UpdateOpBase<Context>::ApplyUpdate(Tensor* dX, Tensor* X) {
-  math::Sub(
-      X->count(),
-      X->template data<T, Context>(),
-      dX->template data<T, Context>(),
-      X->template mutable_data<T, Context>(),
-      ctx());
 }

 template <class Context>
 void UpdateOpBase<Context>::RunOnDevice() {
  GetArguments();
-  for (int i = 0; i < InputSize(); ++i) {
-    weight_index_ = i;
-    auto &dX = Input(i), *X = Output(i);
-    if (dX.count() == 0 || X->count() == 0) return;
+  for (weight_index_ = 0; weight_index_ < InputSize(); ++weight_index_) {
+    auto &dX = Input(weight_index_), *X = Output(weight_index_);
+    if (dX.count() == 0 || X->count() == 0) continue;
    CHECK(dX.dims() == X->dims())
        << "\nWeight and grad should have the same dimensions."
        << "\nGot" << X->DimString() << " and " << dX.DimString();
-    if (dX.template IsType<float>()) {
-      TransformGrad<float>(&dX, X);
-      ComputeUpdate(&dX, X);
-      ApplyUpdate<float>(&dX, X);
-    } else if (dX.template IsType<float16>()) {
+    if (dX.template IsType<float16>()) {
      auto* X_master = workspace()->CreateTensor(X->name() + "_master");
      auto* X_grad = ctx()->workspace()->CreateTensor("BufferShared");
      if (X_master->count() != X->count()) {
@@ -88,17 +61,17 @@ void UpdateOpBase<Context>::RunOnDevice() {
          dX.template data<float16, Context>(),
          X_grad->ReshapeLike(dX)->template mutable_data<float, Context>(),
          ctx());
-      TransformGrad<float>(X_grad, X_master);
-      ComputeUpdate(X_grad, X_master);
-      ApplyUpdate<float>(X_grad, X_master);
-      math::Cast(
-          X->count(),
-          X_master->template data<float, Context>(),
-          X->template mutable_data<float16, Context>(),
-          ctx());
+      TransformGrad<float>(X_grad);
+      ApplyUpdate(X_grad, X_master, X);
+    } else if (dX.template IsType<float>()) {
+      TransformGrad<float>(&dX);
+      ApplyUpdate(&dX, X, nullptr);
+    } else if (dX.template IsType<double>()) {
+      TransformGrad<double>(&dX);
+      ApplyUpdate(&dX, X, nullptr);
    } else {
      LOG(FATAL) << MessageForUnsupported(
-          dtypes::to_string(dX.meta()), {"float16", "float32"});
+          dtypes::to_string(dX.meta()), {"float16", "float32", "float64"});
    }
  }
 }

--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
@@ -58,9 +58,6 @@ from dragon.core.ops import tensor_ops as _
 from dragon.core.ops.array_ops import assign
 from dragon.core.ops.array_ops import boolean_mask
 from dragon.core.ops.array_ops import broadcast_to
-from dragon.core.ops.array_ops import channel_affine
-from dragon.core.ops.array_ops import channel_normalize
-from dragon.core.ops.array_ops import channel_shuffle
 from dragon.core.ops.array_ops import concat
 from dragon.core.ops.array_ops import expand_dims
 from dragon.core.ops.array_ops import flatten

--- a/dragon/python/_api/cuda/__init__.py
+++ b/dragon/python/_api/cuda/__init__.py
@@ -21,6 +21,7 @@ from dragon.core.device.cuda import current_device
 from dragon.core.device.cuda import get_device_capability
 from dragon.core.device.cuda import is_available
 from dragon.core.device.cuda import memory_allocated
+from dragon.core.device.cuda import set_cublas_flags
 from dragon.core.device.cuda import set_cudnn_flags
 from dragon.core.device.cuda import set_default_device
 from dragon.core.device.cuda import set_device

--- a/dragon/python/_api/math/__init__.py
+++ b/dragon/python/_api/math/__init__.py
@@ -17,8 +17,10 @@ from dragon.core.ops.activation_ops import sigmoid
 from dragon.core.ops.activation_ops import tanh
 from dragon.core.ops.math_ops import abs
 from dragon.core.ops.math_ops import add
+from dragon.core.ops.math_ops import affine
 from dragon.core.ops.math_ops import argmax
 from dragon.core.ops.math_ops import argmin
+from dragon.core.ops.math_ops import atan2
 from dragon.core.ops.math_ops import ceil
 from dragon.core.ops.math_ops import clip
 from dragon.core.ops.math_ops import cos
@@ -60,7 +62,6 @@ from dragon.core.ops.math_ops import sqrt
 from dragon.core.ops.math_ops import square
 from dragon.core.ops.math_ops import sub
 from dragon.core.ops.math_ops import sum
-from dragon.core.ops.normalization_ops import lp_normalize
 from dragon.core.ops.sort_ops import top_k

 __all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/dragon/python/_api/nn/__init__.py
+++ b/dragon/python/_api/nn/__init__.py
@@ -34,12 +34,15 @@ from dragon.core.ops.activation_ops import relu6
 from dragon.core.ops.activation_ops import selu
 from dragon.core.ops.activation_ops import silu
 from dragon.core.ops.activation_ops import softmax
+from dragon.core.ops.array_ops import channel_shuffle
 from dragon.core.ops.math_ops import moments
 from dragon.core.ops.normalization_ops import batch_norm
+from dragon.core.ops.normalization_ops import channel_norm
 from dragon.core.ops.normalization_ops import group_norm
 from dragon.core.ops.normalization_ops import instance_norm
 from dragon.core.ops.normalization_ops import layer_norm
 from dragon.core.ops.normalization_ops import local_response_norm
+from dragon.core.ops.normalization_ops import lp_norm
 from dragon.core.ops.normalization_ops import sync_batch_norm
 from dragon.core.ops.vision_ops import bias_add
 from dragon.core.ops.vision_ops import conv

--- a/dragon/python/core/autograph/op_args.py
+++ b/dragon/python/core/autograph/op_args.py
@@ -78,16 +78,13 @@ def cast_args(**kwargs):
    return {'dtype': kwargs.get('dtype', 'float32')}


-@register('ChannelAffine')
-def channel_affine_args(**kwargs):
-    return {
-        'axis': kwargs.get('axis', -1),
-        'end_axis': kwargs.get('end_axis', kwargs.get('axis', -1)),
-    }
+@register('Affine')
+def affine_args(**kwargs):
+    return {'axes': kwargs.get('axes', None)}


-@register('ChannelNormalize')
-def channel_normalize_args(**kwargs):
+@register('ChannelNorm')
+def channel_norm_args(**kwargs):
    return {
        'axis': kwargs.get('axis', -1),
        'mean': kwargs.get('mean', None),
@@ -323,8 +320,8 @@ def loss_args(**kwargs):
    return {'reduction': kwargs.get('reduction', 'MEAN')}


-@register('LpNormalize')
-def lp_normalize_args(**kwargs):
+@register('LpNorm')
+def lp_norm_args(**kwargs):
    return {
        'p': kwargs.get('p', 2),
        'axis': kwargs.get('axis', -1),

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -81,6 +81,7 @@ def binary_shape_spec(inputs, outputs):

 @register([
    'Add',
+    'Atan2',
    'BitwiseAnd',
    'BitwiseOr',
    'BitwiseXor',
@@ -403,7 +404,7 @@ def gemm_spec(args, inputs, outputs):
    return outputs


-@register('ChannelNormalize')
+@register('ChannelNorm')
 def channel_normalize_spec(args, inputs, outputs):
    outputs[0]._dtype = args['dtype']
    try:

--- a/dragon/python/core/device/cuda.py
+++ b/dragon/python/core/device/cuda.py
@@ -62,11 +62,23 @@ def current_device():
    return backend.cudaGetDevice()


+def set_cublas_flags(allow_tf32=None):
+    """Set the flags of cuBLAS library.
+
+    Parameters
+    ----------
+    allow_tf32 : bool, optional, default=False
+        Allow TF32 tensor core operation or not.
+
+    """
+    backend.cublasSetFlags(-1 if allow_tf32 is None else allow_tf32)
+
+
 def set_cudnn_flags(
-    enabled=True,
-    benchmark=False,
-    deterministic=False,
-    allow_tf32=False,
+    enabled=None,
+    benchmark=None,
+    deterministic=None,
+    allow_tf32=None,
 ):
    """Set the flags of cuDNN library.

@@ -82,7 +94,11 @@ def set_cudnn_flags(
        Allow TF32 tensor core operation or not.

    """
-    backend.cudnnSetFlags(enabled, benchmark, deterministic, allow_tf32)
+    backend.cudnnSetFlags(
+        -1 if enabled is None else enabled,
+        -1 if benchmark is None else benchmark,
+        -1 if deterministic is None else deterministic,
+        -1 if allow_tf32 is None else allow_tf32)


 def get_device_capability(device_index=None):

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -122,107 +122,16 @@ def broadcast_to(inputs, shape, **kwargs):
    return OpLib.add('Expand', **args)


-@OpSchema.num_inputs(2, 3)
-def channel_affine(inputs, axis=-1, end_axis=None, **kwargs):
-    r"""Apply affine transformation to each channel of input.
-
-    Parameters
-    ----------
-    inputs : Sequence[dragon.Tensor]
-        The input, weight and optional bias tensor.
-    axis : int, optional, default=-1
-        The first channel axis.
-    end_axis : int, optional
-        The last channel axis.
-
-    Returns
-    -------
-    dragon.Tensor
-        The output tensor.
-
-    """
-    outputs = kwargs.pop('outputs', [None])
-    if context.executing_eagerly():
-        return OpLib.execute(
-            'ChannelAffine', inputs, outputs=outputs,
-            axis=axis, end_axis=end_axis)
-    return OpLib.add('ChannelAffine', inputs,
-                     axis=axis, end_axis=end_axis, **kwargs)
-
-
-@OpSchema.num_inputs(1)
-@OpSchema.convert_arg('perm')
-def channel_normalize(
-    inputs,
-    mean,
-    std,
-    axis=-1,
-    dtype='float32',
-    perm=None,
-    **kwargs
-):
-    """Apply normalization to each channel of input.
-
-    :attr:`axis` can be negative:
-
-    ```python
-    m = s = (1., 1., 1.)
-    x = dragon.constant([1, 2, 3])
-    print(dragon.channel_normalize(x, m, s, axis=0))   # [0., 1., 2.]
-    print(dragon.channel_normalize(x, m, s, axis=-1))  # Equivalent
-    ```
-
-    If :attr:`perm` provided, :attr:`axis` is selected from the output layout:
-
-    ```python
-    m, s = (1., 2., 3.), (1., 1., 1.)
-    x = dragon.constant([[1, 2, 3]])
-    # Provided 3 values to normalize the last axis
-    # with length 1, only the first value will be taken
-    print(dragon.channel_normalize(x, m, s, perm=(1, 0)))  # [[0.], [1.], [2.]]
-    ```
-
-    Parameters
-    ----------
-    inputs : dragon.Tensor
-        The input tensor.
-    mean : Sequence[float], required
-        The mean to subtract.
-    std : Sequence[float], required
-        The standard deviation to divide.
-    axis : int, optional, default=-1
-        The channel axis.
-    dtype : str, optional, default='float32'
-        The output data type.
-    perm : Sequence[Union[int, dragon.Tensor]], optional
-        The output permutation.
-
-    Returns
-    -------
-    dragon.Tensor
-        The output tensor.
-
-    """
-    args = OpSchema.parse_args(locals())
-    if context.executing_eagerly():
-        return OpLib.execute(
-            'ChannelNormalize', inputs,
-            axis=axis, mean=mean, std=std, dtype=dtype,
-            ndim=len(args['perm']) if perm is not None else 0,
-            perm=args['perm'])
-    return OpLib.add('ChannelNormalize', **args)
-
-
 @OpSchema.num_inputs(1)
 def channel_shuffle(inputs, axis=-1, group=1, **kwargs):
-    """Apply group shuffle to each channel of input.
+    """Apply the group shuffle to each channel of input.
    `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.

    Examples:

    ```python
    x = dragon.constant([1, 2, 3, 4])
-    print(dragon.channel_shuffle(x, group=2))  # [1, 3, 2, 4]
+    print(dragon.nn.channel_shuffle(x, group=2))  # [1, 3, 2, 4]
    ```

    Parameters

--- a/dragon/python/core/ops/math_ops.py
+++ b/dragon/python/core/ops/math_ops.py
@@ -82,6 +82,30 @@ def add(inputs, **kwargs):
    return OpLib.add('Add', inputs, **kwargs)


+@OpSchema.num_inputs(2, 3)
+def affine(inputs, axis=-1, **kwargs):
+    """Apply affine transformation to input.
+
+    Parameters
+    ----------
+    inputs : Sequence[dragon.Tensor]
+        The input, scale and bias tensor.
+    axis : Union[int, Sequence[int]], optional, default=-1
+        The axis to apply.
+
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+
+    """
+    axes = nest.flatten(axis)
+    outputs = kwargs.pop('outputs', [None])
+    if context.executing_eagerly():
+        return OpLib.execute('Affine', inputs, outputs=outputs, axes=axes)
+    return OpLib.add('Affine', inputs, axes=axes, **kwargs)
+
+
 @OpSchema.num_inputs(1)
 def argmax(inputs, axis=0, keepdims=False, **kwargs):
    """Compute the index of maximum elements along the given axis.
@@ -149,6 +173,37 @@ def argmin(inputs, axis=0, keepdims=False, **kwargs):


 @OpSchema.num_inputs(2)
+def atan2(inputs, **kwargs):
+    r"""Compute the element-wise arc-tangent of two arguments.
+
+    .. math:: \text{out} = \text{arctan}(\frac{\text{input1}}{\text{input2}})
+
+    Examples:
+
+    ```python
+    y = dragon.constant(1)
+    x = dragon.constant(2)
+    print(dragon.math.atan2([y, x]))  # 0.46364761
+    ```
+
+    Parameters
+    ----------
+    inputs : Sequence[dragon.Tensor]
+        The input1 and input2 tensor.
+
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+
+    """
+    inputs = constant_ops.remove_scalars(inputs)
+    if context.executing_eagerly():
+        return OpLib.execute('Atan2', inputs)
+    return OpLib.add('Atan2', inputs, **kwargs)
+
+
+@OpSchema.num_inputs(2)
 def bitwise_and(inputs, **kwargs):
    r"""Compute the element-wise AND bitwise operation.


--- a/dragon/python/core/ops/normalization_ops.py
+++ b/dragon/python/core/ops/normalization_ops.py
@@ -72,6 +72,69 @@ def batch_norm(
    return OpLib.add('BatchNorm', **args)


+@OpSchema.num_inputs(1)
+@OpSchema.convert_arg('perm')
+def channel_norm(
+    inputs,
+    mean,
+    std,
+    axis=-1,
+    dtype='float32',
+    perm=None,
+    **kwargs
+):
+    """Apply the normalization to each channel of input.
+
+    :attr:`axis` can be negative:
+
+    ```python
+    m = s = (1., 1., 1.)
+    x = dragon.constant([1, 2, 3])
+    print(dragon.nn.channel_norm(x, m, s, axis=0))   # [0., 1., 2.]
+    print(dragon.nn.channel_norm(x, m, s, axis=-1))  # Equivalent
+    ```
+
+    If :attr:`perm` provided, :attr:`axis` is selected from the output layout:
+
+    ```python
+    m, s = (1., 2., 3.), (1., 1., 1.)
+    x = dragon.constant([[1, 2, 3]])
+    # Provided 3 values to normalize the last axis
+    # with length 1, only the first value will be taken
+    print(dragon.nn.channel_norm(x, m, s, perm=(1, 0)))  # [[0.], [1.], [2.]]
+    ```
+
+    Parameters
+    ----------
+    inputs : dragon.Tensor
+        The input tensor.
+    mean : Sequence[float], required
+        The mean to subtract.
+    std : Sequence[float], required
+        The standard deviation to divide.
+    axis : int, optional, default=-1
+        The channel axis.
+    dtype : str, optional, default='float32'
+        The output data type.
+    perm : Sequence[Union[int, dragon.Tensor]], optional
+        The output permutation.
+
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+
+    """
+    args = OpSchema.parse_args(locals())
+    if context.executing_eagerly():
+        return OpLib.execute(
+            'ChannelNorm', inputs,
+            axis=axis, mean=mean, std=std, dtype=dtype,
+            ndim=len(args['perm']) if perm is not None else 0,
+            perm=args['perm'])
+    return OpLib.add('ChannelNorm', **args)
+
+
 @OpSchema.num_inputs(3)
 def group_norm(inputs, axis=-1, group=0, epsilon=1e-5, **kwargs):
    r"""Apply the group normalization.
@@ -180,7 +243,7 @@ def layer_norm(inputs, axis=-1, epsilon=1e-5, **kwargs):


 @OpSchema.num_inputs(1)
-def lp_normalize(
+def lp_norm(
    inputs,
    axis=-1,
    end_axis=None,
@@ -200,15 +263,15 @@ def lp_normalize(
    ```python
    x = dragon.constant([[1, 2, 3], [4, 5, 6]], 'float32')
    # A negative axis is the last-k axis
-    print(dragon.math.lp_normalize(x, 1))
-    print(dragon.math.lp_normalize(x, -1))  # Equivalent
+    print(dragon.nn.lp_norm(x, 1))
+    print(dragon.nn.lp_norm(x, -1))  # Equivalent
    ```

    More than one axis could be specified to reduce:

    ```python
    # Along the continuous axes: [axis, end_axis]
-    print(dragon.math.lp_normalize(x, axis=0, end_axis=1))
+    print(dragon.nn.lp_norm(x, axis=0, end_axis=1))
    ```

    Parameters
@@ -236,9 +299,9 @@ def lp_normalize(
    reduction = reduction.upper()
    if context.executing_eagerly():
        return OpLib.execute(
-            'LpNormalize', inputs, p=p, axis=axis, end_axis=end_axis,
+            'LpNorm', inputs, p=p, axis=axis, end_axis=end_axis,
            epsilon=epsilon, reduction=reduction)
-    return OpLib.add('LpNormalize', inputs, p=p, axis=axis, end_axis=end_axis,
+    return OpLib.add('LpNorm', inputs, p=p, axis=axis, end_axis=end_axis,
                     epsilon=epsilon, reduction=reduction, **kwargs)



--- a/dragon/python/core/training/adam.py
+++ b/dragon/python/core/training/adam.py
@@ -24,9 +24,11 @@ class Adam(optimizer.Optimizer):
    The **Adam** update is defined as:

    .. math::
-        \text{Adam}(g) = \frac{\text{lr} * m_{t}}{\sqrt{v_{t}} + \epsilon} \\
+        \text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
+                                           {\sqrt{v_{t}} + \epsilon}) \\
            \quad \\ \text{where}\quad
                \begin{cases}
+                    \text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
                    m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
                \end{cases}
@@ -62,12 +64,13 @@ class AdamW(Adam):
    The **AdamW** update is defined as:

    .. math::
-        \text{AdamW}(g, p) = \text{lr} * (\frac{m_{t}}{\sqrt{v_{t}} + \epsilon}
-                                           + \lambda p) \\
+        \text{AdamW}(g, p) = \text{lr} * (\frac{\text{correction} * m_{t}}
+                                               {\sqrt{v_{t}} + \epsilon} + \lambda p) \\
            \quad \\ \text{where}\quad
                \begin{cases}
+                    \text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
                    m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
-                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
+                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} \\
                \end{cases}

    """

--- a/dragon/python/core/training/rmsprop.py
+++ b/dragon/python/core/training/rmsprop.py
@@ -27,13 +27,13 @@ class RMSprop(optimizer.Optimizer):
        \text{RMSprop}(g) = \text{lr} * m_{t} \\
            \quad \\ \text{where} \quad
                \begin{cases}
-                    v_{t} = \text{decay} * v_{t-1} + (1 - \text{decay}) * g^{2} \\
+                    v_{t} = \alpha * v_{t-1} + (1 - \alpha) * g^{2} \\
                    m_{t} = \text{momentum} * m_{t-1} + \frac{g}{\sqrt{v_{t}} + \epsilon}
                \end{cases}

    """

-    def __init__(self, lr=0.01, momentum=0, decay=0.9, eps=1e-8, **kwargs):
+    def __init__(self, lr=0.01, momentum=0, alpha=0.9, eps=1e-8, **kwargs):
        r"""Create a ``RMSProp`` optimizer.

        Parameters
@@ -42,8 +42,8 @@ class RMSprop(optimizer.Optimizer):
            The initial value to :math:`\text{lr}`.
        momentum : float, optional, default=0
            The initial value to :math:`\text{momentum}`.
-        decay : float, optional, default=0.9
-            The initial value to :math:`\text{decay}`.
+        alpha : float, optional, default=0.9
+            The initial value to :math:`\alpha`.
        eps : float, optional, default=1e-8
            The initial value to :math:`\epsilon`.

@@ -51,5 +51,5 @@ class RMSprop(optimizer.Optimizer):
        super(RMSprop, self).__init__(**kwargs)
        self._set_hyper('lr', lr)
        self._set_hyper('momentum', momentum)
-        self._set_hyper('decay', decay)
+        self._set_hyper('alpha', alpha)
        self._set_hyper('eps', eps)
--- a/dragon/python/vm/onnx/core/exporters/array.py
+++ b/dragon/python/vm/onnx/core/exporters/array.py
@@ -51,41 +51,6 @@ def cast_exporter(op_def, context):
    return node, const_tensors


-@export_util.register('ChannelAffine')
-def channel_affine_exporter(op_def, context):
-    node, const_tensors = export_util.translate(**locals())
-    node.op_type = 'ATen'  # Currently not supported in ai.onnx
-    helper.add_attribute(node, 'op_type', 'ChannelAffine')
-    for arg in op_def.arg:
-        if arg.name == 'axis':
-            helper.add_attribute(node, 'axis', arg.i)
-        elif arg.name == 'end_axis':
-            helper.add_attribute(node, 'end_axis', arg.i)
-    return node, const_tensors
-
-
-@export_util.register('ChannelNormalize')
-def channel_normalize_exporter(op_def, context):
-    node, const_tensors = export_util.translate(**locals())
-    node.op_type = 'ATen'  # Currently not supported in ai.onnx
-    helper.add_attribute(node, 'op_type', 'ChannelNormalize')
-    for arg in op_def.arg:
-        if arg.name == 'mean':
-            helper.add_attribute(node, 'mean', arg.floats)
-        elif arg.name == 'std':
-            helper.add_attribute(node, 'std', arg.floats)
-        elif arg.name == 'axis':
-            helper.add_attribute(node, 'axis', arg.i)
-        elif arg.name == 'dtype':
-            helper.add_attribute(node, 'dtype', arg.s)
-        elif arg.name == 'perm':
-            helper.add_attribute(node, 'perm', arg.ints)
-        elif arg.name == 'perm_desc':
-            values = helper.fetch_argument(op_def, arg, context.ws)
-            helper.add_attribute(node, 'perm', values)
-    return node, const_tensors
-
-
 @export_util.register('Concat')
 def concat_exporter(op_def, context):
    node, const_tensors = export_util.translate(**locals())

--- a/dragon/python/vm/onnx/core/exporters/math.py
+++ b/dragon/python/vm/onnx/core/exporters/math.py
@@ -31,6 +31,17 @@ def add_exporter(op_def, context):
    return node, const_tensors


+@export_util.register('Affine')
+def affine_exporter(op_def, context):
+    node, const_tensors = export_util.translate(**locals())
+    node.op_type = 'ATen'  # Currently not supported in ai.onnx
+    helper.add_attribute(node, 'op_type', 'Affine')
+    for arg in op_def.arg:
+        if arg.name == 'axes':
+            helper.add_attribute(node, 'axes', arg.ints)
+    return node, const_tensors
+
+
 @export_util.register('Div')
 def div_exporter(op_def, context):
    node, const_tensors = export_util.translate(**locals())

--- a/dragon/python/vm/onnx/core/exporters/normalization.py
+++ b/dragon/python/vm/onnx/core/exporters/normalization.py
@@ -32,6 +32,28 @@ def batch_norm_exporter(op_def, context):
    return node, const_tensors


+@export_util.register('ChannelNorm')
+def channel_norm_exporter(op_def, context):
+    node, const_tensors = export_util.translate(**locals())
+    node.op_type = 'ATen'  # Currently not supported in ai.onnx
+    helper.add_attribute(node, 'op_type', 'ChannelNorm')
+    for arg in op_def.arg:
+        if arg.name == 'mean':
+            helper.add_attribute(node, 'mean', arg.floats)
+        elif arg.name == 'std':
+            helper.add_attribute(node, 'std', arg.floats)
+        elif arg.name == 'axis':
+            helper.add_attribute(node, 'axis', arg.i)
+        elif arg.name == 'dtype':
+            helper.add_attribute(node, 'dtype', arg.s)
+        elif arg.name == 'perm':
+            helper.add_attribute(node, 'perm', arg.ints)
+        elif arg.name == 'perm_desc':
+            values = helper.fetch_argument(op_def, arg, context.ws)
+            helper.add_attribute(node, 'perm', values)
+    return node, const_tensors
+
+
 @export_util.register('GroupNorm')
 def group_norm_exporter(op_def, context):
    node, const_tensors = export_util.translate(**locals())
@@ -49,8 +71,8 @@ def group_norm_exporter(op_def, context):
    return node, const_tensors


-@export_util.register('LpNormalize')
-def lp_normalize_exporter(op_def, context):
+@export_util.register('LpNorm')
+def lp_norm_exporter(op_def, context):
    node, const_tensors = export_util.translate(**locals())
    node.op_type = 'LpNormalization'
    axis, end_axis = None, None

--- a/dragon/utils/device/common_cuda.h
+++ b/dragon/utils/device/common_cuda.h
@@ -33,9 +33,6 @@ constexpr int CUDA_WARP_SIZE = 32;
 /*! \brief The number of cuda threads in a block */
 constexpr int CUDA_THREADS = 256;

-/*! \brief The maximum number of blocks to use in a default kernel call */
-constexpr int CUDA_MAX_BLOCKS = 4096;
-
 /*! \brief The maximum number of devices in a single machine */
 constexpr int CUDA_MAX_DEVICES = 16;

@@ -82,12 +79,15 @@ constexpr int CUDA_TENSOR_MAX_DIMS = 8;
  for (size_t j = threadIdx.x; j < m; j += blockDim.x)

 inline int CUDA_BLOCKS(const int N) {
-  return std::max(
-      std::min((N + CUDA_THREADS - 1) / CUDA_THREADS, CUDA_MAX_BLOCKS), 1);
-}
-
-inline int CUDA_2D_BLOCKS(const int N) {
-  return std::max(std::min(N, CUDA_MAX_BLOCKS), 1);
+  int device, sm_count, threads_per_sm;
+  CUDA_CHECK(cudaGetDevice(&device));
+  CUDA_CHECK(cudaDeviceGetAttribute(
+      &sm_count, cudaDevAttrMultiProcessorCount, device));
+  CUDA_CHECK(cudaDeviceGetAttribute(
+      &threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, device));
+  const auto num_blocks = (N + CUDA_THREADS - 1) / CUDA_THREADS;
+  const auto max_blocks = sm_count * threads_per_sm / CUDA_THREADS * 32;
+  return std::max(1, std::min(num_blocks, max_blocks));
 }

 #if CUDA_VERSION_MAX(9, 0)

--- a/dragon/utils/math/broadcast.cc
+++ b/dragon/utils/math/broadcast.cc
@@ -84,6 +84,7 @@ DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Sub, T);
 DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Mul, T);
 DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Div, T);
 DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Pow, T);
+DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Atan2, T);
 DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Minimum, T);
 DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Maximum, T);
 DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Equal, bool);
@@ -434,6 +435,7 @@ DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(And, bool, std::logical_and);
 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Or, bool, std::logical_or);
 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Xor, bool, math::XorFunctor);
 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Pow, T, math::PowFunctor);
+DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Atan2, T, math::Atan2Functor);
 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Minimum, T, math::MinFunctor);
 DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Maximum, T, math::MaxFunctor);
 #undef DEFINE_ROWWISE_COLWISE_BIANRY_FUNC
@@ -469,6 +471,7 @@ DEFINE_BROADCAST_BINARY_FUNC(And, bool, std::logical_and);
 DEFINE_BROADCAST_BINARY_FUNC(Or, bool, std::logical_or);
 DEFINE_BROADCAST_BINARY_FUNC(Xor, bool, math::XorFunctor);
 DEFINE_BROADCAST_BINARY_FUNC(Pow, T, math::PowFunctor);
+DEFINE_BROADCAST_BINARY_FUNC(Atan2, T, math::Atan2Functor);
 DEFINE_BROADCAST_BINARY_FUNC(Minimum, T, math::MinFunctor);
 DEFINE_BROADCAST_BINARY_FUNC(Maximum, T, math::MaxFunctor);
 #undef DEFINE_BROADCAST_BINARY_FUNC
@@ -612,6 +615,9 @@ DEFINE_BINARY_FUNC(Div, float, float);
 DEFINE_BINARY_FUNC(Div, double, double);
 DEFINE_BINARY_FUNC(Pow, float, float);
 DEFINE_BINARY_FUNC(Pow, double, double);
+DEFINE_BINARY_FUNC(Atan2, float16, float16);
+DEFINE_BINARY_FUNC(Atan2, float, float);
+DEFINE_BINARY_FUNC(Atan2, double, double);
 DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t);
 DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t);
 DEFINE_BINARY_FUNC(Minimum, int, int);

--- a/dragon/utils/math/broadcast.cu
+++ b/dragon/utils/math/broadcast.cu
@@ -388,6 +388,9 @@ DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor);
 DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor);
 DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor);
 DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor);
+DEFINE_BINARY_FUNC(Atan2, float16, float16, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Atan2, float, float, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Atan2, double, double, math::Atan2Functor);
 DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor);
 DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor);
 DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor);

--- a/dragon/utils/math/broadcast.h
+++ b/dragon/utils/math/broadcast.h
@@ -122,6 +122,17 @@ DRAGON_API void Pow(
    Context* ctx);

 template <typename T, class Context>
+DRAGON_API void Atan2(
+    const int A_ndim,
+    const int64_t* A_dims,
+    const int B_ndim,
+    const int64_t* B_dims,
+    const T* a,
+    const T* b,
+    T* y,
+    Context* ctx);
+
+template <typename T, class Context>
 DRAGON_API void Minimum(
    const int A_ndim,
    const int64_t* A_dims,

--- a/dragon/utils/math/elementwise.cc
+++ b/dragon/utils/math/elementwise.cc
@@ -550,6 +550,9 @@ DEFINE_BINARY_FUNC(Maximum, double, double, max);
    _SimpleBinaryFunc(N, Functor<InputT>(), a, b, y);      \
  }

+DEFINE_BINARY_FUNC(Atan2, float16, float16, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Atan2, float, float, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Atan2, double, double, math::Atan2Functor);
 DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, std::bit_and);
 DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, std::bit_and);
 DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, std::bit_and);

--- a/dragon/utils/math/elementwise.cu
+++ b/dragon/utils/math/elementwise.cu
@@ -342,7 +342,10 @@ _Where(const int N, const T* a, const T* b, const bool* c, T* y) {
  DRAGON_API void name<InputT, CUDAContext>(                                   \
      const int N, const InputT* x, OutputT* y, CUDAContext* ctx) {            \
    _SimpleUnaryFunc<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        N, Functor<InputT>(), x, y);                                           \
+        N,                                                                     \
+        Functor<math::ScalarType<InputT>::type>(),                             \
+        reinterpret_cast<const math::ScalarType<InputT>::type*>(x),            \
+        reinterpret_cast<math::ScalarType<OutputT>::type*>(y));                \
  }

 DEFINE_UNARY_FUNC(BitwiseNot, bool, bool, math::BitNotFunctor);
@@ -706,6 +709,87 @@ DEFINE_APPLY_MASK_FUNC(float, float);
 DEFINE_APPLY_MASK_FUNC(double, double);
 #undef DEFINE_APPLY_MASK_FUNC

+#define DEFINE_BINARY_FUNC(name, T, Functor)                         \
+  template <>                                                        \
+  DRAGON_API void name<T, CUDAContext>(                              \
+      const int N, const T* a, const T* b, T* y, CUDAContext* ctx) { \
+    using ScalarT = typename math::ScalarType<T>::type;              \
+    using ScalarT2 = typename math::ScalarType<T>::type2;            \
+    if ((N & 1) == 0 && sizeof(ScalarT) != sizeof(ScalarT2)) {       \
+      _SimpleBinaryFunc<<<                                           \
+          CUDA_BLOCKS(N >> 1),                                       \
+          CUDA_THREADS,                                              \
+          0,                                                         \
+          ctx->cuda_stream()>>>(                                     \
+          N >> 1,                                                    \
+          Functor<ScalarT2>(),                                       \
+          reinterpret_cast<const ScalarT2*>(a),                      \
+          reinterpret_cast<const ScalarT2*>(b),                      \
+          reinterpret_cast<ScalarT2*>(y));                           \
+    } else {                                                         \
+      _SimpleBinaryFunc<<<                                           \
+          CUDA_BLOCKS(N),                                            \
+          CUDA_THREADS,                                              \
+          0,                                                         \
+          ctx->cuda_stream()>>>(                                     \
+          N,                                                         \
+          Functor<ScalarT>(),                                        \
+          reinterpret_cast<const ScalarT*>(a),                       \
+          reinterpret_cast<const ScalarT*>(b),                       \
+          reinterpret_cast<ScalarT*>(y));                            \
+    }                                                                \
+  }
+
+DEFINE_BINARY_FUNC(Add, uint8_t, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Add, int8_t, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Add, int, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Add, int64_t, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Add, float16, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Add, float, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Add, double, math::PlusFunctor);
+DEFINE_BINARY_FUNC(Sub, uint8_t, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Sub, int8_t, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Sub, int, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Sub, int64_t, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Sub, float16, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Sub, float, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Sub, double, math::MinusFunctor);
+DEFINE_BINARY_FUNC(Mul, uint8_t, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Mul, int8_t, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Mul, int, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Mul, int64_t, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Mul, float16, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Mul, float, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Mul, double, math::MultipliesFunctor);
+DEFINE_BINARY_FUNC(Div, uint8_t, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Div, int8_t, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Div, int, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Div, int64_t, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Div, float16, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Div, float, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Div, double, math::DividesFunctor);
+DEFINE_BINARY_FUNC(Pow, float16, math::PowFunctor);
+DEFINE_BINARY_FUNC(Pow, float, math::PowFunctor);
+DEFINE_BINARY_FUNC(Pow, double, math::PowFunctor);
+DEFINE_BINARY_FUNC(Atan2, float16, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Atan2, float, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Atan2, double, math::Atan2Functor);
+DEFINE_BINARY_FUNC(Minimum, uint8_t, math::MinFunctor);
+DEFINE_BINARY_FUNC(Minimum, int8_t, math::MinFunctor);
+DEFINE_BINARY_FUNC(Minimum, int, math::MinFunctor);
+DEFINE_BINARY_FUNC(Minimum, int64_t, math::MinFunctor);
+DEFINE_BINARY_FUNC(Minimum, float16, math::MinFunctor);
+DEFINE_BINARY_FUNC(Minimum, float, math::MinFunctor);
+DEFINE_BINARY_FUNC(Minimum, double, math::MinFunctor);
+DEFINE_BINARY_FUNC(Maximum, uint8_t, math::MaxFunctor);
+DEFINE_BINARY_FUNC(Maximum, int8_t, math::MaxFunctor);
+DEFINE_BINARY_FUNC(Maximum, int, math::MaxFunctor);
+DEFINE_BINARY_FUNC(Maximum, int64_t, math::MaxFunctor);
+DEFINE_BINARY_FUNC(Maximum, float16, math::MaxFunctor);
+DEFINE_BINARY_FUNC(Maximum, float, math::MaxFunctor);
+DEFINE_BINARY_FUNC(Maximum, double, math::MaxFunctor);
+#undef DEFINE_BINARY_FUNC
+
 #define DEFINE_BINARY_FUNC(name, InputT, OutputT, Functor)          \
  template <>                                                       \
  DRAGON_API void name<InputT, CUDAContext>(                        \
@@ -726,51 +810,6 @@ DEFINE_APPLY_MASK_FUNC(double, double);
        reinterpret_cast<math::ScalarType<OutputT>::type*>(y));     \
  }

-DEFINE_BINARY_FUNC(Add, uint8_t, uint8_t, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Add, int8_t, int8_t, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Add, int, int, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Add, int64_t, int64_t, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Add, float16, float16, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Add, float, float, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Add, double, double, math::PlusFunctor);
-DEFINE_BINARY_FUNC(Sub, uint8_t, uint8_t, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Sub, int8_t, int8_t, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Sub, int, int, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Sub, int64_t, int64_t, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Sub, float16, float16, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Sub, float, float, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Sub, double, double, math::MinusFunctor);
-DEFINE_BINARY_FUNC(Mul, uint8_t, uint8_t, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Mul, int8_t, int8_t, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Mul, int, int, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Mul, int64_t, int64_t, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Mul, float16, float16, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Mul, float, float, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Mul, double, double, math::MultipliesFunctor);
-DEFINE_BINARY_FUNC(Div, uint8_t, uint8_t, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Div, int8_t, int8_t, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Div, int, int, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Div, int64_t, int64_t, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Div, float16, float16, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Div, float, float, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor);
-DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor);
-DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor);
-DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor);
-DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor);
-DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor);
-DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor);
-DEFINE_BINARY_FUNC(Minimum, int64_t, int64_t, math::MinFunctor);
-DEFINE_BINARY_FUNC(Minimum, float16, float16, math::MinFunctor);
-DEFINE_BINARY_FUNC(Minimum, float, float, math::MinFunctor);
-DEFINE_BINARY_FUNC(Minimum, double, double, math::MinFunctor);
-DEFINE_BINARY_FUNC(Maximum, uint8_t, uint8_t, math::MaxFunctor);
-DEFINE_BINARY_FUNC(Maximum, int8_t, int8_t, math::MaxFunctor);
-DEFINE_BINARY_FUNC(Maximum, int, int, math::MaxFunctor);
-DEFINE_BINARY_FUNC(Maximum, int64_t, int64_t, math::MaxFunctor);
-DEFINE_BINARY_FUNC(Maximum, float16, float16, math::MaxFunctor);
-DEFINE_BINARY_FUNC(Maximum, float, float, math::MaxFunctor);
-DEFINE_BINARY_FUNC(Maximum, double, double, math::MaxFunctor);
 DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, math::BitAndFunctor);
 DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, math::BitAndFunctor);
 DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, math::BitAndFunctor);

--- a/dragon/utils/math/elementwise.h
+++ b/dragon/utils/math/elementwise.h
@@ -126,6 +126,9 @@ template <typename T, class Context>
 DRAGON_API void Pow(const int N, const T* a, const T* b, T* y, Context* ctx);

 template <typename T, class Context>
+DRAGON_API void Atan2(const int N, const T* a, const T* b, T* y, Context* ctx);
+
+template <typename T, class Context>
 DRAGON_API void
 Minimum(const int N, const T* a, const T* b, T* y, Context* ctx);


--- a/dragon/utils/math/functional.h
+++ b/dragon/utils/math/functional.h
@@ -16,24 +16,25 @@
 #include "dragon/core/types.h"
 #include "dragon/utils/conversions.h"

+#if defined(__CUDA_ARCH__)
+#define HOSTDEVICE_DECL inline __host__ __device__
+#else
+#define HOSTDEVICE_DECL inline
+#endif
+
 namespace dragon {

 namespace math {

 /*
- * Arithmetic Functors */
+ * Arithmetic Functors
+ */

 template <typename T>
 struct IdentityFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& x) const {
+  HOSTDEVICE_DECL T operator()(const T& x) const {
    return x;
  }
-#else
-  inline T operator()(const T& x) const {
-    return x;
-  }
-#endif
 };

 template <typename T>
@@ -76,15 +77,9 @@ struct AbsFunctor<half2> {

 template <typename T>
 struct SqrFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& x) const {
+  HOSTDEVICE_DECL T operator()(const T& x) const {
    return x * x;
  }
-#else
-  inline T operator()(const T& x) const {
-    return x * x;
-  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -115,40 +110,16 @@ struct SqrFunctor<half2> {

 template <typename T>
 struct MaxFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
-    return lhs < rhs ? rhs : lhs;
-  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs ? rhs : lhs;
  }
-#endif
 };

 template <>
 struct MaxFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-#if __CUDA_ARCH__ >= 530
-    return __hlt(
-               *reinterpret_cast<const half*>(&lhs),
-               *reinterpret_cast<const half*>(&rhs))
-        ? rhs
-        : lhs;
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) <
-            __half2float(*reinterpret_cast<const half*>(&rhs))
-        ? rhs
-        : lhs;
-#endif
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) < convert::To<float>(rhs) ? rhs : lhs;
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -176,40 +147,16 @@ struct MaxFunctor<half2> {

 template <typename T>
 struct MinFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs ? lhs : rhs;
  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
-    return lhs < rhs ? lhs : rhs;
-  }
-#endif
 };

 template <>
 struct MinFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-#if __CUDA_ARCH__ >= 530
-    return __hlt(
-               *reinterpret_cast<const half*>(&lhs),
-               *reinterpret_cast<const half*>(&rhs))
-        ? lhs
-        : rhs;
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) <
-            __half2float(*reinterpret_cast<const half*>(&rhs))
-        ? lhs
-        : rhs;
-#endif
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) < convert::To<float>(rhs) ? lhs : rhs;
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -237,39 +184,17 @@ struct MinFunctor<half2> {

 template <typename T>
 struct PlusFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
-    return lhs + rhs;
-  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs + rhs;
  }
-#endif
 };

 template <>
 struct PlusFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-#if __CUDA_ARCH__ >= 530
-    half ret = __hadd(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    half ret = __float2half(
-        __half2float(*reinterpret_cast<const half*>(&lhs)) +
-        __half2float(*reinterpret_cast<const half*>(&rhs)));
-#endif
-    return *reinterpret_cast<float16*>(&ret);
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float16>(
        convert::To<float>(lhs) + convert::To<float>(rhs));
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -300,39 +225,17 @@ struct PlusFunctor<half2> {

 template <typename T>
 struct MinusFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
-    return lhs - rhs;
-  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs - rhs;
  }
-#endif
 };

 template <>
 struct MinusFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-#if __CUDA_ARCH__ >= 530
-    half ret = __hsub(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    half ret = __float2half(
-        __half2float(*reinterpret_cast<const half*>(&lhs)) -
-        __half2float(*reinterpret_cast<const half*>(&rhs)));
-#endif
-    return *reinterpret_cast<float16*>(&ret);
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float16>(
        convert::To<float>(lhs) - convert::To<float>(rhs));
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -363,39 +266,17 @@ struct MinusFunctor<half2> {

 template <typename T>
 struct MultipliesFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
-    return lhs * rhs;
-  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs * rhs;
  }
-#endif
 };

 template <>
 struct MultipliesFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-#if __CUDA_ARCH__ >= 530
-    half ret = __hmul(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    half ret = __float2half(
-        __half2float(*reinterpret_cast<const half*>(&lhs)) *
-        __half2float(*reinterpret_cast<const half*>(&rhs)));
-#endif
-    return *reinterpret_cast<float16*>(&ret);
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float16>(
        convert::To<float>(lhs) * convert::To<float>(rhs));
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -426,39 +307,17 @@ struct MultipliesFunctor<half2> {

 template <typename T>
 struct DividesFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs / rhs;
  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
-    return lhs / rhs;
-  }
-#endif
 };

 template <>
 struct DividesFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-#if __CUDA_ARCH__ >= 530
-    half ret = __hdiv(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    half ret = __float2half(
-        __half2float(*reinterpret_cast<const half*>(&lhs)) /
-        __half2float(*reinterpret_cast<const half*>(&rhs)));
-#endif
-    return *reinterpret_cast<float16*>(&ret);
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float16>(
        convert::To<float>(lhs) / convert::To<float>(rhs));
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -498,20 +357,10 @@ struct PowFunctor {

 template <>
 struct PowFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ float16
-  operator()(const float16& lhs, const float16& rhs) const {
-    half ret = __float2half(
-        pow(__half2float(*reinterpret_cast<const half*>(&lhs)),
-            __half2float(*reinterpret_cast<const half*>(&rhs))));
-    return *reinterpret_cast<float16*>(&ret);
-  }
-#else
  inline float16 operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float16>(
        std::pow(convert::To<float>(lhs), convert::To<float>(rhs)));
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -532,34 +381,104 @@ struct PowFunctor<half2> {
 };
 #endif

-/*
- * Logical Functors
- */
-
 template <typename T>
-struct NotFunctor {
+struct Atan2Functor {
 #if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& x) const {
-    return !x;
+  inline __device__ T operator()(const T& lhs, const T& rhs) const {
+    return atan2(lhs, rhs);
  }
 #else
-  inline bool operator()(const T& x) const {
-    return !x;
+  inline T operator()(const T& lhs, const T& rhs) const {
+    return std::atan2(lhs, rhs);
  }
 #endif
 };

 template <>
-struct NotFunctor<float16> {
+struct Atan2Functor<float16> {
+  inline float16 operator()(const float16& lhs, const float16& rhs) const {
+    return convert::To<float16>(
+        std::atan2(convert::To<float>(lhs), convert::To<float>(rhs)));
+  }
+};
+
 #if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& x) const {
-    return !__half2float(*reinterpret_cast<const half*>(&x));
+template <>
+struct Atan2Functor<half> {
+  inline __device__ half operator()(const half& lhs, const half& rhs) const {
+    return __float2half(atan2f(__half2float(lhs), __half2float(rhs)));
+  }
+};
+
+template <>
+struct Atan2Functor<half2> {
+  inline __device__ half2 operator()(const half2& lhs, const half2& rhs) const {
+    const float2 v1 = __half22float2(lhs);
+    const float2 v2 = __half22float2(rhs);
+    return __floats2half2_rn(atan2f(v1.x, v2.x), atan2f(v1.y, v2.y));
  }
+};
+#endif
+
+template <typename T>
+struct FMAFunctor {
+#if defined(__CUDA_ARCH__)
+  inline __device__ T operator()(const T& x, const T& y, const T& z) const {
+    return fma(x, y, z);
+  }
+#else
+  inline T operator()(const T& x, const T& y, const T& z) const {
+    return std::fma(x, y, z);
+  }
+#endif
+};
+
+#if defined(__CUDA_ARCH__)
+template <>
+struct FMAFunctor<half> {
+  inline __device__ half
+  operator()(const half& x, const half& y, const half& z) const {
+#if __CUDA_ARCH__ >= 530
+    return __hfma(x, y, z);
+#else
+    return __float2half(
+        fmaf(__half2float(x), __half2float(y), __half2float(z)));
+#endif
+  }
+};
+
+template <>
+struct FMAFunctor<half2> {
+  inline __device__ half2
+  operator()(const half2& x, const half2& y, const half2& z) const {
+#if __CUDA_ARCH__ >= 530
+    return __hfma2(x, y, z);
 #else
+    const float2 v1 = __half22float2(x);
+    const float2 v2 = __half22float2(y);
+    const float2 v3 = __half22float2(z);
+    return __floats2half2_rn(fmaf(v1.x, v2.x, v3.x), fmaf(v1.y, v2.y, v3.y));
+#endif
+  }
+};
+#endif
+
+/*
+ * Logical Functors
+ */
+
+template <typename T>
+struct NotFunctor {
+  HOSTDEVICE_DECL bool operator()(const T& x) const {
+    return !x;
+  }
+};
+
+template <>
+struct NotFunctor<float16> {
  inline bool operator()(const float16& x) const {
    return !convert::To<float>(x);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -573,30 +492,16 @@ struct NotFunctor<half> {

 template <typename T>
 struct AndFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return lhs && rhs;
-  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs && rhs;
  }
-#endif
 };

 template <>
 struct AndFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) &&
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) && convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -610,30 +515,16 @@ struct AndFunctor<half> {

 template <typename T>
 struct OrFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return lhs || rhs;
-  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs || rhs;
  }
-#endif
 };

 template <>
 struct OrFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) ||
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) || convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -647,32 +538,17 @@ struct OrFunctor<half> {

 template <typename T>
 struct XorFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return convert::To<bool>(lhs) ^ convert::To<bool>(rhs);
-  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return convert::To<bool>(lhs) ^ convert::To<bool>(rhs);
  }
-#endif
 };

 template <>
 struct XorFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-    return convert::To<bool>(
-               __half2float(*reinterpret_cast<const half*>(&lhs))) ^
-        convert::To<bool>(__half2float(*reinterpret_cast<const half*>(&rhs)));
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<bool>(convert::To<float>(lhs)) ^
        convert::To<bool>(convert::To<float>(rhs));
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -691,54 +567,30 @@ struct XorFunctor<half> {

 template <typename T>
 struct BitNotFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& x) const {
+  HOSTDEVICE_DECL T operator()(const T& x) const {
    return ~x;
  }
-#else
-  inline T operator()(const T& x) const {
-    return ~x;
-  }
-#endif
 };

 template <typename T>
 struct BitAndFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs & rhs;
  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
-    return lhs & rhs;
-  }
-#endif
 };

 template <typename T>
 struct BitOrFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
-    return lhs | rhs;
-  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs | rhs;
  }
-#endif
 };

 template <typename T>
 struct BitXorFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ T operator()(const T& lhs, const T& rhs) const {
-    return lhs ^ rhs;
-  }
-#else
-  inline T operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
    return lhs ^ rhs;
  }
-#endif
 };

 /*
@@ -747,36 +599,16 @@ struct BitXorFunctor {

 template <typename T>
 struct EqualFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return lhs == rhs;
-  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs == rhs;
  }
-#endif
 };

 template <>
 struct EqualFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-#if __CUDA_ARCH__ >= 530
-    return __heq(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) ==
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-#endif
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) == convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -794,36 +626,16 @@ struct EqualFunctor<half> {

 template <typename T>
 struct NotEqualFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs != rhs;
  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
-    return lhs != rhs;
-  }
-#endif
 };

 template <>
 struct NotEqualFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-#if __CUDA_ARCH__ >= 530
-    return __hne(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) !=
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-#endif
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) != convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -841,36 +653,16 @@ struct NotEqualFunctor<half> {

 template <typename T>
 struct GreaterFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return lhs > rhs;
-  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs > rhs;
  }
-#endif
 };

 template <>
 struct GreaterFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-#if __CUDA_ARCH__ >= 530
-    return __hgt(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) >
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-#endif
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) > convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -888,36 +680,16 @@ struct GreaterFunctor<half> {

 template <typename T>
 struct LessFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs;
  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
-    return lhs < rhs;
-  }
-#endif
 };

 template <>
 struct LessFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-#if __CUDA_ARCH__ >= 530
-    return __hlt(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) <
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-#endif
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) < convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -935,36 +707,16 @@ struct LessFunctor<half> {

 template <typename T>
 struct GreaterEqualFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs >= rhs;
  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
-    return lhs >= rhs;
-  }
-#endif
 };

 template <>
 struct GreaterEqualFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-#if __CUDA_ARCH__ >= 530
-    return __hge(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) >=
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-#endif
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) >= convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -982,36 +734,16 @@ struct GreaterEqualFunctor<half> {

 template <typename T>
 struct LessEqualFunctor {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
-    return lhs <= rhs;
-  }
-#else
-  inline bool operator()(const T& lhs, const T& rhs) const {
+  HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
    return lhs <= rhs;
  }
-#endif
 };

 template <>
 struct LessEqualFunctor<float16> {
-#if defined(__CUDA_ARCH__)
-  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
-      const {
-#if __CUDA_ARCH__ >= 530
-    return __hle(
-        *reinterpret_cast<const half*>(&lhs),
-        *reinterpret_cast<const half*>(&rhs));
-#else
-    return __half2float(*reinterpret_cast<const half*>(&lhs)) <
-        __half2float(*reinterpret_cast<const half*>(&rhs));
-#endif
-  }
-#else
  inline bool operator()(const float16& lhs, const float16& rhs) const {
    return convert::To<float>(lhs) <= convert::To<float>(rhs);
  }
-#endif
 };

 #if defined(__CUDA_ARCH__)
@@ -1031,4 +763,6 @@ struct LessEqualFunctor<half> {

 } // namespace dragon

+#undef HOSTDEVICE_DECL
+
 #endif // DRAGON_UTILS_MATH_FUNCTIONAL_H_
--- a/dragon/utils/math/reduce.cc
+++ b/dragon/utils/math/reduce.cc
@@ -108,13 +108,11 @@ void _GenericReduce(
    }                                                                     \
    if (math::utils::IsRowwiseReduce(                                     \
            num_dims, dims, out_dims.data(), &rows, &cols)) {             \
-      _RowwiseReduce##name(rows, cols, scale, x, y);                      \
-      return;                                                             \
+      return _RowwiseReduce##name(rows, cols, scale, x, y);               \
    }                                                                     \
    if (math::utils::IsColwiseReduce(                                     \
            num_dims, dims, out_dims.data(), &rows, &cols)) {             \
-      _ColwiseReduce##name(rows, cols, scale, x, y);                      \
-      return;                                                             \
+      return _ColwiseReduce##name(rows, cols, scale, x, y);               \
    }                                                                     \
    vec64_t transpose_axes(num_dims);                                     \
    vec64_t transpose_strides(num_dims);                                  \

--- a/dragon/utils/math/transform.cc
+++ b/dragon/utils/math/transform.cc
+#include "dragon/utils/math/transform.h"
+#include "dragon/utils/device/common_eigen.h"
+#include "dragon/utils/math/reduce.h"
+
+namespace dragon {
+
+namespace math {
+
+namespace {
+
+template <typename T>
+void _AffineChannel(
+    const int N,
+    const int C,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y) {
+  EigenArrayMap<T> Y(y, C, N);
+  ConstEigenArrayMap<T> X(x, C, N);
+  Y = X.colwise() * ConstEigenVectorArrayMap<T>(scale, C);
+  if (bias != nullptr) {
+    Y.colwise() += ConstEigenVectorArrayMap<T>(bias, C);
+  }
+}
+
+template <typename T>
+void _AffineChannel(
+    const int N,
+    const int C,
+    const int S,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y) {
+  const auto CxS = C * S;
+  for (int i = 0; i < N; ++i) {
+    EigenArrayMap<T> Y(y + i * CxS, S, C);
+    ConstEigenArrayMap<T> X(x + i * CxS, S, C);
+    Y = X.rowwise() * ConstEigenVectorArrayMap<T>(scale, C).transpose();
+    if (bias != nullptr) {
+      Y.rowwise() += ConstEigenVectorArrayMap<T>(bias, C).transpose();
+    }
+  }
+}
+
+template <typename T>
+void _AffineImpl(
+    const int num_dims,
+    const int64_t* dims,
+    const int num_axes,
+    const int64_t* axes,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y) {
+  if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
+    _AffineChannel(dims[0], dims[1], x, scale, bias, y);
+  } else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
+    _AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y);
+  } else {
+    LOG(FATAL) << "Unsupported affine dimensions.";
+  }
+}
+
+} // namespace
+
+template <>
+void Affine<float16, CPUContext>(
+    const int num_dims,
+    const int64_t* dims,
+    const int num_axes,
+    const int64_t* axes,
+    const float16* x,
+    const float16* scale,
+    const float16* bias,
+    float16* y,
+    CPUContext* ctx) {
+  CPU_FP16_NOT_SUPPORTED;
+}
+
+#define DEFINE_AFFINE_FUNC(T)                                \
+  template <>                                                \
+  void Affine<T, CPUContext>(                                \
+      const int num_dims,                                    \
+      const int64_t* dims,                                   \
+      const int num_axes,                                    \
+      const int64_t* axes,                                   \
+      const T* x,                                            \
+      const T* scale,                                        \
+      const T* bias,                                         \
+      T* y,                                                  \
+      CPUContext* ctx) {                                     \
+    vec64_t new_dims, new_axes;                              \
+    math::utils::CollapseReduceAxes(                         \
+        num_dims, dims, num_axes, axes, new_dims, new_axes); \
+    _AffineImpl(                                             \
+        new_dims.size(),                                     \
+        new_dims.data(),                                     \
+        new_axes.size(),                                     \
+        new_axes.data(),                                     \
+        x,                                                   \
+        scale,                                               \
+        bias,                                                \
+        y);                                                  \
+  }
+
+DEFINE_AFFINE_FUNC(float);
+DEFINE_AFFINE_FUNC(double);
+#undef DEFINE_AFFINE_FUNC
+
+} // namespace math
+
+} // namespace dragon
--- a/dragon/utils/math/transform.cu
+++ b/dragon/utils/math/transform.cu
+#ifdef USE_CUDA
+
+#include "dragon/core/context_cuda.h"
+#include "dragon/utils/math/functional.h"
+#include "dragon/utils/math/reduce.h"
+#include "dragon/utils/math/transform.h"
+#include "dragon/utils/math/types.h"
+#include "dragon/utils/math/utils.h"
+
+namespace dragon {
+
+namespace math {
+
+namespace {
+
+template <typename T>
+__global__ void _AffineChannel(
+    const int NxC,
+    const int C,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y) {
+  auto op3 = math::FMAFunctor<T>();
+  auto op2 = math::MultipliesFunctor<T>();
+  CUDA_1D_KERNEL_LOOP(i, NxC) {
+    if (bias != nullptr) {
+      y[i] = op3(x[i], __ldg(scale + i % C), __ldg(bias + i % C));
+    } else {
+      y[i] = op2(x[i], __ldg(scale + i % C));
+    }
+  }
+}
+
+template <typename T>
+__global__ void _AffineChannel(
+    const int NxCxS,
+    const int C,
+    const int S,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y) {
+  auto op3 = math::FMAFunctor<T>();
+  auto op2 = math::MultipliesFunctor<T>();
+  CUDA_1D_KERNEL_LOOP(i, NxCxS) {
+    const int j = (i / S) % C;
+    if (bias != nullptr) {
+      y[i] = op3(x[i], __ldg(scale + j), __ldg(bias + j));
+    } else {
+      y[i] = op2(x[i], __ldg(scale + j));
+    }
+  }
+}
+
+template <typename T>
+void _AffineImpl(
+    const int num_dims,
+    const int64_t* dims,
+    const int num_axes,
+    const int64_t* axes,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y,
+    CUDAContext* ctx) {
+  const auto N = math::utils::Prod(num_dims, dims);
+  if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
+    _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+        N, dims[1], x, scale, bias, y);
+  } else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
+    _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
+        N, dims[1], dims[2], x, scale, bias, y);
+  } else {
+    LOG(FATAL) << "Unsupported affine dimensions.";
+  }
+}
+
+} // namespace
+
+#define DEFINE_AFFINE_FUNC(T)                                      \
+  template <>                                                      \
+  void Affine<T, CUDAContext>(                                     \
+      const int num_dims,                                          \
+      const int64_t* dims,                                         \
+      const int num_axes,                                          \
+      const int64_t* axes,                                         \
+      const T* x,                                                  \
+      const T* scale,                                              \
+      const T* bias,                                               \
+      T* y,                                                        \
+      CUDAContext* ctx) {                                          \
+    vec64_t new_dims, new_axes;                                    \
+    math::utils::CollapseReduceAxes(                               \
+        num_dims, dims, num_axes, axes, new_dims, new_axes);       \
+    _AffineImpl(                                                   \
+        new_dims.size(),                                           \
+        new_dims.data(),                                           \
+        new_axes.size(),                                           \
+        new_axes.data(),                                           \
+        reinterpret_cast<const math::ScalarType<T>::type*>(x),     \
+        reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
+        reinterpret_cast<const math::ScalarType<T>::type*>(bias),  \
+        reinterpret_cast<math::ScalarType<T>::type*>(y),           \
+        ctx);                                                      \
+  }
+
+DEFINE_AFFINE_FUNC(float);
+DEFINE_AFFINE_FUNC(float16);
+DEFINE_AFFINE_FUNC(double);
+#undef DEFINE_AFFINE_FUNC
+
+} // namespace math
+
+} // namespace dragon
+
+#endif // USE_CUDA
--- a/dragon/utils/math/transform.h
+++ b/dragon/utils/math/transform.h
+/*!
+ * Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+ *
+ * Licensed under the BSD 2-Clause License.
+ * You should have received a copy of the BSD 2-Clause License
+ * along with the software. If not, See,
+ *
+ *     <https://opensource.org/licenses/BSD-2-Clause>
+ *
+ * ------------------------------------------------------------
+ */
+
+#ifndef DRAGON_UTILS_MATH_TRANSFORM_H_
+#define DRAGON_UTILS_MATH_TRANSFORM_H_
+
+#include "dragon/core/context.h"
+
+namespace dragon {
+
+namespace math {
+
+template <typename T, class Context>
+DRAGON_API void Affine(
+    const int num_dims,
+    const int64_t* dims,
+    const int num_axes,
+    const int64_t* axes,
+    const T* x,
+    const T* scale,
+    const T* bias,
+    T* y,
+    Context* ctx);
+
+} // namespace math
+
+} // namespace dragon
+
+#endif // DRAGON_UTILS_MATH_TRANSFORM_H_
--- a/dragon/utils/math/transpose.cu
+++ b/dragon/utils/math/transpose.cu
@@ -141,8 +141,7 @@ void _TransposeImpl(
    CUDAContext* ctx) {
  auto aligned_size = sizeof(T);
  if (axes.back() == D - 1) {
-    const auto N = math::utils::Prod(D, dims.data());
-    aligned_size = utils::GetAlignedSize<T, 16>(N, x, y);
+    aligned_size = utils::GetAlignedSize<T, 16>(dims[D - 1], x, y);
  }
  SimpleArray<int, D> X_dims, X_strides, Y_dims;
  for (int i = 0; i < D; ++i) {

--- a/dragon/utils/math/types.h
+++ b/dragon/utils/math/types.h
@@ -27,6 +27,7 @@ template <typename T>
 class ScalarType {
 public:
  typedef T type;
+  typedef T type2;
 };

 #if defined(__CUDACC__)
@@ -34,6 +35,7 @@ template <>
 class ScalarType<float16> {
 public:
  typedef half type;
+  typedef half2 type2;
 };
 #endif


--- a/dragon/utils/math/utils.h
+++ b/dragon/utils/math/utils.h
@@ -16,9 +16,9 @@
 #include "dragon/utils/conversions.h"

 #if defined(__CUDACC__)
-#define MATH_UTILS_DECL inline __host__ __device__
+#define HOSTDEVICE_DECL inline __host__ __device__
 #else
-#define MATH_UTILS_DECL inline
+#define HOSTDEVICE_DECL inline
 #endif

 #define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
@@ -41,28 +41,28 @@ namespace utils {
 template <
    typename T,
    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-MATH_UTILS_DECL T IsInf(const T x) {
+HOSTDEVICE_DECL T IsInf(const T x) {
  return false;
 }

 template <
    typename T,
    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-MATH_UTILS_DECL T IsNaN(const T x) {
+HOSTDEVICE_DECL T IsNaN(const T x) {
  return false;
 }

 template <
    typename T,
    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-MATH_UTILS_DECL T IsFinite(const T x) {
+HOSTDEVICE_DECL T IsFinite(const T x) {
  return true;
 }

 template <
    typename T,
    typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
-MATH_UTILS_DECL bool IsInf(T x) {
+HOSTDEVICE_DECL bool IsInf(T x) {
 #if defined(__CUDACC__)
  return isinf(x);
 #else
@@ -73,7 +73,7 @@ MATH_UTILS_DECL bool IsInf(T x) {
 template <
    typename T,
    typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
-MATH_UTILS_DECL bool IsNaN(T x) {
+HOSTDEVICE_DECL bool IsNaN(T x) {
 #if defined(__CUDACC__)
  return isnan(x);
 #else
@@ -84,7 +84,7 @@ MATH_UTILS_DECL bool IsNaN(T x) {
 template <
    typename T,
    typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
-MATH_UTILS_DECL bool IsFinite(T x) {
+HOSTDEVICE_DECL bool IsFinite(T x) {
 #if defined(__CUDACC__)
  return isfinite(x);
 #else
@@ -106,27 +106,27 @@ inline bool IsFinite(float16 x) {
 }

 template <typename T>
-MATH_UTILS_DECL bool IsAGeZeroAndALtB(const T a, const T b) {
+HOSTDEVICE_DECL bool IsAGeZeroAndALtB(const T a, const T b) {
  return static_cast<unsigned int>(a) < static_cast<unsigned int>(b);
 }

 template <typename T>
-MATH_UTILS_DECL T Sign(const T x) {
+HOSTDEVICE_DECL T Sign(const T x) {
  return x > T(0) ? T(1) : (x < T(0) ? T(-1) : T(0));
 }

 template <typename T>
-MATH_UTILS_DECL T Identity(const T x) {
+HOSTDEVICE_DECL T Identity(const T x) {
  return x;
 }

 template <typename T>
-MATH_UTILS_DECL T Square(const T x) {
+HOSTDEVICE_DECL T Square(const T x) {
  return x * x;
 }

 template <typename T>
-MATH_UTILS_DECL T Cube(const T x) {
+HOSTDEVICE_DECL T Cube(const T x) {
  return x * x * x;
 }

@@ -247,4 +247,6 @@ void IncreaseIndexInDims(const int num_dims, const DimT* dims, IndexT* index) {

 } // namespace dragon

+#undef HOSTDEVICE_DECL
+
 #endif // DRAGON_UTILS_MATH_UTILS_H_
--- a/dragon/utils/math_functions.h
+++ b/dragon/utils/math_functions.h
@@ -21,6 +21,7 @@
 #include "dragon/utils/math/random.h"
 #include "dragon/utils/math/reduce.h"
 #include "dragon/utils/math/sort.h"
+#include "dragon/utils/math/transform.h"
 #include "dragon/utils/math/transpose.h"
 #include "dragon/utils/math/types.h"
 #include "dragon/utils/math/utils.h"

--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -284,39 +284,6 @@ void BooleanMaskGrad(
    Context* ctx);

 template <typename T, class Context>
-void ChannelAffine(
-    const int N,
-    const int S,
-    const int C,
-    const T* x,
-    const T* scale,
-    const T* bias,
-    T* y,
-    Context* ctx);
-
-template <typename InputT, typename OutputT, class Context>
-void ChannelNormalize(
-    const int axis,
-    const int num_dims,
-    const int64_t* x_strides,
-    const int64_t* y_dims,
-    const InputT* x,
-    const float* mean,
-    const float* std,
-    OutputT* y,
-    Context* ctx);
-
-template <typename T, class Context>
-void ChannelShuffle(
-    const int N,
-    const int S,
-    const int C,
-    const int G,
-    const T* x,
-    T* y,
-    Context* ctx);
-
-template <typename T, class Context>
 void ConstPad(
    const int num_dims,
    const int64_t* x_dims,
@@ -813,6 +780,18 @@ void TopK(
 * NormalizationOp Kernels
 */

+template <typename InputT, typename OutputT, class Context>
+void ChannelNorm(
+    const int axis,
+    const int num_dims,
+    const int64_t* x_strides,
+    const int64_t* y_dims,
+    const InputT* x,
+    const float* mean,
+    const float* std,
+    OutputT* y,
+    Context* ctx);
+
 template <typename T, typename AccT, class Context>
 void BatchNormExpectation(
    const int N,
@@ -923,7 +902,7 @@ void GroupNormGrad(
    Context* ctx);

 template <typename T, class Context>
-void L1Normalize(
+void L1Norm(
    const int N,
    const int S,
    const int C,
@@ -934,7 +913,7 @@ void L1Normalize(
    Context* ctx);

 template <typename T, class Context>
-void L1NormalizeGrad(
+void L1NormGrad(
    const int N,
    const int S,
    const int C,
@@ -946,7 +925,7 @@ void L1NormalizeGrad(
    Context* ctx);

 template <typename T, class Context>
-void L2Normalize(
+void L2Norm(
    const int N,
    const int S,
    const int C,
@@ -957,7 +936,7 @@ void L2Normalize(
    Context* ctx);

 template <typename T, class Context>
-void L2NormalizeGrad(
+void L2NormGrad(
    const int N,
    const int S,
    const int C,
@@ -1012,19 +991,23 @@ void LSTMCellGrad(
 * TrainingOp Kernels
 */

-template <typename T, class Context>
+template <typename T, typename CopyT, class Context>
 void Adam(
    const int N,
    const float lr,
    const float beta1,
    const float beta2,
    const float eps,
-    T* g,
+    const float wd,
+    const T* x,
+    const T* g,
    T* m,
    T* v,
+    T* y,
+    CopyT* y_copy,
    Context* ctx);

-template <typename T, class Context>
+template <typename T, typename CopyT, class Context>
 void AdamW(
    const int N,
    const float lr,
@@ -1033,39 +1016,53 @@ void AdamW(
    const float eps,
    const float wd,
    const T* x,
-    T* g,
+    const T* g,
    T* m,
    T* v,
+    T* y,
+    CopyT* y_copy,
    Context* ctx);

-template <typename T, class Context>
+template <typename T, typename CopyT, class Context>
 void MomentumSGD(
    const int N,
    const float lr,
    const float momentum,
-    T* g,
+    const float wd,
+    const T* x,
+    const T* g,
    T* m,
+    T* y,
+    CopyT* y_copy,
    Context* ctx);

-template <typename T, class Context>
+template <typename T, typename CopyT, class Context>
 void NesterovSGD(
    const int N,
    const float lr,
    const float momentum,
-    T* g,
+    const float wd,
+    const T* x,
+    const T* g,
    T* m,
+    T* y,
+    CopyT* y_copy,
    Context* ctx);

-template <typename T, class Context>
+template <typename T, typename CopyT, class Context>
 void RMSprop(
    const int N,
    const float lr,
    const float momentum,
-    const float decay,
+    const float alpha,
    const float eps,
-    T* g,
+    const float wd,
+    const T* x,
+    const T* g,
    T* m,
    T* v,
+    T* y,
+    CopyT* y_copy,
    Context* ctx);

 /*

--- a/tensorflow/_api/math/__init__.py
+++ b/tensorflow/_api/math/__init__.py
@@ -18,6 +18,7 @@ from dragon.vm.tensorflow.core.ops.math_ops import add
 from dragon.vm.tensorflow.core.ops.math_ops import add_n
 from dragon.vm.tensorflow.core.ops.math_ops import argmax
 from dragon.vm.tensorflow.core.ops.math_ops import argmin
+from dragon.vm.tensorflow.core.ops.math_ops import atan2
 from dragon.vm.tensorflow.core.ops.math_ops import cast
 from dragon.vm.tensorflow.core.ops.math_ops import ceil
 from dragon.vm.tensorflow.core.ops.math_ops import cos

--- a/tensorflow/core/keras/optimizer/adam.py
+++ b/tensorflow/core/keras/optimizer/adam.py
@@ -27,9 +27,11 @@ class Adam(optimizer.Optimizer):
    The **Adam** update is defined as:

    .. math::
-        \text{Adam}(g) = \text{lr} * \frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \\
+        \text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
+                                           {\sqrt{v_{t}} + \epsilon}) \\
            \quad \\ \text{where}\quad
                \begin{cases}
+                    \text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
                    m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
                \end{cases}

--- a/tensorflow/core/keras/optimizer/rmsprop.py
+++ b/tensorflow/core/keras/optimizer/rmsprop.py
@@ -61,8 +61,8 @@ class RMSprop(optimizer.Optimizer):
        super(RMSprop, self).__init__(name, **kwargs)
        self._set_hyper('lr', learning_rate)
        self._set_hyper('momentum', momentum)
-        self._set_hyper('decay', rho)
+        self._set_hyper('alpha', rho)
        self._set_hyper('eps', epsilon)
        self._hyper_aliases['learning_rate'] = 'lr'
-        self._hyper_aliases['rho'] = 'decay'
+        self._hyper_aliases['rho'] = 'alpha'
        self._hyper_aliases['eps'] = 'epsilon'
--- a/tensorflow/core/ops/math_ops.py
+++ b/tensorflow/core/ops/math_ops.py
@@ -184,6 +184,35 @@ def argmin(input, axis=None, name=None):
    return math_ops.argmin(input, axis=axis, name=name)


+def atan2(y, x, name=None):
+    r"""Compute the element-wise arc-tangent of two arguments.
+
+    .. math:: \text{out} = \text{arctan}(\frac{\text{input1}}{\text{input2}})
+
+    ```python
+    y = tf.constant(1.)
+    x = tf.constant(2.)
+    print(tf.math.atan2(y, x))  # 0.46364761
+    ```
+
+    Parameters
+    ----------
+    y : dragon.Tensor
+        The input1 tensor.
+    x : dragon.Tensor
+        The input2 tensor.
+    name : str, optional
+        The operation name.
+
+    Returns
+    -------
+    dragon.Tensor
+        The output tensor.
+
+    """
+    return math_ops.atan2([y, x], name=name)
+
+
 def cast(x, dtype, name=None):
    """Cast the data type of input.


--- a/tensorflow/core/ops/nn_impl.py
+++ b/tensorflow/core/ops/nn_impl.py
@@ -129,13 +129,8 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None):
        The output tensor.

    """
-    return normalization_ops.lp_normalize(
-        x,
-        p=2,
-        axis=axis,
-        epsilon=epsilon,
-        name=name,
-    )
+    return normalization_ops.lp_norm(
+        x, p=2, axis=axis, epsilon=epsilon, name=name)


 def moments(x, axes=None, keepdims=False, name=None):

--- a/test/dragon/test_autograph.py
+++ b/test/dragon/test_autograph.py
@@ -501,8 +501,8 @@ class TestOpSpecWithTensorDesc(unittest.TestCase):
            self.assertEqual(dragon.broadcast_to(
                self.sym2, shape=self.shape1).shape, (None,) * len(self.sym2.shape))

-    def test_channel_normalize(self):
-        func = functools.partial(dragon.channel_normalize,
+    def test_channel_norm(self):
+        func = functools.partial(dragon.nn.channel_norm,
                                 mean=(1., 1., 1.), std=(1., 1., 1.))
        with dragon.graph_mode():
            self.assertEqual(func(self.sym1).shape, None)

--- a/test/dragon/test_device.py
+++ b/test/dragon/test_device.py
@@ -31,6 +31,9 @@ class TestCUDA(unittest.TestCase):
        stream.synchronize()
        dragon.cuda.synchronize()

+    def test_cublas(self):
+        dragon.cuda.set_cublas_flags()
+
    def test_cudnn(self):
        dragon.cuda.set_cudnn_flags()


--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -572,51 +572,6 @@ class TestArrayOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_cast()

-    def test_channel_affine(self):
-        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
-            with execution_context().mode(execution):
-                data1 = arange((2, 3, 4, 5))
-                data2, data3 = arange((3, 4)), arange((3, 4))
-                data4 = arange(data1.shape)
-                grad1 = data4 * np.expand_dims(data2, -1)
-                grad2 = np.sum(data4 * data1, (0, 3))
-                grad3 = np.sum(data4, (0, 3))
-                x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
-                with dragon.GradientTape() as tape:
-                    tape.watch([x, w, b])
-                    y = dragon.channel_affine([x, w, b], axis=1, end_axis=2)
-                dy = new_tensor(data4)
-                dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
-                self.assertEqual(
-                    [y, dx, dw, db],
-                    [data1 * np.expand_dims(data2, -1) +
-                     np.expand_dims(data3, -1),
-                     grad1, grad2, grad3])
-
-    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_channel_affine_cuda(self):
-        with dragon.device('cuda'):
-            self.test_channel_affine()
-
-    def test_channel_normalize(self):
-        entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'perm': (0, 1, 2)}),
-                   ((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'perm': (0, 2, 1)})]
-        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
-            with execution_context().mode(execution):
-                for shape, args, kwargs in entries:
-                    perm = kwargs['perm']
-                    data = np.ones(shape, dtype='uint8').transpose(perm)
-                    mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
-                    std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
-                    x = dragon.ones(shape, dtype='uint8')
-                    y = dragon.channel_normalize(x, *args, **kwargs)
-                    self.assertEqual(y, (data - mean) / std)
-
-    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_channel_normalize_cuda(self):
-        with dragon.device('cuda'):
-            self.test_channel_normalize()
-
    def test_channel_shuffle(self):
        entries = [(0, 2), (1, 4)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
@@ -630,7 +585,7 @@ class TestArrayOps(OpTestCase):
                    x, dy = new_tensor(data), new_tensor(data)
                    with dragon.GradientTape() as tape:
                        tape.watch(x)
-                        y = dragon.channel_shuffle(x, axis, group)
+                        y = dragon.nn.channel_shuffle(x, axis, group)
                    dx = tape.gradient(y, [x], output_gradients=[dy])[0]
                    self.assertEqual(
                        [y, dx], [data.reshape(shape1).transpose(perm).reshape(data.shape),
@@ -1676,6 +1631,32 @@ class TestMathOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_add()

+    def test_affine(self):
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                data1 = arange((2, 3, 4, 5))
+                data2, data3 = arange((3, 4)), arange((3, 4))
+                data4 = arange(data1.shape)
+                grad1 = data4 * np.expand_dims(data2, -1)
+                grad2 = np.sum(data4 * data1, (0, 3))
+                grad3 = np.sum(data4, (0, 3))
+                x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
+                with dragon.GradientTape() as tape:
+                    tape.watch([x, w, b])
+                    y = dragon.math.affine([x, w, b], axis=(1, 2))
+                dy = new_tensor(data4)
+                dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
+                self.assertEqual(
+                    [y, dx, dw, db],
+                    [data1 * np.expand_dims(data2, -1) +
+                     np.expand_dims(data3, -1),
+                     grad1, grad2, grad3])
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_affine_cuda(self):
+        with dragon.device('cuda'):
+            self.test_affine()
+
    def test_argmax(self):
        entries = [(0, True), (0, False), (1, True), (1, False)]
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
@@ -1712,6 +1693,20 @@ class TestMathOps(OpTestCase):
        with dragon.device('cuda'):
            self.test_argmin()

+    def test_atan2(self):
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                for a_shape, b_shape in self.binary_test_shapes:
+                    data1, data2 = arange(a_shape), arange(b_shape, 1)
+                    a, b = new_tensor(data1), new_tensor(data2)
+                    y = dragon.math.atan2([a, b])
+                    self.assertEqual(y, np.arctan2(data1, data2))
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_atan2_cuda(self):
+        with dragon.device('cuda'):
+            self.test_atan2()
+
    def test_bitwise_and(self):
        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
            with execution_context().mode(execution):
@@ -2738,6 +2733,25 @@ class TestNormalizationOps(OpTestCase):
        with dragon.device('cuda'), self.cudnn_ws.as_default():
            self.test_batch_norm()

+    def test_channel_norm(self):
+        entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'perm': (0, 1, 2)}),
+                   ((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'perm': (0, 2, 1)})]
+        for execution in ('EAGER_MODE', 'GRAPH_MODE'):
+            with execution_context().mode(execution):
+                for shape, args, kwargs in entries:
+                    perm = kwargs['perm']
+                    data = np.ones(shape, dtype='uint8').transpose(perm)
+                    mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
+                    std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
+                    x = dragon.ones(shape, dtype='uint8')
+                    y = dragon.nn.channel_norm(x, *args, **kwargs)
+                    self.assertEqual(y, (data - mean) / std)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
+    def test_channel_norm_cuda(self):
+        with dragon.device('cuda'):
+            self.test_channel_norm()
+
    def test_group_norm(self):
        eps = 1e-5
        entries = [((1, 4), (4,), -1, 2, (2,)),
@@ -2904,7 +2918,7 @@ class TestNormalizationOps(OpTestCase):
        with dragon.device('cuda'), self.cudnn_ws.as_default():
            self.test_local_response_norm(test_cudnn=True, prec=1e-2)

-    def test_lp_normalize(self):
+    def test_lp_norm(self):
        entries = [(0, 1, 1e-12, 'sum'),
                   (0, 1, 1e-12, 'mean'),
                   (0, 2, 1e-12, 'sum'),
@@ -2921,7 +2935,7 @@ class TestNormalizationOps(OpTestCase):
                    x, dy = new_tensor(data1), new_tensor(data2)
                    with dragon.GradientTape() as tape:
                        tape.watch(x)
-                        y = dragon.math.lp_normalize(
+                        y = dragon.nn.lp_norm(
                            x, axis, p=p, epsilon=eps, reduction=reduction)
                    dx = tape.gradient(y, [x], output_gradients=[dy])[0]
                    norm = np.abs(data1) if p == 1 else np.square(data1)
@@ -2930,9 +2944,9 @@ class TestNormalizationOps(OpTestCase):
                    self.assertEqual([y, dx], [data1 / max(norm, eps), grad])

    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_lp_normalize_cuda(self):
+    def test_lp_norm_cuda(self):
        with dragon.device('cuda'):
-            self.test_lp_normalize()
+            self.test_lp_norm()


 class TestRNNOps(OpTestCase):
@@ -3028,7 +3042,7 @@ class TestTrainingOps(OpTestCase):
    def test_rmsprop_update(self):
        with execution_context().mode('EAGER_MODE'):
            momentum, lr = self.rmsprop.momentum, self.rmsprop.lr
-            decay, eps = self.rmsprop.decay, self.rmsprop.eps
+            alpha, eps = self.rmsprop.alpha, self.rmsprop.eps
            data1 = uniform((2, 3))
            data2, data3 = np.zeros((2, 3), 'float32'), np.zeros((2, 3), 'float32')
            param = new_tensor(data1)
@@ -3036,7 +3050,7 @@ class TestTrainingOps(OpTestCase):
                data4 = uniform((2, 3))
                grad = new_tensor(data4)
                self.rmsprop.apply_gradients([[grad, param]])
-                data3 = decay * data3 + (1 - decay) * np.square(data4)
+                data3 = alpha * data3 + (1 - alpha) * np.square(data4)
                data2 = momentum * data2 + (data4 / (np.sqrt(data3) + eps))
                data1 -= lr * data2
                self.assertEqual(param, data1)

--- a/test/torch/test_backends.py
+++ b/test/torch/test_backends.py
@@ -20,6 +20,17 @@ from dragon.core.testing.unittest.common_utils import run_tests
 from dragon.vm import torch


+class TestCUDA(unittest.TestCase):
+    """Test the CUDA backend."""
+
+    def test_library(self):
+        _ = torch.backends.cuda.is_built()
+
+    def test_set_flags(self):
+        torch.backends.cuda.matmul.allow_tf32 = False
+        self.assertEqual(torch.backends.cuda.matmul.allow_tf32, False)
+
+
 class TestCuDNN(unittest.TestCase):
    """Test the CuDNN backend."""


--- a/test/torch/test_nn.py
+++ b/test/torch/test_nn.py
@@ -169,7 +169,7 @@ class TestModule(unittest.TestCase):
 class TestModules(OpTestCase):
    """Test the nn module class."""

-    def test_affine_channel(self):
+    def test_affine(self):
        data1 = arange((2, 3, 4, 5))
        data2, data3 = arange((1, 3, 1, 1)), arange((1, 3, 1, 1))
        w, b = new_tensor(data2.flatten()), new_tensor(data3.flatten())
@@ -181,21 +181,19 @@ class TestModules(OpTestCase):
        for bias, fix_weight, fix_bias in entries:
            x = new_tensor(data1)
            try:
-                m = torch.nn.AffineChannel(
+                m = torch.nn.Affine(
                    num_features=3,
                    bias=bias,
                    fix_weight=fix_weight,
                    fix_bias=fix_bias,
-                    inplace=True,
-                )
+                    inplace=True)
            except ValueError:
-                m = torch.nn.AffineChannel(
+                m = torch.nn.Affine(
                    num_features=3,
                    bias=bias,
                    fix_weight=fix_weight,
                    fix_bias=fix_bias,
-                    inplace=False,
-                )
+                    inplace=False)
            m.weight.copy_(w)
            result = data1 * data2
            if bias:
@@ -262,6 +260,18 @@ class TestModules(OpTestCase):
                y, _ = m(x), repr(m)
                self.assertEqual(y, result)

+    def test_channel_norm(self):
+        entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'dims': (0, 1, 2)}),
+                   ((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'dims': (0, 2, 1)})]
+        for shape, args, kwargs in entries:
+            perm = kwargs['dims']
+            data = np.ones(shape, dtype='uint8').transpose(perm)
+            mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
+            std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
+            x = torch.ones(shape, dtype='uint8')
+            y = torch.nn.functional.channel_norm(x, *args, **kwargs)
+            self.assertEqual(y, (data - mean) / std)
+
    def test_channel_shuffle(self):
        entries = [(1, 4)]
        for axis, group in entries:

--- a/test/torch/test_ops.py
+++ b/test/torch/test_ops.py
@@ -127,6 +127,12 @@ class TestTensorOps(OpTestCase):
                result = np.expand_dims(result, axis)
            self.assertEqual(x.argmin(axis, keepdims), result)

+    def test_atan2(self):
+        for a_shape, b_shape in self.binary_test_shapes:
+            data1, data2 = arange(a_shape), arange(b_shape, 1)
+            a, b = new_tensor(data1, False), new_tensor(data2, False)
+            self.assertEqual(a.atan2(b), np.arctan2(data1, data2))
+
    def test_baddbmm(self):
        entries = [((2, 2, 3), (2, 3, 4), (2, 2, 4))]
        for a_shape, b_shape, c_shape in entries:
@@ -944,18 +950,6 @@ class TestTorchOps(OpTestCase):
            y = torch.cat([x, x], dim=axis)
            self.assertEqual(y, np.concatenate([data, data], axis=axis))

-    def test_channel_normalize(self):
-        entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'dims': (0, 1, 2)}),
-                   ((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'dims': (0, 2, 1)})]
-        for shape, args, kwargs in entries:
-            perm = kwargs['dims']
-            data = np.ones(shape, dtype='uint8').transpose(perm)
-            mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
-            std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
-            x = torch.ones(shape, dtype='uint8')
-            y = torch.channel_normalize(x, *args, **kwargs)
-            self.assertEqual(y, (data - mean) / std)
-
    def test_linspace(self):
        entries = [([[0., 5.], [10., 40.], 5], {'dim': 0, 'dtype': 'float32'}),
                   ([[0., 5.], [10., 40.], 5], {'dim': 1, 'dtype': 'float32'}),

--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -49,8 +49,6 @@ from dragon.vm.torch.core.tensor import Tensor
 from dragon.vm.torch.core.ops import tensor_ops as _
 from dragon.vm.torch.core.ops.array_ops import broadcast_to
 from dragon.vm.torch.core.ops.array_ops import cat
-from dragon.vm.torch.core.ops.array_ops import channel_affine
-from dragon.vm.torch.core.ops.array_ops import channel_normalize
 from dragon.vm.torch.core.ops.array_ops import chunk
 from dragon.vm.torch.core.ops.array_ops import flatten
 from dragon.vm.torch.core.ops.array_ops import flip
@@ -71,7 +69,6 @@ from dragon.vm.torch.core.ops.array_ops import scatter_add
 from dragon.vm.torch.core.ops.array_ops import split
 from dragon.vm.torch.core.ops.array_ops import squeeze
 from dragon.vm.torch.core.ops.array_ops import stack
-from dragon.vm.torch.core.ops.math_ops import sum
 from dragon.vm.torch.core.ops.array_ops import tile
 from dragon.vm.torch.core.ops.array_ops import transpose
 from dragon.vm.torch.core.ops.array_ops import tril
@@ -97,6 +94,7 @@ from dragon.vm.torch.core.ops.math_ops import add
 from dragon.vm.torch.core.ops.math_ops import addmm
 from dragon.vm.torch.core.ops.math_ops import argmax
 from dragon.vm.torch.core.ops.math_ops import argmin
+from dragon.vm.torch.core.ops.math_ops import atan2
 from dragon.vm.torch.core.ops.math_ops import baddbmm
 from dragon.vm.torch.core.ops.math_ops import bitwise_and
 from dragon.vm.torch.core.ops.math_ops import bitwise_not
@@ -144,6 +142,7 @@ from dragon.vm.torch.core.ops.math_ops import sin
 from dragon.vm.torch.core.ops.math_ops import sqrt
 from dragon.vm.torch.core.ops.math_ops import square
 from dragon.vm.torch.core.ops.math_ops import sub
+from dragon.vm.torch.core.ops.math_ops import sum
 from dragon.vm.torch.core.ops.random_ops import normal
 from dragon.vm.torch.core.ops.random_ops import rand
 from dragon.vm.torch.core.ops.random_ops import randn

--- a/torch/_api/backends/__init__.py
+++ b/torch/_api/backends/__init__.py
@@ -15,6 +15,7 @@ from __future__ import division as _division
 from __future__ import print_function as _print_function

 # Modules
+from dragon.vm.torch.core.backends import cuda
 from dragon.vm.torch.core.backends import cudnn

 __all__ = [_s for _s in dir() if not _s.startswith('_')]
--- a/torch/_api/nn/__init__.py
+++ b/torch/_api/nn/__init__.py
@@ -56,6 +56,7 @@ from dragon.vm.torch.core.nn.modules.dropout import Dropout
 from dragon.vm.torch.core.nn.modules.dropout import DropPath
 from dragon.vm.torch.core.nn.modules.flatten import Flatten
 from dragon.vm.torch.core.nn.modules.fold import Unfold
+from dragon.vm.torch.core.nn.modules.linear import Affine
 from dragon.vm.torch.core.nn.modules.linear import Identity
 from dragon.vm.torch.core.nn.modules.linear import Linear
 from dragon.vm.torch.core.nn.modules.loss import CTCLoss
@@ -68,7 +69,6 @@ from dragon.vm.torch.core.nn.modules.loss import NLLLoss
 from dragon.vm.torch.core.nn.modules.loss import SigmoidFocalLoss
 from dragon.vm.torch.core.nn.modules.loss import SmoothL1Loss
 from dragon.vm.torch.core.nn.modules.module import Module
-from dragon.vm.torch.core.nn.modules.normalization import AffineChannel
 from dragon.vm.torch.core.nn.modules.normalization import GroupNorm
 from dragon.vm.torch.core.nn.modules.normalization import LayerNorm
 from dragon.vm.torch.core.nn.modules.normalization import LocalResponseNorm

--- a/torch/_api/nn/functional/__init__.py
+++ b/torch/_api/nn/functional/__init__.py
@@ -20,11 +20,13 @@ from dragon.vm.torch.core.nn.functional import adaptive_avg_pool3d
 from dragon.vm.torch.core.nn.functional import adaptive_max_pool1d
 from dragon.vm.torch.core.nn.functional import adaptive_max_pool2d
 from dragon.vm.torch.core.nn.functional import adaptive_max_pool3d
+from dragon.vm.torch.core.nn.functional import affine
 from dragon.vm.torch.core.nn.functional import avg_pool1d
 from dragon.vm.torch.core.nn.functional import avg_pool2d
 from dragon.vm.torch.core.nn.functional import avg_pool3d
 from dragon.vm.torch.core.nn.functional import batch_norm
 from dragon.vm.torch.core.nn.functional import binary_cross_entropy_with_logits
+from dragon.vm.torch.core.nn.functional import channel_norm
 from dragon.vm.torch.core.nn.functional import channel_shuffle
 from dragon.vm.torch.core.nn.functional import conv1d
 from dragon.vm.torch.core.nn.functional import conv2d

--- a/torch/core/autograd/function.py
+++ b/torch/core/autograd/function.py
@@ -173,6 +173,7 @@ class Function(object):
                outputs_id.append(outputs[i].id)
            else:
                if isinstance(spec, Tensor):
+                    spec._device = device.copy()
                    outputs.append(spec)
                    outputs_id.append(spec.id)
                else:

--- a/torch/core/backends/cuda.py
+++ b/torch/core/backends/cuda.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""CUDA backend."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from dragon.core.device import cuda
+from dragon.core.framework import sysconfig
+
+
+class CuBLASModule(object):
+    """CuBLAS module class."""
+
+    def __init__(self):
+        self._allow_tf32 = False
+
+    @property
+    def allow_tf32(self):
+        """The flag that allows cuBLAS TF32 math type or not."""
+        return self._allow_tf32
+
+    @allow_tf32.setter
+    def allow_tf32(self, value):
+        self._allow_tf32 = value
+        cuda.set_cublas_flags(allow_tf32=value)
+
+
+def is_built():
+    """Return a bool reporting if built with CUDA support.
+
+    Returns
+    -------
+    bool
+        ``True`` if built otherwise ``False``.
+
+    """
+    version = sysconfig.get_build_info().get('cuda_version', None)
+    return True if version is not None else False
+
+
+# Module instances.
+matmul = CuBLASModule()
--- a/torch/core/backends/cudnn.py
+++ b/torch/core/backends/cudnn.py
@@ -37,7 +37,7 @@ class CuDNNModule(object):
    @allow_tf32.setter
    def allow_tf32(self, value):
        self._allow_tf32 = value
-        self._set_flags()
+        cuda.set_cudnn_flags(allow_tf32=value)

    @property
    def benchmark(self):
@@ -47,7 +47,7 @@ class CuDNNModule(object):
    @benchmark.setter
    def benchmark(self, value):
        self._benchmark = value
-        self._set_flags()
+        cuda.set_cudnn_flags(benchmark=value)

    @property
    def deterministic(self):
@@ -57,7 +57,7 @@ class CuDNNModule(object):
    @deterministic.setter
    def deterministic(self, value):
        self._deterministic = value
-        self._set_flags()
+        cuda.set_cudnn_flags(deterministic=value)

    @property
    def enabled(self):
@@ -67,7 +67,7 @@ class CuDNNModule(object):
    @enabled.setter
    def enabled(self, value):
        self._enabled = value
-        self._set_flags()
+        cuda.set_cudnn_flags(enabled=value)

    @staticmethod
    def is_available():
@@ -97,15 +97,6 @@ class CuDNNModule(object):
            version = major * 1000 + minor * 100 + patch
        return version

-    def _set_flags(self):
-        """Set all flags with current value."""
-        cuda.set_cudnn_flags(
-            enabled=self._enabled,
-            benchmark=self._benchmark,
-            deterministic=self._deterministic,
-            allow_tf32=self._allow_tf32,
-        )
-

 # Module instances.
 sys.modules[__name__] = CuDNNModule()

--- a/torch/core/nn/functional.py
+++ b/torch/core/nn/functional.py
@@ -170,8 +170,42 @@ def adaptive_max_pool3d(input, output_size):
    return _pool('MAX', utils._triple, input, **args)


+def affine(input, weight, bias=None, dim=-1, out=None):
+    r"""Apply affine transformation to input.
+
+    .. math:: \text{out} = \text{input} \times \text{weight} + \text{bias}
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    weight : dragon.vm.torch.Tensor
+        The weight tensor.
+    bias : dragon.vm.torch.Tensor, optional
+        The bias tensor.
+    dim : Union[int, Sequence[int]], optional
+        The dimension to apply.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    See Also
+    --------
+    `torch.nn.Affine(...)`_
+
+    """
+    return Function.apply(
+        'Affine', input.device,
+        [input, weight] + ([bias] if bias else []), outputs=[out],
+        axes=nest.flatten(dim))
+
+
 def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
-    r"""Apply the 1d average pooling to input.
+    """Apply the 1d average pooling to input.

    Parameters
    ----------
@@ -200,7 +234,7 @@ def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):


 def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
-    r"""Apply the 2d average pooling to input.
+    """Apply the 2d average pooling to input.

    Parameters
    ----------
@@ -229,7 +263,7 @@ def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):


 def avg_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
-    r"""Apply the 3d average pooling to input.
+    """Apply the 3d average pooling to input.

    Parameters
    ----------
@@ -267,7 +301,7 @@ def batch_norm(
    momentum=0.1,
    eps=1e-5,
 ):
-    r"""Apply the batch normalization to input.
+    """Apply the batch normalization to input.
    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.

    Parameters
@@ -315,7 +349,7 @@ def binary_cross_entropy_with_logits(
    reduction='mean',
    pos_weight=None,
 ):
-    r"""Compute the sigmoid cross entropy with contiguous target.
+    """Compute the sigmoid cross entropy with contiguous target.

    Parameters
    ----------
@@ -353,6 +387,55 @@ def binary_cross_entropy_with_logits(
        [input, target], reduction=reduction.upper())


+def channel_norm(input, mean, std, dim=-1, dtype='float32', dims=None):
+    """Apply the normalization to each channel of input.
+
+    :attr:`dim` can be negative:
+
+    ```python
+    m = s = (1., 1., 1.)
+    x = torch.tensor([1, 2, 3])
+    print(nn.functional.channel_norm(x, m, s, dim=0))  # [0., 1., 2.]
+    print(nn.functional.channel_norm(x, m, s, dim=-1))  # Equivalent
+    ```
+
+    If :attr:`dims` provided, :attr:`dim` is selected from the output layout:
+
+    ```python
+    m, s = (1., 2., 3.), (1., 1., 1.)
+    x = torch.tensor([[1, 2, 3]])
+    # Provided 3 values to normalize the last dimension
+    # with length 1, only the first value will be taken
+    print(nn.functional.channel_norm(x, m, s, dims=(1, 0)))  # [[0.], [1.], [2.]]
+    ```
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    mean : Sequence[float], required
+        The mean to subtract.
+    std : Sequence[float], required
+        The standard deviation to divide.
+    dim : int, optional, default=-1
+        The channel dimension.
+    dtype : str, optional, default='float32'
+        The output data type.
+    dims : Sequence[int], optional
+        The order of output dimensions.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    return Function.apply(
+        'ChannelNorm', input.device, [input],
+        axis=dim, mean=mean, std=std, dtype=dtype,
+        ndim=len(dims) if dims is not None else 0, perm=dims)
+
+
 def channel_shuffle(input, groups):
    """Apply group shuffle to each channel of input.
    `[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
@@ -387,7 +470,7 @@ def conv1d(
    dilation=1,
    groups=1,
 ):
-    r"""Apply the 1d convolution to input.
+    """Apply the 1d convolution to input.

    Parameters
    ----------
@@ -428,7 +511,7 @@ def conv2d(
    dilation=1,
    groups=1,
 ):
-    r"""Apply the 2d convolution to input.
+    """Apply the 2d convolution to input.

    Parameters
    ----------
@@ -469,7 +552,7 @@ def conv3d(
    dilation=1,
    groups=1,
 ):
-    r"""Apply the 3d convolution to input.
+    """Apply the 3d convolution to input.

    Parameters
    ----------
@@ -511,7 +594,7 @@ def conv_transpose1d(
    groups=1,
    dilation=1,
 ):
-    r"""Apply the 1d deconvolution to input.
+    """Apply the 1d deconvolution to input.

    Parameters
    ----------
@@ -555,7 +638,7 @@ def conv_transpose2d(
    groups=1,
    dilation=1,
 ):
-    r"""Apply the 2d deconvolution to input.
+    """Apply the 2d deconvolution to input.

    Parameters
    ----------
@@ -599,7 +682,7 @@ def conv_transpose3d(
    groups=1,
    dilation=1,
 ):
-    r"""Apply the 3d deconvolution to input.
+    """Apply the 3d deconvolution to input.

    Parameters
    ----------
@@ -747,7 +830,7 @@ def depthwise_conv2d(
    padding=0,
    dilation=1,
 ):
-    r"""Apply the 2d depthwise convolution to input.
+    """Apply the 2d depthwise convolution to input.

    Parameters
    ----------
@@ -778,7 +861,7 @@ def depthwise_conv2d(


 def dropout(input, p=0.5, training=True, inplace=False):
-    r"""Set the elements of the input to zero randomly.
+    """Set the elements of the input to zero randomly.
    `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.

    Parameters
@@ -810,7 +893,7 @@ def dropout(input, p=0.5, training=True, inplace=False):


 def drop_block2d(input, p=0.5, block_size=1, training=True, inplace=False):
-    r"""Set the blocks over input to zero randomly.
+    """Set the blocks over input to zero randomly.

    Parameters
    ----------
@@ -994,6 +1077,15 @@ def group_norm(input, num_groups, weight, bias, eps=1e-5):
 def hardsigmoid(input, inplace=False):
    r"""Apply the hard sigmoid function to input.

+    The **HardSigmoid** function is defined as:
+
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            1 & \text{if~} x \ge +3, \\
+            x / 6 + 1 / 2 & \text{otherwise}
+        \end{cases}
+
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
@@ -1020,6 +1112,15 @@ def hardswish(input):
    r"""Apply the hard swish function to input.
    `[Howard et.al, 2019] <https://arxiv.org/abs/1905.02244>`_.

+    The **HardSwish** function is defined as:
+
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            x & \text{if~} x \ge +3, \\
+            x \cdot (x + 3) /6 & \text{otherwise}
+        \end{cases}
+
    Parameters
    ----------
    input : dragon.vm.torch.Tensor
@@ -1161,7 +1262,7 @@ def kl_div(


 def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
-    r"""Compute the element-wise absolute value difference.
+    """Compute the element-wise absolute value difference.

    Parameters
    ----------
@@ -1196,7 +1297,7 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):


 def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
-    r"""Apply the layer normalization to input.
+    """Apply the layer normalization to input.
    `[Ba et.al, 2016] <https://arxiv.org/abs/1607.06450>`_

    Parameters
@@ -1387,7 +1488,7 @@ def lstm_cell(input, cx):


 def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
-    r"""Apply the 1d max pooling to input.
+    """Apply the 1d max pooling to input.

    Parameters
    ----------
@@ -1416,7 +1517,7 @@ def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):


 def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
-    r"""Apply the 2d max pooling to input.
+    """Apply the 2d max pooling to input.

    Parameters
    ----------
@@ -1445,7 +1546,7 @@ def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):


 def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
-    r"""Apply the 3d max pooling to input.
+    """Apply the 3d max pooling to input.

    Parameters
    ----------
@@ -1474,11 +1575,7 @@ def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):


 def mse_loss(input, target, size_average=None, reduce=None, reduction='mean'):
-    r"""Compute the element-wise squared error.
-
-    The ``MSELoss`` function is defined as:
-
-    .. math:: \text{MSELoss}(x, y) = (x - y)^{2}
+    """Compute the element-wise squared error.

    Parameters
    ----------
@@ -1726,7 +1823,7 @@ def normalize(input, p=2, dim=1, end_dim=None, eps=1e-12, out=None):

    """
    return Function.apply(
-        'LpNormalize', input.device, [input], outputs=[out],
+        'LpNorm', input.device, [input], outputs=[out],
        p=p, axis=dim, end_axis=end_dim, epsilon=eps, reduction='SUM')



--- a/torch/core/nn/modules/linear.py
+++ b/torch/core/nn/modules/linear.py
@@ -19,9 +19,97 @@ import math
 from dragon.vm.torch.core.nn import functional as F
 from dragon.vm.torch.core.nn.modules.module import Module
 from dragon.vm.torch.core.nn.parameter import Parameter
+from dragon.vm.torch.core.ops import constant_ops
 from dragon.vm.torch.core.tensor import Tensor


+class Affine(Module):
+    """Apply affine transformation.
+
+    Affine is often taken as a post-processing of normalization.
+
+    Examples:
+
+    ```python
+    m = torch.nn.Affine(5)
+
+    # Apply a 2d transformation
+    x2d = torch.ones(3, 5)
+    y2d = m(x2d)
+
+    # Apply a 3d transformation
+    x3d = torch.ones(3, 5, 4)
+    y3d = m(x3d)
+
+    # Apply a 4d transformation
+    x4d = torch.ones(3, 5, 2, 2)
+    y4d = m(x4d)
+    ```
+
+    See Also
+    --------
+    `torch.nn.functional.affine(...)`_
+
+    """
+
+    def __init__(
+        self,
+        num_features,
+        bias=True,
+        fix_weight=False,
+        fix_bias=False,
+        inplace=False,
+    ):
+        """Create an ``AffineChannel`` module.
+
+        Parameters
+        ----------
+        num_features : int
+            The number of channels.
+        bias : bool, optional, default=True
+            ``True`` to attach a bias.
+        fix_weight : bool, optional, default=False
+            ``True`` to frozen the ``weight``.
+        fix_bias : bool, optional, default=False
+            ``True`` to frozen the ``bias``.
+        inplace : bool, optional, default=False
+            Whether to do the operation in-place.
+
+        """
+        super(Affine, self).__init__()
+        self.num_features = num_features
+        self.inplace = inplace
+        if not fix_weight:
+            self.weight = Parameter(constant_ops.ones(num_features))
+            if inplace:
+                raise ValueError('In-place operation requires fixed weight.')
+        else:
+            self.register_buffer('weight', constant_ops.ones(num_features))
+        if bias:
+            if not fix_bias:
+                self.bias = Parameter(constant_ops.zeros(num_features))
+            else:
+                self.register_buffer('bias', constant_ops.zeros(num_features))
+        else:
+            self.bias = None
+
+    def extra_repr(self):
+        s = '{num_features}, ' \
+            'inplace={inplace}'.format(**self.__dict__)
+        if self.bias is None:
+            s += ', bias=False'
+        return s
+
+    def forward(self, input):
+        return F.affine(
+            input,
+            self.weight,
+            self.bias,
+            dim=1,
+            out=input if self.inplace else None,
+        )
+
+
 class Identity(Module):
    r"""Apply the identity transformation.


--- a/torch/core/nn/modules/normalization.py
+++ b/torch/core/nn/modules/normalization.py
@@ -20,98 +20,10 @@ from dragon.core.util import nest
 from dragon.vm.torch.core.nn import functional as F
 from dragon.vm.torch.core.nn.modules.module import Module
 from dragon.vm.torch.core.nn.parameter import Parameter
-from dragon.vm.torch.core.ops import array_ops
 from dragon.vm.torch.core.ops import constant_ops
 from dragon.vm.torch.core.tensor import Tensor


-class AffineChannel(Module):
-    """Apply affine transformation to channels.
-
-    Affine is often taken as a post-processing of normalization.
-
-    Examples:
-
-    ```python
-    m = torch.nn.AffineChannel(5)
-
-    # Apply a 2d transformation
-    x2d = torch.ones(3, 5)
-    y2d = m(x2d)
-
-    # Apply a 3d transformation
-    x3d = torch.ones(3, 5, 4)
-    y3d = m(x3d)
-
-    # Apply a 4d transformation
-    x4d = torch.ones(3, 5, 2, 2)
-    y4d = m(x4d)
-    ```
-
-    See Also
-    --------
-    `torch.channel_affine(...)`_
-
-    """
-
-    def __init__(
-        self,
-        num_features,
-        bias=True,
-        fix_weight=False,
-        fix_bias=False,
-        inplace=False,
-    ):
-        """Create an ``AffineChannel`` module.
-
-        Parameters
-        ----------
-        num_features : int
-            The number of channels.
-        bias : bool, optional, default=True
-            ``True`` to attach a bias.
-        fix_weight : bool, optional, default=False
-            ``True`` to frozen the ``weight``.
-        fix_bias : bool, optional, default=False
-            ``True`` to frozen the ``bias``.
-        inplace : bool, optional, default=False
-            Whether to do the operation in-place.
-
-        """
-        super(AffineChannel, self).__init__()
-        self.num_features = num_features
-        self.inplace = inplace
-        if not fix_weight:
-            self.weight = Parameter(constant_ops.ones(num_features))
-            if inplace:
-                raise ValueError('In-place operation requires fixed weight.')
-        else:
-            self.register_buffer('weight', constant_ops.ones(num_features))
-        if bias:
-            if not fix_bias:
-                self.bias = Parameter(constant_ops.zeros(num_features))
-            else:
-                self.register_buffer('bias', constant_ops.zeros(num_features))
-        else:
-            self.bias = None
-
-    def extra_repr(self):
-        s = '{num_features}, ' \
-            'inplace={inplace}'.format(**self.__dict__)
-        if self.bias is None:
-            s += ', bias=False'
-        return s
-
-    def forward(self, input):
-        return array_ops.channel_affine(
-            input,
-            self.weight,
-            self.bias,
-            dim=1,
-            out=input if self.inplace else None,
-        )
-
-
 class GroupNorm(Module):
    r"""Apply the group normalization.
    `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.

--- a/torch/core/ops/array_ops.py
+++ b/torch/core/ops/array_ops.py
@@ -42,85 +42,6 @@ def cat(tensors, dim=0, out=None):
        'Concat', tensors[0].device, tensors, outputs=[out], axis=dim)


-def channel_affine(input, weight, bias=None, dim=-1, end_dim=None, out=None):
-    """Apply affine transformation to each channel of input.
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    weight : dragon.vm.torch.Tensor
-        The weight tensor.
-    bias : dragon.vm.torch.Tensor, optional
-        The bias tensor.
-    dim : int, optional, default=-1
-        The first channel dimension.
-    end_dim : int, optional
-        The last channel dimension.
-    out : dragon.vm.torch.Tensor, optional
-        The output tensor.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return Function.apply(
-        'ChannelAffine', input.device,
-        [input, weight] + ([bias] if bias else []), outputs=[out],
-        axis=dim, end_axis=end_dim)
-
-
-def channel_normalize(input, mean, std, dim=-1, dtype='float32', dims=None):
-    """Apply normalization to each channel of input.
-
-    :attr:`dim` can be negative:
-
-    ```python
-    m = s = (1., 1., 1.)
-    x = torch.tensor([1, 2, 3])
-    print(torch.channel_normalize(x, m, s, dim=0))  # [0., 1., 2.]
-    print(torch.channel_normalize(x, m, s, dim=-1))  # Equivalent
-    ```
-
-    If :attr:`dims` provided, :attr:`dim` is selected from the output layout:
-
-    ```python
-    m, s = (1., 2., 3.), (1., 1., 1.)
-    x = torch.tensor([[1, 2, 3]])
-    # Provided 3 values to normalize the last dimension
-    # with length 1, only the first value will be taken
-    print(torch.channel_normalize(x, m, s, dims=(1, 0)))  # [[0.], [1.], [2.]]
-    ```
-
-    Parameters
-    ----------
-    input : dragon.vm.torch.Tensor
-        The input tensor.
-    mean : Sequence[float], required
-        The mean to subtract.
-    std : Sequence[float], required
-        The standard deviation to divide.
-    dim : int, optional, default=-1
-        The channel dimension.
-    dtype : str, optional, default='float32'
-        The output data type.
-    dims : Sequence[int], optional
-        The order of output dimensions.
-
-    Returns
-    -------
-    dragon.vm.torch.Tensor
-        The output tensor.
-
-    """
-    return Function.apply(
-        'ChannelNormalize', input.device, [input],
-        axis=dim, mean=mean, std=std, dtype=dtype,
-        ndim=len(dims) if dims is not None else 0, perm=dims)
-
-
 def chunk(tensor, chunks, dim=0, copy=True):
    """Split input into a specific number of chunks.


--- a/torch/core/ops/math_ops.py
+++ b/torch/core/ops/math_ops.py
@@ -168,6 +168,37 @@ def argmin(input, dim, keepdim=False, out=None):
        axis=dim, keepdims=keepdim)


+def atan2(input, other, out=None):
+    r"""Compute the element-wise arc-tangent of two arguments.
+
+    .. math:: \text{out} = \text{arctan}(\frac{\text{input}}{\text{other}})
+
+    Examples:
+
+    ```python
+    y = torch.tensor(1.)
+    x = torch.tensor(2.)
+    print(torch.atan2(y, x))  # 0.46364761
+    ```
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    other : Union[dragon.vm.torch.Tensor, number]
+        The tensor to divide.
+    out : dragon.vm.torch.Tensor, optional
+        The output tensor.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    """
+    return _binary_func(input, other, 'Atan2', out)
+
+
 def baddbmm(input, batch1, batch2, beta=1, alpha=1, out=None):
    r"""Add input to the result of batched matrix-matrix multiplication.


--- a/torch/core/ops/tensor_ops.py
+++ b/torch/core/ops/tensor_ops.py
@@ -186,6 +186,29 @@ def argsort(self, dim=-1, descending=False):
    return sort_ops.argsort(self, dim, descending)


+def atan2(self, other):
+    r"""Compute the element-wise arc-tangent of two arguments.
+
+    .. math:: \text{out} = \text{arctan}(\frac{\text{self}}{\text{other}})
+
+    Parameters
+    ----------
+    other : Union[dragon.vm.torch.Tensor, number]
+        The value to divide.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    See Also
+    --------
+    `torch.atan2(...)`_
+
+    """
+    return math_ops.atan2(self, other)
+
+
 def baddbmm(self, batch1, batch2, beta=1, alpha=1):
    r"""Add the result of batched matrix-matrix multiplication.

@@ -3051,6 +3074,7 @@ Tensor.addmm = addmm
 Tensor.argmax = argmax
 Tensor.argmin = argmin
 Tensor.argsort = argsort
+Tensor.atan2 = atan2
 Tensor.backward = backward
 Tensor.baddbmm = baddbmm
 Tensor.baddbmm_ = baddbmm_

--- a/torch/core/optim/adam.py
+++ b/torch/core/optim/adam.py
@@ -28,9 +28,11 @@ class Adam(Optimizer):
    The **Adam** update is defined as:

    .. math::
-        \text{Adam}(g) = \text{lr} * \frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \\
+        \text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
+                                           {\sqrt{v_{t}} + \epsilon}) \\
            \quad \\ \text{where}\quad
                \begin{cases}
+                    \text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
                    m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
                \end{cases}
@@ -88,12 +90,13 @@ class AdamW(Adam):
    The **AdamW** update is defined as:

    .. math::
-        \text{AdamW}(g, p) = \text{lr} * (\frac{m_{t}}{\sqrt{v_{t}} + \epsilon}
-                                           + \lambda p) \\
+        \text{AdamW}(g, p) = \text{lr} * (\frac{\text{correction} * m_{t}}
+                                               {\sqrt{v_{t}} + \epsilon} + \lambda p) \\
            \quad \\ \text{where}\quad
                \begin{cases}
+                    \text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
                    m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
-                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
+                    v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} \\
                \end{cases}

    """

--- a/torch/core/optim/rmsprop.py
+++ b/torch/core/optim/rmsprop.py
@@ -78,5 +78,4 @@ class RMSprop(Optimizer):
        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps,
                        centered=centered, weight_decay=weight_decay)
        super(RMSprop, self).__init__(params, defaults, **kwargs)
-        self._hyper['alpha'][0] = 'decay'
        self._hyper.pop('centered')  # Unsupported.
--- a/torch/core/tensor.py
+++ b/torch/core/tensor.py
@@ -372,6 +372,27 @@ class Tensor(object):

        """

+    def atan2(self, other):
+        r"""Compute the element-wise arc-tangent of two arguments.
+
+        .. math:: \text{out} = \text{arctan}(\frac{\text{self}}{\text{other}})
+
+        Parameters
+        ----------
+        other : Union[dragon.vm.torch.Tensor, number]
+            The value to divide.
+
+        Returns
+        -------
+        dragon.vm.torch.Tensor
+            The output tensor.
+
+        See Also
+        --------
+        `torch.atan2(...)`_
+
+        """
+
    def backward(self, gradient=None, retain_graph=False):
        """Compute the derivatives of this tensor w.r.t. graph leaves.