Add FP16 support for DepthwiseConv2d && SyncBN Operator

Summary: This commit adds pseudo FP16 kernels with FP32 conversions for DepthwiseConv2d and SyncBN operator.

Add FP16 support for DepthwiseConv2d && SyncBN Operator
Summary: This commit adds pseudo FP16 kernels with FP32 conversions for DepthwiseConv2d and SyncBN operator.
Ting PAN
Commit 746f2cbb authored Nov 30, 2020 by Ting PAN
Showing with 521 additions and 581 deletions
docs/api/python/dragon.rst
docs/api/python/dragon/get_num_threads.rst
docs/api/python/dragon/set_num_threads.rst
dragon/core/operator.h
dragon/core/types.h
dragon/kernels/activation/drop_block_op_kernel.cc
dragon/kernels/activation/drop_path_op_kernel.cc
dragon/kernels/activation/drop_path_op_kernel.cu
dragon/kernels/activation/dropout_op_kernel.cc
dragon/kernels/activation/dropout_op_kernel.cu
dragon/kernels/activation/elu_op_kernel.cc
dragon/kernels/activation/hardsigmoid_op_kernel.cc
dragon/kernels/activation/hardswish_op_kernel.cc
dragon/kernels/activation/prelu_op_kernel.cc
dragon/kernels/activation/relu_op_kernel.cc
dragon/kernels/activation/relu_op_kernel.cu
dragon/kernels/activation/selu_op_kernel.cc
dragon/kernels/activation/selu_op_kernel.cu
dragon/kernels/activation/sigmoid_op_kernel.cc
dragon/kernels/activation/softmax_op_kernel.cu
--- a/docs/api/python/dragon.rst
+++ b/docs/api/python/dragon.rst
@@ -81,6 +81,9 @@ dragon
  `function(...) <dragon/function.html>`_
  : Compile a function and return an executable.
+  `get_num_threads(...) <dragon/get_num_threads.html>`_
+  : Return the number of threads for cpu parallelism.
  `get_workspace(...) <dragon/get_workspace.html>`_
  : Return the current default workspace.
@@ -138,6 +141,9 @@ dragon
  `reshape(...) <dragon/reshape.html>`_
  : Change the dimensions of input.
+  `set_num_threads(...) <dragon/set_num_threads.html>`_
+  : Set the number of threads for cpu parallelism.
  `shape(...) <dragon/shape.html>`_
  : Return the shape of input.
@@ -204,6 +210,7 @@ dragon
  dragon/fill
  dragon/flatten
  dragon/function
+  dragon/get_num_threads
  dragon/get_workspace
  dragon/gradients
  dragon/graph_mode
@@ -223,6 +230,7 @@ dragon
  dragon/repeat
  dragon/reset_workspace
  dragon/reshape
+  dragon/set_num_threads
  dragon/shape
  dragon/slice
  dragon/sort

--- a/docs/api/python/dragon/get_num_threads.rst
+++ b/docs/api/python/dragon/get_num_threads.rst
+get_num_threads
+===============
+.. autofunction:: dragon.get_num_threads
+.. raw:: html
+  <style>
+    h1:before {
+      content: "dragon.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/dragon/set_num_threads.rst
+++ b/docs/api/python/dragon/set_num_threads.rst
+set_num_threads
+===============
+.. autofunction:: dragon.set_num_threads
+.. raw:: html
+  <style>
+    h1:before {
+      content: "dragon.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/core/operator.h
+++ b/dragon/core/operator.h
@@ -18,7 +18,7 @@
 #include "dragon/core/operator_schema.h"
 #include "dragon/core/registry.h"
 #include "dragon/core/tensor.h"
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 namespace dragon {

--- a/dragon/core/types.h
+++ b/dragon/core/types.h
@@ -19,6 +19,11 @@
 #include "dragon/core/typeid.h"
+#ifndef HFLT_MAX
+#define HFLT_MAX 65504.F
+#define HFLT_MIN 6.10e-5F
+#endif
 namespace dragon {
 typedef std::vector<int> vec32_t;

--- a/dragon/kernels/activation/drop_block_op_kernel.cc
+++ b/dragon/kernels/activation/drop_block_op_kernel.cc
@@ -34,7 +34,7 @@ void _DropBlock2dNCHW(
        }
      } // Share the mask between channels
    }
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }
@@ -65,7 +65,7 @@ void _DropBlock2dNHWC(
        }
      } // Share the mask between channels
    }
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/activation/drop_path_op_kernel.cc
+++ b/dragon/kernels/activation/drop_path_op_kernel.cc
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/omp_utils.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/activation/drop_path_op_kernel.cu
+++ b/dragon/kernels/activation/drop_path_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -86,7 +85,7 @@ void DropPath<float16, CUDAContext>(
    const auto nthreads = rows * cols;                                         \
    const auto thresh = 1.f - (1.f / scale);                                   \
    _DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        nthreads, cols, thresh, cast::to<T>(scale), x, mask, y);               \
+        nthreads, cols, thresh, convert::To<T>(scale), x, mask, y);            \
  }
 DEFINE_KERNEL_LAUNCHER(float);

--- a/dragon/kernels/activation/dropout_op_kernel.cc
+++ b/dragon/kernels/activation/dropout_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/omp_utils.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -72,7 +71,7 @@ void _Dropout<float16>(
      const uint8_t* mask,                                                     \
      T* y,                                                                    \
      CPUContext* ctx) {                                                       \
-    _ApplyMask(count, cast::to<T>(scale), x, mask, y);                        \
+    _ApplyMask(count, convert::To<T>(scale), x, mask, y);                      \
  }                                                                            \
  template <>                                                                  \
  void Dropout<T, CPUContext>(                                                 \
@@ -84,7 +83,8 @@ void _Dropout<float16>(
      T* y,                                                                    \
      uint32_t* r,                                                             \
      CPUContext* ctx) {                                                       \
-    _Dropout(count, cast::to<T>(ratio), cast::to<T>(scale), x, mask, y, ctx); \
+    _Dropout(                                                                  \
+        count, convert::To<T>(ratio), convert::To<T>(scale), x, mask, y, ctx); \
  }
 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/dropout_op_kernel.cu
+++ b/dragon/kernels/activation/dropout_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -113,7 +112,7 @@ void Dropout<float16, CUDAContext>(
      T* y,                                                                  \
      CUDAContext* ctx) {                                                    \
    _ApplyMask<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        count, cast::to<T>(scale), x, mask, y);                              \
+        count, convert::To<T>(scale), x, mask, y);                           \
  }                                                                          \
  template <>                                                                \
  void Dropout<T, CUDAContext>(                                              \
@@ -128,7 +127,7 @@ void Dropout<float16, CUDAContext>(
    math::Random(count, r, ctx);                                             \
    auto threshold = static_cast<uint32_t>(UINT_MAX * ratio);                \
    _Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(   \
-        count, threshold, cast::to<T>(scale), x, r, mask, y);                \
+        count, threshold, convert::To<T>(scale), x, r, mask, y);             \
  }
 DEFINE_KERNEL_LAUNCHER(float);

--- a/dragon/kernels/activation/elu_op_kernel.cc
+++ b/dragon/kernels/activation/elu_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -50,7 +50,7 @@ void _EluGrad<float16>(
  template <>                                                                  \
  void Elu<T, CPUContext>(                                                     \
      const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \
-    _Elu(count, cast::to<T>(alpha), x, y);                                     \
+    _Elu(count, convert::To<T>(alpha), x, y);                                  \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                 \
@@ -62,7 +62,7 @@ void _EluGrad<float16>(
      const T* y,                                      \
      T* dx,                                           \
      CPUContext* ctx) {                               \
-    _EluGrad(count, cast::to<T>(alpha), dy, y, dx); \
+    _EluGrad(count, convert::To<T>(alpha), dy, y, dx); \
  }
 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/hardsigmoid_op_kernel.cc
+++ b/dragon/kernels/activation/hardsigmoid_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -65,7 +65,7 @@ void _HardSigmoidGrad<float16>(
      const T* x,                                                           \
      T* y,                                                                 \
      CPUContext* ctx) {                                                    \
-    _HardSigmoid(count, cast::to<T>(alpha), cast::to<T>(beta), x, y); \
+    _HardSigmoid(count, convert::To<T>(alpha), convert::To<T>(beta), x, y); \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                         \
@@ -77,7 +77,7 @@ void _HardSigmoidGrad<float16>(
      const T* y,                                              \
      T* dx,                                                   \
      CPUContext* ctx) {                                       \
-    _HardSigmoidGrad(count, cast::to<T>(alpha), dy, y, dx); \
+    _HardSigmoidGrad(count, convert::To<T>(alpha), dy, y, dx); \
  }
 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/hardswish_op_kernel.cc
+++ b/dragon/kernels/activation/hardswish_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -68,7 +68,7 @@ void _HardSwishGrad<float16>(
      const T* x,                                                         \
      T* y,                                                               \
      CPUContext* ctx) {                                                  \
-    _HardSwish(count, cast::to<T>(alpha), cast::to<T>(beta), x, y); \
+    _HardSwish(count, convert::To<T>(alpha), convert::To<T>(beta), x, y); \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                  \
@@ -81,7 +81,8 @@ void _HardSwishGrad<float16>(
      const T* x,                                                       \
      T* dx,                                                            \
      CPUContext* ctx) {                                                \
-    _HardSwishGrad(count, cast::to<T>(alpha), cast::to<T>(beta), dy, x, dx); \
+    _HardSwishGrad(                                                     \
+        count, convert::To<T>(alpha), convert::To<T>(beta), dy, x, dx); \
  }
 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/prelu_op_kernel.cc
+++ b/dragon/kernels/activation/prelu_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"

--- a/dragon/kernels/activation/relu_op_kernel.cc
+++ b/dragon/kernels/activation/relu_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -87,7 +87,7 @@ void _ReluNGrad<float16>(
  template <>                                                                  \
  void Relu<T, CPUContext>(                                                    \
      const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \
-    _Relu(count, cast::to<T>(alpha), x, y);                                    \
+    _Relu(count, convert::To<T>(alpha), x, y);                                 \
  }                                                                            \
  template <>                                                                  \
  void ReluN<T, CPUContext>(                                                   \
@@ -96,7 +96,7 @@ void _ReluNGrad<float16>(
      const T* x,                                                              \
      T* y,                                                                    \
      CPUContext* ctx) {                                                       \
-    _ReluN(count, cast::to<T>(max_value), x, y);                               \
+    _ReluN(count, convert::To<T>(max_value), x, y);                            \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                       \
@@ -108,7 +108,7 @@ void _ReluNGrad<float16>(
      const T* y,                                            \
      T* dx,                                                 \
      CPUContext* ctx) {                                     \
-    _ReluGrad(count, cast::to<T>(alpha), dy, y, dx);      \
+    _ReluGrad(count, convert::To<T>(alpha), dy, y, dx);      \
  }                                                          \
  template <>                                                \
  void ReluNGrad<T, CPUContext>(                             \
@@ -118,7 +118,7 @@ void _ReluNGrad<float16>(
      const T* y,                                            \
      T* dx,                                                 \
      CPUContext* ctx) {                                     \
-    _ReluNGrad(count, cast::to<T>(max_value), dy, y, dx); \
+    _ReluNGrad(count, convert::To<T>(max_value), dy, y, dx); \
  }
 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/relu_op_kernel.cu
+++ b/dragon/kernels/activation/relu_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -287,13 +287,13 @@ void ReluN<float16, CUDAContext>(
        0,
        ctx->cuda_stream()>>>(
        count >> 1,
-        cast::to<half>(max_value),
+        convert::To<half>(max_value),
        reinterpret_cast<const half2*>(x),
        reinterpret_cast<half2*>(y));
  } else {
    _ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
        count,
-        cast::to<half>(max_value),
+        convert::To<half>(max_value),
        reinterpret_cast<const half*>(x),
        reinterpret_cast<half*>(y));
  }
@@ -339,14 +339,14 @@ void ReluNGrad<float16, CUDAContext>(
        0,
        ctx->cuda_stream()>>>(
        count >> 1,
-        cast::to<half2>(max_value),
+        convert::To<half2>(max_value),
        reinterpret_cast<const half2*>(dy),
        reinterpret_cast<const half2*>(y),
        reinterpret_cast<half2*>(dx));
  } else {
    _ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
        count,
-        cast::to<half>(max_value),
+        convert::To<half>(max_value),
        reinterpret_cast<const half*>(dy),
        reinterpret_cast<const half*>(y),
        reinterpret_cast<half*>(dx));
@@ -362,7 +362,7 @@ void ReluNGrad<float16, CUDAContext>(
      T* y,                                                              \
      CUDAContext* ctx) {                                                \
    _Relu<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
-        count, cast::to<T>(alpha), x, y);                                \
+        count, convert::To<T>(alpha), x, y);                             \
  }                                                                      \
  template <>                                                            \
  void ReluN<T, CUDAContext>(                                            \
@@ -372,7 +372,7 @@ void ReluNGrad<float16, CUDAContext>(
      T* y,                                                              \
      CUDAContext* ctx) {                                                \
    _ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        count, cast::to<T>(max_value), x, y);                            \
+        count, convert::To<T>(max_value), x, y);                         \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                       \
@@ -385,7 +385,7 @@ void ReluNGrad<float16, CUDAContext>(
      T* dx,                                                                 \
      CUDAContext* ctx) {                                                    \
    _ReluGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(  \
-        count, cast::to<T>(alpha), dy, y, dx);                               \
+        count, convert::To<T>(alpha), dy, y, dx);                            \
  }                                                                          \
  template <>                                                                \
  void ReluNGrad<T, CUDAContext>(                                            \
@@ -396,7 +396,7 @@ void ReluNGrad<float16, CUDAContext>(
      T* dx,                                                                 \
      CUDAContext* ctx) {                                                    \
    _ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        count, cast::to<T>(max_value), dy, y, dx);                           \
+        count, convert::To<T>(max_value), dy, y, dx);                        \
  }
 DEFINE_KERNEL_LAUNCHER(float);

--- a/dragon/kernels/activation/selu_op_kernel.cc
+++ b/dragon/kernels/activation/selu_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -66,7 +66,7 @@ void _SeluGrad<float16>(
      const T* x,                                                     \
      T* y,                                                           \
      CPUContext* ctx) {                                              \
-    _Selu(count, cast::to<T>(alpha), cast::to<T>(gamma), x, y); \
+    _Selu(count, convert::To<T>(alpha), convert::To<T>(gamma), x, y); \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                         \
@@ -79,7 +79,7 @@ void _SeluGrad<float16>(
      const T* y,                                                              \
      T* dx,                                                                   \
      CPUContext* tx) {                                                        \
-    _SeluGrad(count, cast::to<T>(alpha), cast::to<T>(gamma), dy, y, dx); \
+    _SeluGrad(count, convert::To<T>(alpha), convert::To<T>(gamma), dy, y, dx); \
  }
 DEFINE_KERNEL_LAUNCHER(float16);

--- a/dragon/kernels/activation/selu_op_kernel.cu
+++ b/dragon/kernels/activation/selu_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/activation/sigmoid_op_kernel.cc
+++ b/dragon/kernels/activation/sigmoid_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/activation/softmax_op_kernel.cu
+++ b/dragon/kernels/activation/softmax_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
 #include "dragon/utils/device/common_cub.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -200,7 +199,7 @@ void Softmax<float16, CUDAContext>(
      rows,
      cols,
      inner_dim,
-      cast::to<half>(std::numeric_limits<float>::lowest()),
+      convert::To<half>(std::numeric_limits<float>::lowest()),
      reinterpret_cast<const half*>(x),
      reinterpret_cast<half*>(y));
 }

--- a/dragon/kernels/activation/swish_op_kernel.cc
+++ b/dragon/kernels/activation/swish_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/activation/tanh_op_kernel.cc
+++ b/dragon/kernels/activation/tanh_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/activation/tanh_op_kernel.cu
+++ b/dragon/kernels/activation/tanh_op_kernel.cu
@@ -35,7 +35,7 @@ __global__ void _Tanh<half2>(const int nthreads, const half2* x, half2* y) {
 template <typename T>
 __global__ void _TanhGrad(const int nthreads, const T* dy, const T* y, T* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = dy[i] * (T(1) - utils::math::Square(y[i]));
+    dx[i] = dy[i] * (T(1) - math::utils::Square(y[i]));
  }
 }
@@ -44,7 +44,7 @@ __global__ void
 _TanhGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
    dx[i] = __float2half(
-        __half2float(dy[i]) * (1.f - utils::math::Square(__half2float(y[i]))));
+        __half2float(dy[i]) * (1.f - math::utils::Square(__half2float(y[i]))));
  }
 }
@@ -58,8 +58,8 @@ __global__ void _TanhGrad<half2>(
    const float2 val = __half22float2(y[i]);
    const float2 grad = __half22float2(dy[i]);
    dx[i] = __floats2half2_rn(
-        grad.x * (1.f - utils::math::Square(val.x)),
+        grad.x * (1.f - math::utils::Square(val.x)),
-        grad.y * (1.f - utils::math::Square(val.y)));
+        grad.y * (1.f - math::utils::Square(val.y)));
  }
 }

--- a/dragon/kernels/array/channel_affine_op_kernel.cc
+++ b/dragon/kernels/array/channel_affine_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/array/channel_normalize_op_kernel.cc
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cc
@@ -28,7 +28,7 @@ void _ChannelNormalize(
      if (d == axis) wi = idx[d];
    }
    y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, idx.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
  }
 }

--- a/dragon/kernels/array/cum_op_kernel.cc
+++ b/dragon/kernels/array/cum_op_kernel.cc
@@ -26,7 +26,7 @@ void _CumSum(
    } else {
      y[i] = exclusive ? T(0) : x[i];
    }
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/array/eye_op_kernel.cc
+++ b/dragon/kernels/array/eye_op_kernel.cc
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -11,7 +10,7 @@ namespace {
 template <typename T>
 void _SetEye(const int n, const int m, const int k, T* y) {
  for (int i = 0; i < n; ++i) {
-    y[i * m + k + i] = cast::to<T>(1.f);
+    y[i * m + k + i] = convert::To<T>(1.f);
  }
 }
@@ -23,7 +22,7 @@ void _SetEye(const int n, const int m, const int k, T* y) {
  template <>                                                         \
  void Eye<T, CPUContext>(                                            \
      const int n, const int m, const int k, T* y, CPUContext* ctx) { \
-    math::Set(n* m, cast::to<T>(0.f), y, ctx);                        \
+    math::Set(n* m, convert::To<T>(0.f), y, ctx);                     \
    if (k > 0) {                                                      \
      if (m - k > 0) _SetEye(m - k, m, k, y);                         \
    } else {                                                          \

--- a/dragon/kernels/array/eye_op_kernel.cu
+++ b/dragon/kernels/array/eye_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -37,7 +36,7 @@ void Eye<float16, CUDAContext>(
    const int k,
    float16* y,
    CUDAContext* ctx) {
-  math::Set(n * m, cast::to<float16>(0.f), y, ctx);
+  math::Set(n * m, convert::To<float16>(0.f), y, ctx);
  if (k > 0) {
    if (m - k > 0) {
      _SetEye<<<CUDA_BLOCKS(m - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(

--- a/dragon/kernels/array/linspace_op_kernel.cc
+++ b/dragon/kernels/array/linspace_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -16,12 +16,12 @@ void _RowwiseLinSpace(
    T* y) {
  for (int i = 0; i < cols; ++i) {
    const auto delta = (stop[i] - start[i]) / double(rows - 1);
-    y[i] = cast::to<T>(start[i]);
+    y[i] = convert::To<T>(start[i]);
    if (rows > 1) {
-      y[i + (rows - 1) * cols] = cast::to<T>(stop[i]);
+      y[i + (rows - 1) * cols] = convert::To<T>(stop[i]);
    }
    for (int j = 1; j < rows - 1; ++j) {
-      y[i + j * cols] = cast::to<T>(start[i] + double(j) * delta);
+      y[i + j * cols] = convert::To<T>(start[i] + double(j) * delta);
    }
  }
 }
@@ -36,12 +36,12 @@ void _ColwiseLinSpace(
  for (int i = 0; i < rows; ++i) {
    const auto delta = (stop[i] - start[i]) / double(cols - 1);
    auto* offset_y = y + i * cols;
-    offset_y[0] = cast::to<T>(start[i]);
+    offset_y[0] = convert::To<T>(start[i]);
    if (cols > 1) {
-      offset_y[cols - 1] = cast::to<T>(stop[i]);
+      offset_y[cols - 1] = convert::To<T>(stop[i]);
    }
    for (int j = 1; j < cols - 1; ++j) {
-      offset_y[j] = cast::to<T>(start[i] + double(j) * delta);
+      offset_y[j] = convert::To<T>(start[i] + double(j) * delta);
    }
  }
 }

--- a/dragon/kernels/array/masked_select_op_kernel.cc
+++ b/dragon/kernels/array/masked_select_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/omp_utils.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -61,7 +60,7 @@ void _MaskedSelectGrad(
      const ValueType* dy,                                  \
      ValueType* dx,                                        \
      CPUContext* ctx) {                                    \
-    math::Set(count, cast::to<ValueType>(0.f), dx, ctx);   \
+    math::Set(count, convert::To<ValueType>(0.f), dx, ctx); \
    _MaskedSelectGrad(num_selected, index, dy, dx);         \
  }

--- a/dragon/kernels/array/masked_select_op_kernel.cu
+++ b/dragon/kernels/array/masked_select_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -61,7 +61,7 @@ __global__ void _MaskedSelectGrad(
      const ValueType* dy,                                  \
      ValueType* dx,                                        \
      CUDAContext* ctx) {                                   \
-    math::Set(count, cast::to<ValueType>(0.f), dx, ctx);    \
+    math::Set(count, convert::To<ValueType>(0.f), dx, ctx); \
    _MaskedSelectGrad<<<                                    \
        CUDA_BLOCKS(num_selected),                          \
        CUDA_THREADS,                                       \

--- a/dragon/kernels/array/one_hot_op_kernel.cc
+++ b/dragon/kernels/array/one_hot_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/array/pad_op_kernel.cc
+++ b/dragon/kernels/array/pad_op_kernel.cc
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -30,7 +29,7 @@ void _ConstPad(
      xi += r * x_strides[d];
    }
    y[yi] = d >= 0 ? value : x[xi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }
@@ -56,7 +55,7 @@ void _ReflectPad(
      xi += r * x_strides[d];
    }
    y[yi] = x[xi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }
@@ -80,7 +79,7 @@ void _EdgePad(
      xi += r * x_strides[d];
    }
    y[yi] = x[xi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }
@@ -115,7 +114,14 @@ void _EdgePad(
      T* y,                             \
      CPUContext* ctx) {                \
    _ConstPad(                          \
-        num_dims, x_dims, x_strides, y_dims, pads, cast::to<T>(value), x, y); \
+        num_dims,                       \
+        x_dims,                         \
+        x_strides,                      \
+        y_dims,                         \
+        pads,                           \
+        convert::To<T>(value),          \
+        x,                              \
+        y);                             \
  }
 DEFINE_CONST_KERNEL_LAUNCHER(bool);

--- a/dragon/kernels/array/pad_op_kernel.cu
+++ b/dragon/kernels/array/pad_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -114,7 +113,7 @@ __global__ void _EdgePad(
        X_strides,                                                             \
        Y_dims,                                                                \
        X_pads,                                                                \
-        cast::to<T>(value),                                                    \
+        convert::To<T>(value),                                                 \
        x,                                                                     \
        y);                                                                    \
  }

--- a/dragon/kernels/array/range_op_kernel.cc
+++ b/dragon/kernels/array/range_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -14,7 +14,7 @@ void _Range(const int count, const double start, const double delta, T* y) {
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    y[i] = cast::to<T>(start + double(i) * delta);
+    y[i] = convert::To<T>(start + double(i) * delta);
  }
 }

--- a/dragon/kernels/array/reduce_sum_op_kernel.cc
+++ b/dragon/kernels/array/reduce_sum_op_kernel.cc
@@ -26,7 +26,7 @@ void _ReduceSumGrad(
      yi += (index[d] % y_dims[d]) * y_strides[d];
    }
    dx[xi] = dy[yi] * scale;
-    utils::math::IncreaseIndexInDims(num_dims, x_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, x_dims, index.data());
  }
 }

--- a/dragon/kernels/array/repeat_op_kernel.cc
+++ b/dragon/kernels/array/repeat_op_kernel.cc
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/omp_utils.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/array/slice_op_kernel.cc
+++ b/dragon/kernels/array/slice_op_kernel.cc
@@ -25,7 +25,7 @@ void _Slice(
      xi += (index[d] + starts[d]) * x_strides[d];
    }
    y[yi] = x[xi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }
@@ -47,7 +47,7 @@ void _SliceGrad(
      xi += (index[d] + starts[d]) * x_strides[d];
    }
    dx[xi] = dy[yi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }

--- a/dragon/kernels/array/tile_op_kernel.cc
+++ b/dragon/kernels/array/tile_op_kernel.cc
@@ -25,7 +25,7 @@ void _Tile(
      xi += (index[d] % x_dims[d]) * x_strides[d];
    }
    y[i] = x[xi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }

--- a/dragon/kernels/array/top_select_op_kernel.cu
+++ b/dragon/kernels/array/top_select_op_kernel.cu
@@ -162,7 +162,7 @@ __global__ void _SelectViaDeviceSort(
 /* ------------------- Launcher Separator ------------------- */
-#define PLACE_BLOCK_SORT_CASE(T, items_per_thread)                     \
+#define BLOCKSORT_KERNEL(T, items_per_thread)                          \
  _SelectViaBlockSort<T, items_per_thread>                             \
      <<<CUDA_2D_BLOCKS(rows), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
          rows,                                                        \
@@ -175,15 +175,15 @@ __global__ void _SelectViaDeviceSort(
          reinterpret_cast<T*>(value),                                 \
          index)
-#define PLACE_BLOCK_SORT_CASES(T)                                \
+#define DISPATCH_BLOCKSORT_KERNEL(T)                             \
  if (cols <= CUDA_THREADS) {                                    \
-    PLACE_BLOCK_SORT_CASE(T, 1);                                 \
+    BLOCKSORT_KERNEL(T, 1);                                      \
  } else if (cols <= CUDA_THREADS * 2) {                         \
-    PLACE_BLOCK_SORT_CASE(T, 2);                                 \
+    BLOCKSORT_KERNEL(T, 2);                                      \
  } else if (cols <= CUDA_THREADS * 4) {                         \
-    PLACE_BLOCK_SORT_CASE(T, 4);                                 \
+    BLOCKSORT_KERNEL(T, 4);                                      \
  } else if (cols <= CUDA_THREADS * 8) {                         \
-    PLACE_BLOCK_SORT_CASE(T, 8);                                 \
+    BLOCKSORT_KERNEL(T, 8);                                      \
  } else {                                                       \
    LOG(FATAL) << "Too larger dimension (> " << CUDA_THREADS * 8 \
               << ") to launch the cuda kernel";                 \
@@ -238,7 +238,7 @@ __global__ void _SelectViaDeviceSort(
      return;                                                             \
    }                                                                     \
    T2 init = largest > 0 ? kLowest : kMax;                               \
-    PLACE_BLOCK_SORT_CASES(T2);                                           \
+    DISPATCH_BLOCKSORT_KERNEL(T2);                                        \
  }
 DEFINE_KERNEL_LAUNCHER(
@@ -277,8 +277,8 @@ DEFINE_KERNEL_LAUNCHER(
    std::numeric_limits<double>::lowest(),
    std::numeric_limits<double>::max());
-#undef PLACE_BLOCK_SORT_CASE
+#undef BLOCK_SORTKERNEL
-#undef PLACE_BLOCK_SORT_CASES
+#undef DISPATCH_BLOCKSORT_KERNEL
 #undef DEFINE_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/transpose_op_kernel.cc
+++ b/dragon/kernels/array/transpose_op_kernel.cc
@@ -24,7 +24,7 @@ void _Transpose(
      xi += index[d] * x_strides[d];
    }
    y[yi] = x[xi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }
@@ -45,7 +45,7 @@ void _TransposeGrad(
      xi += index[d] * x_strides[d];
    }
    dx[xi] = dy[yi];
-    utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
  }
 }

--- a/dragon/kernels/control_flow/assign_op_kernel.cc
+++ b/dragon/kernels/control_flow/assign_op_kernel.cc
@@ -25,7 +25,7 @@ void _Assign(
      yi += (index[d] + starts[d]) * y_strides[d];
    }
    y[yi] = x[i];
-    utils::math::IncreaseIndexInDims(num_dims, x_dims, index.data());
+    math::utils::IncreaseIndexInDims(num_dims, x_dims, index.data());
  }
 }

--- a/dragon/kernels/loss/generic_loss_op_kernel.cc
+++ b/dragon/kernels/loss/generic_loss_op_kernel.cc
@@ -19,7 +19,7 @@ void _BroadcastLossGrad(
  const int count = outer_dim * axis_dim * inner_dim;
  for (int i = 0; i < count; ++i) {
    dx[i] *= dy[idx[0] * inner_dim + idx[2]];
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }
@@ -93,7 +93,7 @@ void BroadcastLossGrad<float16, CPUContext>(
        num_masks > 0 && normalizer < 0.f                                   \
            ? (float)math::Sum(num_masks, 1.f, mask, ctx)                   \
            : normalizer);                                                  \
-    math::Scale(count, cast::to<float>(dy[0]) / inv_scale, dx, dx, ctx); \
+    math::Scale(count, convert::To<float>(dy[0]) / inv_scale, dx, dx, ctx); \
  }                                                                         \
  template <>                                                               \
  void BroadcastLossGrad<T, CPUContext>(                                    \

--- a/dragon/kernels/loss/nll_loss_op_kernel.cc
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cc
@@ -28,7 +28,7 @@ void _NLLLoss(
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
      loss[i] = -logit[k], mask[i] = LogitType(1);
    }
-    utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
@@ -53,7 +53,7 @@ void _NLLLossGrad(
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
      dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
    }
-    utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/loss/sigmoid_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sigmoid_ce_loss_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
@@ -48,7 +48,7 @@ void _SigmoidFocalLoss(
    loss[i] += -c2 * neg_term * neg_alpha;
    mask[i] = c1;
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }
@@ -96,7 +96,7 @@ void _SigmoidFocalLossGrad(
    dx[i] += -c2 * neg_term * neg_alpha;
    mask[i] = c1;
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/loss/smooth_l1_loss_op_kernel.cc
+++ b/dragon/kernels/loss/smooth_l1_loss_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/loss/softmax_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/softmax_ce_loss_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
@@ -29,7 +29,7 @@ void _SparseSoftmaxCrossEntropy(
      loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
      mask[i] = LogitType(1);
    }
-    utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
@@ -60,7 +60,7 @@ void _SparseSoftmaxCrossEntropyGrad(
      dx[k] -= LogitType(1);
      mask[i] = LogitType(1);
    }
-    utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/math/clip_op_kernel.cc
+++ b/dragon/kernels/math/clip_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -22,15 +22,15 @@ void _Clip<float16>(
    const float16 high,
    const float16* x,
    float16* y) {
-  auto lowf = cast::to<float>(low);
+  auto lowf = convert::To<float>(low);
-  auto highf = cast::to<float>(high);
+  auto highf = convert::To<float>(high);
 #ifdef USE_OPENMP
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    auto val = cast::to<float>(x[i]);
+    auto val = convert::To<float>(x[i]);
    val = std::max(lowf, std::min(val, highf));
-    y[i] = cast::to<float16>(val);
+    y[i] = convert::To<float16>(val);
  }
 }
@@ -56,14 +56,14 @@ void _ClipGrad<float16>(
    const float16* dy,
    const float16* x,
    float16* dx) {
-  auto lowf = cast::to<float>(low);
+  auto lowf = convert::To<float>(low);
-  auto highf = cast::to<float>(high);
+  auto highf = convert::To<float>(high);
-  auto kZero = cast::to<float16>(0.f);
+  auto kZero = convert::To<float16>(0.f);
 #ifdef USE_OPENMP
 #pragma omp parallel for num_threads(OMP_THREADS(count))
 #endif
  for (int i = 0; i < count; ++i) {
-    auto val = cast::to<float>(x[i]);
+    auto val = convert::To<float>(x[i]);
    dx[i] = (val < lowf || val > highf) ? kZero : dy[i];
  }
 } // ClipGrad
@@ -81,7 +81,7 @@ void _ClipGrad<float16>(
      const T* x,                                                  \
      T* y,                                                        \
      CPUContext* ctx) {                                           \
-    _Clip(count, cast::to<T>(low), cast::to<T>(high), x, y); \
+    _Clip(count, convert::To<T>(low), convert::To<T>(high), x, y); \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                      \
@@ -94,7 +94,7 @@ void _ClipGrad<float16>(
      const T* x,                                                           \
      T* dx,                                                                \
      CPUContext* ctx) {                                                    \
-    _ClipGrad(count, cast::to<T>(low), cast::to<T>(high), dy, x, dx); \
+    _ClipGrad(count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
  }
 DEFINE_KERNEL_LAUNCHER(int8_t);

--- a/dragon/kernels/math/clip_op_kernel.cu
+++ b/dragon/kernels/math/clip_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -104,8 +103,8 @@ void Clip<float16, CUDAContext>(
    CUDAContext* ctx) {
  _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
      count,
-      cast::to<half>(low),
+      convert::To<half>(low),
-      cast::to<half>(high),
+      convert::To<half>(high),
      reinterpret_cast<const half*>(x),
      reinterpret_cast<half*>(y));
 }
@@ -121,8 +120,8 @@ void ClipGrad<float16, CUDAContext>(
    CUDAContext* ctx) {
  _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
      count,
-      cast::to<half>(low),
+      convert::To<half>(low),
-      cast::to<half>(high),
+      convert::To<half>(high),
      reinterpret_cast<const half*>(dy),
      reinterpret_cast<const half*>(x),
      reinterpret_cast<half*>(dx));
@@ -138,7 +137,7 @@ void ClipGrad<float16, CUDAContext>(
      T* y,                                                             \
      CUDAContext* ctx) {                                               \
    _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        count, cast::to<T>(low), cast::to<T>(high), x, y);              \
+        count, convert::To<T>(low), convert::To<T>(high), x, y);        \
  }
 #define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                      \
@@ -152,7 +151,7 @@ void ClipGrad<float16, CUDAContext>(
      T* dx,                                                                \
      CUDAContext* ctx) {                                                   \
    _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        count, cast::to<T>(low), cast::to<T>(high), dy, x, dx);             \
+        count, convert::To<T>(low), convert::To<T>(high), dy, x, dx);       \
  }
 DEFINE_KERNEL_LAUNCHER(int8_t);

--- a/dragon/kernels/math/elementwise_op_kernel.cc
+++ b/dragon/kernels/math/elementwise_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/math/eltmentwise_op_kernel.cu
+++ b/dragon/kernels/math/eltmentwise_op_kernel.cu
@@ -70,7 +70,7 @@ template <typename T>
 __global__ void
 _ReciprocalGrad(const int nthreads, const T* dy, const T* y, T* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = -dy[i] * utils::math::Square(y[i]);
+    dx[i] = -dy[i] * math::utils::Square(y[i]);
  }
 }
@@ -82,7 +82,7 @@ __global__ void _ReciprocalGrad<half>(
    half* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
    dx[i] = __float2half(
-        -__half2float(dy[i]) * utils::math::Square(__half2float(y[i])));
+        -__half2float(dy[i]) * math::utils::Square(__half2float(y[i])));
  }
 }
@@ -103,7 +103,7 @@ __global__ void _ReciprocalGrad<half2>(
 template <typename T>
 __global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = T(-0.5) * dy[i] * utils::math::Cube(y[i]);
+    dx[i] = T(-0.5) * dy[i] * math::utils::Cube(y[i]);
  }
 }
@@ -112,7 +112,7 @@ __global__ void
 _RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
    dx[i] = __float2half(
-        -0.5f * __half2float(dy[i]) * utils::math::Cube(__half2float(y[i])));
+        -0.5f * __half2float(dy[i]) * math::utils::Cube(__half2float(y[i])));
  }
 }

--- a/dragon/kernels/math/moments_op_kernel.cc
+++ b/dragon/kernels/math/moments_op_kernel.cc
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/omp_utils.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -106,14 +106,14 @@ void _Moments(
    y_dims[axes[i]] = 1;
  // Case #1: Rowwise Reduce
-  if (utils::math::IsRowwiseReduce(
+  if (math::utils::IsRowwiseReduce(
          num_dims, dims, y_dims.data(), &rows, &cols)) {
    _RowwiseMoments(rows, cols, x, mean, var);
    return;
  }
  // Case #2: Colwise Reduce
-  if (utils::math::IsColwiseReduce(
+  if (math::utils::IsColwiseReduce(
          num_dims, dims, y_dims.data(), &rows, &cols)) {
    _ColwiseMoments(rows, cols, x, mean, var);
    return;
@@ -121,8 +121,8 @@ void _Moments(
  // Case #3: Generic Reduce
  vec32_t axesT(num_dims), stridesT(num_dims), dimsT(num_dims);
-  utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data());
+  math::utils::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data());
-  utils::math::ComputeTransposeStrides(
+  math::utils::ComputeTransposeStrides(
      num_dims, dims, axesT.data(), stridesT.data());
  rows = cols = 1;

--- a/dragon/kernels/math/moments_op_kernel.cu
+++ b/dragon/kernels/math/moments_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/device/common_cub.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -28,10 +28,10 @@ __global__ void _RowwiseMoments(
      const int xi = j * cols + i;
 #if __CUDA_ARCH__ >= 350
      m_val += __ldg(x + xi);
-      v_val += utils::math::Square(__ldg(x + xi));
+      v_val += math::utils::Square(__ldg(x + xi));
 #else
      m_val += x[xi];
-      v_val += utils::math::Square(x[xi]);
+      v_val += math::utils::Square(x[xi]);
 #endif
    }
    m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
@@ -59,7 +59,7 @@ __global__ void _RowwiseMoments<half, float>(
    CUDA_2D_KERNEL_LOOP2(j, rows) {
      const int xi = j * cols + i;
      m_val += __half2float(__ldg(x + xi));
-      v_val += utils::math::Square(__half2float(__ldg(x + xi)));
+      v_val += math::utils::Square(__half2float(__ldg(x + xi)));
    }
    m_val = BlockReduce<float>(m_storage).Sum(m_val);
    v_val = BlockReduce<float>(v_storage).Sum(v_val);
@@ -87,10 +87,10 @@ __global__ void _ColwiseMoments(
      const int xi = i * cols + j;
 #if __CUDA_ARCH__ >= 350
      m_val += __ldg(x + xi);
-      v_val += utils::math::Square(__ldg(x + xi));
+      v_val += math::utils::Square(__ldg(x + xi));
 #else
      m_val += x[xi];
-      v_val += utils::math::Square(x[xi]);
+      v_val += math::utils::Square(x[xi]);
 #endif
    }
    m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
@@ -118,7 +118,7 @@ __global__ void _ColwiseMoments<half, float>(
    CUDA_2D_KERNEL_LOOP2(j, cols) {
      const int xi = i * cols + j;
      m_val += __half2float(__ldg(x + xi));
-      v_val += utils::math::Square(__half2float(__ldg(x + xi)));
+      v_val += math::utils::Square(__half2float(__ldg(x + xi)));
    }
    m_val = BlockReduce<float>(m_storage).Sum(m_val);
    v_val = BlockReduce<float>(v_storage).Sum(v_val);
@@ -154,10 +154,10 @@ __global__ void _GenericMoments(
      }
 #if __CUDA_ARCH__ >= 350
      m_val += __ldg(x + xi);
-      v_val += utils::math::Square(__ldg(x + xi));
+      v_val += math::utils::Square(__ldg(x + xi));
 #else
      m_val += x[xi];
-      v_val += utils::math::Square(x[xi]);
+      v_val += math::utils::Square(x[xi]);
 #endif
    }
    m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
@@ -194,10 +194,10 @@ __global__ void _GenericMoments(
      }
 #if __CUDA_ARCH__ >= 350
      m_val += __half2float(__ldg(x + xi));
-      v_val += utils::math::Square(__half2float(__ldg(x + xi)));
+      v_val += math::utils::Square(__half2float(__ldg(x + xi)));
 #else
      m_val += __half2float(x[xi]);
-      v_val += utils::math::Square(__half2float(x[xi]));
+      v_val += math::utils::Square(__half2float(x[xi]));
 #endif
    }
    m_val = BlockReduce<float>(m_storage).Sum(m_val);
@@ -226,7 +226,7 @@ void _Moments(
    y_dims[axes[i]] = 1;
  /*! Case #1: Rowwise Reduce */
-  if (utils::math::IsRowwiseReduce(
+  if (math::utils::IsRowwiseReduce(
          num_dims, dims, y_dims.data(), &rows, &cols)) {
    _RowwiseMoments<<<
        CUDA_2D_BLOCKS(cols),
@@ -237,7 +237,7 @@ void _Moments(
  }
  /*! Case #2: Colwise Reduce */
-  if (utils::math::IsColwiseReduce(
+  if (math::utils::IsColwiseReduce(
          num_dims, dims, y_dims.data(), &rows, &cols)) {
    _ColwiseMoments<<<
        CUDA_2D_BLOCKS(rows),
@@ -250,8 +250,8 @@ void _Moments(
  /*! Case #3: Generic Reduce */
  CUDA_TENSOR_DIMS_CHECK(num_dims);
  SimpleArray<int, CUDA_TENSOR_MAX_DIMS> axesT, stridesT, dimsT;
-  utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data);
+  math::utils::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data);
-  utils::math::ComputeTransposeStrides(
+  math::utils::ComputeTransposeStrides(
      num_dims, dims, axesT.data, stridesT.data);
  rows = cols = 1;

--- a/dragon/kernels/normalization/batch_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/batch_norm_op_kernel.cc
--- a/dragon/kernels/normalization/batch_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/batch_norm_op_kernel.cu
--- a/dragon/kernels/normalization/group_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/group_norm_op_kernel.cc
--- a/dragon/kernels/normalization/group_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/group_norm_op_kernel.cu
--- a/dragon/kernels/normalization/lp_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cu
--- a/dragon/kernels/recurrent/lstm_cell_op_kernel.cc
+++ b/dragon/kernels/recurrent/lstm_cell_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/training/adam_update_op_kernel.cc
+++ b/dragon/kernels/training/adam_update_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/training/nesterov_update_op_kernel.cc
+++ b/dragon/kernels/training/nesterov_update_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/training/rmsprop_update_op_kernel.cc
+++ b/dragon/kernels/training/rmsprop_update_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/training/sgd_update_op_kernel.cc
+++ b/dragon/kernels/training/sgd_update_op_kernel.cc
-#include "dragon/utils/omp_utils.h"
+#include "dragon/utils/device/common_openmp.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/vision/avgpool_op_kernel.cc
+++ b/dragon/kernels/vision/avgpool_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -47,7 +47,7 @@ void _AvgPool2dNCHW(
      for (int w = wstart; w < wend; ++w)
        val += offset_x[h * W + w];
    y[i] = val / area;
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -89,7 +89,7 @@ void _AvgPool2dNHWC(
      for (int w = wstart; w < wend; ++w)
        val += offset_x[(h * W + w) * C];
    y[i] = val / area;
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -130,7 +130,7 @@ void _AvgPool2dGradNCHW(
    for (int h = hstart; h < hend; ++h)
      for (int w = wstart; w < wend; ++w)
        offset_dx[h * W + w] += dy[i] / area;
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -170,7 +170,7 @@ void _AvgPool2dGradNHWC(
    for (int h = hstart; h < hend; ++h)
      for (int w = wstart; w < wend; ++w)
        offset_dx[(h * W + w) * C] += dy[i] / area;
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -253,7 +253,7 @@ void _AvgPool2dGradNHWC(
      const T* dy,                                         \
      T* dx,                                               \
      CPUContext* ctx) {                                   \
-    math::Set(N* C* H* W, cast::to<T>(0.f), dx, ctx);      \
+    math::Set(N* C* H* W, convert::To<T>(0.f), dx, ctx);   \
    if (data_format == "NCHW") {                           \
      _AvgPool2dGradNCHW(                                  \
          N,                                               \

--- a/dragon/kernels/vision/bias_add_op_kernel.cc
+++ b/dragon/kernels/vision/bias_add_op_kernel.cc
-#include "dragon/utils/eigen_utils.h"
+#include "dragon/utils/device/common_eigen.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/vision/conv_op_kernel.cc
+++ b/dragon/kernels/vision/conv_op_kernel.cc
-#include "dragon/utils/cast.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"

--- a/dragon/kernels/vision/depthwise_conv_op_kernel.cc
+++ b/dragon/kernels/vision/depthwise_conv_op_kernel.cc
@@ -117,6 +117,39 @@ void _DepthwiseConv2dNHWC(
 /* ------------------- Launcher Separator ------------------- */
+#define DISPATCH_DATA_KERNEL(name, ...)                  \
+  if (data_format == "NCHW") {                           \
+    name##NCHW(__VA_ARGS__);                             \
+  } else if (data_format == "NHWC") {                    \
+    name##NHWC(__VA_ARGS__);                             \
+  } else {                                               \
+    LOG(FATAL) << "Unknown DataFormat: " << data_format; \
+  }
+template <>
+void DepthwiseConv2d<float16, CPUContext>(
+    const int N,
+    const int C,
+    const int H,
+    const int W,
+    const int out_h,
+    const int out_w,
+    const int kernel_h,
+    const int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const string& data_format,
+    const float16* x,
+    const float16* w,
+    float16* y,
+    CPUContext* ctx) {
+  CPU_FP16_NOT_SUPPORTED;
+}
 template <>
 void DepthwiseConv2d<float, CPUContext>(
    const int N,
@@ -138,27 +171,8 @@ void DepthwiseConv2d<float, CPUContext>(
    const float* w,
    float* y,
    CPUContext* ctx) {
-  if (data_format == "NCHW") {
+  DISPATCH_DATA_KERNEL(
-    _DepthwiseConv2dNCHW(
+      _DepthwiseConv2d,
-        N,
-        C,
-        H,
-        W,
-        out_h,
-        out_w,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        pad_h,
-        pad_w,
-        dilation_h,
-        dilation_w,
-        x,
-        w,
-        y);
-  } else {
-    _DepthwiseConv2dNHWC(
      N,
      C,
      H,
@@ -176,56 +190,59 @@ void DepthwiseConv2d<float, CPUContext>(
      x,
      w,
      y);
-  }
 }
-template <>
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T)      \
-void DepthwiseConv2dGrad<float, CPUContext>(
+  template <>                               \
-    const int N,
+  void DepthwiseConv2dGrad<T, CPUContext>(  \
-    const int C,
+      const int N,                          \
-    const int H,
+      const int C,                          \
-    const int W,
+      const int H,                          \
-    const int out_h,
+      const int W,                          \
-    const int out_w,
+      const int out_h,                      \
-    const int kernel_h,
+      const int out_w,                      \
-    const int kernel_w,
+      const int kernel_h,                   \
-    const int stride_h,
+      const int kernel_w,                   \
-    const int stride_w,
+      const int stride_h,                   \
-    const int pad_h,
+      const int stride_w,                   \
-    const int pad_w,
+      const int pad_h,                      \
-    const int dilation_h,
+      const int pad_w,                      \
-    const int dilation_w,
+      const int dilation_h,                 \
-    const string& data_format,
+      const int dilation_w,                 \
-    const float* dy,
+      const string& data_format,            \
-    const float* w,
+      const T* dy,                          \
-    float* dx,
+      const T* w,                           \
-    CPUContext* ctx) {
+      T* dx,                                \
-  NOT_IMPLEMENTED;
+      CPUContext* ctx) {                    \
-} // DepthwiseConv2dGrad
+    NOT_IMPLEMENTED;                        \
+  }                                         \
+  template <>                               \
+  void DepthwiseConv2dWGrad<T, CPUContext>( \
+      const int N,                          \
+      const int C,                          \
+      const int H,                          \
+      const int W,                          \
+      const int out_h,                      \
+      const int out_w,                      \
+      const int kernel_h,                   \
+      const int kernel_w,                   \
+      const int stride_h,                   \
+      const int stride_w,                   \
+      const int pad_h,                      \
+      const int pad_w,                      \
+      const int dilation_h,                 \
+      const int dilation_w,                 \
+      const string& data_format,            \
+      const T* dy,                          \
+      const T* x,                           \
+      T* dw,                                \
+      CPUContext* ctx) {                    \
+    NOT_IMPLEMENTED;                        \
+  }
-template <>
+DEFINE_GRAD_KERNEL_LAUNCHER(float16);
-void DepthwiseConv2dWGrad<float, CPUContext>(
+DEFINE_GRAD_KERNEL_LAUNCHER(float);
-    const int N,
+#undef DEFINE_GRAD_KERNEL_LAUNCHER
-    const int C,
-    const int H,
-    const int W,
-    const int out_h,
-    const int out_w,
-    const int kernel_h,
-    const int kernel_w,
-    const int stride_h,
-    const int stride_w,
-    const int pad_h,
-    const int pad_w,
-    const int dilation_h,
-    const int dilation_w,
-    const string& data_format,
-    const float* dy,
-    const float* x,
-    float* dw,
-    CPUContext* ctx) {
-  NOT_IMPLEMENTED;
-} // DepthwiseConv2dWGrad
 } // namespace kernel

--- a/dragon/kernels/vision/depthwise_conv_op_kernel.cu
+++ b/dragon/kernels/vision/depthwise_conv_op_kernel.cu
--- a/dragon/kernels/vision/maxpool_op_kernel.cc
+++ b/dragon/kernels/vision/maxpool_op_kernel.cc
-#include "dragon/utils/cast.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
@@ -52,7 +52,7 @@ void _MaxPool2dNCHW(
    }
    y[i] = val;
    mask[i] = mxi;
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -99,7 +99,7 @@ void _MaxPool2dNHWC(
    }
    y[i] = val;
    mask[i] = mxi;
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -129,7 +129,7 @@ void _MaxPool2dGradNCHW(
    if (mask[i] != -1) {
      dx[idx[0] * CHW + idx[1] * HW + mask[i]] += dy[i];
    }
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }
@@ -158,7 +158,7 @@ void _MaxPool2dGradNHWC(
    if (mask[i] != -1) {
      dx[idx[0] * HWC + mask[i]] += dy[i];
    }
-    utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
@@ -245,7 +245,7 @@ void _MaxPool2dGradNHWC(
      const int* mask,                                     \
      T* dx,                                               \
      CPUContext* ctx) {                                   \
-    math::Set(N* C* H* W, cast::to<T>(0.f), dx, ctx);      \
+    math::Set(N* C* H* W, convert::To<T>(0.f), dx, ctx);   \
    if (data_format == "NCHW") {                           \
      _MaxPool2dGradNCHW(                                  \
          N,                                               \

--- a/dragon/kernels/vision/resize_linear_op_kernel.cc
+++ b/dragon/kernels/vision/resize_linear_op_kernel.cc
@@ -62,7 +62,7 @@ void _ResizeLinearNCHW(
    t = tl + (tr - tl) * u;
    b = bl + (br - bl) * u;
    y[i] = static_cast<T>(t + (b - t) * v);
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -99,7 +99,7 @@ void _ResizeLinearNHWC(
    t = tl + (tr - tl) * u;
    b = bl + (br - bl) * u;
    y[i] = static_cast<T>(t + (b - t) * v);
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -135,7 +135,7 @@ void _ResizeLinearGradNCHW(
    dx[(offset + ti) * W + ri] += u * dt; // tr
    dx[(offset + bi) * W + li] += (1.f - u) * db; // bl
    dx[(offset + bi) * W + ri] += u * db; // br
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -171,7 +171,7 @@ void _ResizeLinearGradNHWC(
    dx[((offset + ti) * W + ri) * C + idx[3]] += u * dt; // tr
    dx[((offset + bi) * W + li) * C + idx[3]] += (1.f - u) * db; // bl
    dx[((offset + bi) * W + ri) * C + idx[3]] += u * db; // br
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }

--- a/dragon/kernels/vision/resize_nearest_op_kernel.cc
+++ b/dragon/kernels/vision/resize_nearest_op_kernel.cc
@@ -27,7 +27,7 @@ void _ResizeNearestNCHW(
    h_in = std::min(int(idx[2] * scale_h), h_max);
    w_in = std::min(int(idx[3] * scale_w), w_max);
    y[i] = x[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in];
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -52,7 +52,7 @@ void _ResizeNearestNHWC(
    w_in = std::min(int(idx[2] * scale_w), w_max);
    memcpy(
        y + i * C, x + (((idx[0] * H) + h_in) * W + w_in) * C, C * sizeof(T));
-    utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
  }
 }
@@ -76,7 +76,7 @@ void _ResizeNearestGradNCHW(
    h_in = std::min(int(idx[2] * scale_h), h_max);
    w_in = std::min(int(idx[3] * scale_w), w_max);
    dx[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in] += (float)dy[i];
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }
@@ -100,7 +100,7 @@ void _ResizeNearestGradNHWC(
    h_in = std::min(int(idx[1] * scale_h), h_max);
    w_in = std::min(int(idx[2] * scale_w), w_max);
    dx[(((idx[0] * H) + h_in) * W + w_in) * C + idx[3]] += (float)dy[i];
-    utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
  }
 }

--- a/dragon/modules/python/config.h
+++ b/dragon/modules/python/config.h
@@ -14,6 +14,7 @@
 #define DRAGON_MODULES_PYTHON_CONFIG_H_
 #include "dragon/modules/python/common.h"
+#include "dragon/utils/device/common_eigen.h"
 namespace dragon {
@@ -22,9 +23,16 @@ namespace python {
 namespace config {
 void RegisterModule(py::module& m) {
+  /*! \brief Set the logging severity */
  m.def("SetLoggingLevel", [](const string& severity) {
    SetLogDestination(severity);
  });
+  /*! \brief Set the number of threads for cpu parallelism */
+  m.def("SetNumThreads", [](int num) { Eigen::setNbThreads(num); });
+  /*! \brief Return the number of threads for cpu parallelism */
+  m.def("GetNumThreads", []() { return Eigen::nbThreads(); });
 }
 } // namespace config

--- a/dragon/modules/python/operator.h
+++ b/dragon/modules/python/operator.h
@@ -14,7 +14,6 @@
 #define DRAGON_MODULES_PYTHON_OPERATOR_H_
 #include "dragon/modules/python/common.h"
-#include "dragon/utils/eigen_utils.h"
 namespace dragon {

--- a/dragon/operators/array/expand_op.cc
+++ b/dragon/operators/array/expand_op.cc
@@ -19,7 +19,7 @@ void ExpandOp<Context>::DoRunWithType() {
  // Store for the gradient calculation
  STORE_INPUT_SPEC(0);
-  if (utils::math::IsBinaryBroadcast(X.dims(), X_dims, Y_dims)) {
+  if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, Y_dims)) {
    math::Set(
        X.ndim(),
        X.dims().data(),
@@ -47,7 +47,7 @@ void ExpandGradientOp<Context>::DoRunWithType() {
  vec32_t X_broadcast_axes, _;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      dX->dims(), dY.dims(), dY.dims(), X_broadcast_axes, _);
  if (X_broadcast_axes.empty()) {

--- a/dragon/operators/array/index_select_op.cc
+++ b/dragon/operators/array/index_select_op.cc
@@ -62,7 +62,7 @@ void IndexSelectGradientOp<Context>::DoRunWithType() {
  // Reset the accumulating gradient
  math::Set(
      dX->count(),
-      cast::to<T>(0.f),
+      convert::To<T>(0.f),
      dX->template mutable_data<T, Context>(),
      ctx());

--- a/dragon/operators/array/initialize_ops.cc
+++ b/dragon/operators/array/initialize_ops.cc
@@ -46,7 +46,7 @@ template <class Context>
 template <typename T>
 void FillOp<Context>::DoRunWithType() {
  auto* y = Output(0)->template mutable_data<T, Context>();
-  math::Set(Output(0)->count(), cast::to<T>(value_), y, ctx());
+  math::Set(Output(0)->count(), convert::To<T>(value_), y, ctx());
 }
 template <class Context>

--- a/dragon/operators/array/one_hot_op.cc
+++ b/dragon/operators/array/one_hot_op.cc
@@ -15,7 +15,7 @@ void OneHotOp<Context>::DoRunWithType() {
  // Brush the off-value over all
  math::Set(
      X.count() * depth_,
-      cast::to<T>((float)off_value_),
+      convert::To<T>((float)off_value_),
      Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
      ctx());

--- a/dragon/operators/array/slice_op.cc
+++ b/dragon/operators/array/slice_op.cc
@@ -87,7 +87,7 @@ void SliceGradientOp<Context>::DoRunWithType() {
  // Zero the redundant gradients
  auto* dx = dX->template mutable_data<T, Context>();
-  math::Set(dX->count(), cast::to<T>(0.f), dx, ctx());
+  math::Set(dX->count(), convert::To<T>(0.f), dx, ctx());
  // Copy the dY to the right positions
  kernel::SliceGrad(

--- a/dragon/operators/array/split_op.cc
+++ b/dragon/operators/array/split_op.cc
@@ -75,7 +75,7 @@ void SplitGradientOp<Context>::DoRunWithType() {
    if (!Input(i).has_name()) {
      math::Set(
          dX->count(),
-          cast::to<T>(0.f),
+          convert::To<T>(0.f),
          dX->template mutable_data<T, Context>(),
          ctx());
      break;

--- a/dragon/operators/array/where_op.cc
+++ b/dragon/operators/array/where_op.cc
@@ -14,8 +14,8 @@ void WhereOp<Context>::DoRunWithType() {
      << "\nExcepted bool or uint8 condition tensor.";
  vec64_t AB_dims, Y_dims;
-  if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), AB_dims) &&
+  if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), AB_dims) &&
-      utils::math::IsBinaryBroadcast(AB_dims, C.dims(), Y_dims)) {
+      math::utils::IsBinaryBroadcast(AB_dims, C.dims(), Y_dims)) {
    math::Where(
        A.ndim(),
        A.dims().data(),
@@ -50,7 +50,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
  // Temporal space to store the intermediate gradient and zeros
@@ -68,7 +68,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
  if (scratch_size > 0) {
    scratch = ctx()->workspace()->template data<T, Context>({scratch_size})[0];
    zeros = scratch + (scratch_size - 1);
-    math::Set(1, cast::to<T>(0.f), zeros, ctx());
+    math::Set(1, convert::To<T>(0.f), zeros, ctx());
  }
  if (dA->has_name()) {

--- a/dragon/operators/control_flow/assign_op.cc
+++ b/dragon/operators/control_flow/assign_op.cc
@@ -43,11 +43,11 @@ void AssignOp<Context>::DoRunWithType() {
  if (X.dims() != X_dims) {
    vec64_t dims1, dims2;
-    if (utils::math::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
+    if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
      CHECK(X_dims == dims1)
          << "\nCould not assign with shapes " << X.DimString() << " "
          << Tensor::DimString(X_dims);
-      utils::math::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
+      math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
      if (dims1 != dims2) {
        auto* scratch = ctx()->workspace()->template data<T, Context>(
            {X_broadcast.count()})[0];

--- a/dragon/operators/control_flow/masked_assign_op.cc
+++ b/dragon/operators/control_flow/masked_assign_op.cc
@@ -14,8 +14,8 @@ void MaskedAssignOp<Context>::DoRunWithType() {
      << "\nExcepted bool or uint8 mask.";
  vec64_t X_dims, Y_dims;
-  if (utils::math::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
+  if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
-      utils::math::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
+      math::utils::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
      Y_dims == Y->dims()) {
    math::Where(
        X.ndim(),

--- a/dragon/operators/generic/gradient_ops.cc
+++ b/dragon/operators/generic/gradient_ops.cc
@@ -13,7 +13,7 @@ void GradientGenerateOp<Context>::DoRunWithType() {
    Y->ReshapeLike(Input(i));
    math::Set(
        Y->count(),
-        cast::to<T>(defaults_[i]),
+        convert::To<T>(defaults_[i]),
        Y->template mutable_data<T, Context>(),
        ctx());
  }

--- a/dragon/operators/loss/nll_loss_op.cc
+++ b/dragon/operators/loss/nll_loss_op.cc
@@ -105,7 +105,7 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
  auto* dx = dX->template mutable_data<LogitType, Context>();
  auto* mask =
      ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
-  math::Set(dX->count(), cast::to<LogitType>(0.f), dx, ctx());
+  math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
  kernel::NLLLossGrad(
      outer_dim,

--- a/dragon/operators/math/add_op.cc
+++ b/dragon/operators/math/add_op.cc
@@ -21,7 +21,7 @@ void AddOp<Context>::DoRunWithType() {
        B.template data<T, Context>(),
        Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
        ctx());
-  } else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
+  } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
    auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
    math::Add(
        A.ndim(),
@@ -51,7 +51,7 @@ void AddGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
  if (dA->has_name()) {

--- a/dragon/operators/math/div_op.cc
+++ b/dragon/operators/math/div_op.cc
@@ -21,7 +21,7 @@ void DivOp<Context>::DoRunWithType() {
        B.template data<T, Context>(),
        Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
        ctx());
-  } else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
+  } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
    auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
    math::Div(
        A.ndim(),
@@ -52,7 +52,7 @@ void DivGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A_ref.dims(),
      B_ref.dims(),
      dY.dims(),

--- a/dragon/operators/math/elementwise_ops.cc
+++ b/dragon/operators/math/elementwise_ops.cc
@@ -93,7 +93,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(Invert, T);
          B.template data<T, Context>(),                                     \
          Y->Reshape(Y_dims)->template mutable_data<TOut, Context>(),        \
          ctx());                                                            \
-    } else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { \
+    } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { \
      math::name(                                                            \
          A.ndim(),                                                          \
          A.dims().data(),                                                   \

--- a/dragon/operators/math/maximum_op.cc
+++ b/dragon/operators/math/maximum_op.cc
@@ -13,7 +13,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
  // Temporal space to store the intermediate gradient

--- a/dragon/operators/math/minimum_op.cc
+++ b/dragon/operators/math/minimum_op.cc
@@ -13,7 +13,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
  // Temporal space to store the intermediate gradient

--- a/dragon/operators/math/moments_op.cc
+++ b/dragon/operators/math/moments_op.cc
@@ -40,7 +40,7 @@ void MomentsOp<Context>::DoRunWithType() {
        ctx());
    math::Set(
        1,
-        cast::to<Ty>(0.f),
+        convert::To<Ty>(0.f),
        Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
        ctx());
  } else {

--- a/dragon/operators/math/mul_op.cc
+++ b/dragon/operators/math/mul_op.cc
@@ -21,7 +21,7 @@ void MulOp<Context>::DoRunWithType() {
        B.template data<T, Context>(),
        Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
        ctx());
-  } else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
+  } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
    auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
    math::Mul(
        A.ndim(),
@@ -52,7 +52,7 @@ void MulGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A_ref.dims(),
      B_ref.dims(),
      dY.dims(),

--- a/dragon/operators/math/pow_op.cc
+++ b/dragon/operators/math/pow_op.cc
@@ -12,7 +12,7 @@ void PowGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
  // Temporal space to store the intermediate gradient
@@ -99,7 +99,7 @@ void PowGradientOp<Context>::DoRunWithType() {
          ctx());
      math::ReplaceNaN(
          A.count(),
-          cast::to<T>(0.f),
+          convert::To<T>(0.f),
          dA->template data<T, Context>(),
          dA->template mutable_data<T, Context>(),
          ctx());
@@ -141,7 +141,7 @@ void PowGradientOp<Context>::DoRunWithType() {
          A.template data<T, Context>(),
          scratch,
          ctx());
-      math::ReplaceNaN(Y.count(), cast::to<T>(0.f), scratch, scratch, ctx());
+      math::ReplaceNaN(Y.count(), convert::To<T>(0.f), scratch, scratch, ctx());
      if (B_broadcast_axes.empty()) {
        math::Mul(
            Y.count(), scratch, B.template data<T, Context>(), scratch, ctx());

--- a/dragon/operators/math/sign_op.cc
+++ b/dragon/operators/math/sign_op.cc
@@ -9,7 +9,7 @@ void SignGradientOp<Context>::DoRunWithType() {
  auto &dY = Input(0), *dX = Output(0);
  math::Set(
      dY.count(),
-      cast::to<T>(0.f),
+      convert::To<T>(0.f),
      dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
      ctx());
 }

--- a/dragon/operators/math/sub_op.cc
+++ b/dragon/operators/math/sub_op.cc
@@ -21,7 +21,7 @@ void SubOp<Context>::DoRunWithType() {
        B.template data<T, Context>(),
        Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
        ctx());
-  } else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
+  } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
    auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
    math::Sub(
        A.ndim(),
@@ -51,7 +51,7 @@ void SubGradientOp<Context>::DoRunWithType() {
  vec32_t A_broadcast_axes, B_broadcast_axes;
  vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
-  utils::math::ComputeBinaryBroadcastAxes(
+  math::utils::ComputeBinaryBroadcastAxes(
      A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
  if (dA->has_name()) {

--- a/dragon/operators/normalization/batch_norm_op.cc
+++ b/dragon/operators/normalization/batch_norm_op.cc
@@ -19,17 +19,57 @@ void BatchNormOp<Context>::TrainingImpl() {
  auto* X_bias = Buffer("X_bias")->Reshape({C_});
  auto* x = Input(0).template data<InputType, Context>();
-  auto* gamma = Input(1).template data<ParamType, Context>();
-  auto* beta = Input(2).template data<ParamType, Context>();
  auto* rm = Input(3).template mutable_data<ParamType, Context>();
  auto* rv = Input(4).template mutable_data<ParamType, Context>();
  auto* mu = X_mu->template mutable_data<ParamType, Context>();
  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
  auto* scale = X_scale->template mutable_data<ParamType, Context>();
-  auto* bias = X_bias->template mutable_data<ParamType, Context>();
-  auto* y = Output(0)->template mutable_data<InputType, Context>();
  // Compute moments
+  if (sync_stats_ > 0) {
+#ifdef USE_MPI
+    // Compute E(X) and E(X^2)
+    kernel::BatchNormExpectation(
+        N_,
+        C_,
+        S_,
+        ParamType(1) / (N_ * comm_size_ * S_),
+        data_format(),
+        x,
+        mu,
+        rsig,
+        ctx());
+    // Compute D(X) = E(X^2) - E(X)^2
+    ctx()->FinishDeviceComputation();
+    if (enable_nccl_) {
+#ifdef USE_NCCL
+      auto nccl_comm_ = this->nccl_comm();
+      auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
+      NCCL_CHECK(ncclAllReduce(
+          (void*)mu,
+          (void*)mu,
+          C_,
+          nccl_dtype_,
+          ncclSum,
+          nccl_comm_,
+          ((CUDAContext*)ctx())->cuda_stream()));
+      NCCL_CHECK(ncclAllReduce(
+          (void*)rsig,
+          (void*)rsig,
+          C_,
+          nccl_dtype_,
+          ncclSum,
+          nccl_comm_,
+          ((CUDAContext*)ctx())->cuda_stream()));
+#endif // USE_NCCL
+    } else {
+      AllReduce(mu, mu, C_);
+      AllReduce(rsig, rsig, C_);
+    }
+    math::Square(C_, mu, scale, ctx());
+    math::Sub(C_, rsig, scale, rsig, ctx());
+#endif // USE_MPI
+  } else {
    if (data_format() == "NCHW") {
      vec32_t dims = {(int)N_, (int)C_, (int)S_};
      vec32_t axes = {0, 2};
@@ -39,27 +79,32 @@ void BatchNormOp<Context>::TrainingImpl() {
      vec32_t axes = {0};
      kernel::Moments(2, dims.data(), 1, axes.data(), x, mu, rsig, ctx());
    }
+  }
  // Compute running statistics
  if (is_recomputing_ == 0) {
-    // Running(X) = (1 - momentum) * Cur(X) + momentum * Running(X)
    math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
    math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
  }
-  // Fuse parameters along channel axis
+  // Inverse stddev from variance
-  // [mu, rsig, alpha, beta] => [scale, bias]
  math::InvStd(C_, epsilon_, rsig, rsig, ctx());
-  math::Mul(C_, gamma, rsig, scale, ctx());
-  math::Mul(C_, scale, mu, bias, ctx());
-  math::Sub(C_, beta, bias, bias, ctx());
-  // Compute affine transformation
+  // Fuse parameters to compute affine transformation
-  if (data_format() == "NCHW") {
+  kernel::BatchNorm(
-    kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
+      N_,
-  } else if (data_format() == "NHWC") {
+      C_,
-    kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
+      S_,
-  }
+      data_format(),
+      x,
+      mu,
+      rsig,
+      Input(1).template data<ParamType, Context>(), // gamma
+      Input(2).template data<ParamType, Context>(), // beta
+      scale,
+      X_bias->template mutable_data<ParamType, Context>(),
+      Output(0)->template mutable_data<InputType, Context>(),
+      ctx());
 }
 template <class Context>
@@ -70,31 +115,30 @@ void BatchNormOp<Context>::InferenceImpl() {
  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
  auto* X_scale = Buffer("X_scale")->Reshape({C_});
  auto* X_bias = Buffer("X_bias")->Reshape({C_});
-  auto* x = Input(0).template data<InputType, Context>();
-  auto* gamma = Input(1).template data<ParamType, Context>();
-  auto* beta = Input(2).template data<ParamType, Context>();
-  auto* rm = Input(3).template data<ParamType, Context>();
  auto* rv = Input(4).template data<ParamType, Context>();
-  auto* scale = X_scale->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
-  auto* bias = X_bias->template mutable_data<ParamType, Context>();
-  auto* y = Output(0)->template mutable_data<InputType, Context>();
-  // Fuse parameters along channel axis
+  // Inverse stddev from variance
-  // [mu, rsig, alpha, beta] => [scale, bias]
+  math::InvStd(C_, epsilon_, rv, rsig, ctx());
-  math::InvStd(C_, epsilon_, rv, bias, ctx());
-  math::Mul(C_, gamma, bias, scale, ctx());
-  math::Mul(C_, scale, rm, bias, ctx());
-  math::Sub(C_, beta, bias, bias, ctx());
-  // Compute affine transformation
+  // Fuse parameters to compute affine transformation
-  if (data_format() == "NCHW") {
+  kernel::BatchNorm(
-    kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
+      N_,
-  } else if (data_format() == "NHWC") {
+      C_,
-    kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
+      S_,
-  }
+      data_format(),
+      Input(0).template data<InputType, Context>(),
+      Input(3).template data<ParamType, Context>(),
+      rsig,
+      Input(1).template data<ParamType, Context>(), // gamma
+      Input(2).template data<ParamType, Context>(), // beta
+      X_scale->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamType, Context>(),
+      Output(0)->template mutable_data<InputType, Context>(),
+      ctx());
 }
 template <class Context>
@@ -113,9 +157,15 @@ void BatchNormOp<Context>::RunOnDevice() {
    } else {
      InferenceImpl<float, float>();
    }
+  } else if (Input(0).template IsType<float16>()) {
+    if (is_training_) {
+      TrainingImpl<float16, float>();
+    } else {
+      InferenceImpl<float16, float>();
+    }
  } else {
    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(Input(0).meta()), {"float32"});
+        types::to_string(Input(0).meta()), {"float16", "float32"});
  }
 }
@@ -124,21 +174,71 @@ template <typename InputType, typename ParamType>
 void BatchNormGradientOp<Context>::TrainingImpl() {
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
+  auto *X_scale = Buffer("X_scale"), *X_bias = Buffer("X_bias");
-  // Gradient w.r.t. gamma, beta and input
+  auto* x = Input(0).template data<InputType, Context>();
-  kernel::BatchNormBackwardTraining(
+  auto* gamma = Input(1).template data<ParamType, Context>();
+  auto* dy = Input(4).template data<InputType, Context>();
+  auto* mu = X_mu->template data<ParamType, Context>();
+  auto* rsig = X_rsig->template data<ParamType, Context>();
+  auto* scale = X_scale->template mutable_data<ParamType, Context>();
+  auto* bias = X_bias->template mutable_data<ParamType, Context>();
+  auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
+  auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
+  // Gradient w.r.t. gamma and beta
+  kernel::BatchNormInternalGrad(
+      N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());
+  if (sync_stats_ > 0) {
+#ifdef USE_MPI
+    ctx()->FinishDeviceComputation();
+    if (enable_nccl_) {
+#ifdef USE_NCCL
+      auto nccl_comm_ = this->nccl_comm();
+      auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
+      NCCL_CHECK(ncclAllReduce(
+          (void*)dgamma,
+          (void*)scale,
+          C_,
+          nccl_dtype_,
+          ncclSum,
+          nccl_comm_,
+          ((CUDAContext*)ctx())->cuda_stream()));
+      NCCL_CHECK(ncclAllReduce(
+          (void*)dbeta,
+          (void*)bias,
+          C_,
+          nccl_dtype_,
+          ncclSum,
+          nccl_comm_,
+          ((CUDAContext*)ctx())->cuda_stream()));
+#endif // USE_NCCL
+    } else {
+      AllReduce(dgamma, scale, C_);
+      AllReduce(dbeta, bias, C_);
+    }
+    math::Scale(C_, ParamType(1) / comm_size_, scale, scale, ctx());
+    math::Scale(C_, ParamType(1) / comm_size_, bias, bias, ctx());
+#endif // USE_MPI
+  } else {
+    scale = dgamma, bias = dbeta;
+  }
+  // Gradient w.r.t. input
+  kernel::BatchNormTrainingGrad(
      N_,
      C_,
      S_,
      data_format(),
-      Input(0).template data<InputType, Context>(), // x
+      x,
-      X_mu->template data<ParamType, Context>(), // mu
+      mu,
-      X_rsig->template data<ParamType, Context>(), // rsig
+      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      gamma,
-      Input(4).template data<InputType, Context>(), // dy
+      scale,
-      Output(0)->template mutable_data<InputType, Context>(), // dx
+      bias,
-      dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dgamma
+      dy,
-      dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // dbeta
+      Output(0)->template mutable_data<InputType, Context>(),
      ctx());
 }
@@ -158,11 +258,11 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
    dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
  }
-  // Restore inverse stddev from variance
+  // Inverse stddev from variance
  math::InvStd(C_, epsilon_, rv, rsig, ctx());
  // Gradient w.r.t. gamma, beta and input
-  kernel::BatchNormBackwardInference(
+  kernel::BatchNormInferenceGrad(
      N_,
      C_,
      S_,
@@ -172,9 +272,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
      rsig,
      Input(1).template data<ParamType, Context>(), // gamma
      Input(4).template data<InputType, Context>(), // dy
-      dX->template mutable_data<InputType, Context>(),
      dgamma,
      dbeta,
+      dX->template mutable_data<InputType, Context>(),
      ctx());
 }
@@ -190,9 +290,15 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
    } else {
      InferenceImpl<float, float>();
    }
+  } else if (Input(0).template IsType<float16>()) {
+    if (is_training_ > 0) {
+      TrainingImpl<float16, float>();
+    } else {
+      InferenceImpl<float16, float>();
+    }
  } else {
    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(Input(0).meta()), {"float32"});
+        types::to_string(Input(0).meta()), {"float16", "float32"});
  }
 }

--- a/dragon/operators/normalization/batch_norm_op.h
+++ b/dragon/operators/normalization/batch_norm_op.h
@@ -35,7 +35,8 @@ class BatchNormOpBase : public GenericOpBase<Context> {
      : GenericOpBase<Context>(def, ws),
        momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
-        use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)) {}
+        use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
+        sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
  USE_OPERATOR_FUNCTIONS;
  void DetermineBaseArguments() {
@@ -58,7 +59,8 @@ class BatchNormOpBase : public GenericOpBase<Context> {
 protected:
  float momentum_;
  double epsilon_;
-  int64_t use_stats_, N_, C_, S_;
+  int64_t N_, C_, S_;
+  int64_t use_stats_, sync_stats_;
  int64_t is_training_, is_recomputing_;
 };
@@ -69,6 +71,7 @@ class BatchNormOpBase : public GenericOpBase<Context> {
  using BatchNormOpBase<Context>::momentum_;              \
  using BatchNormOpBase<Context>::epsilon_;               \
  using BatchNormOpBase<Context>::use_stats_;             \
+  using BatchNormOpBase<Context>::sync_stats_;            \
  using BatchNormOpBase<Context>::N_;                     \
  using BatchNormOpBase<Context>::C_;                     \
  using BatchNormOpBase<Context>::S_;                     \
@@ -82,6 +85,9 @@ class BatchNormOp : public BatchNormOpBase<Context> {
      : BatchNormOpBase<Context>(def, ws) {}
  USE_OPERATOR_FUNCTIONS;
  USE_BATCHNORM_FUNCTIONS;
+#ifdef USE_MPI
+  USE_COLLECTIVE_FUNCTIONS;
+#endif
  void RunOnDevice() override;
@@ -99,50 +105,19 @@ class BatchNormGradientOp : public BatchNormOpBase<Context> {
      : BatchNormOpBase<Context>(def, ws) {}
  USE_OPERATOR_FUNCTIONS;
  USE_BATCHNORM_FUNCTIONS;
-  void RunOnDevice() override;
-  template <typename InputType, typename ParamType>
-  void TrainingImpl();
-  template <typename InputType, typename ParamType>
-  void InferenceImpl();
-};
 #ifdef USE_MPI
-template <class Context>
-class SyncBatchNormOp : public BatchNormOp<Context> {
- public:
-  SyncBatchNormOp(const OperatorDef& def, Workspace* ws)
-      : BatchNormOp<Context>(def, ws) {}
-  USE_OPERATOR_FUNCTIONS;
-  USE_BATCHNORM_FUNCTIONS;
  USE_COLLECTIVE_FUNCTIONS;
+#endif
  void RunOnDevice() override;
  template <typename InputType, typename ParamType>
  void TrainingImpl();
-};
-template <class Context>
-class SyncBatchNormGradientOp : public BatchNormGradientOp<Context> {
- public:
-  SyncBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
-      : BatchNormGradientOp<Context>(def, ws) {}
-  USE_OPERATOR_FUNCTIONS;
-  USE_BATCHNORM_FUNCTIONS;
-  USE_COLLECTIVE_FUNCTIONS;
-  void RunOnDevice() override;
  template <typename InputType, typename ParamType>
-  void TrainingImpl();
+  void InferenceImpl();
 };
-#endif // USE_MPI
 #ifdef USE_CUDNN
 template <class Context>

--- a/dragon/operators/normalization/batch_norm_op_sync.cc
+++ b/dragon/operators/normalization/batch_norm_op_sync.cc
 #ifdef USE_MPI
-#include "dragon/core/workspace.h"
 #include "dragon/operators/normalization/batch_norm_op.h"
-#include "dragon/utils/filler.h"
-#include "dragon/utils/op_kernels.h"
 namespace dragon {
-template <class Context>
+REGISTER_CPU_OPERATOR(SyncBatchNorm, BatchNormOp<CPUContext>);
-template <typename InputType, typename ParamType>
+REGISTER_CPU_OPERATOR(SyncBatchNormGradient, BatchNormGradientOp<CPUContext>);
-void SyncBatchNormOp<Context>::TrainingImpl() {
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
-  auto* X_mu = Buffer("X_mu")->Reshape({C_});
-  auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
-  auto* X_scale = Buffer("X_scale")->Reshape({C_});
-  auto* X_bias = Buffer("X_bias")->Reshape({C_});
-  auto* x = Input(0).template data<InputType, Context>();
-  auto* gamma = Input(1).template data<ParamType, Context>();
-  auto* beta = Input(2).template data<ParamType, Context>();
-  auto* rm = Input(3).template mutable_data<ParamType, Context>();
-  auto* rv = Input(4).template mutable_data<ParamType, Context>();
-  auto* mu = X_mu->template mutable_data<ParamType, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
-  auto* scale = X_scale->template mutable_data<ParamType, Context>();
-  auto* bias = X_bias->template mutable_data<ParamType, Context>();
-  auto* y = Output(0)->template mutable_data<InputType, Context>();
-  // Compute E(X) and E(X^2)
-  kernel::BatchNormExpectation(
-      N_,
-      C_,
-      S_,
-      ParamType(1) / (N_ * comm_size_ * S_),
-      data_format(),
-      x,
-      mu,
-      rsig,
-      ctx());
-  // Compute D(X) = E(X^2) - E(X)^2
-  ctx()->FinishDeviceComputation();
-  if (enable_nccl_) {
-#ifdef USE_NCCL
-    auto nccl_comm_ = this->nccl_comm();
-    auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
-    NCCL_CHECK(ncclAllReduce(
-        (void*)mu,
-        (void*)mu,
-        C_,
-        nccl_dtype_,
-        ncclSum,
-        nccl_comm_,
-        ((CUDAContext*)ctx())->cuda_stream()));
-    NCCL_CHECK(ncclAllReduce(
-        (void*)rsig,
-        (void*)rsig,
-        C_,
-        nccl_dtype_,
-        ncclSum,
-        nccl_comm_,
-        ((CUDAContext*)ctx())->cuda_stream()));
-#endif
-  } else {
-    AllReduce(mu, mu, C_);
-    AllReduce(rsig, rsig, C_);
-  }
-  math::Square(C_, mu, y, ctx());
-  math::Sub(C_, rsig, y, rsig, ctx());
-  // Compute running statistics
-  if (is_recomputing_ == 0) {
-    // Running(X) = (1 - momentum) * Cur(X) + momentum * Running(X)
-    math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
-    math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
-  }
-  // Fuse parameters along channel axis
-  // [mu, rsig, alpha, beta] => [scale, bias]
-  math::InvStd(C_, epsilon_, rsig, rsig, ctx());
-  math::Mul(C_, gamma, rsig, scale, ctx());
-  math::Mul(C_, scale, mu, bias, ctx());
-  math::Sub(C_, beta, bias, bias, ctx());
-  // Compute affine transformation
-  if (data_format() == "NCHW") {
-    kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
-  } else if (data_format() == "NHWC") {
-    kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
-  }
-}
-template <class Context>
-void SyncBatchNormOp<Context>::RunOnDevice() {
-  DetermineBaseArguments();
-  // Get the recomputing flag
-  auto* flag = workspace()->GetTensor("/share/flag/recomputing");
-  is_recomputing_ = flag->template data<bool, CPUContext>()[0] ? 1 : 0;
-  // Dispatch the training or inference impl
-  Output(0)->ReshapeLike(Input(0));
-  if (Input(0).template IsType<float>()) {
-    if (is_training_ > 0) {
-      TrainingImpl<float, float>();
-    } else {
-      this->template InferenceImpl<float, float>();
-    }
-  } else {
-    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(Input(0).meta()), {"float32"});
-  }
-}
-template <class Context>
-template <typename InputType, typename ParamType>
-void SyncBatchNormGradientOp<Context>::TrainingImpl() {
-  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
-  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
-  auto *X_scale = Buffer("X_scale"), *X_bias = Buffer("X_bias");
-  auto* x = Input(0).template data<InputType, Context>();
-  auto* gamma = Input(1).template data<ParamType, Context>();
-  auto* dy = Input(4).template data<InputType, Context>();
-  auto* mu = X_mu->template data<ParamType, Context>();
-  auto* rsig = X_rsig->template data<ParamType, Context>();
-  auto* scale = X_scale->template mutable_data<ParamType, Context>();
-  auto* bias = X_bias->template mutable_data<ParamType, Context>();
-  auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
-  auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
-  // Gradient w.r.t. gamma and beta of local batch
-  kernel::BatchNormInternalGrad(
-      N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());
-  // Gradient w.r.t. gamma and beta of global batch
-  ctx()->FinishDeviceComputation();
-  if (enable_nccl_) {
-#ifdef USE_NCCL
-    auto nccl_comm_ = this->nccl_comm();
-    auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
-    NCCL_CHECK(ncclAllReduce(
-        (void*)dgamma,
-        (void*)scale,
-        C_,
-        nccl_dtype_,
-        ncclSum,
-        nccl_comm_,
-        ((CUDAContext*)ctx())->cuda_stream()));
-    NCCL_CHECK(ncclAllReduce(
-        (void*)dbeta,
-        (void*)bias,
-        C_,
-        nccl_dtype_,
-        ncclSum,
-        nccl_comm_,
-        ((CUDAContext*)ctx())->cuda_stream()));
-#endif
-  } else {
-    AllReduce(dgamma, scale, C_);
-    AllReduce(dbeta, bias, C_);
-  }
-  math::Scale(C_, ParamType(1) / comm_size_, scale, scale, ctx());
-  math::Scale(C_, ParamType(1) / comm_size_, bias, bias, ctx());
-  // Gradient w.r.t. input
-  kernel::BatchNormTrainingGrad(
-      N_,
-      C_,
-      S_,
-      data_format(),
-      x,
-      mu,
-      rsig,
-      gamma,
-      scale,
-      bias,
-      dy,
-      Output(0)->template mutable_data<InputType, Context>(),
-      ctx());
-}
-template <class Context>
-void SyncBatchNormGradientOp<Context>::RunOnDevice() {
-  DetermineBaseArguments();
-  // Dispatch the training or inference impl
-  Output(0)->ReshapeLike(Input(0));
-  if (Input(0).template IsType<float>()) {
-    if (is_training_ > 0) {
-      TrainingImpl<float, float>();
-    } else {
-      this->template InferenceImpl<float, float>();
-    }
-  } else {
-    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(Input(0).meta()), {"float32"});
-  }
-}
-DEPLOY_CPU_OPERATOR(SyncBatchNorm);
-#ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(SyncBatchNorm);
-#endif
-DEPLOY_CPU_OPERATOR(SyncBatchNormGradient);
 #ifdef USE_CUDA
-DEPLOY_CUDA_OPERATOR(SyncBatchNormGradient);
+REGISTER_CUDA_OPERATOR(SyncBatchNorm, BatchNormOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(SyncBatchNormGradient, BatchNormGradientOp<CUDAContext>);
 #endif
 OPERATOR_SCHEMA(SyncBatchNorm)

--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
--- a/dragon/operators/recurrent/lstm_cell_op.cc
+++ b/dragon/operators/recurrent/lstm_cell_op.cc
--- a/dragon/operators/recurrent/recurrent_op_cudnn.cc
+++ b/dragon/operators/recurrent/recurrent_op_cudnn.cc
--- a/dragon/operators/vision/conv_op_base.cc
+++ b/dragon/operators/vision/conv_op_base.cc
--- a/dragon/operators/vision/depthwise_conv2d_op.cc
+++ b/dragon/operators/vision/depthwise_conv2d_op.cc
--- a/dragon/operators/vision/depthwise_conv2d_op_cudnn.cc
+++ b/dragon/operators/vision/depthwise_conv2d_op_cudnn.cc
--- a/dragon/operators/vision/roi_align_op.cc
+++ b/dragon/operators/vision/roi_align_op.cc
--- a/dragon/operators/vision/roi_pool_op.cc
+++ b/dragon/operators/vision/roi_pool_op.cc
--- a/dragon/python/__init__.py
+++ b/dragon/python/__init__.py
--- a/dragon/python/core/framework/config.py
+++ b/dragon/python/core/framework/config.py
--- a/dragon/python/vm/onnx/core/frontend.py
+++ b/dragon/python/vm/onnx/core/frontend.py
--- a/dragon/utils/cast.h
+++ b/dragon/utils/cast.h
--- a/dragon/utils/eigen_utils.h
+++ b/dragon/utils/eigen_utils.h
--- a/dragon/utils/omp_utils.h
+++ b/dragon/utils/omp_utils.h
--- a/dragon/utils/filler.h
+++ b/dragon/utils/filler.h
--- a/dragon/utils/math/blas.cc
+++ b/dragon/utils/math/blas.cc
--- a/dragon/utils/math/blas.cu
+++ b/dragon/utils/math/blas.cu
--- a/dragon/utils/math/broadcast.cc
+++ b/dragon/utils/math/broadcast.cc
--- a/dragon/utils/math/broadcast.cu
+++ b/dragon/utils/math/broadcast.cu
--- a/dragon/utils/math/cast.cc
+++ b/dragon/utils/math/cast.cc
--- a/dragon/utils/math/elementwise.cc
+++ b/dragon/utils/math/elementwise.cc
--- a/dragon/utils/math/elementwise.cu
+++ b/dragon/utils/math/elementwise.cu
--- a/dragon/utils/math/functional.h
+++ b/dragon/utils/math/functional.h
--- a/dragon/utils/math/reduce.cc
+++ b/dragon/utils/math/reduce.cc
--- a/dragon/utils/math/reduce.cu
+++ b/dragon/utils/math/reduce.cu
--- a/dragon/utils/math/utils.cc
+++ b/dragon/utils/math/utils.cc
--- a/dragon/utils/math/utils.h
+++ b/dragon/utils/math/utils.h
--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
--- a/tensorflow/core/keras/saving/pickle_format.py
+++ b/tensorflow/core/keras/saving/pickle_format.py
--- a/tensorlayer/core/files/utils.py
+++ b/tensorlayer/core/files/utils.py
--- a/test/dragon/test_framework.py
+++ b/test/dragon/test_framework.py
--- a/torch/core/onnx/utils.py
+++ b/torch/core/onnx/utils.py