Add tests of operator spec for AutoGraph

Summary: This commit tests the correctness of shape inference and data type blended by autograph module.

Add tests of operator spec for AutoGraph
Summary: This commit tests the correctness of shape inference and data type blended by autograph module.
Ting PAN
Commit 1ad360e9 authored Dec 24, 2020 by Ting PAN
Showing with 655 additions and 752 deletions
dragon/kernels/array/channel_normalize_op_kernel.cc
dragon/kernels/array/channel_normalize_op_kernel.cu
dragon/kernels/loss/nll_loss_op_kernel.cc
dragon/kernels/loss/nll_loss_op_kernel.cu
dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
dragon/kernels/math/clip_op_kernel.cu
dragon/kernels/math/moments_op_kernel.cc
dragon/kernels/math/moments_op_kernel.cu
dragon/kernels/normalization/lp_norm_op_kernel.cc
dragon/kernels/normalization/lp_norm_op_kernel.cu
dragon/kernels/vision/roi_align_op_kernel.cu
dragon/kernels/vision/roi_pool_op_kernel.cu
dragon/operators/array/channel_normalize_op.cc
dragon/operators/loss/nll_loss_op.cc
dragon/operators/loss/nll_loss_op.h
dragon/operators/loss/sigmoid_focal_loss_op.cc
dragon/operators/loss/sigmoid_loss_ops.h
--- a/dragon/kernels/array/channel_normalize_op_kernel.cc
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cc
@@ -7,16 +7,16 @@ namespace kernel {
 namespace {
-template <typename Tx, typename Ty>
+template <typename InputT, typename OutputT>
 void _ChannelNormalize(
    const int axis,
    const int num_dims,
    const int64_t* x_strides,
    const int64_t* y_dims,
-    const Tx* x,
+    const InputT* x,
    const float* mean,
    const float* std,
-    Ty* y) {
+    OutputT* y) {
  const auto count =
      std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
  vec64_t idx(num_dims, 0);
@@ -27,7 +27,8 @@ void _ChannelNormalize(
      xi += idx[d] * x_strides[d];
      if (d == axis) wi = idx[d];
    }
-    y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
+    y[yi] =
+        convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]);
    math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
  }
 }
@@ -36,83 +37,43 @@ void _ChannelNormalize(
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                            \
-void ChannelNormalize<float16, float16, CPUContext>(
-    const int axis,
-    const int num_dims,
-    const int64_t* x_strides,
-    const int64_t* y_dims,
-    const float16* x,
-    const float* mean,
-    const float* std,
-    float16* y,
-    CPUContext* ctx) {
-  CPU_FP16_NOT_SUPPORTED;
-}
-#define DEFINE_KERNEL_LAUNCHER(Tx, Ty)                                     \
  template <>                                                              \
-  void ChannelNormalize<Tx, Ty, CPUContext>(                               \
+  void ChannelNormalize<InputT, OutputT, CPUContext>(                      \
      const int axis,                                                      \
      const int num_dims,                                                  \
      const int64_t* x_strides,                                            \
      const int64_t* y_dims,                                               \
-      const Tx* x,                                                         \
+      const InputT* x,                                                     \
      const float* mean,                                                   \
      const float* std,                                                    \
-      Ty* y,                                                               \
+      OutputT* y,                                                          \
      CPUContext* ctx) {                                                   \
    _ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
  }
-#define DEFINE_FP16_KERNEL_LAUNCHER(T)           \
+DEFINE_KERNEL_LAUNCHER(int8_t, float16);
-  template <>                                    \
-  void ChannelNormalize<float16, T, CPUContext>( \
-      const int axis,                            \
-      const int num_dims,                        \
-      const int64_t* x_strides,                  \
-      const int64_t* y_dims,                     \
-      const float16* x,                          \
-      const float* mean,                         \
-      const float* std,                          \
-      T* y,                                      \
-      CPUContext* ctx) {                         \
-    CPU_FP16_NOT_SUPPORTED;                      \
-  }                                              \
-  template <>                                    \
-  void ChannelNormalize<T, float16, CPUContext>( \
-      const int axis,                            \
-      const int num_dims,                        \
-      const int64_t* x_strides,                  \
-      const int64_t* y_dims,                     \
-      const T* x,                                \
-      const float* mean,                         \
-      const float* std,                          \
-      float16* y,                                \
-      CPUContext* ctx) {                         \
-    CPU_FP16_NOT_SUPPORTED;                      \
-  }
 DEFINE_KERNEL_LAUNCHER(int8_t, float);
 DEFINE_KERNEL_LAUNCHER(int8_t, double);
+DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
 DEFINE_KERNEL_LAUNCHER(uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, double);
+DEFINE_KERNEL_LAUNCHER(int, float16);
 DEFINE_KERNEL_LAUNCHER(int, float);
 DEFINE_KERNEL_LAUNCHER(int, double);
+DEFINE_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_KERNEL_LAUNCHER(int64_t, double);
+DEFINE_KERNEL_LAUNCHER(float16, float16);
+DEFINE_KERNEL_LAUNCHER(float16, float);
+DEFINE_KERNEL_LAUNCHER(float16, double);
+DEFINE_KERNEL_LAUNCHER(float, float16);
 DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(float, double);
+DEFINE_KERNEL_LAUNCHER(double, float16);
 DEFINE_KERNEL_LAUNCHER(double, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
-DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
-DEFINE_FP16_KERNEL_LAUNCHER(int);
-DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
-DEFINE_FP16_KERNEL_LAUNCHER(float);
-DEFINE_FP16_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
-#undef DEFINE_FP16_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/channel_normalize_op_kernel.cu
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cu
--- a/dragon/kernels/loss/nll_loss_op_kernel.cc
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cc
@@ -7,51 +7,51 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _NLLLoss(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      loss[i] = mask[i] = LogitType(0);
+      loss[i] = mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      loss[i] = -logit[k], mask[i] = LogitType(1);
+      loss[i] = -logit[k], mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _NLLLossGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      mask[i] = LogitType(0);
+      mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
+      dlogit[k] = LogitT(-1), mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
@@ -61,17 +61,17 @@ void _NLLLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
  template <>                                         \
-  void name<LogitType, TargetType, CPUContext>(             \
+  void name<LogitT, TargetT, CPUContext>(             \
      const int outer_dim,                            \
      const int inner_dim,                            \
      const int axis_dim,                             \
      const int ignore_index,                         \
-      const LogitType* logit,                               \
+      const LogitT* logit,                            \
-      const TargetType* target,                             \
+      const TargetT* target,                          \
-      LogitType* loss,                                      \
+      LogitT* loss,                                   \
-      LogitType* mask,                                      \
+      LogitT* mask,                                   \
      CPUContext* ctx) {                              \
    _##name(                                          \
        outer_dim,                                    \

--- a/dragon/kernels/loss/nll_loss_op_kernel.cu
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cu
@@ -9,48 +9,48 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _NLLLoss(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      loss[yi] = mask[yi] = LogitType(0);
+      loss[yi] = mask[yi] = LogitT(0);
    } else {
      loss[yi] = -logit[(i * axis_dim + label) * inner_dim + j];
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _NLLLossGrad(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      mask[yi] = LogitType(0);
+      mask[yi] = LogitT(0);
    } else {
-      dlogit[(i * axis_dim + label) * inner_dim + j] = LogitType(-1);
+      dlogit[(i * axis_dim + label) * inner_dim + j] = LogitT(-1);
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
@@ -59,17 +59,17 @@ __global__ void _NLLLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType)                  \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT)                        \
  template <>                                                                \
-  void name<LogitType, TargetType, CUDAContext>(                             \
+  void name<LogitT, TargetT, CUDAContext>(                                   \
      const int outer_dim,                                                   \
      const int inner_dim,                                                   \
      const int axis_dim,                                                    \
      const int ignore_index,                                                \
-      const LogitType* logit,                                                \
+      const LogitT* logit,                                                   \
-      const TargetType* target,                                              \
+      const TargetT* target,                                                 \
-      LogitType* loss,                                                       \
+      LogitT* loss,                                                          \
-      LogitType* mask,                                                       \
+      LogitT* mask,                                                          \
      CUDAContext* ctx) {                                                    \
    const auto nthreads = outer_dim * inner_dim;                             \
    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
@@ -7,19 +7,19 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SigmoidFocalLoss(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 3> idx = {0, 0, 0};
  std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
  const int count = dims[0] * dims[1] * dims[2];
@@ -27,23 +27,21 @@ void _SigmoidFocalLoss(
  for (int i = 0; i < count; ++i) {
    const int t = (int)target[idx[0] * inner_dim + idx[2]];
    // "0" is reserved for target if negative index is zero
-    LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
-    LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
    // (1 - p)^{gamma} * log(p)
-    LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
+    LogitT pos_term =
-        std::log(std::max(p, (LogitType)FLT_MIN));
+        std::pow(LogitT(1) - p, gamma) * std::log(std::max(p, (LogitT)FLT_MIN));
    // p^{gamma} * log(1 - p)
-    LogitType neg_term = std::pow(p, gamma) *
+    LogitT neg_term = std::pow(p, gamma) *
        (-logit[i] * (logit[i] >= 0) -
         std::log(
-             LogitType(1) +
+             LogitT(1) + std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
-             std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
-    loss[i] = LogitType(0);
+    loss[i] = LogitT(0);
    loss[i] += -c1 * pos_term * pos_alpha;
    loss[i] += -c2 * neg_term * neg_alpha;
    mask[i] = c1;
@@ -52,19 +50,19 @@ void _SigmoidFocalLoss(
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SigmoidFocalLossGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 3> idx = {0, 0, 0};
  std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
  const int count = dims[0] * dims[1] * dims[2];
@@ -72,26 +70,24 @@ void _SigmoidFocalLossGrad(
  for (int i = 0; i < count; ++i) {
    const int t = (int)target[idx[0] * inner_dim + idx[2]];
    // "0" is reserved for target if negative index is zero
-    LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
-    LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
    // (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
-    LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
+    LogitT pos_term = std::pow(LogitT(1) - p, gamma) *
-        (LogitType(1) - p -
+        (LogitT(1) - p - p * gamma * std::log(std::max(p, (LogitT)FLT_MIN)));
-         p * gamma * std::log(std::max(p, (LogitType)FLT_MIN)));
    // p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
-    LogitType neg_term = std::pow(p, gamma) *
+    LogitT neg_term = std::pow(p, gamma) *
        ((-logit[i] * (logit[i] >= 0) -
          std::log(
-              LogitType(1) +
+              LogitT(1) +
-              std::exp(logit[i] - LogitType(2) * logit[i] * (logit[i] >= 0)))) *
+              std::exp(logit[i] - LogitT(2) * logit[i] * (logit[i] >= 0)))) *
             (1 - p) * gamma -
         p);
-    dx[i] = LogitType(0);
+    dx[i] = LogitT(0);
    dx[i] += -c1 * pos_term * pos_alpha;
    dx[i] += -c2 * neg_term * neg_alpha;
    mask[i] = c1;
@@ -104,9 +100,9 @@ void _SigmoidFocalLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
  template <>                                         \
-  void name<LogitType, TargetType, CPUContext>(             \
+  void name<LogitT, TargetT, CPUContext>(             \
      const int outer_dim,                            \
      const int inner_dim,                            \
      const int axis_dim,                             \
@@ -114,18 +110,18 @@ void _SigmoidFocalLossGrad(
      const float neg_alpha,                          \
      const float gamma,                              \
      const int negative_index,                       \
-      const LogitType* logit,                               \
+      const LogitT* logit,                            \
-      const TargetType* target,                             \
+      const TargetT* target,                          \
-      LogitType* loss,                                      \
+      LogitT* loss,                                   \
-      LogitType* mask,                                      \
+      LogitT* mask,                                   \
      CPUContext* ctx) {                              \
    _##name(                                          \
        outer_dim,                                    \
        inner_dim,                                    \
        axis_dim,                                     \
-        (LogitType)pos_alpha,                               \
+        (LogitT)pos_alpha,                            \
-        (LogitType)neg_alpha,                               \
+        (LogitT)neg_alpha,                            \
-        (LogitType)gamma,                                   \
+        (LogitT)gamma,                                \
        negative_index,                               \
        logit,                                        \
        target,                                       \

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
@@ -9,19 +9,19 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SigmoidFocalLoss(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int j = yi % inner_dim;
    const int k = (yi / inner_dim) % axis_dim;
@@ -29,40 +29,39 @@ __global__ void _SigmoidFocalLoss(
    const int t = target[i * inner_dim + j];
    // "0" is reserved for target if negative index is zero
-    LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[yi]));
-    LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[yi]));
    // (1 - p)^{gamma} * log(p)
-    LogitType pos_term = pow(LogitType(1) - p, gamma) * log(max(p, FLT_MIN));
+    LogitT pos_term = pow(LogitT(1) - p, gamma) * log(max(p, FLT_MIN));
    // p^{gamma} * log(1 - p)
-    LogitType neg_term = pow(p, gamma) *
+    LogitT neg_term = pow(p, gamma) *
        (-logit[yi] * (logit[yi] >= 0) -
-         log(LogitType(1) +
+         log(LogitT(1) +
-             exp(logit[yi] - LogitType(2) * logit[yi] * (logit[yi] >= 0))));
+             exp(logit[yi] - LogitT(2) * logit[yi] * (logit[yi] >= 0))));
-    loss[yi] = LogitType(0);
+    loss[yi] = LogitT(0);
    loss[yi] += -c1 * pos_term * pos_alpha;
    loss[yi] += -c2 * neg_term * neg_alpha;
    mask[yi] = c1;
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SigmoidFocalLossGrad(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(xi, nthreads) {
    const int j = xi % inner_dim;
    const int k = (xi / inner_dim) % axis_dim;
@@ -70,24 +69,23 @@ __global__ void _SigmoidFocalLossGrad(
    const int t = target[i * inner_dim + j];
    // "0" is reserved for target if neg index is zero
-    LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[xi]));
-    LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[xi]));
    // (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
-    LogitType pos_term = pow(LogitType(1) - p, gamma) *
+    LogitT pos_term = pow(LogitT(1) - p, gamma) *
-        (LogitType(1) - p - p * gamma * log(max(p, FLT_MIN)));
+        (LogitT(1) - p - p * gamma * log(max(p, FLT_MIN)));
    // p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
-    LogitType neg_term = pow(p, gamma) *
+    LogitT neg_term = pow(p, gamma) *
        ((-logit[xi] * (logit[xi] >= 0) -
-          log(LogitType(1) +
+          log(LogitT(1) +
-              exp(logit[xi] - LogitType(2) * logit[xi] * (logit[xi] >= 0)))) *
+              exp(logit[xi] - LogitT(2) * logit[xi] * (logit[xi] >= 0)))) *
-             (LogitType(1) - p) * gamma -
+             (LogitT(1) - p) * gamma -
         p);
-    dx[xi] = LogitType(0);
+    dx[xi] = LogitT(0);
    dx[xi] += -c1 * pos_term * pos_alpha;
    dx[xi] += -c2 * neg_term * neg_alpha;
    mask[xi] = c1;
@@ -98,9 +96,9 @@ __global__ void _SigmoidFocalLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType)                  \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT)                        \
  template <>                                                                \
-  void name<LogitType, TargetType, CUDAContext>(                             \
+  void name<LogitT, TargetT, CUDAContext>(                                   \
      const int outer_dim,                                                   \
      const int inner_dim,                                                   \
      const int axis_dim,                                                    \
@@ -108,19 +106,19 @@ __global__ void _SigmoidFocalLossGrad(
      const float neg_alpha,                                                 \
      const float gamma,                                                     \
      const int negative_index,                                              \
-      const LogitType* logit,                                                \
+      const LogitT* logit,                                                   \
-      const TargetType* target,                                              \
+      const TargetT* target,                                                 \
-      LogitType* loss,                                                       \
+      LogitT* loss,                                                          \
-      LogitType* mask,                                                       \
+      LogitT* mask,                                                          \
      CUDAContext* ctx) {                                                    \
    const auto nthreads = outer_dim * axis_dim * inner_dim;                  \
    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        nthreads,                                                            \
        inner_dim,                                                           \
        axis_dim,                                                            \
-        (LogitType)pos_alpha,                                                \
+        (LogitT)pos_alpha,                                                   \
-        (LogitType)neg_alpha,                                                \
+        (LogitT)neg_alpha,                                                   \
-        (LogitType)gamma,                                                    \
+        (LogitT)gamma,                                                       \
        negative_index,                                                      \
        logit,                                                               \
        target,                                                              \

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
@@ -7,58 +7,58 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SparseSoftmaxCrossEntropy(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      loss[i] = mask[i] = LogitType(0);
+      loss[i] = mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
+      loss[i] = -std::log(std::max(prob[k], LogitT(FLT_MIN)));
-      mask[i] = LogitType(1);
+      mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SparseSoftmaxCrossEntropyGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      LogitType* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
+      LogitT* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
      for (int j = 0; j < axis_dim; ++j) {
-        (*offset_dx) = LogitType(0);
+        (*offset_dx) = LogitT(0);
        offset_dx += inner_dim;
      }
-      mask[i] = LogitType(0);
+      mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      dx[k] -= LogitType(1);
+      dx[k] -= LogitT(1);
-      mask[i] = LogitType(1);
+      mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
@@ -68,17 +68,17 @@ void _SparseSoftmaxCrossEntropyGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
  template <>                                         \
-  void name<LogitType, TargetType, CPUContext>(             \
+  void name<LogitT, TargetT, CPUContext>(             \
      const int outer_dim,                            \
      const int inner_dim,                            \
      const int axis_dim,                             \
      const int ignore_index,                         \
-      const LogitType* prob,                                \
+      const LogitT* prob,                             \
-      const TargetType* target,                             \
+      const TargetT* target,                          \
-      LogitType* loss,                                      \
+      LogitT* loss,                                   \
-      LogitType* mask,                                      \
+      LogitT* mask,                                   \
      CPUContext* ctx) {                              \
    _##name(                                          \
        outer_dim,                                    \

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
@@ -9,54 +9,54 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SparseSoftmaxCrossEntropy(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      loss[yi] = mask[yi] = LogitType(0);
+      loss[yi] = mask[yi] = LogitT(0);
    } else {
-      loss[yi] = -log(max(
+      loss[yi] = -log(
-          prob[(i * axis_dim + label) * inner_dim + j], LogitType(FLT_MIN)));
+          max(prob[(i * axis_dim + label) * inner_dim + j], LogitT(FLT_MIN)));
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SparseSoftmaxCrossEntropyGrad(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      LogitType* offset_dx = dx + i * axis_dim * inner_dim + j;
+      LogitT* offset_dx = dx + i * axis_dim * inner_dim + j;
      for (int k = 0; k < axis_dim; ++k) {
-        (*offset_dx) = LogitType(0);
+        (*offset_dx) = LogitT(0);
        offset_dx += inner_dim;
      }
-      mask[yi] = LogitType(0);
+      mask[yi] = LogitT(0);
    } else {
-      dx[(i * axis_dim + label) * inner_dim + j] -= LogitType(1);
+      dx[(i * axis_dim + label) * inner_dim + j] -= LogitT(1);
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
@@ -65,17 +65,17 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType)                  \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT)                        \
  template <>                                                                \
-  void name<LogitType, TargetType, CUDAContext>(                             \
+  void name<LogitT, TargetT, CUDAContext>(                                   \
      const int outer_dim,                                                   \
      const int inner_dim,                                                   \
      const int axis_dim,                                                    \
      const int ignore_index,                                                \
-      const LogitType* prob,                                                 \
+      const LogitT* prob,                                                    \
-      const TargetType* target,                                              \
+      const TargetT* target,                                                 \
-      LogitType* loss,                                                       \
+      LogitT* loss,                                                          \
-      LogitType* mask,                                                       \
+      LogitT* mask,                                                          \
      CUDAContext* ctx) {                                                    \
    const auto nthreads = outer_dim * inner_dim;                             \
    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \

--- a/dragon/kernels/math/clip_op_kernel.cu
+++ b/dragon/kernels/math/clip_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -9,125 +10,34 @@ namespace kernel {
 namespace {
-template <typename T>
+template <typename T, typename AccT>
 __global__ void
-_Clip(const int nthreads, const T low, const T high, const T* x, T* y) {
+_Clip(const int nthreads, const AccT low, const AccT high, const T* x, T* y) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = max(low, min(x[i], high));
+    y[i] = convert::To<T>(max(low, min(convert::To<AccT>(x[i]), high)));
  }
 }
-template <>
+template <typename T, typename AccT>
-__global__ void _Clip<half>(
-    const int nthreads,
-    const half low,
-    const half high,
-    const half* x,
-    half* y) {
-#if __CUDA_ARCH__ >= 530
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __hlt(__ldg(x + i), high)
-        ? (__hgt(__ldg(x + i), low) ? __ldg(x + i) : low)
-        : high;
-  }
-#else
-  const float kLow = __half2float(low);
-  const float kHigh = __half2float(high);
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __float2half(max(kLow, min(__half2float(x[i]), kHigh)));
-  }
-#endif
-}
-template <typename T>
 __global__ void _ClipGrad(
    const int nthreads,
-    const T low,
+    const AccT low,
-    const T high,
+    const AccT high,
    const T* dy,
    const T* x,
    T* dx) {
+  const T kZero = convert::To<T>(0.f);
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-#if __CUDA_ARCH__ >= 350
+    const AccT val = convert::To<AccT>(x[i]);
-    dx[i] = __ldg(x + i) < low || __ldg(x + i) > high ? T(0) : dy[i];
+    dx[i] = val < low || val > high ? kZero : dy[i];
-#else
-    dx[i] = x[i] < low || x[i] > high ? T(0) : dy[i];
-#endif
  }
 }
-template <>
-__global__ void _ClipGrad<half>(
-    const int nthreads,
-    const half low,
-    const half high,
-    const half* dy,
-    const half* x,
-    half* dx) {
-  const half kZero = __float2half(0.f);
-#if __CUDA_ARCH__ >= 530
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] =
-        (__hlt(__ldg(x + i), low) || __hgt(__ldg(x + i), high)) ? kZero : dy[i];
-  }
-#elif __CUDA_ARCH__ >= 350
-  const float kLow = __half2float(low);
-  const float kHigh = __half2float(high);
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = (__half2float(__ldg(x + i)) < kLow ||
-             __half2float(__ldg(x + i)) > kHigh)
-        ? kZero
-        : dy[i];
-  }
-#else
-  const float kLow = __half2float(low);
-  const float kHigh = __half2float(high);
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = (__half2float(x[i]) < kLow || __half2float(x[i]) > kHigh) ? kZero
-                                                                      : dy[i];
-  }
-#endif
-}
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(T, AccT)                                \
-void Clip<float16, CUDAContext>(
-    const int count,
-    const float low,
-    const float high,
-    const float16* x,
-    float16* y,
-    CUDAContext* ctx) {
-  _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      count,
-      convert::To<half>(low),
-      convert::To<half>(high),
-      reinterpret_cast<const half*>(x),
-      reinterpret_cast<half*>(y));
-}
-template <>
-void ClipGrad<float16, CUDAContext>(
-    const int count,
-    const float low,
-    const float high,
-    const float16* dy,
-    const float16* x,
-    float16* dx,
-    CUDAContext* ctx) {
-  _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      count,
-      convert::To<half>(low),
-      convert::To<half>(high),
-      reinterpret_cast<const half*>(dy),
-      reinterpret_cast<const half*>(x),
-      reinterpret_cast<half*>(dx));
-} // ClipGrad
-#define DEFINE_KERNEL_LAUNCHER(T)                                       \
  template <>                                                          \
  void Clip<T, CUDAContext>(                                           \
      const int count,                                                 \
@@ -136,11 +46,12 @@ void ClipGrad<float16, CUDAContext>(
      const T* x,                                                      \
      T* y,                                                            \
      CUDAContext* ctx) {                                              \
-    _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+    _Clip<T, AccT>                                                     \
-        count, convert::To<T>(low), convert::To<T>(high), x, y);        \
+        <<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+            count, low, high, x, y);                                   \
  }
-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                      \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T, AccT)                           \
  template <>                                                          \
  void ClipGrad<T, CUDAContext>(                                       \
      const int count,                                                 \
@@ -150,18 +61,21 @@ void ClipGrad<float16, CUDAContext>(
      const T* x,                                                      \
      T* dx,                                                           \
      CUDAContext* ctx) {                                              \
-    _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+    _ClipGrad<T, AccT>                                                 \
-        count, convert::To<T>(low), convert::To<T>(high), dy, x, dx);       \
+        <<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+            count, low, high, dy, x, dx);                              \
  }
-DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int, int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
-DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(float16, float);
-DEFINE_KERNEL_LAUNCHER(double);
+DEFINE_KERNEL_LAUNCHER(float, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(double);
+DEFINE_GRAD_KERNEL_LAUNCHER(float16, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/math/moments_op_kernel.cc
+++ b/dragon/kernels/math/moments_op_kernel.cc
@@ -20,15 +20,15 @@ void _RowwiseMoments(
 #pragma omp parallel for num_threads(OMP_THREADS(cols))
 #endif
  for (int i = 0; i < cols; ++i) {
-    T x_val;
+    AccT x_val, m_val = AccT(0), v_val = AccT(0);
-    AccT m_val = AccT(0), v_val = AccT(0), mu;
    for (int j = 0; j < rows; ++j) {
-      x_val = x[j * cols + i];
+      x_val = convert::To<AccT>(x[j * cols + i]);
      m_val += x_val;
      v_val += x_val * x_val;
    }
-    mean[i] = mu = m_val * scale;
+    m_val *= scale;
-    var[i] = v_val * scale - mu * mu;
+    mean[i] = m_val;
+    var[i] = v_val * scale - m_val * m_val;
  }
 }
@@ -44,15 +44,15 @@ void _ColwiseMoments(
 #pragma omp parallel for num_threads(OMP_THREADS(rows))
 #endif
  for (int i = 0; i < rows; ++i) {
-    T x_val;
+    AccT x_val, m_val = AccT(0), v_val = AccT(0);
-    AccT m_val = AccT(0), v_val = AccT(0), mu;
    for (int j = 0; j < cols; ++j) {
-      x_val = x[i * cols + j];
+      x_val = convert::To<AccT>(x[i * cols + j]);
      m_val += x_val;
      v_val += x_val * x_val;
    }
-    mean[i] = mu = m_val * scale;
+    m_val *= scale;
-    var[i] = v_val * scale - mu * mu;
+    mean[i] = m_val;
+    var[i] = v_val * scale - m_val * m_val;
  }
 }
@@ -71,8 +71,7 @@ void _GenericMoments(
 #pragma omp parallel for num_threads(OMP_THREADS(rows))
 #endif
  for (int i = 0; i < rows; ++i) {
-    T x_val;
+    AccT x_val, m_val = AccT(0), v_val = AccT(0);
-    AccT m_val = AccT(0), v_val = AccT(0), mu;
    int xi, c, r;
    for (int j = 0; j < cols; ++j) {
      xi = 0;
@@ -81,12 +80,13 @@ void _GenericMoments(
        FIXED_DIVISOR_DIV_MOD(x_dims[d], c, &c, &r);
        xi += r * x_strides[d];
      }
-      x_val = x[xi];
+      x_val = convert::To<AccT>(x[xi]);
      m_val += x_val;
      v_val += x_val * x_val;
    }
-    mean[i] = mu = m_val * scale;
+    m_val *= scale;
-    var[i] = v_val * scale - mu * mu;
+    mean[i] = m_val;
+    var[i] = v_val * scale - m_val * m_val;
  }
 }
@@ -148,19 +148,6 @@ void _Moments(
 /* ------------------- Launcher Separator ------------------- */
-template <>
-void Moments<float16, float, CPUContext>(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const float16* x,
-    float* mean,
-    float* var,
-    CPUContext* ctx) {
-  CPU_FP16_NOT_SUPPORTED;
-}
 #define DEFINE_KERNEL_LAUNCHER(T, AccT)                          \
  template <>                                                    \
  void Moments<T, AccT, CPUContext>(                             \
@@ -178,7 +165,8 @@ void Moments<float16, float, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(int8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(int, float);
-DEFINE_KERNEL_LAUNCHER(int64_t, float);
+DEFINE_KERNEL_LAUNCHER(int64_t, double);
+DEFINE_KERNEL_LAUNCHER(float16, float);
 DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
 #undef DEFINE__KERNEL_LAUNCHER

--- a/dragon/kernels/math/moments_op_kernel.cu
+++ b/dragon/kernels/math/moments_op_kernel.cu
@@ -201,7 +201,7 @@ void _Moments(
 DEFINE_KERNEL_LAUNCHER(int8_t, int8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(int, int, float);
-DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, float);
+DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, double);
 DEFINE_KERNEL_LAUNCHER(float16, half, float);
 DEFINE_KERNEL_LAUNCHER(float, float, float);
 DEFINE_KERNEL_LAUNCHER(double, double, double);

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cc
@@ -70,7 +70,7 @@ void _L1NormalizeGrad(
      auto X = ConstEigenStridedVectorMap<T>(
          x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
      auto norm = std::max(X.template lpNorm<1>() / normalizer, epsilon);
-      auto norm2 = std::pow(norm, 2);
+      auto norm2 = std::pow(norm, T(2));
      EigenStridedVectorMap<T>(
          dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
          (dY / norm) -
@@ -98,7 +98,7 @@ void _L2NormalizeGrad(
      auto X = ConstEigenStridedVectorMap<T>(
          x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
      auto norm = std::max(std::sqrt(X.squaredNorm() / normalizer), epsilon);
-      auto norm3 = std::pow(norm, 3);
+      auto norm3 = std::pow(norm, T(3));
      EigenStridedVectorMap<T>(
          dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
          (dY / norm) - ((X / norm3) * dY.dot(X) / normalizer);

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cu
@@ -93,7 +93,7 @@ __global__ void _L1NormalizeGrad(
    val2 = BlockReduce<AccT>(storage).Sum(val2);
    if (threadIdx.x == 0) {
      norm = max(val1 / normalizer, epsilon);
-      norm2 = pow(norm, 2);
+      norm2 = pow(norm, AccT(2));
      sum = val2 / normalizer;
    }
    __syncthreads();
@@ -130,7 +130,7 @@ __global__ void _L2NormalizeGrad(
    val2 = BlockReduce<AccT>(storage).Sum(val2);
    if (threadIdx.x == 0) {
      norm = max(sqrt(val1 / normalizer), epsilon);
-      norm3 = pow(norm, 3);
+      norm3 = pow(norm, AccT(3));
      sum = val2 / normalizer;
    }
    __syncthreads();

--- a/dragon/kernels/vision/roi_align_op_kernel.cu
+++ b/dragon/kernels/vision/roi_align_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/vision/roi_pool_op_kernel.cu
+++ b/dragon/kernels/vision/roi_pool_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/operators/array/channel_normalize_op.cc
+++ b/dragon/operators/array/channel_normalize_op.cc
@@ -5,7 +5,7 @@
 namespace dragon {
 template <class Context>
-template <typename Tx, typename Ty>
+template <typename InputT, typename OutputT>
 void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -35,10 +35,10 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
      num_dims,
      X_strides.data(),
      Y_dims.data(),
-      X.template data<Tx, Context>(),
+      X.template data<InputT, Context>(),
      X_mean_.template data<float, Context>(),
      X_std_.template data<float, Context>(),
-      Y->Reshape(Y_dims)->template mutable_data<Ty, Context>(),
+      Y->Reshape(Y_dims)->template mutable_data<OutputT, Context>(),
      ctx());
 }

--- a/dragon/operators/loss/nll_loss_op.cc
+++ b/dragon/operators/loss/nll_loss_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void NLLLossOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -19,19 +19,19 @@ void NLLLossOp<Context>::DoRunWithType() {
      << "\nNumber of preds must match the number of targets.";
  auto scratches = ctx()->workspace()->template data<Context>({
-      (size_t)num_preds * sizeof(LogitType), // loss
+      (size_t)num_preds * sizeof(LogitT), // loss
-      (size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
+      (size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
  });
-  auto* loss = static_cast<LogitType*>(scratches[0]);
+  auto* loss = static_cast<LogitT*>(scratches[0]);
-  auto* mask = static_cast<LogitType*>(scratches[1]);
+  auto* mask = static_cast<LogitT*>(scratches[1]);
  kernel::NLLLoss(
      outer_dim,
      inner_dim,
      X.dim(axis),
      ignore_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      loss,
      mask,
      ctx());
@@ -42,7 +42,7 @@ void NLLLossOp<Context>::DoRunWithType() {
    math::Copy(
        num_preds,
        loss,
-        Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
+        Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
        ctx());
  } else {
    int64_t normalizer = 1;
@@ -59,7 +59,7 @@ void NLLLossOp<Context>::DoRunWithType() {
        normalizer,
        loss,
        mask,
-        Y->Reshape({})->template mutable_data<LogitType, Context>(),
+        Y->Reshape({})->template mutable_data<LogitT, Context>(),
        ctx());
  }
 }
@@ -91,7 +91,7 @@ void NLLLossOp<Context>::RunOnDevice() {
 }
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void NLLLossGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -101,19 +101,19 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
  auto inner_dim = dX->count(axis + 1);
  auto num_preds = outer_dim * inner_dim;
-  auto* dy = dY.template data<LogitType, Context>();
+  auto* dy = dY.template data<LogitT, Context>();
-  auto* dx = dX->template mutable_data<LogitType, Context>();
+  auto* dx = dX->template mutable_data<LogitT, Context>();
  auto* mask =
-      ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
+      ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
-  math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
+  math::Set(dX->count(), convert::To<LogitT>(0.f), dx, ctx());
  kernel::NLLLossGrad(
      outer_dim,
      inner_dim,
      dX->dim(axis),
      ignore_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      dx,
      mask,
      ctx());

--- a/dragon/operators/loss/nll_loss_op.h
+++ b/dragon/operators/loss/nll_loss_op.h
@@ -28,7 +28,7 @@ class NLLLossOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:
@@ -47,7 +47,7 @@ class NLLLossGradientOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/loss/sigmoid_focal_loss_op.cc
+++ b/dragon/operators/loss/sigmoid_focal_loss_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SigmoidFocalLossOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -18,11 +18,11 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
      << "\nNumber of preds must match the number of targets.";
  auto scratches = ctx()->workspace()->template data<Context>({
-      X.size() * sizeof(LogitType), // loss
+      X.size() * sizeof(LogitT), // loss
-      X.size() * sizeof(LogitType) + sizeof(LogitType), // mask
+      X.size() * sizeof(LogitT) + sizeof(LogitT), // mask
  });
-  auto* loss = static_cast<LogitType*>(scratches[0]);
+  auto* loss = static_cast<LogitT*>(scratches[0]);
-  auto* mask = static_cast<LogitType*>(scratches[1]);
+  auto* mask = static_cast<LogitT*>(scratches[1]);
  kernel::SigmoidFocalLoss(
      outer_dim,
@@ -32,8 +32,8 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
      neg_alpha_,
      gamma_,
      negative_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      loss,
      mask,
      ctx());
@@ -42,7 +42,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
    math::Copy(
        X.count(),
        loss,
-        Y->ReshapeLike(X)->template mutable_data<LogitType, Context>(),
+        Y->ReshapeLike(X)->template mutable_data<LogitT, Context>(),
        ctx());
  } else {
    int64_t normalizer = 1;
@@ -59,7 +59,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
        normalizer,
        loss,
        mask,
-        Y->Reshape({})->template mutable_data<LogitType, Context>(),
+        Y->Reshape({})->template mutable_data<LogitT, Context>(),
        ctx());
  }
 }
@@ -91,7 +91,7 @@ void SigmoidFocalLossOp<Context>::RunOnDevice() {
 }
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -100,10 +100,10 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
  auto outer_dim = dX->count(0, axis);
  auto inner_dim = dX->count(axis + 1);
-  auto* dy = dY.template data<LogitType, Context>();
+  auto* dy = dY.template data<LogitT, Context>();
-  auto* dx = dX->template mutable_data<LogitType, Context>();
+  auto* dx = dX->template mutable_data<LogitT, Context>();
-  auto* mask = ctx()->workspace()->template data<LogitType, Context>(
+  auto* mask =
-      {dX->count() + 1})[0];
+      ctx()->workspace()->template data<LogitT, Context>({dX->count() + 1})[0];
  kernel::SigmoidFocalLossGrad(
      outer_dim,
@@ -113,8 +113,8 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
      neg_alpha_,
      gamma_,
      negative_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      dx,
      mask,
      ctx());

--- a/dragon/operators/loss/sigmoid_loss_ops.h
+++ b/dragon/operators/loss/sigmoid_loss_ops.h
@@ -48,7 +48,7 @@ class SigmoidFocalLossOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:
@@ -88,7 +88,7 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/loss/softmax_loss_ops.h
+++ b/dragon/operators/loss/softmax_loss_ops.h
@@ -45,7 +45,7 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:
@@ -81,7 +81,7 @@ class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -18,20 +18,20 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
  CHECK_EQ(num_preds, Input(1).count())
      << "\nNumber of preds must match the number of targets.";
  auto* X_prob = Buffer("prob")->ReshapeLike(X);
-  auto* prob = X_prob->template mutable_data<LogitType, Context>();
+  auto* prob = X_prob->template mutable_data<LogitT, Context>();
  auto scratches = ctx()->workspace()->template data<Context>({
-      (size_t)num_preds * sizeof(LogitType), // loss
+      (size_t)num_preds * sizeof(LogitT), // loss
-      (size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
+      (size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
  });
-  auto* loss = static_cast<LogitType*>(scratches[0]);
+  auto* loss = static_cast<LogitT*>(scratches[0]);
-  auto* mask = static_cast<LogitType*>(scratches[1]);
+  auto* mask = static_cast<LogitT*>(scratches[1]);
  kernel::Softmax(
      outer_dim,
      inner_dim,
      X.dim(axis),
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
      prob,
      ctx());
@@ -41,7 +41,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
      X.dim(axis),
      ignore_index_,
      prob,
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      loss,
      mask,
      ctx());
@@ -52,7 +52,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
    math::Copy(
        num_preds,
        loss,
-        Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
+        Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
        ctx());
  } else {
    int64_t normalizer = 1;
@@ -69,7 +69,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
        normalizer,
        loss,
        mask,
-        Y->Reshape({})->template mutable_data<LogitType, Context>(),
+        Y->Reshape({})->template mutable_data<LogitT, Context>(),
        ctx());
  }
 }
@@ -101,7 +101,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
 }
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
  auto &dY = Input(-1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(Input(0));
@@ -110,11 +110,11 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
  auto inner_dim = dX->count(axis + 1);
  auto num_preds = outer_dim * inner_dim;
-  auto* prob = Buffer("prob")->template data<LogitType, Context>();
+  auto* prob = Buffer("prob")->template data<LogitT, Context>();
-  auto* dy = Input(-1).template data<LogitType, Context>();
+  auto* dy = Input(-1).template data<LogitT, Context>();
-  auto* dx = Output(0)->template mutable_data<LogitType, Context>();
+  auto* dx = Output(0)->template mutable_data<LogitT, Context>();
  auto* mask =
-      ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
+      ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
  math::Copy(dX->count(), prob, dx, ctx());
@@ -124,7 +124,7 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
      dX->dim(axis),
      ignore_index_,
      prob,
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      dx,
      mask,
      ctx());

--- a/dragon/operators/math/moments_op.cc
+++ b/dragon/operators/math/moments_op.cc
@@ -5,8 +5,9 @@
 namespace dragon {
 template <class Context>
-template <typename Tx, typename Ty>
+template <typename T>
 void MomentsOp<Context>::DoRunWithType() {
+  using OutputT = typename math::utils::AccmulatorType<T>::type;
  auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);
  // Determine the reduce axes
@@ -35,13 +36,13 @@ void MomentsOp<Context>::DoRunWithType() {
  if (X.count() == 1) {
    math::Cast(
        1,
-        X.template data<Tx, Context>(),
+        X.template data<T, Context>(),
-        Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
        ctx());
    math::Set(
        1,
-        convert::To<Ty>(0.f),
+        convert::To<OutputT>(0.f),
-        Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
        ctx());
  } else {
    kernel::Moments(
@@ -49,35 +50,16 @@ void MomentsOp<Context>::DoRunWithType() {
        X_dims.data(),
        reduce_axes.size(),
        reduce_axes.data(),
-        X.template data<Tx, Context>(),
+        X.template data<T, Context>(),
-        Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
-        Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
        ctx());
  }
 }
 template <class Context>
 void MomentsOp<Context>::RunOnDevice() {
-  auto& X = Input(0);
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
-  if (X.template IsType<int8_t>()) {
-    DoRunWithType<int8_t, float>();
-  } else if (X.template IsType<uint8_t>()) {
-    DoRunWithType<uint8_t, float>();
-  } else if (X.template IsType<int>()) {
-    DoRunWithType<int, float>();
-  } else if (X.template IsType<int64_t>()) {
-    DoRunWithType<int64_t, float>();
-  } else if (X.template IsType<float16>()) {
-    DoRunWithType<float16, float>();
-  } else if (X.template IsType<float>()) {
-    DoRunWithType<float, float>();
-  } else if (X.template IsType<double>()) {
-    DoRunWithType<double, double>();
-  } else {
-    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(X.meta()),
-        {"int8", "uint8", "int32", "int64", "float16", "float32", "float64"});
-  }
 }
 DEPLOY_CPU_OPERATOR(Moments);

--- a/dragon/operators/math/moments_op.h
+++ b/dragon/operators/math/moments_op.h
@@ -28,7 +28,7 @@ class MomentsOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename Tx, typename Ty>
+  template <typename T>
  void DoRunWithType();
 protected:

--- a/dragon/operators/metric/accuracy_op.cc
+++ b/dragon/operators/metric/accuracy_op.cc
@@ -3,7 +3,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void AccuracyOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -18,21 +18,21 @@ void AccuracyOp<Context>::DoRunWithType() {
  int64_t acc = 0, count = 0;
  int64_t cols = X.count() / outer_dim;
-  auto* logit = X.template data<LogitType, CPUContext>();
+  auto* logit = X.template data<LogitT, CPUContext>();
-  auto* target = Input(1).template data<TargetType, CPUContext>();
+  auto* target = Input(1).template data<TargetT, CPUContext>();
  for (int i = 0; i < outer_dim; ++i) {
    for (int j = 0; j < inner_dim; ++j) {
      const int label = target[i * inner_dim + j];
      if (label == ignore_index_) continue;
-      vector<pair<LogitType, int>> vec;
+      vector<pair<LogitT, int>> vec;
      for (int k = 0; k < axis_dim; k++)
        vec.push_back(std::make_pair(logit[i * cols + k * inner_dim + j], k));
      std::partial_sort(
          vec.begin(),
          vec.begin() + top_k_,
          vec.end(),
-          std::greater<pair<LogitType, int>>());
+          std::greater<pair<LogitT, int>>());
      for (int k = 0; k < top_k_; k++) {
        if (vec[k].second == label) {
          acc++;

--- a/dragon/operators/metric/accuracy_op.h
+++ b/dragon/operators/metric/accuracy_op.h
@@ -28,7 +28,7 @@ class AccuracyOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/normalization/batch_norm_op.cc
+++ b/dragon/operators/normalization/batch_norm_op.cc
@@ -8,11 +8,11 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void BatchNormOp<Context>::TrainingImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
  auto* X_mu = Buffer("X_mu")->Reshape({C_});
  auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
@@ -20,11 +20,11 @@ void BatchNormOp<Context>::TrainingImpl() {
  auto* X_bias = Buffer("X_bias")->Reshape({C_});
  auto* x = Input(0).template data<T, Context>();
-  auto* rm = Input(3).template mutable_data<ParamType, Context>();
+  auto* rm = Input(3).template mutable_data<ParamT, Context>();
-  auto* rv = Input(4).template mutable_data<ParamType, Context>();
+  auto* rv = Input(4).template mutable_data<ParamT, Context>();
-  auto* mu = X_mu->template mutable_data<ParamType, Context>();
+  auto* mu = X_mu->template mutable_data<ParamT, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
-  auto* scale = X_scale->template mutable_data<ParamType, Context>();
+  auto* scale = X_scale->template mutable_data<ParamT, Context>();
  // Compute moments
  if (sync_stats_ > 0) {
@@ -45,7 +45,7 @@ void BatchNormOp<Context>::TrainingImpl() {
    if (enable_nccl_) {
 #ifdef USE_NCCL
      auto coll_comm = this->nccl_comm();
-      auto coll_dtype = this->template nccl_dtype<ParamType>();
+      auto coll_dtype = this->template nccl_dtype<ParamT>();
      NCCL_CHECK(ncclAllReduce(
          (void*)mu,
          (void*)mu,
@@ -84,8 +84,9 @@ void BatchNormOp<Context>::TrainingImpl() {
  // Compute running statistics
  if (is_recomputing_ == 0) {
-    math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
+    auto decay_factor = momentum();
-    math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
+    math::Axpby(C_, 1.f - decay_factor, mu, decay_factor, rm, ctx());
+    math::Axpby(C_, 1.f - decay_factor, rsig, decay_factor, rv, ctx());
  }
  // Inverse stddev from variance
@@ -100,10 +101,10 @@ void BatchNormOp<Context>::TrainingImpl() {
      x,
      mu,
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      Input(2).template data<ParamType, Context>(), // beta
+      Input(2).template data<ParamT, Context>(), // beta
      scale,
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -111,17 +112,17 @@ void BatchNormOp<Context>::TrainingImpl() {
 template <class Context>
 template <typename T>
 void BatchNormOp<Context>::InferenceImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
  auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
  auto* X_scale = Buffer("X_scale")->Reshape({C_});
  auto* X_bias = Buffer("X_bias")->Reshape({C_});
-  auto* rv = Input(4).template data<ParamType, Context>();
+  auto* rv = Input(4).template data<ParamT, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
  // Inverse stddev from variance
  math::InvStd(C_, epsilon_, rv, rsig, ctx());
@@ -133,12 +134,12 @@ void BatchNormOp<Context>::InferenceImpl() {
      S_,
      data_format(),
      Input(0).template data<T, Context>(),
-      Input(3).template data<ParamType, Context>(),
+      Input(3).template data<ParamT, Context>(),
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      Input(2).template data<ParamType, Context>(), // beta
+      Input(2).template data<ParamT, Context>(), // beta
-      X_scale->template mutable_data<ParamType, Context>(),
+      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -159,17 +160,17 @@ void BatchNormOp<Context>::RunOnDevice() {
 template <class Context>
 template <typename T>
 void BatchNormGradientOp<Context>::TrainingImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
  auto* x = Input(0).template data<T, Context>();
-  auto* gamma = Input(1).template data<ParamType, Context>();
+  auto* gamma = Input(1).template data<ParamT, Context>();
  auto* dy = Input(4).template data<T, Context>();
-  auto* mu = X_mu->template data<ParamType, Context>();
+  auto* mu = X_mu->template data<ParamT, Context>();
-  auto* rsig = X_rsig->template data<ParamType, Context>();
+  auto* rsig = X_rsig->template data<ParamT, Context>();
-  auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
+  auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
-  auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
+  auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
  // Gradient w.r.t. gamma and beta
  kernel::BatchNormWGrad(
@@ -181,7 +182,7 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
    if (enable_nccl_) {
 #ifdef USE_NCCL
      auto coll_comm = this->nccl_comm();
-      auto coll_dtype = this->template nccl_dtype<ParamType>();
+      auto coll_dtype = this->template nccl_dtype<ParamT>();
      NCCL_CHECK(ncclAllReduce(
          (void*)dgamma,
          (void*)dgamma,
@@ -231,18 +232,18 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
 template <class Context>
 template <typename T>
 void BatchNormGradientOp<Context>::InferenceImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto* X_scale = Buffer("X_scale")->Reshape({C_});
-  auto* rv = Input(3).template data<ParamType, Context>();
+  auto* rv = Input(3).template data<ParamT, Context>();
-  auto* rsig = X_scale->template mutable_data<ParamType, Context>();
+  auto* rsig = X_scale->template mutable_data<ParamT, Context>();
  // Gradient w.r.t. gamma or beta if necessary
-  ParamType *dgamma = nullptr, *dbeta = nullptr;
+  ParamT *dgamma = nullptr, *dbeta = nullptr;
  if (dW->has_name() || dB->has_name()) {
-    dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
+    dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
-    dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
+    dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
  }
  // Inverse stddev from variance
@@ -255,9 +256,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
      S_,
      data_format(),
      Input(0).template data<T, Context>(), // x
-      Input(2).template data<ParamType, Context>(), // rm
+      Input(2).template data<ParamT, Context>(), // rm
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
      Input(4).template data<T, Context>(), // dy
      dgamma,
      dbeta,

--- a/dragon/operators/normalization/batch_norm_op.h
+++ b/dragon/operators/normalization/batch_norm_op.h
@@ -33,7 +33,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
 public:
  BatchNormOpBase(const OperatorDef& def, Workspace* ws)
      : GenericOpBase<Context>(def, ws),
-        momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
        use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
        sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
@@ -57,7 +56,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
  }
 protected:
-  float momentum_;
  double epsilon_;
  int64_t N_, C_, S_;
  int64_t use_stats_, sync_stats_;
@@ -68,7 +66,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
 #define USE_BATCHNORM_FUNCTIONS                           \
  using BatchNormOpBase<Context>::DetermineBaseArguments; \
-  using BatchNormOpBase<Context>::momentum_;              \
  using BatchNormOpBase<Context>::epsilon_;               \
  using BatchNormOpBase<Context>::use_stats_;             \
  using BatchNormOpBase<Context>::sync_stats_;            \
@@ -82,7 +79,9 @@ template <class Context>
 class BatchNormOp : public BatchNormOpBase<Context> {
 public:
  BatchNormOp(const OperatorDef& def, Workspace* ws)
-      : BatchNormOpBase<Context>(def, ws) {}
+      : BatchNormOpBase<Context>(def, ws) {
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
+  }
  USE_OPERATOR_FUNCTIONS;
  USE_BATCHNORM_FUNCTIONS;
 #ifdef USE_MPI
@@ -105,6 +104,8 @@ class BatchNormOp : public BatchNormOpBase<Context> {
      InferenceImpl<T>();
    }
  };
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
 };
 template <class Context>
@@ -146,11 +147,9 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
    CuDNNCreateTensorDesc(&bn_desc_);
    CuDNNCreateTensorDesc(&input_desc_);
    if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. \nSet it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
      epsilon_ = CUDNN_BN_MIN_EPSILON;
    }
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
  }
  USE_OPERATOR_FUNCTIONS;
  USE_BATCHNORM_FUNCTIONS;
@@ -168,6 +167,7 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
 protected:
  cudnnTensorDescriptor_t input_desc_, bn_desc_;
  cudnnBatchNormMode_t bn_mode_;
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
 };
 template <class Context>
@@ -178,9 +178,6 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
    CuDNNCreateTensorDesc(&bn_desc_);
    CuDNNCreateTensorDesc(&input_desc_);
    if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. \nSet it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
      epsilon_ = CUDNN_BN_MIN_EPSILON;
    }
  }
@@ -211,8 +208,12 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
  cudnnBatchNormMode_t bn_mode_;
 };
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, CuDNNBatchNormOp, momentum);
 #endif // USE_CUDNN
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, BatchNormOp, momentum);
 } // namespace dragon
 #endif // DRAGON_OPERATORS_NORMALIZATION_BATCH_NORM_OP_H_
--- a/dragon/operators/normalization/batch_norm_op_cudnn.cc
+++ b/dragon/operators/normalization/batch_norm_op_cudnn.cc
@@ -9,11 +9,11 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void CuDNNBatchNormOp<Context>::DoRunWithType() {
-  using ParamType = typename CuDNNType<T>::BNParamType;
+  using ParamT = typename CuDNNType<T>::BNParamType;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
  // Determine the descriptors
  if (Input(0).ndim() == 2) {
@@ -39,14 +39,14 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
        input_desc_,
        Output(0)->template mutable_data<T, Context>(), // y
        bn_desc_,
-        Input(1).template data<ParamType, Context>(), // gamma
+        Input(1).template data<ParamT, Context>(), // gamma
-        Input(2).template data<ParamType, Context>(), // beta
+        Input(2).template data<ParamT, Context>(), // beta
-        is_recomputing_ > 0 ? 0.f : 1.f - this->momentum_,
+        is_recomputing_ == 0 ? 1.f - momentum() : 0.f,
-        Input(3).template mutable_data<ParamType, Context>(), // rm
+        Input(3).template mutable_data<ParamT, Context>(), // rm
-        Input(4).template mutable_data<ParamType, Context>(), // rv
+        Input(4).template mutable_data<ParamT, Context>(), // rv
        epsilon_,
-        X_mu->template mutable_data<ParamType, Context>(), // sm
+        X_mu->template mutable_data<ParamT, Context>(), // sm
-        X_rsig->template mutable_data<ParamType, Context>())); // sv
+        X_rsig->template mutable_data<ParamT, Context>())); // sv
  } else {
    CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
        ctx()->cudnn_handle(),
@@ -58,10 +58,10 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
        input_desc_,
        Output(0)->template mutable_data<T, Context>(), // y
        bn_desc_,
-        Input(1).template data<ParamType, Context>(), // gamma
+        Input(1).template data<ParamT, Context>(), // gamma
-        Input(2).template data<ParamType, Context>(), // beta
+        Input(2).template data<ParamT, Context>(), // beta
-        Input(3).template data<ParamType, Context>(), // rm
+        Input(3).template data<ParamT, Context>(), // rm
-        Input(4).template data<ParamType, Context>(), // rv
+        Input(4).template data<ParamT, Context>(), // rv
        epsilon_));
  }
 }
@@ -82,7 +82,7 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
 template <class Context>
 template <typename T>
 void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
-  using ParamType = typename CuDNNType<T>::BNParamType;
+  using ParamT = typename CuDNNType<T>::BNParamType;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
@@ -111,12 +111,12 @@ void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
      input_desc_,
      Output(0)->template mutable_data<T, Context>(), // dx
      bn_desc_,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dw
+      dW->Reshape({C_})->template mutable_data<ParamT, Context>(), // dw
-      dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // db
+      dB->Reshape({C_})->template mutable_data<ParamT, Context>(), // db
      epsilon_,
-      X_mu->template data<ParamType, Context>(), // mu
+      X_mu->template data<ParamT, Context>(), // mu
-      X_rsig->template data<ParamType, Context>())); // rsig
+      X_rsig->template data<ParamT, Context>())); // rsig
 }
 template <class Context>

--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
@@ -8,9 +8,9 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void GroupNormOp<Context>::DoRunWithType() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
  auto* X_mu = Buffer("X_mu")->Reshape({N_, G_});
  auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_});
@@ -18,8 +18,8 @@ void GroupNormOp<Context>::DoRunWithType() {
  auto* X_bias = Buffer("X_bias")->Reshape({N_, C_});
  auto* x = Input(0).template data<T, Context>();
-  auto* mu = X_mu->template mutable_data<ParamType, Context>();
+  auto* mu = X_mu->template mutable_data<ParamT, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
  // Compute the moments
  if (data_format() == "NCHW") {
@@ -45,10 +45,10 @@ void GroupNormOp<Context>::DoRunWithType() {
      x,
      mu,
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      Input(2).template data<ParamType, Context>(), // beta
+      Input(2).template data<ParamT, Context>(), // beta
-      X_scale->template mutable_data<ParamType, Context>(),
+      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -63,7 +63,7 @@ void GroupNormOp<Context>::RunOnDevice() {
 template <class Context>
 template <typename T>
 void GroupNormGradientOp<Context>::DoRunWithType() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
@@ -78,14 +78,14 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
      S_,
      data_format(),
      Input(0).template data<T, Context>(), // x
-      X_mu->template data<ParamType, Context>(),
+      X_mu->template data<ParamT, Context>(),
-      X_rsig->template data<ParamType, Context>(),
+      X_rsig->template data<ParamT, Context>(),
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
      Input(2).template data<T, Context>(), // dy
-      X_scale->template mutable_data<ParamType, Context>(),
+      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
-      dW->Reshape({C_})->template mutable_data<ParamType, Context>(),
+      dW->Reshape({C_})->template mutable_data<ParamT, Context>(),
-      dB->Reshape({C_})->template mutable_data<ParamType, Context>(),
+      dB->Reshape({C_})->template mutable_data<ParamT, Context>(),
      dX->template mutable_data<T, Context>(),
      ctx());
 }

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
--- a/dragon/python/core/ops/activation_ops.py
+++ b/dragon/python/core/ops/activation_ops.py
@@ -58,7 +58,7 @@ def dropout(inputs, ratio=0.5, **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate() \
-            .apply([inputs], ratio, inplace=inplace)
+            .apply([inputs], args['ratio'], inplace=inplace)
    else:
        return op_lib.blend(**args)
@@ -103,7 +103,7 @@ def drop_block2d(inputs, ratio=0.1, block_size=7, data_format='NCHW', **kwargs):
            .instantiate(
                block_size=block_size,
                data_format=data_format,
-            ).apply([inputs], ratio, inplace=inplace)
+            ).apply([inputs], args['ratio'], inplace=inplace)
    else:
        return op_lib.blend(**args)
@@ -137,7 +137,7 @@ def drop_path(inputs, ratio=0.2, **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate() \
-            .apply([inputs], ratio, inplace=inplace)
+            .apply([inputs], args['ratio'], inplace=inplace)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -205,9 +205,8 @@ def broadcast_to(inputs, shape, **kwargs):
    op_lib = array_ops_lib.Expand
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(
+            .instantiate(ndim=len(args['dims'])) \
-                ndim=len(args['dims']),
+            .apply([inputs], args['dims'])
-            ).apply([inputs], args['dims'])
    else:
        return op_lib.blend(**args)
@@ -1163,6 +1162,7 @@ def pad(inputs, pads, mode='constant', value=0, **kwargs):
        return op_lib.blend(**args)
+@ArgHelper.desc('limit', as_target=True)
 def permutation(limit, dtype='int64', **kwargs):
    r"""Return a tensor with value in the permuted range.
@@ -1174,7 +1174,7 @@ def permutation(limit, dtype='int64', **kwargs):
    Parameters
    ----------
-    limit: number
+    limit: Union[number, dragon.Tensor]
        The end of interval.
    dtype : str, optional, default='int64'
        The optional data type.
@@ -1192,7 +1192,7 @@ def permutation(limit, dtype='int64', **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate(dtype=dtype) \
-            .apply(limit, trainable=trainable)
+            .apply(args['limit'], trainable=trainable)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/control_flow_ops.py
+++ b/dragon/python/core/ops/control_flow_ops.py
@@ -49,10 +49,11 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
    inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
    op_lib = control_flow_ops_lib.Assign
    if context.executing_eagerly():
+        starts = args['starts'] if starts is not None else [0]
+        sizes = args['sizes'] if sizes is not None else [-1]
        return op_lib \
-            .instantiate(
+            .instantiate(ndim=len(starts)) \
-                ndim=len(starts) if starts is not None else 0,
+            .apply(inputs, starts, sizes, inplace=inplace)
-            ).apply(inputs, starts, sizes, inplace=inplace)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/normalization_ops.py
+++ b/dragon/python/core/ops/normalization_ops.py
@@ -23,6 +23,7 @@ from dragon.core.util import nest
 @OpSchema.num_inputs(5)
+@ArgHelper.desc('momentum', as_target=False)
 def batch_norm(
    inputs,
    axis=-1,
@@ -40,7 +41,8 @@ def batch_norm(
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
+    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
+                                   (1 - \text{momentum}) * x_{\text{batch}}
    Parameters
    ----------
@@ -48,8 +50,8 @@ def batch_norm(
        The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
    axis : int, optional, default=-1
        The channel axis.
-    momentum : float, optional, default=0.9
+    momentum : Union[float, dragon.Tensor], optional
-        The momentum for running average.
+        The value to :math:`\text{momentum}`.
    epsilon : float, optional, default=1e-5
        The value to :math:`\epsilon`.
    use_stats : int, optional, default=-1
@@ -62,16 +64,15 @@ def batch_norm(
    """
    args = ArgHelper.parse(locals())
-    args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
+    args['epsilon'] = float(epsilon)
    op_lib = normalization_ops_lib.BatchNorm
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
                axis=axis,
-                momentum=args['momentum'],
                epsilon=args['epsilon'],
                use_stats=use_stats,
-            ).apply(inputs)
+            ).apply(inputs, args['momentum'])
    else:
        return op_lib.blend(**args)
@@ -304,6 +305,7 @@ def local_response_norm(
 @OpSchema.num_inputs(5)
+@ArgHelper.desc('momentum', as_target=False)
 def sync_batch_norm(
    inputs,
    axis=-1,
@@ -322,7 +324,8 @@ def sync_batch_norm(
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
+    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
+                                   (1 - \text{momentum}) * x_{\text{batch}}
    Parameters
    ----------
@@ -330,8 +333,8 @@ def sync_batch_norm(
        The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
    axis : int, optional, default=-1
        The channel axis.
-    momentum : float, optional, default=0.9
+    momentum : Union[float, dragon.Tensor], optional
-        The momentum for average.
+        The value to :math:`\text{momentum}`.
    epsilon : float, optional, default=1e-5
        The value to :math:`\epsilon`.
    use_stats : int, optional, default=-1
@@ -346,7 +349,7 @@ def sync_batch_norm(
    """
    args = ArgHelper.parse(locals())
-    args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
+    args['epsilon'] = float(epsilon)
    if process_group is None:
        process_group = distributed.get_group()
    if process_group is None:
@@ -356,11 +359,10 @@ def sync_batch_norm(
        return op_lib \
            .instantiate(
                axis=axis,
-                momentum=args['momentum'],
                epsilon=args['epsilon'],
                use_stats=use_stats,
                process_group=process_group,
-            ).apply(inputs)
+            ).apply(inputs, args['momentum'])
    else:
        args.update(process_group.arguments)
        return op_lib.blend(**args)
--- a/dragon/python/core/ops/normalization_ops_lib.py
+++ b/dragon/python/core/ops/normalization_ops_lib.py
@@ -23,7 +23,6 @@ class BatchNorm(Operator):
    def __init__(self, key, dev, **kwargs):
        super(BatchNorm, self).__init__(key, dev, **kwargs)
        self.axis = kwargs.get('axis', -1)
-        self.momentum = kwargs.get('momentum', 0.9)
        self.epsilon = kwargs.get('epsilon', 1e-5)
        self.use_stats = kwargs.get('use_stats', 0)
        if self.use_stats not in (0, 1):
@@ -34,14 +33,21 @@ class BatchNorm(Operator):
            'op_type': 'BatchNorm',
            'arguments': {
                'axis': self.axis,
-                'momentum': self.momentum,
                'epsilon': self.epsilon,
                'use_stats': self.use_stats,
+                'momentum_desc': '${HANDLE}/momentum',
            },
        }
-    def forward(self, inputs):
+    def setup(self, ws, handle, momentum):
-        return self.dispatch(inputs, [self.alloc()])
+        self.feed_arg(ws, '%s/momentum' % handle, momentum, 'float32')
+    def forward(self, inputs, momentum):
+        return self.dispatch(
+            inputs, [self.alloc()],
+            callback=lambda ws, handle:
+                self.setup(ws, handle, momentum),
+        )
 class GroupNorm(Operator):

--- a/dragon/python/core/ops/utils.py
+++ b/dragon/python/core/ops/utils.py
@@ -118,6 +118,7 @@ class ArgHelper(object):
            if 'extra_inputs' not in arguments:
                arguments['extra_inputs'] = []
            arguments['extra_inputs'] += [arg]
+        if name in arguments:
            arguments.pop(name)
        arguments[name + '_desc'] = arg.id
        return arguments
@@ -141,5 +142,6 @@ class ArgHelper(object):
                    descs.append(ele.id)
                else:
                    descs.append(Tensor.from_value(ele, dtype, 'DescConst').id)
+            if name in arguments:
                arguments.pop(name)
            arguments[name + '_descs'] = descs
--- a/dragon/python/core/ops/vision_ops.py
+++ b/dragon/python/core/ops/vision_ops.py
@@ -176,9 +176,12 @@ def conv2d_transpose(
        raise ValueError('Unsupported padding algorithm: %s' % padding)
    if data_format not in ('NCHW', 'NHWC'):
        raise ValueError('Unsupported data format: %s' % data_format)
+    if 'SAME' in padding and output_shape is None:
+        raise ValueError('Excepted <output_shape> for same padding.')
    if output_shape is not None and 'SAME' not in padding:
        args['padding'] = 'SAME'
    for key in ('kernel_shape', 'strides', 'pads', 'dilations'):
+        if key in args and args[key] is not None:
            if key == 'pads':
                args[key] = _normalize_pads(args[key], 2)
            else:

--- a/dragon/python/vm/onnx/core/exporters/activation.py
+++ b/dragon/python/vm/onnx/core/exporters/activation.py
@@ -26,7 +26,7 @@ def dropout_exporter(op_def, context):
            drop_ratio = arg.f
        elif arg.name == 'prob_desc':
            drop_ratio = helper.fetch_argument(op_def, arg, context.ws)
-    helper.add_attribute(node, 'ratio', drop_ratio)
+    helper.add_attribute(node, 'ratio', float(drop_ratio))
    return node, const_tensors

--- a/dragon/python/vm/onnx/core/exporters/normalization.py
+++ b/dragon/python/vm/onnx/core/exporters/normalization.py
@@ -26,6 +26,9 @@ def batch_norm_exporter(op_def, context):
            helper.add_attribute(node, 'epsilon', arg.f)
        elif arg.name == 'momentum':
            helper.add_attribute(node, 'momentum', arg.f)
+        elif arg.name == 'momentum_desc':
+            momentum = helper.fetch_argument(op_def, arg, context.ws)
+            helper.add_attribute(node, 'momentum', float(momentum))
    # Weight, bias, running mean and running variance
    const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
    return node, const_tensors

--- a/dragon/utils/conversions.h
+++ b/dragon/utils/conversions.h
@@ -123,23 +123,51 @@ CONVERSIONS_DECL float16 To<float16, half>(half val) {
 }
 template <>
-CONVERSIONS_DECL half To<half, float>(float val) {
+CONVERSIONS_DECL half To<half, float16>(float16 val) {
-  return __float2half(val);
+  return __half_raw{val.x};
 }
 template <>
-CONVERSIONS_DECL half To<half, float16>(float16 val) {
+CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
-  return __half_raw{val.x};
+  return half2(__half2_raw{val.x, val.x});
 }
 template <>
-CONVERSIONS_DECL half2 To<half2, float>(float val) {
+CONVERSIONS_DECL half To<half, float>(float val) {
-  return __float2half2_rn(val);
+#if CUDA_VERSION_MIN(9, 2, 0)
+  return __float2half(val);
+#else
+#if defined(__CUDA_ARCH__)
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+  __half ret;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(ret)) : "f"(val));
+  return ret;
+#undef __HALF_TO_US
+#else
+  return To<half>(To<float16>(val));
+#endif
+#endif
 }
 template <>
-CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
+CONVERSIONS_DECL half2 To<half2, float>(float val) {
-  return half2(__half2_raw{val.x, val.x});
+#if CUDA_VERSION_MIN(9, 2, 0)
+  return __float2half2_rn(val);
+#else
+#if defined(__CUDA_ARCH__)
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+  __half2 ret;
+  asm("{.reg .f16 low;\n"
+      "  cvt.rn.f16.f32 low, %1;\n"
+      "  mov.b32 %0, {low,low};}\n"
+      : "=r"(__HALF2_TO_UI(ret))
+      : "f"(val));
+  return ret;
+#undef __HALF2_TO_UI
+#else
+  return To<half2>(To<float16>(val));
+#endif
+#endif
 }
 #endif // USE_CUDA

--- a/dragon/utils/math/elementwise.cu
+++ b/dragon/utils/math/elementwise.cu
@@ -162,23 +162,17 @@ __global__ void _InvStd(const int n, const T eps, const T* x, T* y) {
  }
 }
-template <>
+__global__ void _InvStd(const int n, const float eps, const half* x, half* y) {
-__global__ void
-_InvStd<half>(const int n, const half eps, const half* x, half* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
+    y[i] = __float2half(rsqrt(__half2float(x[i]) + eps));
-    y[i] = hrsqrt(__hadd(x[i], eps));
-#endif
  }
 }
-template <>
 __global__ void
-_InvStd<half2>(const int n, const half2 eps, const half2* x, half2* y) {
+_InvStd(const int n, const float eps, const half2* x, half2* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
+    const float2 val = __half22float2(x[i]);
-    y[i] = h2rsqrt(__hadd2(x[i], eps));
+    y[i] = __floats2half2_rn(rsqrt(val.x + eps), rsqrt(val.y + eps));
-#endif
  }
 }
@@ -206,19 +200,15 @@ __global__ void _Powx(const int n, const T exponent, const T* x, T* y) {
 __global__ void
 _Powx(const int n, const float exponent, const half* x, half* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
    y[i] = __float2half(pow(__half2float(x[i]), exponent));
-#endif
  }
 }
 __global__ void
 _Powx(const int n, const float exponent, const half2* x, half2* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
    const float2 val = __half22float2(x[i]);
    y[i] = __floats2half2_rn(pow(val.x, exponent), pow(val.y, exponent));
-#endif
  }
 }
@@ -269,20 +259,16 @@ __global__ void _Square(const int n, const T* x, T* y) {
 template <typename T>
 __global__ void _NotZero(const int nthreads, const T* x, bool* y) {
-  const T kZero = T(0);
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = x[i] != kZero ? true : false;
+    y[i] = x[i] != T(0) ? true : false;
  }
 }
 template <>
 __global__ void _NotZero<half>(const int nthreads, const half* x, bool* y) {
-#if __CUDA_ARCH__ >= 530
-  const half kZero = __float2half(0.f);
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __hne(x[i], kZero) ? true : false;
+    y[i] = __half2float(x[i]) != 0.f ? true : false;
  }
-#endif
 }
 template <typename T>
@@ -560,15 +546,12 @@ DRAGON_API void InvStd<float16, CUDAContext>(
  if ((n & 1) == 0) {
    _InvStd<<<CUDA_BLOCKS(n >> 1), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
        n >> 1,
-        convert::To<half2>(eps),
+        eps,
        reinterpret_cast<const half2*>(x),
        reinterpret_cast<half2*>(y));
  } else {
    _InvStd<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-        n,
+        n, eps, reinterpret_cast<const half*>(x), reinterpret_cast<half*>(y));
-        convert::To<half>(eps),
-        reinterpret_cast<const half*>(x),
-        reinterpret_cast<half*>(y));
  }
 }

--- a/dragon/utils/math/functional.h
+++ b/dragon/utils/math/functional.h
@@ -26,7 +26,7 @@ namespace math {
 template <typename T>
 struct MaxFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs ? rhs : lhs;
  }
@@ -39,7 +39,7 @@ struct MaxFunctor {
 template <>
 struct MaxFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -62,7 +62,7 @@ struct MaxFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MaxFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -87,7 +87,7 @@ struct MaxFunctor<half2> {
 template <typename T>
 struct MinFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs ? lhs : rhs;
  }
@@ -100,7 +100,7 @@ struct MinFunctor {
 template <>
 struct MinFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -123,7 +123,7 @@ struct MinFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MinFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -148,7 +148,7 @@ struct MinFunctor<half2> {
 template <typename T>
 struct PlusFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs + rhs;
  }
@@ -161,7 +161,7 @@ struct PlusFunctor {
 template <>
 struct PlusFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -183,7 +183,7 @@ struct PlusFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct PlusFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -211,7 +211,7 @@ struct PlusFunctor<half2> {
 template <typename T>
 struct MinusFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs - rhs;
  }
@@ -224,7 +224,7 @@ struct MinusFunctor {
 template <>
 struct MinusFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -246,7 +246,7 @@ struct MinusFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MinusFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -274,7 +274,7 @@ struct MinusFunctor<half2> {
 template <typename T>
 struct MultipliesFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs * rhs;
  }
@@ -287,7 +287,7 @@ struct MultipliesFunctor {
 template <>
 struct MultipliesFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -309,7 +309,7 @@ struct MultipliesFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MultipliesFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -337,7 +337,7 @@ struct MultipliesFunctor<half2> {
 template <typename T>
 struct DividesFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs / rhs;
  }
@@ -350,7 +350,7 @@ struct DividesFunctor {
 template <>
 struct DividesFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -372,7 +372,7 @@ struct DividesFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct DividesFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -396,7 +396,7 @@ struct DividesFunctor<half2> {
 template <typename T>
 struct PowFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return pow(lhs, rhs);
  }
@@ -409,7 +409,7 @@ struct PowFunctor {
 template <>
 struct PowFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
    half ret = __float2half(
@@ -425,7 +425,7 @@ struct PowFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct PowFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -449,7 +449,7 @@ struct PowFunctor<half2> {
 template <typename T>
 struct EqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs == rhs;
  }
@@ -462,7 +462,7 @@ struct EqualFunctor {
 template <>
 struct EqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -481,7 +481,7 @@ struct EqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct EqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -496,7 +496,7 @@ struct EqualFunctor<half> {
 template <typename T>
 struct NotEqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs != rhs;
  }
@@ -509,7 +509,7 @@ struct NotEqualFunctor {
 template <>
 struct NotEqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -528,7 +528,7 @@ struct NotEqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct NotEqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -543,7 +543,7 @@ struct NotEqualFunctor<half> {
 template <typename T>
 struct GreaterFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs > rhs;
  }
@@ -556,7 +556,7 @@ struct GreaterFunctor {
 template <>
 struct GreaterFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -575,7 +575,7 @@ struct GreaterFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct GreaterFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -590,7 +590,7 @@ struct GreaterFunctor<half> {
 template <typename T>
 struct LessFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs;
  }
@@ -603,7 +603,7 @@ struct LessFunctor {
 template <>
 struct LessFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -622,7 +622,7 @@ struct LessFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct LessFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -637,7 +637,7 @@ struct LessFunctor<half> {
 template <typename T>
 struct GreaterEqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs >= rhs;
  }
@@ -650,7 +650,7 @@ struct GreaterEqualFunctor {
 template <>
 struct GreaterEqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -669,7 +669,7 @@ struct GreaterEqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct GreaterEqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -684,7 +684,7 @@ struct GreaterEqualFunctor<half> {
 template <typename T>
 struct LessEqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs <= rhs;
  }
@@ -697,7 +697,7 @@ struct LessEqualFunctor {
 template <>
 struct LessEqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -716,7 +716,7 @@ struct LessEqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct LessEqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {

--- a/dragon/utils/math/reduce.cu
+++ b/dragon/utils/math/reduce.cu
@@ -239,8 +239,8 @@ void ReduceSum<float16, CUDAContext>(
        num_axes,                                                          \
        axes,                                                              \
        Reducer<AccT>(),                                                   \
-        AccT(kInit),                                                       \
+        convert::To<AccT>(kInit),                                          \
-        AccT(scale),                                                       \
+        convert::To<AccT>(scale),                                          \
        x,                                                                 \
        y,                                                                 \
        ctx);                                                              \

--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -301,16 +301,16 @@ void ChannelAffine(
 /* array.channel_normalize */
-template <typename Tx, typename Ty, class Context>
+template <typename InputT, typename OutputT, class Context>
 void ChannelNormalize(
    const int axis,
    const int num_dims,
    const int64_t* x_strides,
    const int64_t* y_dims,
-    const Tx* x,
+    const InputT* x,
    const float* mean,
    const float* std,
-    Ty* y,
+    OutputT* y,
    Context* ctx);
 /* array.channel_shuffle */
@@ -648,28 +648,28 @@ void BroadcastLossGrad(
 /* loss.nll_loss */
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void NLLLoss(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void NLLLossGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
 /* loss.sigmoid_ce_loss */
@@ -694,7 +694,7 @@ void SigmoidCrossEntropyGrad(
 /* loss.sigmoid_focal_loss */
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SigmoidFocalLoss(
    const int outer_dim,
    const int inner_dim,
@@ -703,13 +703,13 @@ void SigmoidFocalLoss(
    const float neg_alpha,
    const float gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SigmoidFocalLossGrad(
    const int outer_dim,
    const int inner_dim,
@@ -718,10 +718,10 @@ void SigmoidFocalLossGrad(
    const float neg_alpha,
    const float gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
 /* loss.smooth_l1_loss */
@@ -754,28 +754,28 @@ void SoftmaxCrossEntropy(
 /* loss.sparse_softmax_cross_entropy */
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SparseSoftmaxCrossEntropy(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SparseSoftmaxCrossEntropyGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
 /* math.abs */

--- a/tensorflow/core/keras/layers/normalization.py
+++ b/tensorflow/core/keras/layers/normalization.py
@@ -55,7 +55,7 @@ class BatchNormalization(Layer):
        axis : int, optional, default=-1
            The channel axis.
        momentum : float, optional, default=0.99
-            The momentum of moving average.
+            The decay factor of running average.
        epsilon : float, optional, default=1e-3
            The epsilon value.
        center : bool, optional, default=True

--- a/tensorflow/core/ops/nn_impl.py
+++ b/tensorflow/core/ops/nn_impl.py
@@ -41,8 +41,8 @@ def batch_normalization(
    The moving average of stats are calculated as:
-    .. math::
+    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
-        x_{moving} \leftarrow momentum * x_{moving} + (1 - momentum) * x_{stat}
+                                   (1 - \text{momentum}) * x_{\text{batch}}
    Parameters
    ----------
@@ -58,10 +58,10 @@ def batch_normalization(
        The :math:`\gamma` tensor.
    axis : int, optional, default=-1
        The channel axis.
-    momentum : float, optional, default=0.9
+    momentum : Union[float, dragon.Tensor], optional
-        The momentum of moving average.
+        The value to :math:`\text{momentum}`.
    variance_epsilon : float, optional, default=1e-5
-        The value of epsilon.
+        The value to :math:`\epsilon`.
    trainable : bool, optional, default=False
        The optional training flag.
    name : str, optional

--- a/tensorlayer/core/layers/normalization.py
+++ b/tensorlayer/core/layers/normalization.py
@@ -50,7 +50,7 @@ class BatchNorm(layer.Layer):
        Parameters
        ----------
        decay : float, optional, default=0.9
-            The decay factor for moving average.
+            The decay factor of running average.
        epsilon : float, optional, default=1e-5
            The epsilon.
        act : callable, optional

--- a/test/dragon/test_autograph.py
+++ b/test/dragon/test_autograph.py
--- a/tools/codegen_runtime.py
+++ b/tools/codegen_runtime.py
@@ -8,7 +8,6 @@
 #      <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Code generator for Runtime API."""
 from __future__ import absolute_import

--- a/torch/core/nn/functional.py
+++ b/torch/core/nn/functional.py
@@ -89,7 +89,8 @@ def batch_norm(
    The moving average of stats are calculated as:
-    .. math:: x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    Parameters
    ----------
@@ -124,9 +125,9 @@ def batch_norm(
        .instantiate(
            input.device,
            training=training,
-            momentum=momentum,
            epsilon=eps,
-        ).apply(input, running_mean, running_var, weight, bias)
+        ).apply(input, running_mean, running_var,
+                weight, bias, momentum)
 def binary_cross_entropy_with_logits(
@@ -1598,7 +1599,7 @@ def sync_batch_norm(
    The moving average of stats are calculated as:
    .. math::
-        x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
+        x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{\text{batch}}
    Additionally, you can specify ``process_group`` to perform synchronization.

--- a/torch/core/nn/modules/_functions.py
+++ b/torch/core/nn/modules/_functions.py
@@ -111,24 +111,31 @@ class BatchNorm(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(BatchNorm, self).__init__(key, dev, **kwargs)
-        self.momentum = kwargs.get('momentum', 0.1)
        self.epsilon = kwargs.get('epsilon', 1e-5)
        self.training = kwargs.get('training', False)
+        self.track_stats = kwargs.get('track_stats', True)
+    def setup(self, ws, handle, momentum):
+        self.feed_arg(ws, '{}/momentum'.format(handle), 1.0 - momentum, 'float32')
    def attributes(self):
        return {
            'op_type': 'BatchNorm',
            'arguments': {
                'axis': 1,
-                'momentum': 1. - self.momentum,
                'epsilon': self.epsilon,
                'use_stats': int(not self.training),
+                'momentum_desc': '${HANDLE}/momentum',
            }
        }
-    def forward(self, input, running_mean, running_var, weight, bias):
+    def forward(self, input, running_mean, running_var, weight, bias, momentum):
        inputs = [input, weight, bias, running_mean, running_var]
-        return self.dispatch(inputs, [self.alloc()])
+        return self.dispatch(
+            inputs, [self.alloc()],
+            callback=lambda ws, handle:
+                self.setup(ws, handle, momentum),
+        )
 class Conv2d(_ConvNd):

--- a/torch/core/nn/modules/batchnorm.py
+++ b/torch/core/nn/modules/batchnorm.py
@@ -25,6 +25,8 @@ from dragon.vm.torch.core.tensor import Tensor
 class _BatchNorm(Module):
+    """BatchNorm base module."""
    def __init__(
        self,
        num_features,
@@ -45,20 +47,26 @@ class _BatchNorm(Module):
        else:
            self.register_buffer('weight', init_funcs.ones(num_features))
            self.register_buffer('bias', init_funcs.zeros(num_features))
+        if self.track_running_stats:
+            self.num_batches_tracked = 0
+        else:
+            self.num_batches_tracked = None
        self.register_buffer('running_mean', init_funcs.zeros(num_features))
        self.register_buffer('running_var', init_funcs.ones(num_features))
        self.inputs = [self.running_mean, self.running_var, self.weight, self.bias]
        self.reset_parameters()
-    def reset_parameters(self):
-        if self.affine:
-            self.weight.data.one_()
-            self.bias.data.zero_()
    def reset_running_stats(self):
        if self.track_running_stats:
            self.running_mean.zero_()
            self.running_var.fill_(1)
+            self.num_batches_tracked = 0
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.one_()
+            self.bias.data.zero_()
    def extra_repr(self):
        return '{num_features}, ' \
@@ -72,7 +80,7 @@ class _BatchNorm(Module):
        return F.batch_norm(
            input, *self.inputs,
            training=self.training,
-            momentum=self.momentum,
+            momentum=self._get_momentum(),
            eps=self.eps
        )
@@ -82,6 +90,19 @@ class _BatchNorm(Module):
            return self  # Float32 parameters are required.
        return super(_BatchNorm, self)._apply(fn)
+    def _get_momentum(self):
+        """Return the current momentum value."""
+        momentum = 0.0 if self.momentum is None else self.momentum
+        if self.track_running_stats:
+            if self.training:
+                if self.num_batches_tracked is not None:
+                    self.num_batches_tracked += 1
+                if self.momentum is None:
+                    momentum = 1.0 / float(self.num_batches_tracked)
+        else:
+            momentum = 0.0
+        return momentum
 class BatchNorm1d(_BatchNorm):
    r"""Apply the batch normalization over 2d input.
@@ -93,7 +114,8 @@ class BatchNorm1d(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    See Also
    --------
@@ -109,16 +131,16 @@ class BatchNorm1d(_BatchNorm):
        affine=True,
        track_running_stats=True,
    ):
-        """Create a ``BatchNorm1d`` module.
+        r"""Create a ``BatchNorm1d`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -142,7 +164,8 @@ class BatchNorm2d(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    See Also
    --------
@@ -158,16 +181,16 @@ class BatchNorm2d(_BatchNorm):
        affine=True,
        track_running_stats=True,
    ):
-        """Create a ``BatchNorm2d`` module.
+        r"""Create a ``BatchNorm2d`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -191,7 +214,8 @@ class BatchNorm3d(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    See Also
    --------
@@ -207,16 +231,16 @@ class BatchNorm3d(_BatchNorm):
        affine=True,
        track_running_stats=True,
    ):
-        """Create a ``BatchNorm3d`` module.
+        r"""Create a ``BatchNorm3d`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -240,7 +264,8 @@ class SyncBatchNorm(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    Additionally, specify ``process_group`` to perform synchronization.
@@ -261,16 +286,16 @@ class SyncBatchNorm(_BatchNorm):
        track_running_stats=True,
        process_group=None,
    ):
-        """Create a ``SyncBatchNorm`` module.
+        r"""Create a ``SyncBatchNorm`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -292,7 +317,7 @@ class SyncBatchNorm(_BatchNorm):
            return F.sync_batch_norm(
                input, *self.inputs,
                training=self.training,
-                momentum=self.momentum,
+                momentum=self._get_momentum(),
                eps=self.eps,
                process_group=self.process_group
            )
@@ -300,6 +325,6 @@ class SyncBatchNorm(_BatchNorm):
            return F.batch_norm(
                input, *self.inputs,
                training=self.training,
-                momentum=self.momentum,
+                momentum=self._get_momentum(),
                eps=self.eps
            )
--- a/torch/core/nn/modules/normalization.py
+++ b/torch/core/nn/modules/normalization.py
@@ -61,7 +61,7 @@ class AffineChannel(Module):
        fix_bias=False,
        inplace=False,
    ):
-        """Create an ``Affine`` module.
+        """Create an ``AffineChannel`` module.
        Parameters
        ----------
@@ -141,7 +141,7 @@ class GroupNorm(Module):
        eps=1e-5,
        affine=True,
    ):
-        """Create a ``GroupNorm`` module.
+        r"""Create a ``GroupNorm`` module.
        Parameters
        ----------
@@ -150,7 +150,7 @@ class GroupNorm(Module):
        num_channels : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
@@ -228,11 +228,11 @@ class LocalResponseNorm(Module):
        size : int, required
            The number of neighbouring channels to sum over.
        alpha : float, optional, default=0.0001
-            The scale value :math:`\alpha`.
+            The value to :math:`\alpha`.
        beta : float, optional, default=0.75
-            The exponent value :math:`\beta`.
+            The value to :math:`\beta`.
        k : float, optional, default=1.
-            The bias constant :math:`k`.
+            The value to :math:`k`.
        """
        super(LocalResponseNorm, self).__init__()