Add tests of operator spec for AutoGraph

Summary: This commit tests the correctness of shape inference and data type blended by autograph module.

Add tests of operator spec for AutoGraph
Summary: This commit tests the correctness of shape inference and data type blended by autograph module.
Ting PAN
Commit 1ad360e9 authored Dec 24, 2020 by Ting PAN
Showing with 1313 additions and 1031 deletions
dragon/kernels/array/channel_normalize_op_kernel.cc
dragon/kernels/array/channel_normalize_op_kernel.cu
dragon/kernels/loss/nll_loss_op_kernel.cc
dragon/kernels/loss/nll_loss_op_kernel.cu
dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
dragon/kernels/math/clip_op_kernel.cu
dragon/kernels/math/moments_op_kernel.cc
dragon/kernels/math/moments_op_kernel.cu
dragon/kernels/normalization/lp_norm_op_kernel.cc
dragon/kernels/normalization/lp_norm_op_kernel.cu
dragon/kernels/vision/roi_align_op_kernel.cu
dragon/kernels/vision/roi_pool_op_kernel.cu
dragon/operators/array/channel_normalize_op.cc
dragon/operators/loss/nll_loss_op.cc
dragon/operators/loss/nll_loss_op.h
dragon/operators/loss/sigmoid_focal_loss_op.cc
dragon/operators/loss/sigmoid_loss_ops.h
--- a/dragon/kernels/array/channel_normalize_op_kernel.cc
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cc
@@ -7,16 +7,16 @@ namespace kernel {
 namespace {
-template <typename Tx, typename Ty>
+template <typename InputT, typename OutputT>
 void _ChannelNormalize(
    const int axis,
    const int num_dims,
    const int64_t* x_strides,
    const int64_t* y_dims,
-    const Tx* x,
+    const InputT* x,
    const float* mean,
    const float* std,
-    Ty* y) {
+    OutputT* y) {
  const auto count =
      std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
  vec64_t idx(num_dims, 0);
@@ -27,7 +27,8 @@ void _ChannelNormalize(
      xi += idx[d] * x_strides[d];
      if (d == axis) wi = idx[d];
    }
-    y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
+    y[yi] =
+        convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]);
    math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
  }
 }
@@ -36,83 +37,43 @@ void _ChannelNormalize(
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                            \
-void ChannelNormalize<float16, float16, CPUContext>(
-    const int axis,
-    const int num_dims,
-    const int64_t* x_strides,
-    const int64_t* y_dims,
-    const float16* x,
-    const float* mean,
-    const float* std,
-    float16* y,
-    CPUContext* ctx) {
-  CPU_FP16_NOT_SUPPORTED;
-}
-#define DEFINE_KERNEL_LAUNCHER(Tx, Ty)                                     \
  template <>                                                              \
-  void ChannelNormalize<Tx, Ty, CPUContext>(                               \
+  void ChannelNormalize<InputT, OutputT, CPUContext>(                      \
      const int axis,                                                      \
      const int num_dims,                                                  \
      const int64_t* x_strides,                                            \
      const int64_t* y_dims,                                               \
-      const Tx* x,                                                         \
+      const InputT* x,                                                     \
      const float* mean,                                                   \
      const float* std,                                                    \
-      Ty* y,                                                               \
+      OutputT* y,                                                          \
      CPUContext* ctx) {                                                   \
    _ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
  }
-#define DEFINE_FP16_KERNEL_LAUNCHER(T)           \
+DEFINE_KERNEL_LAUNCHER(int8_t, float16);
-  template <>                                    \
-  void ChannelNormalize<float16, T, CPUContext>( \
-      const int axis,                            \
-      const int num_dims,                        \
-      const int64_t* x_strides,                  \
-      const int64_t* y_dims,                     \
-      const float16* x,                          \
-      const float* mean,                         \
-      const float* std,                          \
-      T* y,                                      \
-      CPUContext* ctx) {                         \
-    CPU_FP16_NOT_SUPPORTED;                      \
-  }                                              \
-  template <>                                    \
-  void ChannelNormalize<T, float16, CPUContext>( \
-      const int axis,                            \
-      const int num_dims,                        \
-      const int64_t* x_strides,                  \
-      const int64_t* y_dims,                     \
-      const T* x,                                \
-      const float* mean,                         \
-      const float* std,                          \
-      float16* y,                                \
-      CPUContext* ctx) {                         \
-    CPU_FP16_NOT_SUPPORTED;                      \
-  }
 DEFINE_KERNEL_LAUNCHER(int8_t, float);
 DEFINE_KERNEL_LAUNCHER(int8_t, double);
+DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
 DEFINE_KERNEL_LAUNCHER(uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, double);
+DEFINE_KERNEL_LAUNCHER(int, float16);
 DEFINE_KERNEL_LAUNCHER(int, float);
 DEFINE_KERNEL_LAUNCHER(int, double);
+DEFINE_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_KERNEL_LAUNCHER(int64_t, double);
+DEFINE_KERNEL_LAUNCHER(float16, float16);
+DEFINE_KERNEL_LAUNCHER(float16, float);
+DEFINE_KERNEL_LAUNCHER(float16, double);
+DEFINE_KERNEL_LAUNCHER(float, float16);
 DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(float, double);
+DEFINE_KERNEL_LAUNCHER(double, float16);
 DEFINE_KERNEL_LAUNCHER(double, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
-DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
-DEFINE_FP16_KERNEL_LAUNCHER(int);
-DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
-DEFINE_FP16_KERNEL_LAUNCHER(float);
-DEFINE_FP16_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
-#undef DEFINE_FP16_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/array/channel_normalize_op_kernel.cu
+++ b/dragon/kernels/array/channel_normalize_op_kernel.cu
@@ -10,44 +10,23 @@ namespace kernel {
 namespace {
-template <typename Tx, typename Ty, int D>
-__global__ void _ChannelNormalize(
-    const int nthreads,
-    const int axis,
-    const int num_dims,
-    const SimpleArray<int, D> x_strides,
-    const SimpleArray<int, D> y_dims,
-    const Tx* x,
-    const float* mean,
-    const float* std,
-    Ty* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    int xi = 0, wi, tmp = yi;
-    for (int d = num_dims - 1; d >= 0; --d) {
-      int r;
-      FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
-      xi += r * x_strides.data[d];
-      if (d == axis) wi = r;
-    }
 #if __CUDA_ARCH__ >= 350
-    y[yi] = ((Ty)x[xi] - (Ty)__ldg(mean + wi)) / (Ty)__ldg(std + wi);
+#define LDG(x, i) __ldg(x + i)
 #else
-    y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
+#define LDG(x, i) x[i]
 #endif
-  }
-}
-template <typename T, int D>
+template <typename InputT, typename OutputT, int D>
-__global__ void _ChannelNormalizeHalf(
+__global__ void _ChannelNormalize(
    const int nthreads,
    const int axis,
    const int num_dims,
    const SimpleArray<int, D> x_strides,
    const SimpleArray<int, D> y_dims,
-    const T* x,
+    const InputT* x,
    const float* mean,
    const float* std,
-    half* y) {
+    OutputT* y) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    int xi = 0, wi, tmp = yi;
    for (int d = num_dims - 1; d >= 0; --d) {
@@ -56,119 +35,28 @@ __global__ void _ChannelNormalizeHalf(
      xi += r * x_strides.data[d];
      if (d == axis) wi = r;
    }
-#if __CUDA_ARCH__ >= 350
+    y[yi] = convert::To<OutputT>(
-    y[yi] = __float2half(((float)x[xi] - __ldg(mean + wi)) / __ldg(std + wi));
+        (convert::To<float>(x[xi]) - LDG(mean, wi)) / LDG(std, wi));
-#else
-    y[yi] = __float2half(((float)x[xi] - mean[wi]) / std[wi]);
-#endif
  }
 }
-template <typename T, int D>
+#undef LDG
-__global__ void _ChannelNormalizeHalf(
-    const int nthreads,
-    const int axis,
-    const int num_dims,
-    const SimpleArray<int, D> x_strides,
-    const SimpleArray<int, D> y_dims,
-    const half* x,
-    const float* mean,
-    const float* std,
-    T* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    int xi = 0, wi, tmp = yi;
-    for (int d = num_dims - 1; d >= 0; --d) {
-      int r;
-      FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
-      xi += r * x_strides.data[d];
-      if (d == axis) wi = r;
-    }
-#if __CUDA_ARCH__ >= 350
-    y[yi] = (T)((__half2float(x[xi]) - __ldg(mean + wi)) / __ldg(std + wi));
-#else
-    y[yi] = (T)((__half2float(x[xi]) - mean[wi]) / std[wi]);
-#endif
-  }
-}
-template <int D>
-__global__ void _ChannelNormalizeHalfAndHalf(
-    const int nthreads,
-    const int axis,
-    const int num_dims,
-    const SimpleArray<int, D> x_strides,
-    const SimpleArray<int, D> y_dims,
-    const half* x,
-    const float* mean,
-    const float* std,
-    half* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    int xi = 0, wi, tmp = yi;
-    for (int d = num_dims - 1; d >= 0; --d) {
-      int r;
-      FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
-      xi += r * x_strides.data[d];
-      if (d == axis) wi = r;
-    }
-#if __CUDA_ARCH__ >= 350
-    y[yi] = __float2half(
-        ((__half2float(x[xi]) - __ldg(mean + wi)) / __ldg(std + wi)));
-#else
-    y[yi] = __float2half(((__half2float(x[xi]) - mean[wi]) / std[wi]));
-#endif
-  }
-}
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT)                        \
-void ChannelNormalize<float16, float16, CUDAContext>(
-    const int axis,
-    const int num_dims,
-    const int64_t* x_strides,
-    const int64_t* y_dims,
-    const float16* x,
-    const float* mean,
-    const float* std,
-    float16* y,
-    CUDAContext* ctx) {
-  CUDA_TENSOR_DIMS_CHECK(num_dims);
-  SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;
-  const auto nthreads =
-      std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
-  for (int i = 0; i < num_dims; ++i) {
-    X_strides.data[i] = x_strides[i];
-    Y_dims.data[i] = y_dims[i];
-  }
-  _ChannelNormalizeHalfAndHalf<<<
-      CUDA_BLOCKS(nthreads),
-      CUDA_THREADS,
-      0,
-      ctx->cuda_stream()>>>(
-      nthreads,
-      axis,
-      num_dims,
-      X_strides,
-      Y_dims,
-      reinterpret_cast<const half*>(x),
-      mean,
-      std,
-      reinterpret_cast<half*>(y));
-}
-#define DEFINE_KERNEL_LAUNCHER(Tx, Ty)                                 \
  template <>                                                          \
-  void ChannelNormalize<Tx, Ty, CUDAContext>(                          \
+  void ChannelNormalize<InputT, OutputT, CUDAContext>(                 \
      const int axis,                                                  \
      const int num_dims,                                              \
      const int64_t* x_strides,                                        \
      const int64_t* y_dims,                                           \
-      const Tx* x,                                                     \
+      const InputT* x,                                                 \
      const float* mean,                                               \
      const float* std,                                                \
-      Ty* y,                                                           \
+      OutputT* y,                                                      \
      CUDAContext* ctx) {                                              \
    CUDA_TENSOR_DIMS_CHECK(num_dims);                                  \
    SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;          \
@@ -186,96 +74,28 @@ void ChannelNormalize<float16, float16, CUDAContext>(
        nthreads, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
  }
-#define DEFINE_FP16_KERNEL_LAUNCHER(T)                             \
+DEFINE_KERNEL_LAUNCHER(int8_t, float16);
-  template <>                                                      \
-  void ChannelNormalize<float16, T, CUDAContext>(                  \
-      const int axis,                                              \
-      const int num_dims,                                          \
-      const int64_t* x_strides,                                    \
-      const int64_t* y_dims,                                       \
-      const float16* x,                                            \
-      const float* mean,                                           \
-      const float* std,                                            \
-      T* y,                                                        \
-      CUDAContext* ctx) {                                          \
-    CUDA_TENSOR_DIMS_CHECK(num_dims);                              \
-    SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;      \
-    const auto nthreads = std::accumulate(                         \
-        y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
-    for (int i = 0; i < num_dims; ++i) {                           \
-      X_strides.data[i] = x_strides[i];                            \
-      Y_dims.data[i] = y_dims[i];                                  \
-    }                                                              \
-    _ChannelNormalizeHalf<<<                                       \
-        CUDA_BLOCKS(nthreads),                                     \
-        CUDA_THREADS,                                              \
-        0,                                                         \
-        ctx->cuda_stream()>>>(                                     \
-        nthreads,                                                  \
-        axis,                                                      \
-        num_dims,                                                  \
-        X_strides,                                                 \
-        Y_dims,                                                    \
-        reinterpret_cast<const half*>(x),                          \
-        mean,                                                      \
-        std,                                                       \
-        y);                                                        \
-  }                                                                \
-  template <>                                                      \
-  void ChannelNormalize<T, float16, CUDAContext>(                  \
-      const int axis,                                              \
-      const int num_dims,                                          \
-      const int64_t* x_strides,                                    \
-      const int64_t* y_dims,                                       \
-      const T* x,                                                  \
-      const float* mean,                                           \
-      const float* std,                                            \
-      float16* y,                                                  \
-      CUDAContext* ctx) {                                          \
-    CUDA_TENSOR_DIMS_CHECK(num_dims);                              \
-    SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;      \
-    const auto nthreads = std::accumulate(                         \
-        y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
-    for (int i = 0; i < num_dims; ++i) {                           \
-      X_strides.data[i] = x_strides[i];                            \
-      Y_dims.data[i] = y_dims[i];                                  \
-    }                                                              \
-    _ChannelNormalizeHalf<<<                                       \
-        CUDA_BLOCKS(nthreads),                                     \
-        CUDA_THREADS,                                              \
-        0,                                                         \
-        ctx->cuda_stream()>>>(                                     \
-        nthreads,                                                  \
-        axis,                                                      \
-        num_dims,                                                  \
-        X_strides,                                                 \
-        Y_dims,                                                    \
-        x,                                                         \
-        mean,                                                      \
-        std,                                                       \
-        reinterpret_cast<half*>(y));                               \
-  }
 DEFINE_KERNEL_LAUNCHER(int8_t, float);
 DEFINE_KERNEL_LAUNCHER(int8_t, double);
+DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
 DEFINE_KERNEL_LAUNCHER(uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, double);
+DEFINE_KERNEL_LAUNCHER(int, float16);
 DEFINE_KERNEL_LAUNCHER(int, float);
 DEFINE_KERNEL_LAUNCHER(int, double);
+DEFINE_KERNEL_LAUNCHER(int64_t, float16);
 DEFINE_KERNEL_LAUNCHER(int64_t, float);
 DEFINE_KERNEL_LAUNCHER(int64_t, double);
+DEFINE_KERNEL_LAUNCHER(float16, float16);
+DEFINE_KERNEL_LAUNCHER(float16, float);
+DEFINE_KERNEL_LAUNCHER(float16, double);
+DEFINE_KERNEL_LAUNCHER(float, float16);
 DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(float, double);
+DEFINE_KERNEL_LAUNCHER(double, float16);
 DEFINE_KERNEL_LAUNCHER(double, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
-DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
-DEFINE_FP16_KERNEL_LAUNCHER(int);
-DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
-DEFINE_FP16_KERNEL_LAUNCHER(float);
-DEFINE_FP16_KERNEL_LAUNCHER(double);
 #undef DEFINE_KERNEL_LAUNCHER
-#undef DEFINE_FP16_KERNEL_LAUNCHER
 } // namespace kernel

--- a/dragon/kernels/loss/nll_loss_op_kernel.cc
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cc
@@ -7,51 +7,51 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _NLLLoss(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      loss[i] = mask[i] = LogitType(0);
+      loss[i] = mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      loss[i] = -logit[k], mask[i] = LogitType(1);
+      loss[i] = -logit[k], mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _NLLLossGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      mask[i] = LogitType(0);
+      mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
+      dlogit[k] = LogitT(-1), mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
@@ -61,17 +61,17 @@ void _NLLLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
  template <>                                         \
-  void name<LogitType, TargetType, CPUContext>(             \
+  void name<LogitT, TargetT, CPUContext>(             \
      const int outer_dim,                            \
      const int inner_dim,                            \
      const int axis_dim,                             \
      const int ignore_index,                         \
-      const LogitType* logit,                               \
+      const LogitT* logit,                            \
-      const TargetType* target,                             \
+      const TargetT* target,                          \
-      LogitType* loss,                                      \
+      LogitT* loss,                                   \
-      LogitType* mask,                                      \
+      LogitT* mask,                                   \
      CPUContext* ctx) {                              \
    _##name(                                          \
        outer_dim,                                    \

--- a/dragon/kernels/loss/nll_loss_op_kernel.cu
+++ b/dragon/kernels/loss/nll_loss_op_kernel.cu
@@ -9,48 +9,48 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _NLLLoss(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      loss[yi] = mask[yi] = LogitType(0);
+      loss[yi] = mask[yi] = LogitT(0);
    } else {
      loss[yi] = -logit[(i * axis_dim + label) * inner_dim + j];
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _NLLLossGrad(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      mask[yi] = LogitType(0);
+      mask[yi] = LogitT(0);
    } else {
-      dlogit[(i * axis_dim + label) * inner_dim + j] = LogitType(-1);
+      dlogit[(i * axis_dim + label) * inner_dim + j] = LogitT(-1);
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
@@ -59,17 +59,17 @@ __global__ void _NLLLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType)                  \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT)                        \
  template <>                                                                \
-  void name<LogitType, TargetType, CUDAContext>(                             \
+  void name<LogitT, TargetT, CUDAContext>(                                   \
      const int outer_dim,                                                   \
      const int inner_dim,                                                   \
      const int axis_dim,                                                    \
      const int ignore_index,                                                \
-      const LogitType* logit,                                                \
+      const LogitT* logit,                                                   \
-      const TargetType* target,                                              \
+      const TargetT* target,                                                 \
-      LogitType* loss,                                                       \
+      LogitT* loss,                                                          \
-      LogitType* mask,                                                       \
+      LogitT* mask,                                                          \
      CUDAContext* ctx) {                                                    \
    const auto nthreads = outer_dim * inner_dim;                             \
    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cc
@@ -7,19 +7,19 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SigmoidFocalLoss(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 3> idx = {0, 0, 0};
  std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
  const int count = dims[0] * dims[1] * dims[2];
@@ -27,23 +27,21 @@ void _SigmoidFocalLoss(
  for (int i = 0; i < count; ++i) {
    const int t = (int)target[idx[0] * inner_dim + idx[2]];
    // "0" is reserved for target if negative index is zero
-    LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
-    LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
    // (1 - p)^{gamma} * log(p)
-    LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
+    LogitT pos_term =
-        std::log(std::max(p, (LogitType)FLT_MIN));
+        std::pow(LogitT(1) - p, gamma) * std::log(std::max(p, (LogitT)FLT_MIN));
    // p^{gamma} * log(1 - p)
-    LogitType neg_term = std::pow(p, gamma) *
+    LogitT neg_term = std::pow(p, gamma) *
        (-logit[i] * (logit[i] >= 0) -
         std::log(
-             LogitType(1) +
+             LogitT(1) + std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
-             std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
-    loss[i] = LogitType(0);
+    loss[i] = LogitT(0);
    loss[i] += -c1 * pos_term * pos_alpha;
    loss[i] += -c2 * neg_term * neg_alpha;
    mask[i] = c1;
@@ -52,19 +50,19 @@ void _SigmoidFocalLoss(
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SigmoidFocalLossGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 3> idx = {0, 0, 0};
  std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
  const int count = dims[0] * dims[1] * dims[2];
@@ -72,26 +70,24 @@ void _SigmoidFocalLossGrad(
  for (int i = 0; i < count; ++i) {
    const int t = (int)target[idx[0] * inner_dim + idx[2]];
    // "0" is reserved for target if negative index is zero
-    LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
-    LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
    // (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
-    LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
+    LogitT pos_term = std::pow(LogitT(1) - p, gamma) *
-        (LogitType(1) - p -
+        (LogitT(1) - p - p * gamma * std::log(std::max(p, (LogitT)FLT_MIN)));
-         p * gamma * std::log(std::max(p, (LogitType)FLT_MIN)));
    // p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
-    LogitType neg_term = std::pow(p, gamma) *
+    LogitT neg_term = std::pow(p, gamma) *
        ((-logit[i] * (logit[i] >= 0) -
          std::log(
-              LogitType(1) +
+              LogitT(1) +
-              std::exp(logit[i] - LogitType(2) * logit[i] * (logit[i] >= 0)))) *
+              std::exp(logit[i] - LogitT(2) * logit[i] * (logit[i] >= 0)))) *
             (1 - p) * gamma -
         p);
-    dx[i] = LogitType(0);
+    dx[i] = LogitT(0);
    dx[i] += -c1 * pos_term * pos_alpha;
    dx[i] += -c2 * neg_term * neg_alpha;
    mask[i] = c1;
@@ -104,9 +100,9 @@ void _SigmoidFocalLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
  template <>                                         \
-  void name<LogitType, TargetType, CPUContext>(             \
+  void name<LogitT, TargetT, CPUContext>(             \
      const int outer_dim,                            \
      const int inner_dim,                            \
      const int axis_dim,                             \
@@ -114,18 +110,18 @@ void _SigmoidFocalLossGrad(
      const float neg_alpha,                          \
      const float gamma,                              \
      const int negative_index,                       \
-      const LogitType* logit,                               \
+      const LogitT* logit,                            \
-      const TargetType* target,                             \
+      const TargetT* target,                          \
-      LogitType* loss,                                      \
+      LogitT* loss,                                   \
-      LogitType* mask,                                      \
+      LogitT* mask,                                   \
      CPUContext* ctx) {                              \
    _##name(                                          \
        outer_dim,                                    \
        inner_dim,                                    \
        axis_dim,                                     \
-        (LogitType)pos_alpha,                               \
+        (LogitT)pos_alpha,                            \
-        (LogitType)neg_alpha,                               \
+        (LogitT)neg_alpha,                            \
-        (LogitType)gamma,                                   \
+        (LogitT)gamma,                                \
        negative_index,                               \
        logit,                                        \
        target,                                       \

--- a/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sigmoid_focal_loss_op_kernel.cu
@@ -9,19 +9,19 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SigmoidFocalLoss(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int j = yi % inner_dim;
    const int k = (yi / inner_dim) % axis_dim;
@@ -29,40 +29,39 @@ __global__ void _SigmoidFocalLoss(
    const int t = target[i * inner_dim + j];
    // "0" is reserved for target if negative index is zero
-    LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[yi]));
-    LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[yi]));
    // (1 - p)^{gamma} * log(p)
-    LogitType pos_term = pow(LogitType(1) - p, gamma) * log(max(p, FLT_MIN));
+    LogitT pos_term = pow(LogitT(1) - p, gamma) * log(max(p, FLT_MIN));
    // p^{gamma} * log(1 - p)
-    LogitType neg_term = pow(p, gamma) *
+    LogitT neg_term = pow(p, gamma) *
        (-logit[yi] * (logit[yi] >= 0) -
-         log(LogitType(1) +
+         log(LogitT(1) +
-             exp(logit[yi] - LogitType(2) * logit[yi] * (logit[yi] >= 0))));
+             exp(logit[yi] - LogitT(2) * logit[yi] * (logit[yi] >= 0))));
-    loss[yi] = LogitType(0);
+    loss[yi] = LogitT(0);
    loss[yi] += -c1 * pos_term * pos_alpha;
    loss[yi] += -c2 * neg_term * neg_alpha;
    mask[yi] = c1;
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SigmoidFocalLossGrad(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
-    const LogitType pos_alpha,
+    const LogitT pos_alpha,
-    const LogitType neg_alpha,
+    const LogitT neg_alpha,
-    const LogitType gamma,
+    const LogitT gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(xi, nthreads) {
    const int j = xi % inner_dim;
    const int k = (xi / inner_dim) % axis_dim;
@@ -70,24 +69,23 @@ __global__ void _SigmoidFocalLossGrad(
    const int t = target[i * inner_dim + j];
    // "0" is reserved for target if neg index is zero
-    LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
+    LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
-    LogitType c2 =
+    LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
-        (LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
+    LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[xi]));
-    LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[xi]));
    // (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
-    LogitType pos_term = pow(LogitType(1) - p, gamma) *
+    LogitT pos_term = pow(LogitT(1) - p, gamma) *
-        (LogitType(1) - p - p * gamma * log(max(p, FLT_MIN)));
+        (LogitT(1) - p - p * gamma * log(max(p, FLT_MIN)));
    // p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
-    LogitType neg_term = pow(p, gamma) *
+    LogitT neg_term = pow(p, gamma) *
        ((-logit[xi] * (logit[xi] >= 0) -
-          log(LogitType(1) +
+          log(LogitT(1) +
-              exp(logit[xi] - LogitType(2) * logit[xi] * (logit[xi] >= 0)))) *
+              exp(logit[xi] - LogitT(2) * logit[xi] * (logit[xi] >= 0)))) *
-             (LogitType(1) - p) * gamma -
+             (LogitT(1) - p) * gamma -
         p);
-    dx[xi] = LogitType(0);
+    dx[xi] = LogitT(0);
    dx[xi] += -c1 * pos_term * pos_alpha;
    dx[xi] += -c2 * neg_term * neg_alpha;
    mask[xi] = c1;
@@ -98,9 +96,9 @@ __global__ void _SigmoidFocalLossGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType)                  \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT)                        \
  template <>                                                                \
-  void name<LogitType, TargetType, CUDAContext>(                             \
+  void name<LogitT, TargetT, CUDAContext>(                                   \
      const int outer_dim,                                                   \
      const int inner_dim,                                                   \
      const int axis_dim,                                                    \
@@ -108,19 +106,19 @@ __global__ void _SigmoidFocalLossGrad(
      const float neg_alpha,                                                 \
      const float gamma,                                                     \
      const int negative_index,                                              \
-      const LogitType* logit,                                                \
+      const LogitT* logit,                                                   \
-      const TargetType* target,                                              \
+      const TargetT* target,                                                 \
-      LogitType* loss,                                                       \
+      LogitT* loss,                                                          \
-      LogitType* mask,                                                       \
+      LogitT* mask,                                                          \
      CUDAContext* ctx) {                                                    \
    const auto nthreads = outer_dim * axis_dim * inner_dim;                  \
    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
        nthreads,                                                            \
        inner_dim,                                                           \
        axis_dim,                                                            \
-        (LogitType)pos_alpha,                                                \
+        (LogitT)pos_alpha,                                                   \
-        (LogitType)neg_alpha,                                                \
+        (LogitT)neg_alpha,                                                   \
-        (LogitType)gamma,                                                    \
+        (LogitT)gamma,                                                       \
        negative_index,                                                      \
        logit,                                                               \
        target,                                                              \

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cc
@@ -7,58 +7,58 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SparseSoftmaxCrossEntropy(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      loss[i] = mask[i] = LogitType(0);
+      loss[i] = mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
+      loss[i] = -std::log(std::max(prob[k], LogitT(FLT_MIN)));
-      mask[i] = LogitType(1);
+      mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void _SparseSoftmaxCrossEntropyGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  std::array<int, 2> idx = {0, 0};
  std::array<int, 2> dims = {outer_dim, inner_dim};
  int count = dims[0] * dims[1], k;
  for (int i = 0; i < count; ++i) {
    const int label = (int)target[i];
    if (label == ignore_index) {
-      LogitType* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
+      LogitT* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
      for (int j = 0; j < axis_dim; ++j) {
-        (*offset_dx) = LogitType(0);
+        (*offset_dx) = LogitT(0);
        offset_dx += inner_dim;
      }
-      mask[i] = LogitType(0);
+      mask[i] = LogitT(0);
    } else {
      k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
-      dx[k] -= LogitType(1);
+      dx[k] -= LogitT(1);
-      mask[i] = LogitType(1);
+      mask[i] = LogitT(1);
    }
    math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
  }
@@ -68,17 +68,17 @@ void _SparseSoftmaxCrossEntropyGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
  template <>                                         \
-  void name<LogitType, TargetType, CPUContext>(             \
+  void name<LogitT, TargetT, CPUContext>(             \
      const int outer_dim,                            \
      const int inner_dim,                            \
      const int axis_dim,                             \
      const int ignore_index,                         \
-      const LogitType* prob,                                \
+      const LogitT* prob,                             \
-      const TargetType* target,                             \
+      const TargetT* target,                          \
-      LogitType* loss,                                      \
+      LogitT* loss,                                   \
-      LogitType* mask,                                      \
+      LogitT* mask,                                   \
      CPUContext* ctx) {                              \
    _##name(                                          \
        outer_dim,                                    \

--- a/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
+++ b/dragon/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
@@ -9,54 +9,54 @@ namespace kernel {
 namespace {
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SparseSoftmaxCrossEntropy(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      loss[yi] = mask[yi] = LogitType(0);
+      loss[yi] = mask[yi] = LogitT(0);
    } else {
-      loss[yi] = -log(max(
+      loss[yi] = -log(
-          prob[(i * axis_dim + label) * inner_dim + j], LogitType(FLT_MIN)));
+          max(prob[(i * axis_dim + label) * inner_dim + j], LogitT(FLT_MIN)));
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 __global__ void _SparseSoftmaxCrossEntropyGrad(
    const int nthreads,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask) {
+    LogitT* mask) {
  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
    const int i = yi / inner_dim;
    const int j = yi % inner_dim;
    const int label = target[i * inner_dim + j];
    if (label == ignore_index) {
-      LogitType* offset_dx = dx + i * axis_dim * inner_dim + j;
+      LogitT* offset_dx = dx + i * axis_dim * inner_dim + j;
      for (int k = 0; k < axis_dim; ++k) {
-        (*offset_dx) = LogitType(0);
+        (*offset_dx) = LogitT(0);
        offset_dx += inner_dim;
      }
-      mask[yi] = LogitType(0);
+      mask[yi] = LogitT(0);
    } else {
-      dx[(i * axis_dim + label) * inner_dim + j] -= LogitType(1);
+      dx[(i * axis_dim + label) * inner_dim + j] -= LogitT(1);
-      mask[yi] = LogitType(1);
+      mask[yi] = LogitT(1);
    }
  }
 }
@@ -65,17 +65,17 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
 /* ------------------- Launcher Separator ------------------- */
-#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType)                  \
+#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT)                        \
  template <>                                                                \
-  void name<LogitType, TargetType, CUDAContext>(                             \
+  void name<LogitT, TargetT, CUDAContext>(                                   \
      const int outer_dim,                                                   \
      const int inner_dim,                                                   \
      const int axis_dim,                                                    \
      const int ignore_index,                                                \
-      const LogitType* prob,                                                 \
+      const LogitT* prob,                                                    \
-      const TargetType* target,                                              \
+      const TargetT* target,                                                 \
-      LogitType* loss,                                                       \
+      LogitT* loss,                                                          \
-      LogitType* mask,                                                       \
+      LogitT* mask,                                                          \
      CUDAContext* ctx) {                                                    \
    const auto nthreads = outer_dim * inner_dim;                             \
    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \

--- a/dragon/kernels/math/clip_op_kernel.cu
+++ b/dragon/kernels/math/clip_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {
@@ -9,125 +10,34 @@ namespace kernel {
 namespace {
-template <typename T>
+template <typename T, typename AccT>
 __global__ void
-_Clip(const int nthreads, const T low, const T high, const T* x, T* y) {
+_Clip(const int nthreads, const AccT low, const AccT high, const T* x, T* y) {
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = max(low, min(x[i], high));
+    y[i] = convert::To<T>(max(low, min(convert::To<AccT>(x[i]), high)));
  }
 }
-template <>
+template <typename T, typename AccT>
-__global__ void _Clip<half>(
-    const int nthreads,
-    const half low,
-    const half high,
-    const half* x,
-    half* y) {
-#if __CUDA_ARCH__ >= 530
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __hlt(__ldg(x + i), high)
-        ? (__hgt(__ldg(x + i), low) ? __ldg(x + i) : low)
-        : high;
-  }
-#else
-  const float kLow = __half2float(low);
-  const float kHigh = __half2float(high);
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __float2half(max(kLow, min(__half2float(x[i]), kHigh)));
-  }
-#endif
-}
-template <typename T>
 __global__ void _ClipGrad(
    const int nthreads,
-    const T low,
+    const AccT low,
-    const T high,
+    const AccT high,
    const T* dy,
    const T* x,
    T* dx) {
+  const T kZero = convert::To<T>(0.f);
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-#if __CUDA_ARCH__ >= 350
+    const AccT val = convert::To<AccT>(x[i]);
-    dx[i] = __ldg(x + i) < low || __ldg(x + i) > high ? T(0) : dy[i];
+    dx[i] = val < low || val > high ? kZero : dy[i];
-#else
-    dx[i] = x[i] < low || x[i] > high ? T(0) : dy[i];
-#endif
  }
 }
-template <>
-__global__ void _ClipGrad<half>(
-    const int nthreads,
-    const half low,
-    const half high,
-    const half* dy,
-    const half* x,
-    half* dx) {
-  const half kZero = __float2half(0.f);
-#if __CUDA_ARCH__ >= 530
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] =
-        (__hlt(__ldg(x + i), low) || __hgt(__ldg(x + i), high)) ? kZero : dy[i];
-  }
-#elif __CUDA_ARCH__ >= 350
-  const float kLow = __half2float(low);
-  const float kHigh = __half2float(high);
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = (__half2float(__ldg(x + i)) < kLow ||
-             __half2float(__ldg(x + i)) > kHigh)
-        ? kZero
-        : dy[i];
-  }
-#else
-  const float kLow = __half2float(low);
-  const float kHigh = __half2float(high);
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    dx[i] = (__half2float(x[i]) < kLow || __half2float(x[i]) > kHigh) ? kZero
-                                                                      : dy[i];
-  }
-#endif
-}
 } // namespace
 /* ------------------- Launcher Separator ------------------- */
-template <>
+#define DEFINE_KERNEL_LAUNCHER(T, AccT)                                \
-void Clip<float16, CUDAContext>(
-    const int count,
-    const float low,
-    const float high,
-    const float16* x,
-    float16* y,
-    CUDAContext* ctx) {
-  _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      count,
-      convert::To<half>(low),
-      convert::To<half>(high),
-      reinterpret_cast<const half*>(x),
-      reinterpret_cast<half*>(y));
-}
-template <>
-void ClipGrad<float16, CUDAContext>(
-    const int count,
-    const float low,
-    const float high,
-    const float16* dy,
-    const float16* x,
-    float16* dx,
-    CUDAContext* ctx) {
-  _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-      count,
-      convert::To<half>(low),
-      convert::To<half>(high),
-      reinterpret_cast<const half*>(dy),
-      reinterpret_cast<const half*>(x),
-      reinterpret_cast<half*>(dx));
-} // ClipGrad
-#define DEFINE_KERNEL_LAUNCHER(T)                                       \
  template <>                                                          \
  void Clip<T, CUDAContext>(                                           \
      const int count,                                                 \
@@ -136,11 +46,12 @@ void ClipGrad<float16, CUDAContext>(
      const T* x,                                                      \
      T* y,                                                            \
      CUDAContext* ctx) {                                              \
-    _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+    _Clip<T, AccT>                                                     \
-        count, convert::To<T>(low), convert::To<T>(high), x, y);        \
+        <<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+            count, low, high, x, y);                                   \
  }
-#define DEFINE_GRAD_KERNEL_LAUNCHER(T)                                      \
+#define DEFINE_GRAD_KERNEL_LAUNCHER(T, AccT)                           \
  template <>                                                          \
  void ClipGrad<T, CUDAContext>(                                       \
      const int count,                                                 \
@@ -150,18 +61,21 @@ void ClipGrad<float16, CUDAContext>(
      const T* x,                                                      \
      T* dx,                                                           \
      CUDAContext* ctx) {                                              \
-    _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+    _ClipGrad<T, AccT>                                                 \
-        count, convert::To<T>(low), convert::To<T>(high), dy, x, dx);       \
+        <<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+            count, low, high, dy, x, dx);                              \
  }
-DEFINE_KERNEL_LAUNCHER(int8_t);
+DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
+DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
+DEFINE_KERNEL_LAUNCHER(int, int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
+DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
-DEFINE_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(float16, float);
-DEFINE_KERNEL_LAUNCHER(double);
+DEFINE_KERNEL_LAUNCHER(float, float);
-DEFINE_GRAD_KERNEL_LAUNCHER(float);
+DEFINE_KERNEL_LAUNCHER(double, double);
-DEFINE_GRAD_KERNEL_LAUNCHER(double);
+DEFINE_GRAD_KERNEL_LAUNCHER(float16, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
+DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER

--- a/dragon/kernels/math/moments_op_kernel.cc
+++ b/dragon/kernels/math/moments_op_kernel.cc
@@ -20,15 +20,15 @@ void _RowwiseMoments(
 #pragma omp parallel for num_threads(OMP_THREADS(cols))
 #endif
  for (int i = 0; i < cols; ++i) {
-    T x_val;
+    AccT x_val, m_val = AccT(0), v_val = AccT(0);
-    AccT m_val = AccT(0), v_val = AccT(0), mu;
    for (int j = 0; j < rows; ++j) {
-      x_val = x[j * cols + i];
+      x_val = convert::To<AccT>(x[j * cols + i]);
      m_val += x_val;
      v_val += x_val * x_val;
    }
-    mean[i] = mu = m_val * scale;
+    m_val *= scale;
-    var[i] = v_val * scale - mu * mu;
+    mean[i] = m_val;
+    var[i] = v_val * scale - m_val * m_val;
  }
 }
@@ -44,15 +44,15 @@ void _ColwiseMoments(
 #pragma omp parallel for num_threads(OMP_THREADS(rows))
 #endif
  for (int i = 0; i < rows; ++i) {
-    T x_val;
+    AccT x_val, m_val = AccT(0), v_val = AccT(0);
-    AccT m_val = AccT(0), v_val = AccT(0), mu;
    for (int j = 0; j < cols; ++j) {
-      x_val = x[i * cols + j];
+      x_val = convert::To<AccT>(x[i * cols + j]);
      m_val += x_val;
      v_val += x_val * x_val;
    }
-    mean[i] = mu = m_val * scale;
+    m_val *= scale;
-    var[i] = v_val * scale - mu * mu;
+    mean[i] = m_val;
+    var[i] = v_val * scale - m_val * m_val;
  }
 }
@@ -71,8 +71,7 @@ void _GenericMoments(
 #pragma omp parallel for num_threads(OMP_THREADS(rows))
 #endif
  for (int i = 0; i < rows; ++i) {
-    T x_val;
+    AccT x_val, m_val = AccT(0), v_val = AccT(0);
-    AccT m_val = AccT(0), v_val = AccT(0), mu;
    int xi, c, r;
    for (int j = 0; j < cols; ++j) {
      xi = 0;
@@ -81,12 +80,13 @@ void _GenericMoments(
        FIXED_DIVISOR_DIV_MOD(x_dims[d], c, &c, &r);
        xi += r * x_strides[d];
      }
-      x_val = x[xi];
+      x_val = convert::To<AccT>(x[xi]);
      m_val += x_val;
      v_val += x_val * x_val;
    }
-    mean[i] = mu = m_val * scale;
+    m_val *= scale;
-    var[i] = v_val * scale - mu * mu;
+    mean[i] = m_val;
+    var[i] = v_val * scale - m_val * m_val;
  }
 }
@@ -148,19 +148,6 @@ void _Moments(
 /* ------------------- Launcher Separator ------------------- */
-template <>
-void Moments<float16, float, CPUContext>(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const float16* x,
-    float* mean,
-    float* var,
-    CPUContext* ctx) {
-  CPU_FP16_NOT_SUPPORTED;
-}
 #define DEFINE_KERNEL_LAUNCHER(T, AccT)                          \
  template <>                                                    \
  void Moments<T, AccT, CPUContext>(                             \
@@ -178,7 +165,8 @@ void Moments<float16, float, CPUContext>(
 DEFINE_KERNEL_LAUNCHER(int8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(int, float);
-DEFINE_KERNEL_LAUNCHER(int64_t, float);
+DEFINE_KERNEL_LAUNCHER(int64_t, double);
+DEFINE_KERNEL_LAUNCHER(float16, float);
 DEFINE_KERNEL_LAUNCHER(float, float);
 DEFINE_KERNEL_LAUNCHER(double, double);
 #undef DEFINE__KERNEL_LAUNCHER

--- a/dragon/kernels/math/moments_op_kernel.cu
+++ b/dragon/kernels/math/moments_op_kernel.cu
@@ -201,7 +201,7 @@ void _Moments(
 DEFINE_KERNEL_LAUNCHER(int8_t, int8_t, float);
 DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t, float);
 DEFINE_KERNEL_LAUNCHER(int, int, float);
-DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, float);
+DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, double);
 DEFINE_KERNEL_LAUNCHER(float16, half, float);
 DEFINE_KERNEL_LAUNCHER(float, float, float);
 DEFINE_KERNEL_LAUNCHER(double, double, double);

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cc
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cc
@@ -70,7 +70,7 @@ void _L1NormalizeGrad(
      auto X = ConstEigenStridedVectorMap<T>(
          x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
      auto norm = std::max(X.template lpNorm<1>() / normalizer, epsilon);
-      auto norm2 = std::pow(norm, 2);
+      auto norm2 = std::pow(norm, T(2));
      EigenStridedVectorMap<T>(
          dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
          (dY / norm) -
@@ -98,7 +98,7 @@ void _L2NormalizeGrad(
      auto X = ConstEigenStridedVectorMap<T>(
          x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
      auto norm = std::max(std::sqrt(X.squaredNorm() / normalizer), epsilon);
-      auto norm3 = std::pow(norm, 3);
+      auto norm3 = std::pow(norm, T(3));
      EigenStridedVectorMap<T>(
          dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
          (dY / norm) - ((X / norm3) * dY.dot(X) / normalizer);

--- a/dragon/kernels/normalization/lp_norm_op_kernel.cu
+++ b/dragon/kernels/normalization/lp_norm_op_kernel.cu
@@ -93,7 +93,7 @@ __global__ void _L1NormalizeGrad(
    val2 = BlockReduce<AccT>(storage).Sum(val2);
    if (threadIdx.x == 0) {
      norm = max(val1 / normalizer, epsilon);
-      norm2 = pow(norm, 2);
+      norm2 = pow(norm, AccT(2));
      sum = val2 / normalizer;
    }
    __syncthreads();
@@ -130,7 +130,7 @@ __global__ void _L2NormalizeGrad(
    val2 = BlockReduce<AccT>(storage).Sum(val2);
    if (threadIdx.x == 0) {
      norm = max(sqrt(val1 / normalizer), epsilon);
-      norm3 = pow(norm, 3);
+      norm3 = pow(norm, AccT(3));
      sum = val2 / normalizer;
    }
    __syncthreads();

--- a/dragon/kernels/vision/roi_align_op_kernel.cu
+++ b/dragon/kernels/vision/roi_align_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/conversions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/kernels/vision/roi_pool_op_kernel.cu
+++ b/dragon/kernels/vision/roi_pool_op_kernel.cu
 #ifdef USE_CUDA
 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/math_functions.h"
 #include "dragon/utils/op_kernels.h"
 namespace dragon {

--- a/dragon/operators/array/channel_normalize_op.cc
+++ b/dragon/operators/array/channel_normalize_op.cc
@@ -5,7 +5,7 @@
 namespace dragon {
 template <class Context>
-template <typename Tx, typename Ty>
+template <typename InputT, typename OutputT>
 void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -35,10 +35,10 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
      num_dims,
      X_strides.data(),
      Y_dims.data(),
-      X.template data<Tx, Context>(),
+      X.template data<InputT, Context>(),
      X_mean_.template data<float, Context>(),
      X_std_.template data<float, Context>(),
-      Y->Reshape(Y_dims)->template mutable_data<Ty, Context>(),
+      Y->Reshape(Y_dims)->template mutable_data<OutputT, Context>(),
      ctx());
 }

--- a/dragon/operators/loss/nll_loss_op.cc
+++ b/dragon/operators/loss/nll_loss_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void NLLLossOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -19,19 +19,19 @@ void NLLLossOp<Context>::DoRunWithType() {
      << "\nNumber of preds must match the number of targets.";
  auto scratches = ctx()->workspace()->template data<Context>({
-      (size_t)num_preds * sizeof(LogitType), // loss
+      (size_t)num_preds * sizeof(LogitT), // loss
-      (size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
+      (size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
  });
-  auto* loss = static_cast<LogitType*>(scratches[0]);
+  auto* loss = static_cast<LogitT*>(scratches[0]);
-  auto* mask = static_cast<LogitType*>(scratches[1]);
+  auto* mask = static_cast<LogitT*>(scratches[1]);
  kernel::NLLLoss(
      outer_dim,
      inner_dim,
      X.dim(axis),
      ignore_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      loss,
      mask,
      ctx());
@@ -42,7 +42,7 @@ void NLLLossOp<Context>::DoRunWithType() {
    math::Copy(
        num_preds,
        loss,
-        Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
+        Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
        ctx());
  } else {
    int64_t normalizer = 1;
@@ -59,7 +59,7 @@ void NLLLossOp<Context>::DoRunWithType() {
        normalizer,
        loss,
        mask,
-        Y->Reshape({})->template mutable_data<LogitType, Context>(),
+        Y->Reshape({})->template mutable_data<LogitT, Context>(),
        ctx());
  }
 }
@@ -91,7 +91,7 @@ void NLLLossOp<Context>::RunOnDevice() {
 }
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void NLLLossGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -101,19 +101,19 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
  auto inner_dim = dX->count(axis + 1);
  auto num_preds = outer_dim * inner_dim;
-  auto* dy = dY.template data<LogitType, Context>();
+  auto* dy = dY.template data<LogitT, Context>();
-  auto* dx = dX->template mutable_data<LogitType, Context>();
+  auto* dx = dX->template mutable_data<LogitT, Context>();
  auto* mask =
-      ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
+      ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
-  math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
+  math::Set(dX->count(), convert::To<LogitT>(0.f), dx, ctx());
  kernel::NLLLossGrad(
      outer_dim,
      inner_dim,
      dX->dim(axis),
      ignore_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      dx,
      mask,
      ctx());

--- a/dragon/operators/loss/nll_loss_op.h
+++ b/dragon/operators/loss/nll_loss_op.h
@@ -28,7 +28,7 @@ class NLLLossOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:
@@ -47,7 +47,7 @@ class NLLLossGradientOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/loss/sigmoid_focal_loss_op.cc
+++ b/dragon/operators/loss/sigmoid_focal_loss_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SigmoidFocalLossOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -18,11 +18,11 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
      << "\nNumber of preds must match the number of targets.";
  auto scratches = ctx()->workspace()->template data<Context>({
-      X.size() * sizeof(LogitType), // loss
+      X.size() * sizeof(LogitT), // loss
-      X.size() * sizeof(LogitType) + sizeof(LogitType), // mask
+      X.size() * sizeof(LogitT) + sizeof(LogitT), // mask
  });
-  auto* loss = static_cast<LogitType*>(scratches[0]);
+  auto* loss = static_cast<LogitT*>(scratches[0]);
-  auto* mask = static_cast<LogitType*>(scratches[1]);
+  auto* mask = static_cast<LogitT*>(scratches[1]);
  kernel::SigmoidFocalLoss(
      outer_dim,
@@ -32,8 +32,8 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
      neg_alpha_,
      gamma_,
      negative_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      loss,
      mask,
      ctx());
@@ -42,7 +42,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
    math::Copy(
        X.count(),
        loss,
-        Y->ReshapeLike(X)->template mutable_data<LogitType, Context>(),
+        Y->ReshapeLike(X)->template mutable_data<LogitT, Context>(),
        ctx());
  } else {
    int64_t normalizer = 1;
@@ -59,7 +59,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
        normalizer,
        loss,
        mask,
-        Y->Reshape({})->template mutable_data<LogitType, Context>(),
+        Y->Reshape({})->template mutable_data<LogitT, Context>(),
        ctx());
  }
 }
@@ -91,7 +91,7 @@ void SigmoidFocalLossOp<Context>::RunOnDevice() {
 }
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
  auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -100,10 +100,10 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
  auto outer_dim = dX->count(0, axis);
  auto inner_dim = dX->count(axis + 1);
-  auto* dy = dY.template data<LogitType, Context>();
+  auto* dy = dY.template data<LogitT, Context>();
-  auto* dx = dX->template mutable_data<LogitType, Context>();
+  auto* dx = dX->template mutable_data<LogitT, Context>();
-  auto* mask = ctx()->workspace()->template data<LogitType, Context>(
+  auto* mask =
-      {dX->count() + 1})[0];
+      ctx()->workspace()->template data<LogitT, Context>({dX->count() + 1})[0];
  kernel::SigmoidFocalLossGrad(
      outer_dim,
@@ -113,8 +113,8 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
      neg_alpha_,
      gamma_,
      negative_index_,
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      dx,
      mask,
      ctx());

--- a/dragon/operators/loss/sigmoid_loss_ops.h
+++ b/dragon/operators/loss/sigmoid_loss_ops.h
@@ -48,7 +48,7 @@ class SigmoidFocalLossOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:
@@ -88,7 +88,7 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/loss/softmax_loss_ops.h
+++ b/dragon/operators/loss/softmax_loss_ops.h
@@ -45,7 +45,7 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:
@@ -81,7 +81,7 @@ class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
@@ -6,7 +6,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -18,20 +18,20 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
  CHECK_EQ(num_preds, Input(1).count())
      << "\nNumber of preds must match the number of targets.";
  auto* X_prob = Buffer("prob")->ReshapeLike(X);
-  auto* prob = X_prob->template mutable_data<LogitType, Context>();
+  auto* prob = X_prob->template mutable_data<LogitT, Context>();
  auto scratches = ctx()->workspace()->template data<Context>({
-      (size_t)num_preds * sizeof(LogitType), // loss
+      (size_t)num_preds * sizeof(LogitT), // loss
-      (size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
+      (size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
  });
-  auto* loss = static_cast<LogitType*>(scratches[0]);
+  auto* loss = static_cast<LogitT*>(scratches[0]);
-  auto* mask = static_cast<LogitType*>(scratches[1]);
+  auto* mask = static_cast<LogitT*>(scratches[1]);
  kernel::Softmax(
      outer_dim,
      inner_dim,
      X.dim(axis),
-      X.template data<LogitType, Context>(),
+      X.template data<LogitT, Context>(),
      prob,
      ctx());
@@ -41,7 +41,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
      X.dim(axis),
      ignore_index_,
      prob,
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      loss,
      mask,
      ctx());
@@ -52,7 +52,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
    math::Copy(
        num_preds,
        loss,
-        Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
+        Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
        ctx());
  } else {
    int64_t normalizer = 1;
@@ -69,7 +69,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
        normalizer,
        loss,
        mask,
-        Y->Reshape({})->template mutable_data<LogitType, Context>(),
+        Y->Reshape({})->template mutable_data<LogitT, Context>(),
        ctx());
  }
 }
@@ -101,7 +101,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
 }
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
  auto &dY = Input(-1), *dX = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(Input(0));
@@ -110,11 +110,11 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
  auto inner_dim = dX->count(axis + 1);
  auto num_preds = outer_dim * inner_dim;
-  auto* prob = Buffer("prob")->template data<LogitType, Context>();
+  auto* prob = Buffer("prob")->template data<LogitT, Context>();
-  auto* dy = Input(-1).template data<LogitType, Context>();
+  auto* dy = Input(-1).template data<LogitT, Context>();
-  auto* dx = Output(0)->template mutable_data<LogitType, Context>();
+  auto* dx = Output(0)->template mutable_data<LogitT, Context>();
  auto* mask =
-      ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
+      ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
  math::Copy(dX->count(), prob, dx, ctx());
@@ -124,7 +124,7 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
      dX->dim(axis),
      ignore_index_,
      prob,
-      Input(1).template data<TargetType, Context>(),
+      Input(1).template data<TargetT, Context>(),
      dx,
      mask,
      ctx());

--- a/dragon/operators/math/moments_op.cc
+++ b/dragon/operators/math/moments_op.cc
@@ -5,8 +5,9 @@
 namespace dragon {
 template <class Context>
-template <typename Tx, typename Ty>
+template <typename T>
 void MomentsOp<Context>::DoRunWithType() {
+  using OutputT = typename math::utils::AccmulatorType<T>::type;
  auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);
  // Determine the reduce axes
@@ -35,13 +36,13 @@ void MomentsOp<Context>::DoRunWithType() {
  if (X.count() == 1) {
    math::Cast(
        1,
-        X.template data<Tx, Context>(),
+        X.template data<T, Context>(),
-        Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
        ctx());
    math::Set(
        1,
-        convert::To<Ty>(0.f),
+        convert::To<OutputT>(0.f),
-        Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
        ctx());
  } else {
    kernel::Moments(
@@ -49,35 +50,16 @@ void MomentsOp<Context>::DoRunWithType() {
        X_dims.data(),
        reduce_axes.size(),
        reduce_axes.data(),
-        X.template data<Tx, Context>(),
+        X.template data<T, Context>(),
-        Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
-        Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
+        Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
        ctx());
  }
 }
 template <class Context>
 void MomentsOp<Context>::RunOnDevice() {
-  auto& X = Input(0);
+  DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
-  if (X.template IsType<int8_t>()) {
-    DoRunWithType<int8_t, float>();
-  } else if (X.template IsType<uint8_t>()) {
-    DoRunWithType<uint8_t, float>();
-  } else if (X.template IsType<int>()) {
-    DoRunWithType<int, float>();
-  } else if (X.template IsType<int64_t>()) {
-    DoRunWithType<int64_t, float>();
-  } else if (X.template IsType<float16>()) {
-    DoRunWithType<float16, float>();
-  } else if (X.template IsType<float>()) {
-    DoRunWithType<float, float>();
-  } else if (X.template IsType<double>()) {
-    DoRunWithType<double, double>();
-  } else {
-    LOG(FATAL) << MessageForUnsupported(
-        types::to_string(X.meta()),
-        {"int8", "uint8", "int32", "int64", "float16", "float32", "float64"});
-  }
 }
 DEPLOY_CPU_OPERATOR(Moments);

--- a/dragon/operators/math/moments_op.h
+++ b/dragon/operators/math/moments_op.h
@@ -28,7 +28,7 @@ class MomentsOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename Tx, typename Ty>
+  template <typename T>
  void DoRunWithType();
 protected:

--- a/dragon/operators/metric/accuracy_op.cc
+++ b/dragon/operators/metric/accuracy_op.cc
@@ -3,7 +3,7 @@
 namespace dragon {
 template <class Context>
-template <typename LogitType, typename TargetType>
+template <typename LogitT, typename TargetT>
 void AccuracyOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0);
  CANONICALIZE_AXIS_WITH_TENSOR(X);
@@ -18,21 +18,21 @@ void AccuracyOp<Context>::DoRunWithType() {
  int64_t acc = 0, count = 0;
  int64_t cols = X.count() / outer_dim;
-  auto* logit = X.template data<LogitType, CPUContext>();
+  auto* logit = X.template data<LogitT, CPUContext>();
-  auto* target = Input(1).template data<TargetType, CPUContext>();
+  auto* target = Input(1).template data<TargetT, CPUContext>();
  for (int i = 0; i < outer_dim; ++i) {
    for (int j = 0; j < inner_dim; ++j) {
      const int label = target[i * inner_dim + j];
      if (label == ignore_index_) continue;
-      vector<pair<LogitType, int>> vec;
+      vector<pair<LogitT, int>> vec;
      for (int k = 0; k < axis_dim; k++)
        vec.push_back(std::make_pair(logit[i * cols + k * inner_dim + j], k));
      std::partial_sort(
          vec.begin(),
          vec.begin() + top_k_,
          vec.end(),
-          std::greater<pair<LogitType, int>>());
+          std::greater<pair<LogitT, int>>());
      for (int k = 0; k < top_k_; k++) {
        if (vec[k].second == label) {
          acc++;

--- a/dragon/operators/metric/accuracy_op.h
+++ b/dragon/operators/metric/accuracy_op.h
@@ -28,7 +28,7 @@ class AccuracyOp final : public Operator<Context> {
  void RunOnDevice() override;
-  template <typename LogitType, typename TargetType>
+  template <typename LogitT, typename TargetT>
  void DoRunWithType();
 protected:

--- a/dragon/operators/normalization/batch_norm_op.cc
+++ b/dragon/operators/normalization/batch_norm_op.cc
@@ -8,11 +8,11 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void BatchNormOp<Context>::TrainingImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
  auto* X_mu = Buffer("X_mu")->Reshape({C_});
  auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
@@ -20,11 +20,11 @@ void BatchNormOp<Context>::TrainingImpl() {
  auto* X_bias = Buffer("X_bias")->Reshape({C_});
  auto* x = Input(0).template data<T, Context>();
-  auto* rm = Input(3).template mutable_data<ParamType, Context>();
+  auto* rm = Input(3).template mutable_data<ParamT, Context>();
-  auto* rv = Input(4).template mutable_data<ParamType, Context>();
+  auto* rv = Input(4).template mutable_data<ParamT, Context>();
-  auto* mu = X_mu->template mutable_data<ParamType, Context>();
+  auto* mu = X_mu->template mutable_data<ParamT, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
-  auto* scale = X_scale->template mutable_data<ParamType, Context>();
+  auto* scale = X_scale->template mutable_data<ParamT, Context>();
  // Compute moments
  if (sync_stats_ > 0) {
@@ -45,7 +45,7 @@ void BatchNormOp<Context>::TrainingImpl() {
    if (enable_nccl_) {
 #ifdef USE_NCCL
      auto coll_comm = this->nccl_comm();
-      auto coll_dtype = this->template nccl_dtype<ParamType>();
+      auto coll_dtype = this->template nccl_dtype<ParamT>();
      NCCL_CHECK(ncclAllReduce(
          (void*)mu,
          (void*)mu,
@@ -84,8 +84,9 @@ void BatchNormOp<Context>::TrainingImpl() {
  // Compute running statistics
  if (is_recomputing_ == 0) {
-    math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
+    auto decay_factor = momentum();
-    math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
+    math::Axpby(C_, 1.f - decay_factor, mu, decay_factor, rm, ctx());
+    math::Axpby(C_, 1.f - decay_factor, rsig, decay_factor, rv, ctx());
  }
  // Inverse stddev from variance
@@ -100,10 +101,10 @@ void BatchNormOp<Context>::TrainingImpl() {
      x,
      mu,
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      Input(2).template data<ParamType, Context>(), // beta
+      Input(2).template data<ParamT, Context>(), // beta
      scale,
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -111,17 +112,17 @@ void BatchNormOp<Context>::TrainingImpl() {
 template <class Context>
 template <typename T>
 void BatchNormOp<Context>::InferenceImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
  auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
  auto* X_scale = Buffer("X_scale")->Reshape({C_});
  auto* X_bias = Buffer("X_bias")->Reshape({C_});
-  auto* rv = Input(4).template data<ParamType, Context>();
+  auto* rv = Input(4).template data<ParamT, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
  // Inverse stddev from variance
  math::InvStd(C_, epsilon_, rv, rsig, ctx());
@@ -133,12 +134,12 @@ void BatchNormOp<Context>::InferenceImpl() {
      S_,
      data_format(),
      Input(0).template data<T, Context>(),
-      Input(3).template data<ParamType, Context>(),
+      Input(3).template data<ParamT, Context>(),
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      Input(2).template data<ParamType, Context>(), // beta
+      Input(2).template data<ParamT, Context>(), // beta
-      X_scale->template mutable_data<ParamType, Context>(),
+      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -159,17 +160,17 @@ void BatchNormOp<Context>::RunOnDevice() {
 template <class Context>
 template <typename T>
 void BatchNormGradientOp<Context>::TrainingImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
  auto* x = Input(0).template data<T, Context>();
-  auto* gamma = Input(1).template data<ParamType, Context>();
+  auto* gamma = Input(1).template data<ParamT, Context>();
  auto* dy = Input(4).template data<T, Context>();
-  auto* mu = X_mu->template data<ParamType, Context>();
+  auto* mu = X_mu->template data<ParamT, Context>();
-  auto* rsig = X_rsig->template data<ParamType, Context>();
+  auto* rsig = X_rsig->template data<ParamT, Context>();
-  auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
+  auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
-  auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
+  auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
  // Gradient w.r.t. gamma and beta
  kernel::BatchNormWGrad(
@@ -181,7 +182,7 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
    if (enable_nccl_) {
 #ifdef USE_NCCL
      auto coll_comm = this->nccl_comm();
-      auto coll_dtype = this->template nccl_dtype<ParamType>();
+      auto coll_dtype = this->template nccl_dtype<ParamT>();
      NCCL_CHECK(ncclAllReduce(
          (void*)dgamma,
          (void*)dgamma,
@@ -231,18 +232,18 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
 template <class Context>
 template <typename T>
 void BatchNormGradientOp<Context>::InferenceImpl() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto* X_scale = Buffer("X_scale")->Reshape({C_});
-  auto* rv = Input(3).template data<ParamType, Context>();
+  auto* rv = Input(3).template data<ParamT, Context>();
-  auto* rsig = X_scale->template mutable_data<ParamType, Context>();
+  auto* rsig = X_scale->template mutable_data<ParamT, Context>();
  // Gradient w.r.t. gamma or beta if necessary
-  ParamType *dgamma = nullptr, *dbeta = nullptr;
+  ParamT *dgamma = nullptr, *dbeta = nullptr;
  if (dW->has_name() || dB->has_name()) {
-    dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
+    dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
-    dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
+    dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
  }
  // Inverse stddev from variance
@@ -255,9 +256,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
      S_,
      data_format(),
      Input(0).template data<T, Context>(), // x
-      Input(2).template data<ParamType, Context>(), // rm
+      Input(2).template data<ParamT, Context>(), // rm
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
      Input(4).template data<T, Context>(), // dy
      dgamma,
      dbeta,

--- a/dragon/operators/normalization/batch_norm_op.h
+++ b/dragon/operators/normalization/batch_norm_op.h
@@ -33,7 +33,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
 public:
  BatchNormOpBase(const OperatorDef& def, Workspace* ws)
      : GenericOpBase<Context>(def, ws),
-        momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
        epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
        use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
        sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
@@ -57,7 +56,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
  }
 protected:
-  float momentum_;
  double epsilon_;
  int64_t N_, C_, S_;
  int64_t use_stats_, sync_stats_;
@@ -68,7 +66,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
 #define USE_BATCHNORM_FUNCTIONS                           \
  using BatchNormOpBase<Context>::DetermineBaseArguments; \
-  using BatchNormOpBase<Context>::momentum_;              \
  using BatchNormOpBase<Context>::epsilon_;               \
  using BatchNormOpBase<Context>::use_stats_;             \
  using BatchNormOpBase<Context>::sync_stats_;            \
@@ -82,7 +79,9 @@ template <class Context>
 class BatchNormOp : public BatchNormOpBase<Context> {
 public:
  BatchNormOp(const OperatorDef& def, Workspace* ws)
-      : BatchNormOpBase<Context>(def, ws) {}
+      : BatchNormOpBase<Context>(def, ws) {
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
+  }
  USE_OPERATOR_FUNCTIONS;
  USE_BATCHNORM_FUNCTIONS;
 #ifdef USE_MPI
@@ -105,6 +104,8 @@ class BatchNormOp : public BatchNormOpBase<Context> {
      InferenceImpl<T>();
    }
  };
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
 };
 template <class Context>
@@ -146,11 +147,9 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
    CuDNNCreateTensorDesc(&bn_desc_);
    CuDNNCreateTensorDesc(&input_desc_);
    if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. \nSet it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
      epsilon_ = CUDNN_BN_MIN_EPSILON;
    }
+    INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
  }
  USE_OPERATOR_FUNCTIONS;
  USE_BATCHNORM_FUNCTIONS;
@@ -168,6 +167,7 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
 protected:
  cudnnTensorDescriptor_t input_desc_, bn_desc_;
  cudnnBatchNormMode_t bn_mode_;
+  DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
 };
 template <class Context>
@@ -178,9 +178,6 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
    CuDNNCreateTensorDesc(&bn_desc_);
    CuDNNCreateTensorDesc(&input_desc_);
    if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. \nSet it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
      epsilon_ = CUDNN_BN_MIN_EPSILON;
    }
  }
@@ -211,8 +208,12 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
  cudnnBatchNormMode_t bn_mode_;
 };
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, CuDNNBatchNormOp, momentum);
 #endif // USE_CUDNN
+DEFINE_OP_SINGLE_ARG_WITH_DESC(float, BatchNormOp, momentum);
 } // namespace dragon
 #endif // DRAGON_OPERATORS_NORMALIZATION_BATCH_NORM_OP_H_
--- a/dragon/operators/normalization/batch_norm_op_cudnn.cc
+++ b/dragon/operators/normalization/batch_norm_op_cudnn.cc
@@ -9,11 +9,11 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void CuDNNBatchNormOp<Context>::DoRunWithType() {
-  using ParamType = typename CuDNNType<T>::BNParamType;
+  using ParamT = typename CuDNNType<T>::BNParamType;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
  // Determine the descriptors
  if (Input(0).ndim() == 2) {
@@ -39,14 +39,14 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
        input_desc_,
        Output(0)->template mutable_data<T, Context>(), // y
        bn_desc_,
-        Input(1).template data<ParamType, Context>(), // gamma
+        Input(1).template data<ParamT, Context>(), // gamma
-        Input(2).template data<ParamType, Context>(), // beta
+        Input(2).template data<ParamT, Context>(), // beta
-        is_recomputing_ > 0 ? 0.f : 1.f - this->momentum_,
+        is_recomputing_ == 0 ? 1.f - momentum() : 0.f,
-        Input(3).template mutable_data<ParamType, Context>(), // rm
+        Input(3).template mutable_data<ParamT, Context>(), // rm
-        Input(4).template mutable_data<ParamType, Context>(), // rv
+        Input(4).template mutable_data<ParamT, Context>(), // rv
        epsilon_,
-        X_mu->template mutable_data<ParamType, Context>(), // sm
+        X_mu->template mutable_data<ParamT, Context>(), // sm
-        X_rsig->template mutable_data<ParamType, Context>())); // sv
+        X_rsig->template mutable_data<ParamT, Context>())); // sv
  } else {
    CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
        ctx()->cudnn_handle(),
@@ -58,10 +58,10 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
        input_desc_,
        Output(0)->template mutable_data<T, Context>(), // y
        bn_desc_,
-        Input(1).template data<ParamType, Context>(), // gamma
+        Input(1).template data<ParamT, Context>(), // gamma
-        Input(2).template data<ParamType, Context>(), // beta
+        Input(2).template data<ParamT, Context>(), // beta
-        Input(3).template data<ParamType, Context>(), // rm
+        Input(3).template data<ParamT, Context>(), // rm
-        Input(4).template data<ParamType, Context>(), // rv
+        Input(4).template data<ParamT, Context>(), // rv
        epsilon_));
  }
 }
@@ -82,7 +82,7 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
 template <class Context>
 template <typename T>
 void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
-  using ParamType = typename CuDNNType<T>::BNParamType;
+  using ParamT = typename CuDNNType<T>::BNParamType;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
@@ -111,12 +111,12 @@ void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
      input_desc_,
      Output(0)->template mutable_data<T, Context>(), // dx
      bn_desc_,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dw
+      dW->Reshape({C_})->template mutable_data<ParamT, Context>(), // dw
-      dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // db
+      dB->Reshape({C_})->template mutable_data<ParamT, Context>(), // db
      epsilon_,
-      X_mu->template data<ParamType, Context>(), // mu
+      X_mu->template data<ParamT, Context>(), // mu
-      X_rsig->template data<ParamType, Context>())); // rsig
+      X_rsig->template data<ParamT, Context>())); // rsig
 }
 template <class Context>

--- a/dragon/operators/normalization/group_norm_op.cc
+++ b/dragon/operators/normalization/group_norm_op.cc
@@ -8,9 +8,9 @@ namespace dragon {
 template <class Context>
 template <typename T>
 void GroupNormOp<Context>::DoRunWithType() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
-  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
-  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
+  TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
  auto* X_mu = Buffer("X_mu")->Reshape({N_, G_});
  auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_});
@@ -18,8 +18,8 @@ void GroupNormOp<Context>::DoRunWithType() {
  auto* X_bias = Buffer("X_bias")->Reshape({N_, C_});
  auto* x = Input(0).template data<T, Context>();
-  auto* mu = X_mu->template mutable_data<ParamType, Context>();
+  auto* mu = X_mu->template mutable_data<ParamT, Context>();
-  auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
+  auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
  // Compute the moments
  if (data_format() == "NCHW") {
@@ -45,10 +45,10 @@ void GroupNormOp<Context>::DoRunWithType() {
      x,
      mu,
      rsig,
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
-      Input(2).template data<ParamType, Context>(), // beta
+      Input(2).template data<ParamT, Context>(), // beta
-      X_scale->template mutable_data<ParamType, Context>(),
+      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
      Output(0)->template mutable_data<T, Context>(),
      ctx());
 }
@@ -63,7 +63,7 @@ void GroupNormOp<Context>::RunOnDevice() {
 template <class Context>
 template <typename T>
 void GroupNormGradientOp<Context>::DoRunWithType() {
-  using ParamType = typename math::utils::AccmulatorType<T>::type;
+  using ParamT = typename math::utils::AccmulatorType<T>::type;
  auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
  auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
@@ -78,14 +78,14 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
      S_,
      data_format(),
      Input(0).template data<T, Context>(), // x
-      X_mu->template data<ParamType, Context>(),
+      X_mu->template data<ParamT, Context>(),
-      X_rsig->template data<ParamType, Context>(),
+      X_rsig->template data<ParamT, Context>(),
-      Input(1).template data<ParamType, Context>(), // gamma
+      Input(1).template data<ParamT, Context>(), // gamma
      Input(2).template data<T, Context>(), // dy
-      X_scale->template mutable_data<ParamType, Context>(),
+      X_scale->template mutable_data<ParamT, Context>(),
-      X_bias->template mutable_data<ParamType, Context>(),
+      X_bias->template mutable_data<ParamT, Context>(),
-      dW->Reshape({C_})->template mutable_data<ParamType, Context>(),
+      dW->Reshape({C_})->template mutable_data<ParamT, Context>(),
-      dB->Reshape({C_})->template mutable_data<ParamType, Context>(),
+      dB->Reshape({C_})->template mutable_data<ParamT, Context>(),
      dX->template mutable_data<T, Context>(),
      ctx());
 }

--- a/dragon/python/core/autograph/op_spec.py
+++ b/dragon/python/core/autograph/op_spec.py
@@ -183,7 +183,7 @@ def conv_spec(args, inputs, outputs):
                    out_size = (in_size + pad_size - k_size) // s + 1
                else:
                    out_size = (in_size + s - 1) // s
-            except IndexError:
+            except (IndexError, TypeError):
                out_size = None
            out_shape[i + spatial_axis] = out_size
    except (TypeError, IndexError):
@@ -205,6 +205,12 @@ def conv_transpose_spec(args, inputs, outputs):
        else:
            out_shape[channel_axis] = inputs[1].shape[1]
        for i in range(num_axes):
+            if 'output_padding_desc' in args or \
+                    'output_padding_descs' in args or \
+                    'output_shape_desc' in args or \
+                    'output_shape_descs' in args:
+                out_shape[i + spatial_axis] = None
+                continue
            try:
                k = args['kernel_shape'][i]
                s = args['strides'][i]
@@ -219,9 +225,9 @@ def conv_transpose_spec(args, inputs, outputs):
                else:
                    if 'output_shape' in args and args['output_shape']:
                        out_size = args['output_shape'][i]
-                    else:
+                        if 'output_padding' in args and args['output_padding']:
-                        out_size = None
+                            out_size += args['output_padding'][i]
-            except IndexError:
+            except (IndexError, TypeError):
                out_size = None
            out_shape[i + spatial_axis] = out_size
    except (TypeError, IndexError):
@@ -296,23 +302,28 @@ def eltwise_loss_spec(args, inputs, outputs):
 @register('Expand')
 def expand_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    shape, out_shape = args['dims'], None
-    if shape is None:
-        return outputs
    try:
-        in_shape, out_shape = list(inputs[0].shape[:]), list(shape[:])
+        out_shape = None
-        if len(shape) < len(in_shape):
+        if 'dims_descs' in args:
-            num_keep = len(in_shape) - len(shape)
+            out_shape = [None] * len(args['dims_descs'])
+        elif 'dims_desc' in args:
+            out_shape = [None] * len(inputs[0].shape)
+        elif 'dims' in args:
+            in_shape = list(inputs[0].shape[:])
+            dims = args['dims']
+            out_shape = list(dims[:])
+            if len(dims) < len(in_shape):
+                num_keep = len(in_shape) - len(dims)
                out_shape = in_shape[:num_keep] + out_shape
-        elif len(shape) > len(in_shape):
+            elif len(dims) > len(in_shape):
-            num_expand = len(shape) - len(in_shape)
+                num_expand = len(dims) - len(in_shape)
                in_shape = [1] * num_expand + in_shape
                for i, dim in enumerate(out_shape):
                    if dim is not None and dim < 0:
                        out_shape[i] = in_shape[i]
        outputs[0].shape = out_shape
-    except TypeError:
+    except (KeyError, TypeError):
-        pass
+        outputs[0].shape = None
    return outputs
@@ -330,8 +341,7 @@ def expand_dims_spec(args, inputs, outputs):
                out_shape[axis] = -1
        j = 0
        for i in range(out_rank):
-            if out_shape[i] is not None and \
+            if out_shape[i] is not None and out_shape[i] < 0:
-                    out_shape[i] < 0:
                out_shape[i] = 1
            else:
                if j >= len(inputs[0].shape):
@@ -358,6 +368,8 @@ def fill_spec(args, inputs, outputs):
    try:
        if 'dims' in args:
            outputs[0].shape = args['dims'][:]
+        elif 'dims_descs' in args:
+            outputs[0].shape = [None] * len(args['dims_descs'])
        else:
            outputs[0].shape = inputs[0].shape[:]
    except (TypeError, KeyError, IndexError):
@@ -432,18 +444,20 @@ def fully_connected_spec(args, inputs, outputs):
 @register('ChannelNormalize')
 def channel_normalize_spec(args, inputs, outputs):
    outputs[0].dtype = args['dtype']
-    perm = args['perm']
-    if 'perm_desc' in args or 'perm_descs' in args:
-        return outputs
    try:
+        out_shape = list(inputs[0].shape[:])
+        if 'perm' in args:
+            perm = args['perm']
            if perm is None:
-            perm = list(range((len(inputs[0].shape))))
+                perm = list(range(len(inputs[0].shape)))
            out_shape = list(inputs[0].shape[:])
            for i, axis in enumerate(perm):
                out_shape[i] = inputs[0].shape[axis]
-    except (TypeError, IndexError):
+        else:
-        out_shape = None
+            out_shape = [None] * len(out_shape)
        outputs[0].shape = out_shape
+    except (TypeError, IndexError):
+        outputs[0].shape = None
    return outputs
@@ -497,37 +511,45 @@ def masked_select_spec(args, inputs, outputs):
 def matmul_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
    ta, tb = args['transA'], args['transB']
-    out_shape = None
    try:
        b_shape = list(inputs[1].shape[:])
        a_shape = out_shape = list(inputs[0].shape[:])
        out_shape[-2] = a_shape[-1] if ta else a_shape[-2]
        out_shape[-1] = b_shape[-2] if tb else b_shape[-1]
-    except TypeError:
+    except (TypeError, IndexError):
-        pass
+        out_shape = None
    outputs[0].shape = out_shape
    return outputs
 @register('Moments')
 def moments_spec(args, inputs, outputs):
-    outputs[0].dtype = outputs[1].dtype = \
+    out_dtype = 'float32'
-        inputs[0].dtype if inputs[0].dtype == 'float64' else 'float32'
+    if inputs[0].dtype == 'float64':
+        out_dtype = 'float64'
+    elif inputs[0].dtype == 'int64':
+        out_dtype = 'float64'
+    outputs[0].dtype = outputs[1].dtype = out_dtype
    axes, keep_dims = args['axes'], args['keep_dims']
    try:
        out_shape = list(inputs[0].shape[:])
        for axis in axes:
            if axis < len(out_shape):
-                out_shape[axis] = 1
+                out_shape[axis] = -1
        if not keep_dims:
            squeezed_shape = []
            for d in out_shape:
-                if d != 1:
+                if d >= 0:
                    squeezed_shape.append(d)
            out_shape = squeezed_shape
+        else:
+            out_shape = [1 if d < 0 else d for d in out_shape]
    except TypeError:
+        if axes is None:
+            out_shape = (1,) if keep_dims else ()
+        else:
            out_shape = None
-    outputs[0].shape = outputs[1].shape = out_shape if axes else ()
+    outputs[0].shape = outputs[1].shape = out_shape
    return outputs
@@ -535,10 +557,11 @@ def moments_spec(args, inputs, outputs):
 def multinomial_spec(args, inputs, outputs):
    outputs[0].dtype = 'int64'
    try:
-        outputs[0].shape = inputs[0].shape[:]
+        out_shape = list(inputs[0].shape[:])
-        outputs[0].shape[-1] = args['num_samples']
+        out_shape[-1] = args['num_samples']
    except TypeError:
-        pass
+        out_shape = None
+    outputs[0].shape = out_shape
    return outputs
@@ -584,11 +607,8 @@ def pad_spec(args, inputs, outputs):
 @register('Permutation')
 def permutation_spec(args, inputs, outputs):
    outputs[0].dtype = args['dtype']
-    if len(inputs) == 1:
+    if 'limit_desc' in args:
-        try:
+        outputs[0].shape = (None,)
-            outputs[0].shape = inputs[0].shape[:]
-        except TypeError:
-            pass
    else:
        outputs[0].shape = (args['limit'],)
    return outputs
@@ -599,7 +619,7 @@ def pool_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
    out_shape = None
    try:
-        out_shape = inputs[0].shape[:]
+        out_shape = list(inputs[0].shape[:])
        num_axes = len(out_shape) - 2
        spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
        for i in range(num_axes):
@@ -615,13 +635,13 @@ def pool_spec(args, inputs, outputs):
                        out_size = floor_or_ceil(out_size)
                    else:
                        out_size = math.ceil(float(in_size) / float(s))
-                except IndexError:
+                except TypeError:
                    out_size = None
                out_shape[i + spatial_axis] = out_size
            else:
                out_shape[i + spatial_axis] = 1
    except (TypeError, IndexError):
-        pass
+        out_shape = None
    outputs[0].shape = out_shape
    return outputs
@@ -641,7 +661,7 @@ def range_spec(args, inputs, outputs):
        start, limit, delta = slice_args
    try:
        outputs[0].shape = (int(math.ceil((limit - start) / delta)),)
-    except TypeError:
+    except (TypeError, ZeroDivisionError):
        pass
    return outputs
@@ -662,22 +682,26 @@ def reduce_spec(args, inputs, outputs):
            out_shape = list(inputs[0].shape[:])
            for axis in axes:
                if axis < len(out_shape):
-                    out_shape[axis] = 1
+                    out_shape[axis] = -1
            if not keep_dims:
                squeezed_shape = []
                for d in out_shape:
-                    if d != 1:
+                    if d >= 0:
                        squeezed_shape.append(d)
                out_shape = squeezed_shape
+            else:
+                out_shape = [1 if d < 0 else d for d in out_shape]
+        except TypeError:
+            out_shape = None
        outputs[0].shape = out_shape
-        except (TypeError, IndexError):
-            pass
    return outputs
 @register('Repeat')
 def repeat_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
+    if 'repeats_desc' in args:
+        return outputs
    axis, repeats = args['axis'], args['repeats']
    if axis is None:
        try:
@@ -702,8 +726,8 @@ def repeat_spec(args, inputs, outputs):
 @register('Reshape')
 def reshape_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    shape, out_shape = args['dims'], None
    try:
+        shape = args['dims']
        out_shape = []
        n_elements, n_elements_known = None, None
        try:
@@ -714,7 +738,7 @@ def reshape_spec(args, inputs, outputs):
                    out_shape.append(inputs[0].shape[i])
                else:
                    out_shape.append(s)
-        except TypeError:
+        except IndexError:
            out_shape = None
        try:
            n_elements = math_util.prod(inputs[0].shape)
@@ -727,8 +751,11 @@ def reshape_spec(args, inputs, outputs):
                    out_shape[i] = n_elements // n_elements_known
                except TypeError:
                    out_shape[i] = None
-    except TypeError:
+    except (KeyError, TypeError):
-        pass
+        if 'dims_descs' in args:
+            out_shape = [None] * len(args['dims_descs'])
+        else:
+            out_shape = None
    outputs[0].shape = out_shape
    return outputs
@@ -736,13 +763,12 @@ def reshape_spec(args, inputs, outputs):
 @register('Resize')
 def resize_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    if 'sizes_desc' in args or \
-            'sizes_descs' in args or \
-            'scales_desc' in args or \
-            'scales_descs' in args:
-        return outputs
    try:
        out_shape = list(inputs[0].shape[:])
+        if 'sizes_desc' in args or 'sizes_descs' in args or \
+                'scales_desc' in args or 'scales_descs' in args:
+            outputs[0].shape = [None] * len(out_shape)
+            return outputs
        num_axes = len(out_shape) - 2
        axis = len(out_shape) - 2 if args['data_format'] == 'NCHW' else 1
        try:
@@ -756,12 +782,15 @@ def resize_spec(args, inputs, outputs):
                    else:
                        out_shape[j] = args['sizes'][j]
                elif args['scales'] is not None:
+                    try:
                        if len(args['scales']) == 1:
                            out_shape[j] = int(out_shape[j] * args['scales'][0])
                        elif len(args['scales']) == num_axes:
                            out_shape[j] = int(out_shape[j] * args['scales'][i])
                        else:
-                        out_shape[j] = int(out_shape[j] * args['sizes'][j])
+                            out_shape[j] = int(out_shape[j] * args['scales'][j])
+                    except TypeError:
+                        out_shape[j] = None
        except IndexError:
            return outputs
        outputs[0].shape = out_shape
@@ -801,12 +830,10 @@ def shape_spec(args, inputs, outputs):
 @register('Slice')
 def slice_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    if 'starts_desc' in args or \
+    if 'starts_desc' in args or 'starts_descs' in args or \
-            'starts_descs' in args or \
+            'sizes_desc' in args or 'sizes_descs' in args:
-            'sizes_desc' in args or \
-            'sizes_descs' in args:
        return outputs
-    starts, sizes = args['starts'], args['sizes']
+    starts, sizes = list(args['starts']), list(args['sizes'])
    try:
        in_shape = inputs[0].shape[:]
        ndim = len(in_shape)
@@ -834,7 +861,7 @@ def slice_spec(args, inputs, outputs):
 def softmax_loss_spec(args, inputs, outputs):
    outputs[0].dtype = 'float32'
    axis, reduction = args['axis'], args['reduction']
-    if reduction != 'NONE':
+    if reduction.upper() != 'NONE':
        outputs[0].shape = ()
    else:
        try:
@@ -894,8 +921,6 @@ def split_spec(args, inputs, outputs):
    axis = args['axis']
    size_splits = args['size_splits']
    slice_points = args['slice_points']
-    if slice_points is not None and len(slice_points) == 0:
-        slice_points = None
    slice_offset = 0
    for i in range(len(outputs)):
        try:
@@ -905,10 +930,7 @@ def split_spec(args, inputs, outputs):
        except TypeError:
            return outputs
        if size_splits is not None:
-            try:
            out_shape[axis] = size_splits[i]
-            except IndexError:
-                return outputs
        elif slice_points is not None:
            try:
                if i < len(outputs) - 1:
@@ -917,16 +939,16 @@ def split_spec(args, inputs, outputs):
                else:
                    slice_dim = inputs[0].shape[axis] - slice_offset
                out_shape[axis] = slice_dim
-            except (TypeError, IndexError):
+            except TypeError:
-                return outputs
+                out_shape[axis] = None
        else:
            try:
                slice_dim = (out_shape[axis] + num_outputs - 1) // num_outputs
                if i == num_outputs - 1:
                    slice_dim = out_shape[axis] - slice_dim * (num_outputs - 1)
                out_shape[axis] = slice_dim
-            except (TypeError, IndexError):
+            except TypeError:
-                return outputs
+                out_shape[axis] = None
        outputs[i].shape = out_shape
    return outputs
@@ -988,34 +1010,38 @@ def stack_spec(args, inputs, outputs):
 @register('Tile')
 def tile_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    repeats = args['repeats']
-    if repeats is not None:
    try:
        out_shape = list(inputs[0].shape[:])
+        if 'repeats' in args:
+            repeats = args['repeats']
            for i, size in enumerate(repeats):
                if i < len(out_shape):
                    try:
                        out_shape[i] *= size
                    except TypeError:
                        out_shape[i] = None
+        else:
+            out_shape = [None] * len(out_shape)
        outputs[0].shape = out_shape
-        except TypeError:
+    except (KeyError, TypeError):
-            pass
+        outputs[0].shape = None
    return outputs
 @register('Transpose')
 def transpose_spec(args, inputs, outputs):
    outputs[0].dtype = inputs[0].dtype
-    if 'perm_desc' in args or 'perm_descs' in args:
-        return outputs
    try:
+        out_shape = list(inputs[0].shape[:])
+        if 'perm' in args:
            perm = args['perm']
            if perm is None:
                perm = list(range(((len(inputs[0].shape)) - 1), -1, -1))
            out_shape = list(inputs[0].shape[:])
            for i, axis in enumerate(perm):
                out_shape[i] = inputs[0].shape[axis]
+        else:
+            out_shape = [None] * len(out_shape)
        outputs[0].shape = out_shape
    except (TypeError, IndexError):
        outputs[0].shape = None

--- a/dragon/python/core/ops/activation_ops.py
+++ b/dragon/python/core/ops/activation_ops.py
@@ -58,7 +58,7 @@ def dropout(inputs, ratio=0.5, **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate() \
-            .apply([inputs], ratio, inplace=inplace)
+            .apply([inputs], args['ratio'], inplace=inplace)
    else:
        return op_lib.blend(**args)
@@ -103,7 +103,7 @@ def drop_block2d(inputs, ratio=0.1, block_size=7, data_format='NCHW', **kwargs):
            .instantiate(
                block_size=block_size,
                data_format=data_format,
-            ).apply([inputs], ratio, inplace=inplace)
+            ).apply([inputs], args['ratio'], inplace=inplace)
    else:
        return op_lib.blend(**args)
@@ -137,7 +137,7 @@ def drop_path(inputs, ratio=0.2, **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate() \
-            .apply([inputs], ratio, inplace=inplace)
+            .apply([inputs], args['ratio'], inplace=inplace)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/array_ops.py
+++ b/dragon/python/core/ops/array_ops.py
@@ -205,9 +205,8 @@ def broadcast_to(inputs, shape, **kwargs):
    op_lib = array_ops_lib.Expand
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(
+            .instantiate(ndim=len(args['dims'])) \
-                ndim=len(args['dims']),
+            .apply([inputs], args['dims'])
-            ).apply([inputs], args['dims'])
    else:
        return op_lib.blend(**args)
@@ -1163,6 +1162,7 @@ def pad(inputs, pads, mode='constant', value=0, **kwargs):
        return op_lib.blend(**args)
+@ArgHelper.desc('limit', as_target=True)
 def permutation(limit, dtype='int64', **kwargs):
    r"""Return a tensor with value in the permuted range.
@@ -1174,7 +1174,7 @@ def permutation(limit, dtype='int64', **kwargs):
    Parameters
    ----------
-    limit: number
+    limit: Union[number, dragon.Tensor]
        The end of interval.
    dtype : str, optional, default='int64'
        The optional data type.
@@ -1192,7 +1192,7 @@ def permutation(limit, dtype='int64', **kwargs):
    if context.executing_eagerly():
        return op_lib \
            .instantiate(dtype=dtype) \
-            .apply(limit, trainable=trainable)
+            .apply(args['limit'], trainable=trainable)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/control_flow_ops.py
+++ b/dragon/python/core/ops/control_flow_ops.py
@@ -49,10 +49,11 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
    inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
    op_lib = control_flow_ops_lib.Assign
    if context.executing_eagerly():
+        starts = args['starts'] if starts is not None else [0]
+        sizes = args['sizes'] if sizes is not None else [-1]
        return op_lib \
-            .instantiate(
+            .instantiate(ndim=len(starts)) \
-                ndim=len(starts) if starts is not None else 0,
+            .apply(inputs, starts, sizes, inplace=inplace)
-            ).apply(inputs, starts, sizes, inplace=inplace)
    else:
        return op_lib.blend(**args)

--- a/dragon/python/core/ops/normalization_ops.py
+++ b/dragon/python/core/ops/normalization_ops.py
@@ -23,6 +23,7 @@ from dragon.core.util import nest
 @OpSchema.num_inputs(5)
+@ArgHelper.desc('momentum', as_target=False)
 def batch_norm(
    inputs,
    axis=-1,
@@ -40,7 +41,8 @@ def batch_norm(
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
+    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
+                                   (1 - \text{momentum}) * x_{\text{batch}}
    Parameters
    ----------
@@ -48,8 +50,8 @@ def batch_norm(
        The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
    axis : int, optional, default=-1
        The channel axis.
-    momentum : float, optional, default=0.9
+    momentum : Union[float, dragon.Tensor], optional
-        The momentum for running average.
+        The value to :math:`\text{momentum}`.
    epsilon : float, optional, default=1e-5
        The value to :math:`\epsilon`.
    use_stats : int, optional, default=-1
@@ -62,16 +64,15 @@ def batch_norm(
    """
    args = ArgHelper.parse(locals())
-    args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
+    args['epsilon'] = float(epsilon)
    op_lib = normalization_ops_lib.BatchNorm
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
                axis=axis,
-                momentum=args['momentum'],
                epsilon=args['epsilon'],
                use_stats=use_stats,
-            ).apply(inputs)
+            ).apply(inputs, args['momentum'])
    else:
        return op_lib.blend(**args)
@@ -304,6 +305,7 @@ def local_response_norm(
 @OpSchema.num_inputs(5)
+@ArgHelper.desc('momentum', as_target=False)
 def sync_batch_norm(
    inputs,
    axis=-1,
@@ -322,7 +324,8 @@ def sync_batch_norm(
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
+    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
+                                   (1 - \text{momentum}) * x_{\text{batch}}
    Parameters
    ----------
@@ -330,8 +333,8 @@ def sync_batch_norm(
        The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
    axis : int, optional, default=-1
        The channel axis.
-    momentum : float, optional, default=0.9
+    momentum : Union[float, dragon.Tensor], optional
-        The momentum for average.
+        The value to :math:`\text{momentum}`.
    epsilon : float, optional, default=1e-5
        The value to :math:`\epsilon`.
    use_stats : int, optional, default=-1
@@ -346,7 +349,7 @@ def sync_batch_norm(
    """
    args = ArgHelper.parse(locals())
-    args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
+    args['epsilon'] = float(epsilon)
    if process_group is None:
        process_group = distributed.get_group()
    if process_group is None:
@@ -356,11 +359,10 @@ def sync_batch_norm(
        return op_lib \
            .instantiate(
                axis=axis,
-                momentum=args['momentum'],
                epsilon=args['epsilon'],
                use_stats=use_stats,
                process_group=process_group,
-            ).apply(inputs)
+            ).apply(inputs, args['momentum'])
    else:
        args.update(process_group.arguments)
        return op_lib.blend(**args)
--- a/dragon/python/core/ops/normalization_ops_lib.py
+++ b/dragon/python/core/ops/normalization_ops_lib.py
@@ -23,7 +23,6 @@ class BatchNorm(Operator):
    def __init__(self, key, dev, **kwargs):
        super(BatchNorm, self).__init__(key, dev, **kwargs)
        self.axis = kwargs.get('axis', -1)
-        self.momentum = kwargs.get('momentum', 0.9)
        self.epsilon = kwargs.get('epsilon', 1e-5)
        self.use_stats = kwargs.get('use_stats', 0)
        if self.use_stats not in (0, 1):
@@ -34,14 +33,21 @@ class BatchNorm(Operator):
            'op_type': 'BatchNorm',
            'arguments': {
                'axis': self.axis,
-                'momentum': self.momentum,
                'epsilon': self.epsilon,
                'use_stats': self.use_stats,
+                'momentum_desc': '${HANDLE}/momentum',
            },
        }
-    def forward(self, inputs):
+    def setup(self, ws, handle, momentum):
-        return self.dispatch(inputs, [self.alloc()])
+        self.feed_arg(ws, '%s/momentum' % handle, momentum, 'float32')
+    def forward(self, inputs, momentum):
+        return self.dispatch(
+            inputs, [self.alloc()],
+            callback=lambda ws, handle:
+                self.setup(ws, handle, momentum),
+        )
 class GroupNorm(Operator):

--- a/dragon/python/core/ops/utils.py
+++ b/dragon/python/core/ops/utils.py
@@ -118,6 +118,7 @@ class ArgHelper(object):
            if 'extra_inputs' not in arguments:
                arguments['extra_inputs'] = []
            arguments['extra_inputs'] += [arg]
+        if name in arguments:
            arguments.pop(name)
        arguments[name + '_desc'] = arg.id
        return arguments
@@ -141,5 +142,6 @@ class ArgHelper(object):
                    descs.append(ele.id)
                else:
                    descs.append(Tensor.from_value(ele, dtype, 'DescConst').id)
+            if name in arguments:
                arguments.pop(name)
            arguments[name + '_descs'] = descs
--- a/dragon/python/core/ops/vision_ops.py
+++ b/dragon/python/core/ops/vision_ops.py
@@ -176,9 +176,12 @@ def conv2d_transpose(
        raise ValueError('Unsupported padding algorithm: %s' % padding)
    if data_format not in ('NCHW', 'NHWC'):
        raise ValueError('Unsupported data format: %s' % data_format)
+    if 'SAME' in padding and output_shape is None:
+        raise ValueError('Excepted <output_shape> for same padding.')
    if output_shape is not None and 'SAME' not in padding:
        args['padding'] = 'SAME'
    for key in ('kernel_shape', 'strides', 'pads', 'dilations'):
+        if key in args and args[key] is not None:
            if key == 'pads':
                args[key] = _normalize_pads(args[key], 2)
            else:

--- a/dragon/python/vm/onnx/core/exporters/activation.py
+++ b/dragon/python/vm/onnx/core/exporters/activation.py
@@ -26,7 +26,7 @@ def dropout_exporter(op_def, context):
            drop_ratio = arg.f
        elif arg.name == 'prob_desc':
            drop_ratio = helper.fetch_argument(op_def, arg, context.ws)
-    helper.add_attribute(node, 'ratio', drop_ratio)
+    helper.add_attribute(node, 'ratio', float(drop_ratio))
    return node, const_tensors

--- a/dragon/python/vm/onnx/core/exporters/normalization.py
+++ b/dragon/python/vm/onnx/core/exporters/normalization.py
@@ -26,6 +26,9 @@ def batch_norm_exporter(op_def, context):
            helper.add_attribute(node, 'epsilon', arg.f)
        elif arg.name == 'momentum':
            helper.add_attribute(node, 'momentum', arg.f)
+        elif arg.name == 'momentum_desc':
+            momentum = helper.fetch_argument(op_def, arg, context.ws)
+            helper.add_attribute(node, 'momentum', float(momentum))
    # Weight, bias, running mean and running variance
    const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
    return node, const_tensors

--- a/dragon/utils/conversions.h
+++ b/dragon/utils/conversions.h
@@ -123,23 +123,51 @@ CONVERSIONS_DECL float16 To<float16, half>(half val) {
 }
 template <>
-CONVERSIONS_DECL half To<half, float>(float val) {
+CONVERSIONS_DECL half To<half, float16>(float16 val) {
-  return __float2half(val);
+  return __half_raw{val.x};
 }
 template <>
-CONVERSIONS_DECL half To<half, float16>(float16 val) {
+CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
-  return __half_raw{val.x};
+  return half2(__half2_raw{val.x, val.x});
 }
 template <>
-CONVERSIONS_DECL half2 To<half2, float>(float val) {
+CONVERSIONS_DECL half To<half, float>(float val) {
-  return __float2half2_rn(val);
+#if CUDA_VERSION_MIN(9, 2, 0)
+  return __float2half(val);
+#else
+#if defined(__CUDA_ARCH__)
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+  __half ret;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(ret)) : "f"(val));
+  return ret;
+#undef __HALF_TO_US
+#else
+  return To<half>(To<float16>(val));
+#endif
+#endif
 }
 template <>
-CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
+CONVERSIONS_DECL half2 To<half2, float>(float val) {
-  return half2(__half2_raw{val.x, val.x});
+#if CUDA_VERSION_MIN(9, 2, 0)
+  return __float2half2_rn(val);
+#else
+#if defined(__CUDA_ARCH__)
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+  __half2 ret;
+  asm("{.reg .f16 low;\n"
+      "  cvt.rn.f16.f32 low, %1;\n"
+      "  mov.b32 %0, {low,low};}\n"
+      : "=r"(__HALF2_TO_UI(ret))
+      : "f"(val));
+  return ret;
+#undef __HALF2_TO_UI
+#else
+  return To<half2>(To<float16>(val));
+#endif
+#endif
 }
 #endif // USE_CUDA

--- a/dragon/utils/math/elementwise.cu
+++ b/dragon/utils/math/elementwise.cu
@@ -162,23 +162,17 @@ __global__ void _InvStd(const int n, const T eps, const T* x, T* y) {
  }
 }
-template <>
+__global__ void _InvStd(const int n, const float eps, const half* x, half* y) {
-__global__ void
-_InvStd<half>(const int n, const half eps, const half* x, half* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
+    y[i] = __float2half(rsqrt(__half2float(x[i]) + eps));
-    y[i] = hrsqrt(__hadd(x[i], eps));
-#endif
  }
 }
-template <>
 __global__ void
-_InvStd<half2>(const int n, const half2 eps, const half2* x, half2* y) {
+_InvStd(const int n, const float eps, const half2* x, half2* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
+    const float2 val = __half22float2(x[i]);
-    y[i] = h2rsqrt(__hadd2(x[i], eps));
+    y[i] = __floats2half2_rn(rsqrt(val.x + eps), rsqrt(val.y + eps));
-#endif
  }
 }
@@ -206,19 +200,15 @@ __global__ void _Powx(const int n, const T exponent, const T* x, T* y) {
 __global__ void
 _Powx(const int n, const float exponent, const half* x, half* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
    y[i] = __float2half(pow(__half2float(x[i]), exponent));
-#endif
  }
 }
 __global__ void
 _Powx(const int n, const float exponent, const half2* x, half2* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
-#if __CUDA_ARCH__ >= 530
    const float2 val = __half22float2(x[i]);
    y[i] = __floats2half2_rn(pow(val.x, exponent), pow(val.y, exponent));
-#endif
  }
 }
@@ -269,20 +259,16 @@ __global__ void _Square(const int n, const T* x, T* y) {
 template <typename T>
 __global__ void _NotZero(const int nthreads, const T* x, bool* y) {
-  const T kZero = T(0);
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = x[i] != kZero ? true : false;
+    y[i] = x[i] != T(0) ? true : false;
  }
 }
 template <>
 __global__ void _NotZero<half>(const int nthreads, const half* x, bool* y) {
-#if __CUDA_ARCH__ >= 530
-  const half kZero = __float2half(0.f);
  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    y[i] = __hne(x[i], kZero) ? true : false;
+    y[i] = __half2float(x[i]) != 0.f ? true : false;
  }
-#endif
 }
 template <typename T>
@@ -560,15 +546,12 @@ DRAGON_API void InvStd<float16, CUDAContext>(
  if ((n & 1) == 0) {
    _InvStd<<<CUDA_BLOCKS(n >> 1), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
        n >> 1,
-        convert::To<half2>(eps),
+        eps,
        reinterpret_cast<const half2*>(x),
        reinterpret_cast<half2*>(y));
  } else {
    _InvStd<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
-        n,
+        n, eps, reinterpret_cast<const half*>(x), reinterpret_cast<half*>(y));
-        convert::To<half>(eps),
-        reinterpret_cast<const half*>(x),
-        reinterpret_cast<half*>(y));
  }
 }

--- a/dragon/utils/math/functional.h
+++ b/dragon/utils/math/functional.h
@@ -26,7 +26,7 @@ namespace math {
 template <typename T>
 struct MaxFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs ? rhs : lhs;
  }
@@ -39,7 +39,7 @@ struct MaxFunctor {
 template <>
 struct MaxFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -62,7 +62,7 @@ struct MaxFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MaxFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -87,7 +87,7 @@ struct MaxFunctor<half2> {
 template <typename T>
 struct MinFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs ? lhs : rhs;
  }
@@ -100,7 +100,7 @@ struct MinFunctor {
 template <>
 struct MinFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -123,7 +123,7 @@ struct MinFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MinFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -148,7 +148,7 @@ struct MinFunctor<half2> {
 template <typename T>
 struct PlusFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs + rhs;
  }
@@ -161,7 +161,7 @@ struct PlusFunctor {
 template <>
 struct PlusFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -183,7 +183,7 @@ struct PlusFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct PlusFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -211,7 +211,7 @@ struct PlusFunctor<half2> {
 template <typename T>
 struct MinusFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs - rhs;
  }
@@ -224,7 +224,7 @@ struct MinusFunctor {
 template <>
 struct MinusFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -246,7 +246,7 @@ struct MinusFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MinusFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -274,7 +274,7 @@ struct MinusFunctor<half2> {
 template <typename T>
 struct MultipliesFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs * rhs;
  }
@@ -287,7 +287,7 @@ struct MultipliesFunctor {
 template <>
 struct MultipliesFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -309,7 +309,7 @@ struct MultipliesFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct MultipliesFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -337,7 +337,7 @@ struct MultipliesFunctor<half2> {
 template <typename T>
 struct DividesFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return lhs / rhs;
  }
@@ -350,7 +350,7 @@ struct DividesFunctor {
 template <>
 struct DividesFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
 #if __CUDA_ARCH__ >= 530
@@ -372,7 +372,7 @@ struct DividesFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct DividesFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -396,7 +396,7 @@ struct DividesFunctor<half2> {
 template <typename T>
 struct PowFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ T operator()(const T& lhs, const T& rhs) const {
    return pow(lhs, rhs);
  }
@@ -409,7 +409,7 @@ struct PowFunctor {
 template <>
 struct PowFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ float16
  operator()(const float16& lhs, const float16& rhs) const {
    half ret = __float2half(
@@ -425,7 +425,7 @@ struct PowFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct PowFunctor<half> {
  inline __device__ half operator()(const half& lhs, const half& rhs) const {
@@ -449,7 +449,7 @@ struct PowFunctor<half2> {
 template <typename T>
 struct EqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs == rhs;
  }
@@ -462,7 +462,7 @@ struct EqualFunctor {
 template <>
 struct EqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -481,7 +481,7 @@ struct EqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct EqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -496,7 +496,7 @@ struct EqualFunctor<half> {
 template <typename T>
 struct NotEqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs != rhs;
  }
@@ -509,7 +509,7 @@ struct NotEqualFunctor {
 template <>
 struct NotEqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -528,7 +528,7 @@ struct NotEqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct NotEqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -543,7 +543,7 @@ struct NotEqualFunctor<half> {
 template <typename T>
 struct GreaterFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs > rhs;
  }
@@ -556,7 +556,7 @@ struct GreaterFunctor {
 template <>
 struct GreaterFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -575,7 +575,7 @@ struct GreaterFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct GreaterFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -590,7 +590,7 @@ struct GreaterFunctor<half> {
 template <typename T>
 struct LessFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs < rhs;
  }
@@ -603,7 +603,7 @@ struct LessFunctor {
 template <>
 struct LessFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -622,7 +622,7 @@ struct LessFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct LessFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -637,7 +637,7 @@ struct LessFunctor<half> {
 template <typename T>
 struct GreaterEqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs >= rhs;
  }
@@ -650,7 +650,7 @@ struct GreaterEqualFunctor {
 template <>
 struct GreaterEqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -669,7 +669,7 @@ struct GreaterEqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct GreaterEqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {
@@ -684,7 +684,7 @@ struct GreaterEqualFunctor<half> {
 template <typename T>
 struct LessEqualFunctor {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const T& lhs, const T& rhs) const {
    return lhs <= rhs;
  }
@@ -697,7 +697,7 @@ struct LessEqualFunctor {
 template <>
 struct LessEqualFunctor<float16> {
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
  inline __device__ bool operator()(const float16& lhs, const float16& rhs)
      const {
 #if __CUDA_ARCH__ >= 530
@@ -716,7 +716,7 @@ struct LessEqualFunctor<float16> {
 #endif
 };
-#if defined(__CUDACC__)
+#if defined(__CUDA_ARCH__)
 template <>
 struct LessEqualFunctor<half> {
  inline __device__ bool operator()(const half& lhs, const half& rhs) const {

--- a/dragon/utils/math/reduce.cu
+++ b/dragon/utils/math/reduce.cu
@@ -239,8 +239,8 @@ void ReduceSum<float16, CUDAContext>(
        num_axes,                                                          \
        axes,                                                              \
        Reducer<AccT>(),                                                   \
-        AccT(kInit),                                                       \
+        convert::To<AccT>(kInit),                                          \
-        AccT(scale),                                                       \
+        convert::To<AccT>(scale),                                          \
        x,                                                                 \
        y,                                                                 \
        ctx);                                                              \

--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -301,16 +301,16 @@ void ChannelAffine(
 /* array.channel_normalize */
-template <typename Tx, typename Ty, class Context>
+template <typename InputT, typename OutputT, class Context>
 void ChannelNormalize(
    const int axis,
    const int num_dims,
    const int64_t* x_strides,
    const int64_t* y_dims,
-    const Tx* x,
+    const InputT* x,
    const float* mean,
    const float* std,
-    Ty* y,
+    OutputT* y,
    Context* ctx);
 /* array.channel_shuffle */
@@ -648,28 +648,28 @@ void BroadcastLossGrad(
 /* loss.nll_loss */
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void NLLLoss(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void NLLLossGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
 /* loss.sigmoid_ce_loss */
@@ -694,7 +694,7 @@ void SigmoidCrossEntropyGrad(
 /* loss.sigmoid_focal_loss */
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SigmoidFocalLoss(
    const int outer_dim,
    const int inner_dim,
@@ -703,13 +703,13 @@ void SigmoidFocalLoss(
    const float neg_alpha,
    const float gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SigmoidFocalLossGrad(
    const int outer_dim,
    const int inner_dim,
@@ -718,10 +718,10 @@ void SigmoidFocalLossGrad(
    const float neg_alpha,
    const float gamma,
    const int negative_index,
-    const LogitType* logit,
+    const LogitT* logit,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dlogit,
+    LogitT* dlogit,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
 /* loss.smooth_l1_loss */
@@ -754,28 +754,28 @@ void SoftmaxCrossEntropy(
 /* loss.sparse_softmax_cross_entropy */
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SparseSoftmaxCrossEntropy(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* loss,
+    LogitT* loss,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
-template <typename LogitType, typename TargetType, class Context>
+template <typename LogitT, typename TargetT, class Context>
 void SparseSoftmaxCrossEntropyGrad(
    const int outer_dim,
    const int inner_dim,
    const int axis_dim,
    const int ignore_index,
-    const LogitType* prob,
+    const LogitT* prob,
-    const TargetType* target,
+    const TargetT* target,
-    LogitType* dx,
+    LogitT* dx,
-    LogitType* mask,
+    LogitT* mask,
    Context* ctx);
 /* math.abs */

--- a/tensorflow/core/keras/layers/normalization.py
+++ b/tensorflow/core/keras/layers/normalization.py
@@ -55,7 +55,7 @@ class BatchNormalization(Layer):
        axis : int, optional, default=-1
            The channel axis.
        momentum : float, optional, default=0.99
-            The momentum of moving average.
+            The decay factor of running average.
        epsilon : float, optional, default=1e-3
            The epsilon value.
        center : bool, optional, default=True

--- a/tensorflow/core/ops/nn_impl.py
+++ b/tensorflow/core/ops/nn_impl.py
@@ -41,8 +41,8 @@ def batch_normalization(
    The moving average of stats are calculated as:
-    .. math::
+    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
-        x_{moving} \leftarrow momentum * x_{moving} + (1 - momentum) * x_{stat}
+                                   (1 - \text{momentum}) * x_{\text{batch}}
    Parameters
    ----------
@@ -58,10 +58,10 @@ def batch_normalization(
        The :math:`\gamma` tensor.
    axis : int, optional, default=-1
        The channel axis.
-    momentum : float, optional, default=0.9
+    momentum : Union[float, dragon.Tensor], optional
-        The momentum of moving average.
+        The value to :math:`\text{momentum}`.
    variance_epsilon : float, optional, default=1e-5
-        The value of epsilon.
+        The value to :math:`\epsilon`.
    trainable : bool, optional, default=False
        The optional training flag.
    name : str, optional

--- a/tensorlayer/core/layers/normalization.py
+++ b/tensorlayer/core/layers/normalization.py
@@ -50,7 +50,7 @@ class BatchNorm(layer.Layer):
        Parameters
        ----------
        decay : float, optional, default=0.9
-            The decay factor for moving average.
+            The decay factor of running average.
        epsilon : float, optional, default=1e-5
            The epsilon.
        act : callable, optional

--- a/test/dragon/test_autograph.py
+++ b/test/dragon/test_autograph.py
@@ -14,6 +14,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import functools
 import unittest
 import dragon
@@ -115,5 +116,537 @@ class TestFunction(unittest.TestCase):
        dragon.create_function(optimizer=optimizer)()
+class TestOpSpec(unittest.TestCase):
+    """Test the op spec."""
+    sym1 = dragon.Tensor(None, None)
+    sym2 = dragon.Tensor((1,))
+    sym3 = dragon.Tensor((1, None))
+    sym4 = dragon.Tensor((1, None, None, None))
+    sym5 = dragon.Tensor((1, None, None, None, None))
+    def test_accuracy(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.metrics.accuracy(
+                [self.sym1, self.sym1]).shape, ())
+    def test_arg_reduce(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.argmax(
+                self.sym1, axis=0, keep_dims=True).shape, None)
+            self.assertEqual(dragon.math.argmax(
+                self.sym1, axis=0, keep_dims=False).shape, None)
+            self.assertEqual(dragon.math.argmax(
+                self.sym1, axis=None, keep_dims=True).shape, (1,))
+            self.assertEqual(dragon.math.argmax(
+                self.sym1, axis=None, keep_dims=False).shape, ())
+            self.assertEqual(dragon.math.argmax(
+                self.sym2, axis=0, keep_dims=True).shape, (1,))
+            self.assertEqual(dragon.math.argmax(
+                self.sym2, axis=0, keep_dims=False).shape, ())
+    def test_binary_ops(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.add(
+                [self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.math.add(
+                [self.sym2, self.sym2]).shape, (1,))
+            self.assertEqual(dragon.math.add(
+                [self.sym2, self.sym3]).shape, (1, None))
+            self.assertEqual(dragon.math.add(
+                [self.sym3, self.sym2]).shape, (1, None))
+            self.assertEqual(dragon.math.equal(
+                [self.sym1, self.sym1]).shape, None)
+    def test_broadcast(self):
+        self.assertEqual(dragon.broadcast_to(
+            self.sym1, shape=(1,)).shape, None)
+        self.assertEqual(dragon.broadcast_to(
+            self.sym2, shape=(1, 2)).shape, (1, 2))
+        self.assertEqual(dragon.broadcast_to(
+            self.sym3, shape=(2,)).shape, self.sym3.shape[:-1] + (2,))
+        self.assertEqual(dragon.broadcast_to(
+            self.sym3, shape=(-1, 2, 2)).shape, (1, 2, 2))
+    def test_cast(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.cast(self.sym1, 'float32').shape, None)
+    def test_concat(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.concat([self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.concat([self.sym1, self.sym2]).shape, (None,))
+            self.assertEqual(dragon.concat([self.sym2, self.sym3], axis=0).shape, (2,))
+            self.assertEqual(dragon.concat([self.sym2, self.sym3], axis=1).shape, None)
+    def test_conv(self):
+        w = dragon.Tensor((3, 3, 3, 3))
+        with dragon.graph_mode():
+            self.assertEqual(dragon.nn.conv2d(
+                [self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.nn.conv2d(
+                [self.sym4, w]).shape, (self.sym4.shape[0], w.shape[0], None, None))
+            self.assertEqual(dragon.nn.conv2d(
+                [w, w], kernel_shape=1, out_channels=w.shape[0]).shape, w.shape)
+            self.assertEqual(dragon.nn.conv2d(
+                [w, w], kernel_shape=1, padding='SAME').shape, w.shape)
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [self.sym4, w], out_channels=w.shape[1]).shape,
+                (self.sym4.shape[0], w.shape[1], None, None))
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [w, w], output_padding=(2, 2), kernel_shape=1).shape,
+                (w.shape[0], w.shape[1], w.shape[2] + 2, w.shape[3] + 2))
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [w, w], output_shape=(4, 4), output_padding=(2, 2), kernel_shape=1).shape,
+                (w.shape[0], w.shape[1], 6, 6))
+    def test_depth_to_space(self):
+        func1 = functools.partial(dragon.nn.depth_to_space, block_size=1)
+        func2 = functools.partial(dragon.nn.space_to_depth, block_size=1)
+        with dragon.graph_mode():
+            for func in (func1, func2):
+                self.assertEqual(func(self.sym1).shape, None)
+                self.assertEqual(func(self.sym2).shape, None)
+                self.assertEqual(func(self.sym4, data_format='NCHW').shape,
+                                 (self.sym4.shape[0],) + (None,) * (len(self.sym4.shape) - 1))
+                self.assertEqual(func(self.sym4, data_format='NCHW').shape,
+                                 (self.sym4.shape[0],) + (None,) * (len(self.sym4.shape) - 1))
+                self.assertEqual(func(dragon.Tensor((1, 2, 3)), data_format='NCHW').shape,
+                                 dragon.Tensor((1, 2, 3)).shape)
+                self.assertEqual(func(dragon.Tensor((1, 2, 3)), data_format='NHWC').shape,
+                                 dragon.Tensor((1, 2, 3)).shape)
+    def test_dot(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.dot(
+                [self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.math.dot(
+                [self.sym2, self.sym2]).shape, ())
+            self.assertEqual(dragon.math.dot(
+                [dragon.Tensor(()), dragon.Tensor(())]).shape, ())
+            self.assertEqual(dragon.math.dot(
+                [self.sym3, self.sym3]).shape, (self.sym3.shape[0], self.sym3.shape[1]))
+            self.assertEqual(dragon.math.dot(
+                [self.sym3, self.sym2]).shape, self.sym3.shape[:-1])
+    def test_eltwise_loss(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.losses.l2_loss(
+                [self.sym1, self.sym1]).shape, ())
+            self.assertEqual(dragon.losses.l2_loss(
+                [self.sym1, self.sym1], reduction='none').shape, None)
+    def test_expand_dims(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.expand_dims(
+                self.sym1, axis=1).shape, None)
+            self.assertEqual(dragon.expand_dims(
+                self.sym2, axis=1).shape, (1, 1))
+            self.assertEqual(dragon.expand_dims(
+                self.sym2, axis=-1).shape, (1, 1))
+            self.assertEqual(dragon.expand_dims(
+                self.sym3, axis=0).shape, (1, 1, None))
+            self.assertEqual(dragon.expand_dims(
+                self.sym3, axis=(0, 3)).shape, (1, 1, None, 1))
+            self.assertEqual(dragon.expand_dims(
+                self.sym3, axis=(0, 3, 5)).shape, (1, 1, None, 1))
+    def test_init_ops(self):
+        init_funcs_v1 = [dragon.fill,
+                         dragon.ones,
+                         dragon.random.glorot_normal,
+                         dragon.random.glorot_uniform,
+                         dragon.random.normal,
+                         dragon.random.uniform,
+                         dragon.random.truncated_normal,
+                         dragon.zeros]
+        for func in init_funcs_v1:
+            with dragon.graph_mode():
+                self.assertEqual(func(shape=self.sym1.shape).shape, None)
+                self.assertEqual(func(shape=self.sym2.shape).shape, self.sym2.shape)
+    def test_flatten(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.flatten(
+                self.sym1, axis=1).shape, None)
+            self.assertEqual(dragon.flatten(
+                self.sym1, keep_axes=2).shape, (None, None))
+            self.assertEqual(dragon.flatten(
+                self.sym2, keep_axes=2).shape, (1, None))
+            self.assertEqual(dragon.flatten(
+                self.sym4, keep_axes=2).shape, (1, None))
+            self.assertEqual(dragon.flatten(
+                self.sym4, axis=1, num_axes=3).shape, (1, None))
+            self.assertEqual(dragon.flatten(
+                self.sym4, axis=1, num_axes=-1).shape, (1, None))
+    def test_fully_connected(self):
+        w = dragon.Tensor((3, 2))
+        with dragon.graph_mode():
+            self.assertEqual(dragon.nn.fully_connected(
+                [self.sym1, w]).shape, (None, 3))
+            self.assertEqual(dragon.nn.fully_connected(
+                [self.sym1, w], transpose_w=False).shape, (None, 2))
+            self.assertEqual(dragon.nn.fully_connected(
+                [self.sym1, w], axis=-1).shape, None)
+            self.assertEqual(dragon.nn.fully_connected(
+                [self.sym1, self.sym1]).shape, (None, None))
+    def test_index_select(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.index_select(
+                self.sym1, self.sym1).shape, None)
+            self.assertEqual(dragon.index_select(
+                self.sym1, self.sym2, axis=-1).shape, None)
+            self.assertEqual(dragon.index_select(
+                self.sym3, self.sym2, axis=1).shape, (1, 1))
+    def test_linspace(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.linspace(
+                start=1, stop=5, num=3).shape, (3,))
+            self.assertEqual(dragon.linspace(
+                start=(1, 2), stop=(3, 4), num=3, axis=1).shape, (2, 3))
+            self.assertEqual(dragon.linspace(
+                start=(1, 2), stop=(3, 4), num=3, axis=0).shape, (3, 2))
+    def test_mask_select(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.masked_select(
+                [self.sym1, self.sym1]).shape, (None,))
+    def test_matmul(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.matmul(
+                [self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.math.matmul(
+                [self.sym1, self.sym2]).shape, None)
+            self.assertEqual(dragon.math.matmul(
+                [self.sym1, self.sym3]).shape, None)
+            self.assertEqual(dragon.math.matmul(
+                [self.sym2, self.sym3]).shape, None)
+            self.assertEqual(dragon.math.matmul(
+                [self.sym3, self.sym3]).shape, (1, None))
+            self.assertEqual(dragon.math.matmul(
+                [self.sym4, self.sym3]).shape, (1, None, None, None))
+            self.assertEqual(dragon.math.matmul(
+                [self.sym4, self.sym4]).shape, (1, None, None, None))
+    def test_moments(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.moments(self.sym1)[0].shape, ())
+            self.assertEqual(dragon.math.moments(self.sym1, axis=0)[0].shape, None)
+            self.assertEqual(dragon.math.moments(self.sym1, keep_dims=True)[0].shape, (1,))
+            self.assertEqual(dragon.math.moments(self.sym2)[0].shape, ())
+            self.assertEqual(dragon.math.moments(self.sym2, axis=0)[0].shape, ())
+            self.assertEqual(dragon.math.moments(self.sym2, axis=1)[0].shape, (1,))
+            self.assertEqual(dragon.math.moments(self.sym2, axis=0, keep_dims=True)[0].shape, (1,))
+            self.assertEqual(dragon.math.moments(dragon.Tensor(None, 'float64'))[0].dtype, 'float64')
+            self.assertEqual(dragon.math.moments(dragon.Tensor(None, 'int64'))[0].dtype, 'float64')
+    def test_multinomial(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.random.multinomial(self.sym1).shape, None)
+            self.assertEqual(dragon.random.multinomial(self.sym2, num_samples=2).shape, (2,))
+    def test_non_zero(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.nonzero(self.sym1).shape, None)
+            self.assertEqual(dragon.nonzero(self.sym2).shape, (None, 1))
+    def test_one_hot(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.one_hot(self.sym1, depth=2).shape, None)
+            self.assertEqual(dragon.one_hot(self.sym2, depth=2).shape, (1, 2))
+    def test_pad(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.pad(self.sym1, pads=[(1, 1)]).shape, None)
+            self.assertEqual(dragon.pad(self.sym3, pads=[(1, 1)]).shape, (3, None))
+            self.assertEqual(dragon.pad(self.sym3, pads=[(1, 1), (1, 1)]).shape, (3, None))
+    def test_permutation(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.random.permutation(5).shape, (5,))
+    def test_pool(self):
+        func = functools.partial(dragon.nn.pool2d, kernel_shape=3, strides=1, pads=1)
+        with dragon.graph_mode():
+            self.assertEqual(func(self.sym1).shape, None)
+            self.assertEqual(func(self.sym3).shape, (1, None))
+            self.assertEqual(func(self.sym4).shape, (1, None, None, None))
+            self.assertEqual(func(self.sym4, global_pooling=True).shape, (1, None, 1, 1))
+            self.assertEqual(func(dragon.Tensor((1, 3, 4, 4))).shape, (1, 3, 4, 4))
+            self.assertEqual(func(dragon.Tensor((1, 3, 4, 4)), padding='SAME').shape, (1, 3, 4, 4))
+    def test_predicative(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.is_inf(self.sym1).shape, self.sym1.shape)
+            self.assertEqual(dragon.math.is_inf(self.sym3).shape, self.sym3.shape)
+            self.assertEqual(dragon.math.is_nan(self.sym1).shape, self.sym1.shape)
+            self.assertEqual(dragon.math.is_nan(self.sym3).shape, self.sym3.shape)
+    def test_range(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.range(3).shape, (3,))
+            self.assertEqual(dragon.range(3, 4).shape, (1,))
+            self.assertEqual(dragon.range(3, delta=0).shape, None)
+    def test_reduce(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.sum(self.sym1).shape, ())
+            self.assertEqual(dragon.math.sum(self.sym1, axis=0).shape, None)
+            self.assertEqual(dragon.math.sum(self.sym1, keep_dims=True).shape, ())
+            self.assertEqual(dragon.math.sum(self.sym2, axis=0).shape, ())
+            self.assertEqual(dragon.math.sum(self.sym2, axis=1).shape, (1,))
+            self.assertEqual(dragon.math.sum(self.sym2, axis=0, keep_dims=True).shape, (1,))
+    def test_repeat(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.repeat(self.sym1, axis=None, repeats=2).shape, (None,))
+            self.assertEqual(dragon.repeat(self.sym1, axis=0, repeats=2).shape, None)
+            self.assertEqual(dragon.repeat(self.sym2, axis=None, repeats=2).shape, (2,))
+            self.assertEqual(dragon.repeat(self.sym3, axis=0, repeats=2).shape, (2, None))
+            self.assertEqual(dragon.repeat(self.sym3, axis=1, repeats=2).shape, (1, None))
+    def test_reshape(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.reshape(self.sym2, shape=(0, 1)).shape, (1, 1))
+            self.assertEqual(dragon.reshape(self.sym3, shape=(0, -1)).shape, (1, None))
+            self.assertEqual(dragon.reshape(self.sym3, shape=(0, 1, 0)).shape, None)
+    def test_resize(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.vision.resize(
+                self.sym4, sizes=(1,)).shape, (1, None, 1, 1))
+            self.assertEqual(dragon.vision.resize(
+                self.sym4, sizes=(1, 1)).shape, (1, None, 1, 1))
+            self.assertEqual(dragon.vision.resize(
+                self.sym4, sizes=(1, 1, 1, 1)).shape, (1, None, 1, 1))
+            self.assertEqual(dragon.vision.resize(
+                self.sym4, scales=(1,)).shape, (1, None, None, None))
+            self.assertEqual(dragon.vision.resize(
+                self.sym4, scales=(1, 1)).shape, (1, None, None, None))
+            self.assertEqual(dragon.vision.resize(
+                self.sym4, scales=(1, 1, 1, 1)).shape, (1, None, None, None))
+            self.assertEqual(dragon.vision.resize(
+                self.sym5, sizes=(1, 1, 1, 1)).shape, None)
+    def test_roi_pool(self):
+        rois = dragon.Tensor((2, 5))
+        func = functools.partial(dragon.vision.roi_pool, pooled_h=7, pooled_w=7)
+        with dragon.graph_mode():
+            self.assertEqual(func([self.sym1, rois]).shape, None)
+            self.assertEqual(func([self.sym4, rois]).shape, (2, None, 7, 7))
+            self.assertEqual(func([self.sym4, self.sym1]).shape, (None, None, 7, 7))
+    def test_slice(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.slice(self.sym1, (1,), (1,)).shape, None)
+            self.assertEqual(dragon.slice(self.sym3, (1,), (1,)).shape, (1, None))
+    def test_softmax_loss(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.losses.sparse_softmax_cross_entropy(
+                [self.sym1, self.sym1]).shape, ())
+            self.assertEqual(dragon.losses.sparse_softmax_cross_entropy(
+                [self.sym1, self.sym1], reduction='none').shape, None)
+            self.assertEqual(dragon.losses.sparse_softmax_cross_entropy(
+                [self.sym3, self.sym1], reduction='none').shape, (self.sym3.shape[0],))
+    def test_sort(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.sort(self.sym1)[0].shape, None)
+            self.assertEqual(dragon.sort(self.sym2)[0].shape, self.sym2.shape)
+    def test_split(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.split(self.sym1, 2)[0].shape, None)
+            self.assertEqual(dragon.split(self.sym2, 2)[0].shape, (1,))
+            self.assertEqual(dragon.split(self.sym2, 2, axis=1)[0].shape, None)
+            self.assertEqual(dragon.split(self.sym2, (1, 1))[0].shape, (1,))
+            self.assertEqual(dragon.split(self.sym2, 2, slice_points=(1,))[0].shape, (1,))
+            self.assertEqual(dragon.split(self.sym3, 2, axis=1)[0].shape, (1, None))
+            self.assertEqual(dragon.split(self.sym3, 2, axis=1, slice_points=(1,))[1].shape, (1, None))
+    def test_squeeze(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.squeeze(self.sym1).shape, None)
+            self.assertEqual(dragon.squeeze(self.sym2).shape, ())
+            self.assertEqual(dragon.squeeze(self.sym2, axis=-1).shape, ())
+            self.assertEqual(dragon.squeeze(self.sym3).shape, (None,))
+    def test_stack(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.stack([self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.stack([self.sym3, self.sym2]).shape, (2, 1, None))
+            self.assertEqual(dragon.stack([self.sym3, self.sym3]).shape, (2, 1, None))
+            self.assertEqual(dragon.stack([self.sym3, self.sym3], axis=-1).shape, (1, None, 2))
+    def test_tile(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.tile(
+                self.sym1, repeats=(1, 2)).shape, None)
+            self.assertEqual(dragon.tile(
+                self.sym3, repeats=(1, 2)).shape, (1, None))
+    def test_topk(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.top_k(self.sym1)[0].shape, None)
+            self.assertEqual(dragon.math.top_k(self.sym2, k=2)[0].shape, (2,))
+            self.assertEqual(dragon.math.top_k(self.sym2, axis=1)[0].shape, None)
+    def test_unchanged(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.math.negative(self.sym1).shape, None)
+    def test_unique(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.unique(self.sym1).shape, (None,))
+            self.assertEqual(dragon.unique(self.sym1, return_counts=True)[1].shape, (None,))
+            self.assertEqual(dragon.unique(self.sym1, return_inverse=True)[1].shape, None)
+            self.assertEqual(dragon.unique(self.sym1,
+                                           return_inverse=True,
+                                           return_counts=True)[1].shape, None)
+class TestOpSpecWithTensorDesc(unittest.TestCase):
+    """Test the op spec with tensor descriptors."""
+    sym1 = dragon.Tensor(None)
+    sym2 = dragon.Tensor((1, None))
+    sym3 = dragon.Tensor((1, None, None, None))
+    shape1 = dragon.shape(sym1)
+    shape2 = [1, shape1, 1]
+    def test_broadcast_to(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.broadcast_to(
+                self.sym1, shape=self.shape1).shape, None)
+            self.assertEqual(dragon.broadcast_to(
+                self.sym2, shape=self.shape1).shape, (None,) * len(self.sym2.shape))
+            self.assertEqual(dragon.broadcast_to(
+                self.sym2, shape=self.shape2).shape, (None,) * len(self.shape2))
+    def test_channel_normalize(self):
+        func = functools.partial(dragon.channel_normalize,
+                                 mean=(1., 1., 1.), std=(1., 1., 1.))
+        with dragon.graph_mode():
+            self.assertEqual(func(self.sym1).shape, None)
+            self.assertEqual(func(self.sym1, perm=self.shape1).shape, None)
+            self.assertEqual(func(self.sym2).shape, self.sym2.shape)
+            self.assertEqual(func(self.sym2, perm=self.shape1).shape,
+                             (None,) * len(self.sym2.shape))
+            self.assertEqual(func(self.sym2, perm=self.shape2).shape,
+                             (None,) * len(self.sym2.shape))
+    def test_conv_transpose(self):
+        w = dragon.Tensor((3, 3, 3, 3))
+        with dragon.graph_mode():
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [self.sym1, self.sym1]).shape, None)
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [self.sym3, self.sym1]).shape, None)
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [self.sym3, w]).shape, (self.sym3.shape[0], w.shape[0], None, None))
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [w, w], output_padding=self.shape1).shape,
+                (w.shape[0], w.shape[0], None, None))
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [w, w], output_padding=self.shape2).shape,
+                (w.shape[0], w.shape[0], None, None))
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [w, w], output_shape=self.shape1).shape,
+                (w.shape[0], w.shape[0], None, None))
+            self.assertEqual(dragon.nn.conv2d_transpose(
+                [w, w], output_shape=self.shape2).shape,
+                (w.shape[0], w.shape[0], None, None))
+    def test_init_ops(self):
+        init_funcs_v1 = [dragon.fill,
+                         dragon.ones,
+                         dragon.random.glorot_normal,
+                         dragon.random.glorot_uniform,
+                         dragon.random.normal,
+                         dragon.random.uniform,
+                         dragon.random.truncated_normal,
+                         dragon.zeros]
+        init_funcs_v2 = [dragon.ones_like,
+                         dragon.random.normal_like,
+                         dragon.random.uniform_like,
+                         dragon.zeros_like]
+        for func in init_funcs_v1:
+            with dragon.graph_mode():
+                self.assertEqual(func(shape=self.shape1).shape, None)
+                self.assertEqual(func(shape=self.shape2).shape, (None,) * len(self.shape2))
+        for func in init_funcs_v2:
+            with dragon.graph_mode():
+                self.assertEqual(func(self.sym1).shape, None)
+                self.assertEqual(func(self.sym2).shape, self.sym2.shape)
+    def test_permutation(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.random.permutation(self.sym1).shape, (None,))
+    def test_repeat(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.repeat(
+                self.sym1, repeats=self.shape1).shape, None)
+            self.assertEqual(dragon.repeat(
+                self.sym2, repeats=self.shape1).shape, None)
+    def test_reshape(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.reshape(
+                self.sym1, shape=self.shape1).shape, None)
+            self.assertEqual(dragon.reshape(
+                self.sym2, shape=self.shape1).shape, None)
+            self.assertEqual(dragon.reshape(
+                self.sym2, shape=self.shape2).shape, (None,) * len(self.shape2))
+    def test_resize(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.vision.resize(
+                self.sym1, sizes=self.shape1).shape, None)
+            self.assertEqual(dragon.vision.resize(
+                self.sym1, scales=self.shape1).shape, None)
+            self.assertEqual(dragon.vision.resize(
+                self.sym2, sizes=self.shape1).shape, (None,) * len(self.sym2.shape))
+            self.assertEqual(dragon.vision.resize(
+                self.sym2, scales=self.shape1).shape, (None,) * len(self.sym2.shape))
+            self.assertEqual(dragon.vision.resize(
+                self.sym2, sizes=self.shape2).shape, (None,) * len(self.sym2.shape))
+            self.assertEqual(dragon.vision.resize(
+                self.sym2, scales=self.shape2).shape, (None,) * len(self.sym2.shape))
+    def test_slice(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.slice(
+                self.sym1, starts=self.shape1, sizes=self.shape1).shape, None)
+            self.assertEqual(dragon.slice(
+                self.sym2, starts=self.shape1, sizes=self.shape1).shape, None)
+            self.assertEqual(dragon.slice(
+                self.sym2, starts=self.shape2, sizes=self.shape2).shape, None)
+    def test_tile(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.tile(
+                self.sym1, repeats=self.shape1).shape, None)
+            self.assertEqual(dragon.tile(
+                self.sym2, repeats=self.shape1).shape, (None,) * len(self.sym2.shape))
+            self.assertEqual(dragon.tile(
+                self.sym2, repeats=self.shape2).shape, (None,) * len(self.sym2.shape))
+    def test_transpose(self):
+        with dragon.graph_mode():
+            self.assertEqual(dragon.transpose(self.sym1).shape, None)
+            self.assertEqual(dragon.transpose(self.sym1, perm=self.shape1).shape, None)
+            self.assertEqual(dragon.transpose(self.sym2).shape, self.sym2.shape[::-1])
+            self.assertEqual(dragon.transpose(
+                self.sym2, perm=self.shape1).shape, (None,) * len(self.sym2.shape))
+            self.assertEqual(dragon.transpose(
+                self.sym2, perm=self.shape2).shape, (None,) * len(self.sym2.shape))
 if __name__ == '__main__':
    run_tests()
--- a/tools/codegen_runtime.py
+++ b/tools/codegen_runtime.py
@@ -8,7 +8,6 @@
 #      <https://opensource.org/licenses/BSD-2-Clause>
 #
 # ------------------------------------------------------------
 """Code generator for Runtime API."""
 from __future__ import absolute_import

--- a/torch/core/nn/functional.py
+++ b/torch/core/nn/functional.py
@@ -89,7 +89,8 @@ def batch_norm(
    The moving average of stats are calculated as:
-    .. math:: x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    Parameters
    ----------
@@ -124,9 +125,9 @@ def batch_norm(
        .instantiate(
            input.device,
            training=training,
-            momentum=momentum,
            epsilon=eps,
-        ).apply(input, running_mean, running_var, weight, bias)
+        ).apply(input, running_mean, running_var,
+                weight, bias, momentum)
 def binary_cross_entropy_with_logits(
@@ -1598,7 +1599,7 @@ def sync_batch_norm(
    The moving average of stats are calculated as:
    .. math::
-        x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
+        x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{\text{batch}}
    Additionally, you can specify ``process_group`` to perform synchronization.

--- a/torch/core/nn/modules/_functions.py
+++ b/torch/core/nn/modules/_functions.py
@@ -111,24 +111,31 @@ class BatchNorm(function.Function):
    def __init__(self, key, dev, **kwargs):
        super(BatchNorm, self).__init__(key, dev, **kwargs)
-        self.momentum = kwargs.get('momentum', 0.1)
        self.epsilon = kwargs.get('epsilon', 1e-5)
        self.training = kwargs.get('training', False)
+        self.track_stats = kwargs.get('track_stats', True)
+    def setup(self, ws, handle, momentum):
+        self.feed_arg(ws, '{}/momentum'.format(handle), 1.0 - momentum, 'float32')
    def attributes(self):
        return {
            'op_type': 'BatchNorm',
            'arguments': {
                'axis': 1,
-                'momentum': 1. - self.momentum,
                'epsilon': self.epsilon,
                'use_stats': int(not self.training),
+                'momentum_desc': '${HANDLE}/momentum',
            }
        }
-    def forward(self, input, running_mean, running_var, weight, bias):
+    def forward(self, input, running_mean, running_var, weight, bias, momentum):
        inputs = [input, weight, bias, running_mean, running_var]
-        return self.dispatch(inputs, [self.alloc()])
+        return self.dispatch(
+            inputs, [self.alloc()],
+            callback=lambda ws, handle:
+                self.setup(ws, handle, momentum),
+        )
 class Conv2d(_ConvNd):

--- a/torch/core/nn/modules/batchnorm.py
+++ b/torch/core/nn/modules/batchnorm.py
@@ -25,6 +25,8 @@ from dragon.vm.torch.core.tensor import Tensor
 class _BatchNorm(Module):
+    """BatchNorm base module."""
    def __init__(
        self,
        num_features,
@@ -45,20 +47,26 @@ class _BatchNorm(Module):
        else:
            self.register_buffer('weight', init_funcs.ones(num_features))
            self.register_buffer('bias', init_funcs.zeros(num_features))
+        if self.track_running_stats:
+            self.num_batches_tracked = 0
+        else:
+            self.num_batches_tracked = None
        self.register_buffer('running_mean', init_funcs.zeros(num_features))
        self.register_buffer('running_var', init_funcs.ones(num_features))
        self.inputs = [self.running_mean, self.running_var, self.weight, self.bias]
        self.reset_parameters()
-    def reset_parameters(self):
-        if self.affine:
-            self.weight.data.one_()
-            self.bias.data.zero_()
    def reset_running_stats(self):
        if self.track_running_stats:
            self.running_mean.zero_()
            self.running_var.fill_(1)
+            self.num_batches_tracked = 0
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.one_()
+            self.bias.data.zero_()
    def extra_repr(self):
        return '{num_features}, ' \
@@ -72,7 +80,7 @@ class _BatchNorm(Module):
        return F.batch_norm(
            input, *self.inputs,
            training=self.training,
-            momentum=self.momentum,
+            momentum=self._get_momentum(),
            eps=self.eps
        )
@@ -82,6 +90,19 @@ class _BatchNorm(Module):
            return self  # Float32 parameters are required.
        return super(_BatchNorm, self)._apply(fn)
+    def _get_momentum(self):
+        """Return the current momentum value."""
+        momentum = 0.0 if self.momentum is None else self.momentum
+        if self.track_running_stats:
+            if self.training:
+                if self.num_batches_tracked is not None:
+                    self.num_batches_tracked += 1
+                if self.momentum is None:
+                    momentum = 1.0 / float(self.num_batches_tracked)
+        else:
+            momentum = 0.0
+        return momentum
 class BatchNorm1d(_BatchNorm):
    r"""Apply the batch normalization over 2d input.
@@ -93,7 +114,8 @@ class BatchNorm1d(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    See Also
    --------
@@ -109,16 +131,16 @@ class BatchNorm1d(_BatchNorm):
        affine=True,
        track_running_stats=True,
    ):
-        """Create a ``BatchNorm1d`` module.
+        r"""Create a ``BatchNorm1d`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -142,7 +164,8 @@ class BatchNorm2d(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    See Also
    --------
@@ -158,16 +181,16 @@ class BatchNorm2d(_BatchNorm):
        affine=True,
        track_running_stats=True,
    ):
-        """Create a ``BatchNorm2d`` module.
+        r"""Create a ``BatchNorm2d`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -191,7 +214,8 @@ class BatchNorm3d(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    See Also
    --------
@@ -207,16 +231,16 @@ class BatchNorm3d(_BatchNorm):
        affine=True,
        track_running_stats=True,
    ):
-        """Create a ``BatchNorm3d`` module.
+        r"""Create a ``BatchNorm3d`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -240,7 +264,8 @@ class SyncBatchNorm(_BatchNorm):
    The running average of statistics are calculated as:
-    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
+    .. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
+                                   \text{momentum} * x_{\text{batch}}
    Additionally, specify ``process_group`` to perform synchronization.
@@ -261,16 +286,16 @@ class SyncBatchNorm(_BatchNorm):
        track_running_stats=True,
        process_group=None,
    ):
-        """Create a ``SyncBatchNorm`` module.
+        r"""Create a ``SyncBatchNorm`` module.
        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
-            The momentum of moving average.
+            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
        track_running_stats : bool, optional, default=True
@@ -292,7 +317,7 @@ class SyncBatchNorm(_BatchNorm):
            return F.sync_batch_norm(
                input, *self.inputs,
                training=self.training,
-                momentum=self.momentum,
+                momentum=self._get_momentum(),
                eps=self.eps,
                process_group=self.process_group
            )
@@ -300,6 +325,6 @@ class SyncBatchNorm(_BatchNorm):
            return F.batch_norm(
                input, *self.inputs,
                training=self.training,
-                momentum=self.momentum,
+                momentum=self._get_momentum(),
                eps=self.eps
            )
--- a/torch/core/nn/modules/normalization.py
+++ b/torch/core/nn/modules/normalization.py
@@ -61,7 +61,7 @@ class AffineChannel(Module):
        fix_bias=False,
        inplace=False,
    ):
-        """Create an ``Affine`` module.
+        """Create an ``AffineChannel`` module.
        Parameters
        ----------
@@ -141,7 +141,7 @@ class GroupNorm(Module):
        eps=1e-5,
        affine=True,
    ):
-        """Create a ``GroupNorm`` module.
+        r"""Create a ``GroupNorm`` module.
        Parameters
        ----------
@@ -150,7 +150,7 @@ class GroupNorm(Module):
        num_channels : int
            The number of channels.
        eps : float, optional, default=1e-5
-            The epsilon value.
+            The value to :math:`\epsilon`.
        affine : bool, optional, default=True
            **True** to apply a affine transformation.
@@ -228,11 +228,11 @@ class LocalResponseNorm(Module):
        size : int, required
            The number of neighbouring channels to sum over.
        alpha : float, optional, default=0.0001
-            The scale value :math:`\alpha`.
+            The value to :math:`\alpha`.
        beta : float, optional, default=0.75
-            The exponent value :math:`\beta`.
+            The value to :math:`\beta`.
        k : float, optional, default=1.
-            The bias constant :math:`k`.
+            The value to :math:`k`.
        """
        super(LocalResponseNorm, self).__init__()