Optimize Concat && Split Operator

Summary: This commit uses CopyMatrix to implement concat and split generically instead of specialized kernels.

Optimize Concat && Split Operator
Summary: This commit uses CopyMatrix to implement concat and split generically instead of specialized kernels.
Ting PAN
Commit f76c693e authored Oct 20, 2020 by Ting PAN
Showing with 334 additions and 383 deletions
caffe/core/layers/loss.py
docs/api/python/torch/nn.rst
docs/api/python/torch/nn/KLDivLoss.rst
docs/api/python/torch/nn/functional.rst
docs/api/python/torch/nn/functional/kl_div.rst
dragon/core/context_cuda.h
dragon/kernels/array/concat_op_kernel.cc
dragon/kernels/array/concat_op_kernel.cu
dragon/kernels/array/split_op_kernel.cc
dragon/kernels/array/split_op_kernel.cu
dragon/operators/array/concat_op.cc
dragon/operators/array/split_op.cc
dragon/operators/array/stack_op.cc
dragon/operators/loss/l1_loss_op.cc
dragon/operators/loss/l2_loss_op.cc
dragon/operators/loss/nll_loss_op.cc
dragon/operators/loss/sigmoid_ce_loss_op.cc
dragon/operators/loss/sigmoid_focal_loss_op.cc
dragon/operators/loss/smooth_l1_loss_op.cc
dragon/operators/loss/softmax_ce_loss_op.cc
--- a/caffe/core/layers/loss.py
+++ b/caffe/core/layers/loss.py
@@ -44,8 +44,8 @@ class EuclideanLoss(Layer):
    def __init__(self, layer_param):
        super(EuclideanLoss, self).__init__(layer_param)
        param = layer_param.loss_param
-        norm_dict = {0: 'mean', 1: 'mean', 2: 'batch_size', 3: 'sum'}
-        reduction = 'batch_size'
+        norm_dict = {0: 'mean', 1: 'mean', 2: 'batch_mean', 3: 'sum'}
+        reduction = 'batch_mean'
        if param.HasField('normalize'):
            if param.normalize:
                reduction = 'mean'
@@ -81,11 +81,11 @@ class SigmoidCrossEntropyLoss(Layer):
    def __init__(self, layer_param):
        super(SigmoidCrossEntropyLoss, self).__init__(layer_param)
        param = layer_param.loss_param
-        norm_dict = {0: 'mean', 1: 'valid', 2: 'batch_size', 3: 'sum'}
+        norm_dict = {0: 'mean', 1: 'valid', 2: 'batch_mean', 3: 'sum'}
        reduction = 'valid'
        if param.HasField('normalize'):
            if not param.normalize:
-                reduction = 'batch_size'
+                reduction = 'batch_mean'
        else:
            reduction = norm_dict[param.normalization]
        self.arguments = {'reduction': reduction}
@@ -123,8 +123,8 @@ class SmoothL1Loss(Layer):
        super(SmoothL1Loss, self).__init__(layer_param)
        param = layer_param.loss_param
        smooth_l1_param = layer_param.smooth_l1_loss_param
-        norm_dict = {0: 'mean', 1: 'mean', 2: 'batch_size', 3: 'sum'}
-        reduction = 'batch_size'
+        norm_dict = {0: 'mean', 1: 'mean', 2: 'batch_mean', 3: 'sum'}
+        reduction = 'batch_mean'
        if param.HasField('normalize'):
            if param.normalize:
                reduction = 'mean'
@@ -174,11 +174,11 @@ class SoftmaxWithLoss(Layer):
        super(SoftmaxWithLoss, self).__init__(layer_param)
        param = layer_param.loss_param
        softmax_param = layer_param.softmax_param
-        norm_dict = {0: 'mean', 1: 'valid', 2: 'batch_size', 3: 'sum'}
+        norm_dict = {0: 'mean', 1: 'valid', 2: 'batch_mean', 3: 'sum'}
        reduction = 'valid'
        if param.HasField('normalize'):
            if not param.normalize:
-                reduction = 'batch_size'
+                reduction = 'batch_mean'
        else:
            reduction = norm_dict[param.normalization]
        self.arguments = {

--- a/docs/api/python/torch/nn.rst
+++ b/docs/api/python/torch/nn.rst
@@ -84,6 +84,9 @@ vm.torch.nn
  : Apply the gumbel softmax with a temperature.
  `[Jang et.al, 2016] <https://arxiv.org/abs/1611.01144>`_.

+  `class KLDivLoss <nn/KLDivLoss.html>`_
+  : Compute the Kullback-Leibler divergence.
+
  `class L1Loss <nn/L1Loss.html>`_
  : Compute the element-wise absolute value difference.

@@ -219,6 +222,7 @@ vm.torch.nn
  nn/GroupNorm
  nn/GRU
  nn/GumbelSoftmax
+  nn/KLDivLoss
  nn/L1Loss
  nn/LeakyReLU
  nn/Linear

--- a/docs/api/python/torch/nn/KLDivLoss.rst
+++ b/docs/api/python/torch/nn/KLDivLoss.rst
+KLDivLoss
+=========
+
+.. autoclass:: dragon.vm.torch.nn.KLDivLoss
+
+__init__
+--------
+.. automethod:: dragon.vm.torch.nn.KLDivLoss.__init__
+
+.. _torch.nn.functional.kl_div(...): functional/kl_div.html
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.nn.";
+      color: #103d3e;
+    }
+  </style>
--- a/docs/api/python/torch/nn/functional.rst
+++ b/docs/api/python/torch/nn/functional.rst
@@ -53,6 +53,9 @@ vm.torch.nn.functional
  : Apply the group normalization to input.
  `[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.

+  `kl_div(...) <functional/kl_div.html>`_
+  : Compute the Kullback-Leibler divergence.
+
  `l1_loss(...) <functional/l1_loss.html>`_
  : Compute the element-wise absolute value difference.

@@ -142,6 +145,7 @@ vm.torch.nn.functional
  functional/dropout
  functional/elu
  functional/group_norm
+  functional/kl_div
  functional/l1_loss
  functional/leaky_relu
  functional/linear

--- a/docs/api/python/torch/nn/functional/kl_div.rst
+++ b/docs/api/python/torch/nn/functional/kl_div.rst
+kl_div
+======
+
+.. autofunction:: dragon.vm.torch.nn.functional.kl_div
+
+.. _torch.nn.KLDivLoss(...): ../KLDivLoss.html
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "torch.nn.functional.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/core/context_cuda.h
+++ b/dragon/core/context_cuda.h
@@ -148,13 +148,13 @@ class CUDAObjects {
  Map<string, ncclComm_t> nccl_comms_[CUDA_MAX_DEVICES];
 #endif

-  /*! \brief The flag that alllows cuDNN or not */
+  /*! \brief The flag that allows cuDNN or not */
  bool cudnn_enabled_ = true;

  /*! \brief The flag that allows cuDNN benchmark or not */
  bool cudnn_benchmark_ = false;

-  /*! \brief The flag thats allow cuDNN TF32 math type or not */
+  /*! \brief The flag that allows cuDNN TF32 math type or not */
  bool cudnn_allow_tf32_ = false;

 private:

--- a/dragon/kernels/array/concat_op_kernel.cc
+++ b/dragon/kernels/array/concat_op_kernel.cc
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernel {
-
-namespace {
-
-template <typename T>
-void _Concat(
-    const int outer_dim,
-    const int inner_dim,
-    const int x_axis_dim,
-    const int y_axis_dim,
-    const int index,
-    const T* x,
-    T* y) {
-  const int offset = index * inner_dim;
-  const int x_cols = x_axis_dim * inner_dim;
-  const int y_cols = y_axis_dim * inner_dim;
-  for (int i = 0; i < outer_dim; ++i) {
-    std::memcpy(y + i * y_cols + offset, x + i * x_cols, x_cols * sizeof(T));
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)                                       \
-  template <>                                                           \
-  void Concat<T, CPUContext>(                                           \
-      const int outer_dim,                                              \
-      const int inner_dim,                                              \
-      const int x_axis_dim,                                             \
-      const int y_axis_dim,                                             \
-      const int index,                                                  \
-      const T* x,                                                       \
-      T* y,                                                             \
-      CPUContext* ctx) {                                                \
-    _Concat(outer_dim, inner_dim, x_axis_dim, y_axis_dim, index, x, y); \
-  }
-
-DEFINE_KERNEL_LAUNCHER(bool);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernel
-
-} // namespace dragon
--- a/dragon/kernels/array/concat_op_kernel.cu
+++ b/dragon/kernels/array/concat_op_kernel.cu
-#ifdef USE_CUDA
-
-#include "dragon/core/context_cuda.h"
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernel {
-
-namespace {
-
-template <typename T>
-__global__ void _Concat(
-    const int nthreads,
-    const int inner_dim,
-    const int x_cols,
-    const int y_cols,
-    const int offset,
-    const T* x,
-    T* y) {
-  CUDA_1D_KERNEL_LOOP(xi, nthreads) {
-    const int i = xi / x_cols;
-    const int j = xi % x_cols;
-    y[i * y_cols + offset + j] = x[xi];
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)                                            \
-  template <>                                                                \
-  void Concat<T, CUDAContext>(                                               \
-      const int outer_dim,                                                   \
-      const int inner_dim,                                                   \
-      const int x_axis_dim,                                                  \
-      const int y_axis_dim,                                                  \
-      const int index,                                                       \
-      const T* x,                                                            \
-      T* y,                                                                  \
-      CUDAContext* ctx) {                                                    \
-    const int offset = index * inner_dim;                                    \
-    const int x_cols = x_axis_dim * inner_dim;                               \
-    const int y_cols = y_axis_dim * inner_dim;                               \
-    const int nthreads = outer_dim * x_cols;                                 \
-    _Concat<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        nthreads, inner_dim, x_cols, y_cols, offset, x, y);                  \
-  }
-
-DEFINE_KERNEL_LAUNCHER(bool);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernel
-
-} // namespace dragon
-
-#endif // USE_CUDA
--- a/dragon/kernels/array/split_op_kernel.cc
+++ b/dragon/kernels/array/split_op_kernel.cc
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernel {
-
-namespace {
-
-template <typename T>
-void _Split(
-    const int outer_dim,
-    const int inner_dim,
-    const int x_axis_dim,
-    const int y_axis_dim,
-    const int index,
-    const T* x,
-    T* y) {
-  const int offset = index * inner_dim;
-  const int x_cols = x_axis_dim * inner_dim;
-  const int y_cols = y_axis_dim * inner_dim;
-  for (int i = 0; i < outer_dim; ++i) {
-    std::memcpy(y + i * y_cols, x + i * x_cols + offset, y_cols * sizeof(T));
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)                                      \
-  template <>                                                          \
-  void Split<T, CPUContext>(                                           \
-      const int outer_dim,                                             \
-      const int inner_dim,                                             \
-      const int x_axis_dim,                                            \
-      const int y_axis_dim,                                            \
-      const int index,                                                 \
-      const T* x,                                                      \
-      T* y,                                                            \
-      CPUContext* ctx) {                                               \
-    _Split(outer_dim, inner_dim, x_axis_dim, y_axis_dim, index, x, y); \
-  }
-
-DEFINE_KERNEL_LAUNCHER(bool);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernel
-
-} // namespace dragon
--- a/dragon/kernels/array/split_op_kernel.cu
+++ b/dragon/kernels/array/split_op_kernel.cu
-#ifdef USE_CUDA
-
-#include "dragon/core/context_cuda.h"
-#include "dragon/utils/op_kernels.h"
-
-namespace dragon {
-
-namespace kernel {
-
-namespace {
-
-template <typename T>
-__global__ void _Split(
-    const int nthreads,
-    const int inner_dim,
-    const int x_cols,
-    const int y_cols,
-    const int offset,
-    const T* x,
-    T* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int i = yi / y_cols;
-    const int j = yi % y_cols;
-    y[yi] = x[i * x_cols + offset + j];
-  }
-}
-
-} // namespace
-
-/* ------------------- Launcher Separator ------------------- */
-
-#define DEFINE_KERNEL_LAUNCHER(T)                                           \
-  template <>                                                               \
-  void Split<T, CUDAContext>(                                               \
-      const int outer_dim,                                                  \
-      const int inner_dim,                                                  \
-      const int x_axis_dim,                                                 \
-      const int y_axis_dim,                                                 \
-      const int index,                                                      \
-      const T* x,                                                           \
-      T* y,                                                                 \
-      CUDAContext* ctx) {                                                   \
-    const int offset = index * inner_dim;                                   \
-    const int x_cols = x_axis_dim * inner_dim;                              \
-    const int y_cols = y_axis_dim * inner_dim;                              \
-    const int nthreads = outer_dim * y_cols;                                \
-    _Split<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        nthreads, inner_dim, x_cols, y_cols, offset, x, y);                 \
-  }
-
-DEFINE_KERNEL_LAUNCHER(bool);
-DEFINE_KERNEL_LAUNCHER(int8_t);
-DEFINE_KERNEL_LAUNCHER(uint8_t);
-DEFINE_KERNEL_LAUNCHER(int);
-DEFINE_KERNEL_LAUNCHER(int64_t);
-DEFINE_KERNEL_LAUNCHER(float16);
-DEFINE_KERNEL_LAUNCHER(float);
-DEFINE_KERNEL_LAUNCHER(double);
-
-#undef DEFINE_KERNEL_LAUNCHER
-
-} // namespace kernel
-
-} // namespace dragon
-
-#endif // USE_CUDA
--- a/dragon/operators/array/concat_op.cc
+++ b/dragon/operators/array/concat_op.cc
 #include "dragon/operators/array/concat_op.h"
 #include "dragon/core/workspace.h"
-#include "dragon/utils/op_kernels.h"
+#include "dragon/utils/math_functions.h"

 namespace dragon {

@@ -26,20 +26,20 @@ void ConcatOp<Context>::DoRunWithType() {
    Y_dims[axis] += Input(i).dim(axis);
  }

-  int64_t index = 0;
-  auto* y = Y->Reshape(Y_dims)->template mutable_data<T, Context>();
-
-  for (int i = 0; i < InputSize(); i++) {
-    kernel::Concat(
-        X.count(0, axis),
-        X.count(axis + 1),
-        Input(i).dim(axis),
-        Y_dims[axis],
-        index,
-        Input(i).template data<T, Context>(),
-        y,
+  Y->Reshape(Y_dims);
+  int64_t output_offset = 0;
+
+  for (int i = 0; i < InputSize(); ++i) {
+    const auto& Xi = Input(i);
+    math::CopyMatrix(
+        Xi.count(0, axis),
+        Xi.count(axis),
+        Xi.count(axis),
+        Y->count(axis),
+        Xi.template data<T, Context>(),
+        Y->template mutable_data<T, Context>() + output_offset,
        ctx());
-    index += Input(i).dim(axis);
+    output_offset += Xi.count(axis);
  }
 }

@@ -54,21 +54,21 @@ void ConcatGradientOp<Context>::DoRunWithType() {
  auto& dY = Input(0);
  CANONICALIZE_AXIS_WITH_TENSOR(dY);

-  int64_t index = 0;
-  for (int i = 0; i < OutputSize(); i++) {
+  int64_t input_offset = 0;
+
+  for (int i = 0; i < OutputSize(); ++i) {
    auto &X = RESTORE_INPUT_SPEC(i), *dX = Output(i);
    if (dX->has_name()) {
-      kernel::Split(
+      math::CopyMatrix(
          dY.count(0, axis),
-          dY.count(axis + 1),
-          dY.dim(axis),
-          X.dim(axis),
-          index,
-          dY.template data<T, Context>(),
+          X.count(axis),
+          dY.count(axis),
+          X.count(axis),
+          dY.template data<T, Context>() + input_offset,
          dX->ReshapeLike(X)->template mutable_data<T, Context>(),
          ctx());
    }
-    index += X.dim(axis);
+    input_offset += X.count(axis);
  }
 }


--- a/dragon/operators/array/split_op.cc
+++ b/dragon/operators/array/split_op.cc
 #include "dragon/operators/array/split_op.h"
 #include "dragon/core/workspace.h"
 #include "dragon/utils/math_functions.h"
-#include "dragon/utils/op_kernels.h"

 namespace dragon {

@@ -34,27 +33,27 @@ void SplitOp<Context>::DoRunWithType() {
  // Store for the gradient calculation
  STORE_INPUT_SPEC(0);

-  int64_t index = 0, next_index;
  vec64_t Y_dims(X.dims());
+  int64_t input_offset = 0, total_size = 0;
+
  for (int i = 0; i < num_splits; ++i) {
-    next_index = index + size_splits[i];
-    CHECK(size_splits[i] > 0 && next_index <= X.dim(axis))
+    total_size += size_splits[i];
+    CHECK(size_splits[i] > 0 && total_size <= X.dim(axis))
        << "\nIllegal size of splits: " << Tensor::DimString(size_splits)
        << " for dimension: " << X.dim(axis);
    auto* Y = Output(i);
    if (Y->has_name()) {
      Y_dims[axis] = size_splits[i];
-      kernel::Split(
+      math::CopyMatrix(
          X.count(0, axis),
-          X.count(axis + 1),
-          X.dim(axis),
-          size_splits[i],
-          index,
-          X.template data<T, Context>(),
+          size_splits[i] * X.count(axis + 1),
+          X.count(axis),
+          size_splits[i] * X.count(axis + 1),
+          X.template data<T, Context>() + input_offset,
          Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
          ctx());
    }
-    index = next_index;
+    input_offset += size_splits[i] * X.count(axis + 1);
  }
 }

@@ -67,7 +66,6 @@ template <class Context>
 template <typename T>
 void SplitGradientOp<Context>::DoRunWithType() {
  auto* dX = Output(0);
-
  int num_splits = InputSize();
  CANONICALIZE_AXIS_WITH_TENSOR((*dX));
  DETERMINE_RUNTIME_ARGS((*dX));
@@ -84,21 +82,21 @@ void SplitGradientOp<Context>::DoRunWithType() {
    }
  }

-  int64_t index = 0;
+  int64_t output_offset = 0;
+
  for (int i = 0; i < num_splits; i++) {
    auto& dY = Input(i);
    if (dY.has_name()) {
-      kernel::Concat(
-          dX->count(0, axis),
-          dX->count(axis + 1),
-          size_splits[i],
-          dX->dim(axis),
-          index,
+      math::CopyMatrix(
+          dY.count(0, axis),
+          dY.count(axis),
+          dY.count(axis),
+          dX->count(axis),
          dY.template data<T, Context>(),
-          dX->template mutable_data<T, Context>(),
+          dX->template mutable_data<T, Context>() + output_offset,
          ctx());
    }
-    index += size_splits[i];
+    output_offset += size_splits[i] * dX->count(axis + 1);
  }
 }


--- a/dragon/operators/array/stack_op.cc
+++ b/dragon/operators/array/stack_op.cc
 #include "dragon/operators/array/stack_op.h"
 #include "dragon/core/workspace.h"
-#include "dragon/utils/op_kernels.h"
+#include "dragon/utils/math_functions.h"

 namespace dragon {

@@ -26,18 +26,20 @@ void StackOp<Context>::DoRunWithType() {
    STORE_INPUT_SPEC(i);
  }

-  auto* y = Y->Reshape(Y_dims)->template mutable_data<T, Context>();
+  Y->Reshape(Y_dims);
+  int64_t output_offset = 0;

  for (int i = 0; i < num_stacks; i++) {
-    kernel::Concat(
-        X.count(0, axis),
-        X.count(axis),
-        1,
-        num_stacks,
-        i,
-        Input(i).template data<T, Context>(),
-        y,
+    const auto& Xi = Input(i);
+    math::CopyMatrix(
+        Xi.count(0, axis),
+        Xi.count(axis),
+        Xi.count(axis),
+        Y->count(axis),
+        Xi.template data<T, Context>(),
+        Y->template mutable_data<T, Context>() + output_offset,
        ctx());
+    output_offset += Xi.count(axis);
  }
 }

@@ -52,20 +54,21 @@ void StackGradientOp<Context>::DoRunWithType() {
  auto &X_ref = RESTORE_INPUT_SPEC(0), &dY = Input(0);
  CANONICALIZE_AXIS_WITH_TENSOR_AND_OFFSET(X_ref, 1)

-  int num_stacks = OutputSize();
-  for (int i = 0; i < num_stacks; ++i) {
+  int64_t input_offset = 0;
+
+  for (int i = 0; i < OutputSize(); ++i) {
    auto &X = RESTORE_INPUT_SPEC(i), *dX = Output(i);
    if (dX->has_name()) {
-      kernel::Split(
-          X.count(0, axis),
+      math::CopyMatrix(
+          dY.count(0, axis),
+          X.count(axis),
+          dY.count(axis),
          X.count(axis),
-          num_stacks,
-          1,
-          i,
-          dY.template data<T, Context>(),
+          dY.template data<T, Context>() + input_offset,
          dX->ReshapeLike(X)->template mutable_data<T, Context>(),
          ctx());
    }
+    input_offset += X.count(axis);
  }
 }


--- a/dragon/operators/loss/l1_loss_op.cc
+++ b/dragon/operators/loss/l1_loss_op.cc
@@ -45,7 +45,7 @@ void L1LossOp<Context>::DoRunWithType() {
        ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer *= X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer *= X.count();
@@ -94,7 +94,7 @@ void L1LossGradientOp<Context>::DoRunWithType() {
    math::Mul(dX->count(), dy, dx, dx, ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer *= dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer *= dX->count();

--- a/dragon/operators/loss/l2_loss_op.cc
+++ b/dragon/operators/loss/l2_loss_op.cc
@@ -45,7 +45,7 @@ void L2LossOp<Context>::DoRunWithType() {
        ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer *= X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer *= X.count();
@@ -92,7 +92,7 @@ void L2LossGradientOp<Context>::DoRunWithType() {
    math::Scale(dX->count(), 2.f, dx, dx, ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer *= dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer *= dX->count();

--- a/dragon/operators/loss/nll_loss_op.cc
+++ b/dragon/operators/loss/nll_loss_op.cc
@@ -48,7 +48,7 @@ void NLLLossOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = num_preds;
@@ -125,7 +125,7 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = num_preds;

--- a/dragon/operators/loss/sigmoid_ce_loss_op.cc
+++ b/dragon/operators/loss/sigmoid_ce_loss_op.cc
@@ -38,7 +38,7 @@ void SigmoidCrossEntropyOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = X.count();
@@ -83,7 +83,7 @@ void SigmoidCrossEntropyGradientOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = dX->count();

--- a/dragon/operators/loss/sigmoid_focal_loss_op.cc
+++ b/dragon/operators/loss/sigmoid_focal_loss_op.cc
@@ -48,7 +48,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = X.count();
@@ -125,7 +125,7 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = dX->count();

--- a/dragon/operators/loss/smooth_l1_loss_op.cc
+++ b/dragon/operators/loss/smooth_l1_loss_op.cc
@@ -45,7 +45,7 @@ void SmoothL1LossOp<Context>::DoRunWithType() {
        ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer *= X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer *= X.count();
@@ -94,7 +94,7 @@ void SmoothL1LossGradientOp<Context>::DoRunWithType() {
    math::Mul(dX->count(), dy, dx, dx, ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer *= dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer *= dX->count();

--- a/dragon/operators/loss/softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/softmax_ce_loss_op.cc
@@ -49,7 +49,7 @@ void SoftmaxCrossEntropyOp<Context>::DoRunWithType() {
        ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer = X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = num_preds;
@@ -93,7 +93,7 @@ void SoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
        outer_dim, dX->dim(axis), inner_dim, dy, dx, ctx());
  } else {
    int64_t normalizer = 1;
-    if (reduction_ == "BATCH_SIZE") {
+    if (reduction_ == "BATCH_MEAN") {
      normalizer = dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = num_preds;

--- a/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
+++ b/dragon/operators/loss/sparse_softmax_ce_loss_op.cc
@@ -58,7 +58,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = X.dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = num_preds;
@@ -136,7 +136,7 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
    int64_t normalizer = 1;
    if (reduction_ == "VALID") {
      normalizer = -1; // Select from mask
-    } else if (reduction_ == "BATCH_SIZE") {
+    } else if (reduction_ == "BATCH_MEAN") {
      normalizer = dX->dim(0);
    } else if (reduction_ == "MEAN") {
      normalizer = num_preds;

--- a/dragon/python/core/ops/loss_ops.py
+++ b/dragon/python/core/ops/loss_ops.py
@@ -88,8 +88,9 @@ def l1_loss(inputs, reduction='mean', **kwargs):
    args['reduction'] = reduction.upper()
    op_lib = loss_ops_lib.L1Loss
    if context.executing_eagerly():
-        return op_lib  \
-            .instantiate(reduction=args['reduction']).apply(inputs)
+        return op_lib \
+            .instantiate(reduction=args['reduction']) \
+            .apply(inputs)
    else:
        return op_lib.blend(**args)

@@ -128,9 +129,8 @@ def l2_loss(inputs, reduction='mean', **kwargs):
    op_lib = loss_ops_lib.L2Loss
    if context.executing_eagerly():
        return op_lib \
-            .instantiate(
-                reduction=args['reduction'],
-            ).apply(inputs)
+            .instantiate(reduction=args['reduction']) \
+            .apply(inputs)
    else:
        return op_lib.blend(**args)

@@ -213,9 +213,9 @@ def sigmoid_cross_entropy(inputs, reduction='valid', **kwargs):
    args['reduction'] = reduction.upper()
    op_lib = loss_ops_lib.SigmoidCrossEntropy
    if context.executing_eagerly():
-        return op_lib.instantiate(
-            reduction=args['reduction'],
-        ).apply(inputs)
+        return op_lib \
+            .instantiate(reduction=args['reduction']) \
+            .apply(inputs)
    else:
        return op_lib.blend(**args)


--- a/dragon/utils/math/blas.cc
+++ b/dragon/utils/math/blas.cc
@@ -36,7 +36,7 @@ DEFINE_SCALE_FUNC(double);
  DRAGON_API void Copy<T, CPUContext>(                  \
      const int n, const T* x, T* y, CPUContext* ctx) { \
    if (x != y && n > 0) {                              \
-      memcpy(y, x, n * sizeof(T));                      \
+      memcpy(y, x, sizeof(T) * n);                      \
    }                                                   \
  }

@@ -75,6 +75,36 @@ DEFINE_COPY_FUNC(float);
 DEFINE_COPY_FUNC(double);
 #undef DEFINE_COPY_FUNC

+#define DEFINE_COPY_FUNC(T)                            \
+  template <>                                          \
+  DRAGON_API void CopyMatrix<T, CPUContext>(           \
+      const int m,                                     \
+      const int n,                                     \
+      const int ldx,                                   \
+      const int ldy,                                   \
+      const T* x,                                      \
+      T* y,                                            \
+      CPUContext* ctx) {                               \
+    if (m <= 0 || n <= 0) return;                      \
+    if (ldx == n && ldy == n) {                        \
+      if (x != y) memcpy(y, x, sizeof(T) * m * n);     \
+      return;                                          \
+    }                                                  \
+    for (int i = 0; i < m; ++i) {                      \
+      memcpy(y + ldy * i, x + ldx * i, sizeof(T) * n); \
+    }                                                  \
+  }
+
+DEFINE_COPY_FUNC(bool);
+DEFINE_COPY_FUNC(int8_t);
+DEFINE_COPY_FUNC(uint8_t);
+DEFINE_COPY_FUNC(int);
+DEFINE_COPY_FUNC(int64_t);
+DEFINE_COPY_FUNC(float16);
+DEFINE_COPY_FUNC(float);
+DEFINE_COPY_FUNC(double);
+#undef DEFINE_COPY_FUNC
+
 template <>
 DRAGON_API void Axpy<float16, CPUContext>(
    const int n,

--- a/dragon/utils/math/blas.cu
+++ b/dragon/utils/math/blas.cu
@@ -231,24 +231,56 @@ DEFINE_COPY_FUNC(int64_t);
 DEFINE_COPY_FUNC(float16);
 #undef DEFINE_COPY_FUNC

-#define DEFINE_COPY_FUNC(T, cublas_func)                      \
-  template <>                                                 \
-  DRAGON_API void Copy<T, CUDAContext>(                       \
-      const int n,                                            \
-      const int incx,                                         \
-      const int incy,                                         \
-      const T* x,                                             \
-      T* y,                                                   \
-      CUDAContext* ctx) {                                     \
-    if (x != y && n > 0) {                                    \
-      cublas_func(ctx->cublas_handle(), n, x, incx, y, incy); \
-    }                                                         \
+#define DEFINE_COPY_FUNC(T, cublas_func)                                    \
+  template <>                                                               \
+  DRAGON_API void Copy<T, CUDAContext>(                                     \
+      const int n,                                                          \
+      const int incx,                                                       \
+      const int incy,                                                       \
+      const T* x,                                                           \
+      T* y,                                                                 \
+      CUDAContext* ctx) {                                                   \
+    if (x != y && n > 0) {                                                  \
+      CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, x, incx, y, incy)); \
+    }                                                                       \
  }

 DEFINE_COPY_FUNC(float, cublasScopy);
 DEFINE_COPY_FUNC(double, cublasDcopy);
 #undef DEFINE_COPY_FUNC

+#define DEFINE_COPY_FUNC(T)                   \
+  template <>                                 \
+  DRAGON_API void CopyMatrix<T, CUDAContext>( \
+      const int m,                            \
+      const int n,                            \
+      const int ldx,                          \
+      const int ldy,                          \
+      const T* x,                             \
+      T* y,                                   \
+      CUDAContext* ctx) {                     \
+    if (m <= 0 || n <= 0) return;             \
+    CUDA_CHECK(cudaMemcpy2DAsync(             \
+        y,                                    \
+        sizeof(T) * ldy,                      \
+        x,                                    \
+        sizeof(T) * ldx,                      \
+        sizeof(T) * n,                        \
+        m,                                    \
+        cudaMemcpyDeviceToDevice,             \
+        ctx->cuda_stream()));                 \
+  }
+
+DEFINE_COPY_FUNC(bool);
+DEFINE_COPY_FUNC(int8_t);
+DEFINE_COPY_FUNC(uint8_t);
+DEFINE_COPY_FUNC(int);
+DEFINE_COPY_FUNC(int64_t);
+DEFINE_COPY_FUNC(float16);
+DEFINE_COPY_FUNC(float);
+DEFINE_COPY_FUNC(double);
+#undef DEFINE_COPY_FUNC
+
 #define DEFINE_AXPY_FUNC(T)                                                 \
  template <>                                                               \
  DRAGON_API void Axpy<T, CUDAContext>(                                     \
@@ -398,7 +430,7 @@ DEFINE_DOT_FUNC(double, cublasDdot);
      const int n, const T* x, T* y, CUDAContext* ctx) {                       \
    CUBLAS_CHECK(cublasSetPointerMode(                                         \
        ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));                    \
-    cublas_func(ctx->cublas_handle(), n, x, 1, y);                             \
+    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, x, 1, y));               \
  }                                                                            \
  template <>                                                                  \
  DRAGON_API T ASum<T, CUDAContext>(                                           \
@@ -406,7 +438,7 @@ DEFINE_DOT_FUNC(double, cublasDdot);
    T ret;                                                                     \
    CUBLAS_CHECK(                                                              \
        cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
-    cublas_func(ctx->cublas_handle(), n, x, 1, &ret);                          \
+    CUBLAS_CHECK(cublas_func(ctx->cublas_handle(), n, x, 1, &ret));            \
    return ret;                                                                \
  }


--- a/dragon/utils/math/blas.h
+++ b/dragon/utils/math/blas.h
@@ -41,6 +41,16 @@ DRAGON_API void Copy(
    Context* ctx);

 template <typename T, class Context>
+DRAGON_API void CopyMatrix(
+    const int m,
+    const int n,
+    const int ldx,
+    const int ldy,
+    const T* x,
+    T* y,
+    Context* ctx);
+
+template <typename T, class Context>
 DRAGON_API void
 Axpy(const int n, const float alpha, const T* x, T* y, Context* ctx);


--- a/dragon/utils/op_kernels.h
+++ b/dragon/utils/op_kernels.h
@@ -270,19 +270,6 @@ void ChannelShuffle(
    T* y,
    Context* ctx);

-/* array.concat */
-
-template <typename T, class Context>
-void Concat(
-    const int outer_dim,
-    const int inner_dim,
-    const int x_axis_dim,
-    const int y_cat_dim,
-    const int index,
-    const T* x,
-    T* y,
-    Context* ctx);
-
 /* array.cumsum */

 template <typename T, class Context>
@@ -494,19 +481,6 @@ void SliceGrad(
    T* dx,
    Context* ctx);

-/* array.split */
-
-template <typename T, class Context>
-void Split(
-    const int outer_dim,
-    const int inner_dim,
-    const int x_axis_dim,
-    const int y_axis_dim,
-    const int index,
-    const T* x,
-    T* y,
-    Context* ctx);
-
 /* array.tile */

 template <typename T, class Context>

--- a/torch/_api/nn/__init__.py
+++ b/torch/_api/nn/__init__.py
@@ -48,6 +48,7 @@ from dragon.vm.torch.core.nn.modules.linear import Linear
 from dragon.vm.torch.core.nn.modules.loss import CTCLoss
 from dragon.vm.torch.core.nn.modules.loss import BCEWithLogitsLoss
 from dragon.vm.torch.core.nn.modules.loss import CrossEntropyLoss
+from dragon.vm.torch.core.nn.modules.loss import KLDivLoss
 from dragon.vm.torch.core.nn.modules.loss import L1Loss
 from dragon.vm.torch.core.nn.modules.loss import MSELoss
 from dragon.vm.torch.core.nn.modules.loss import NLLLoss

--- a/torch/_api/nn/functional/__init__.py
+++ b/torch/_api/nn/functional/__init__.py
@@ -27,6 +27,7 @@ from dragon.vm.torch.core.nn.functional import drop_path
 from dragon.vm.torch.core.nn.functional import dropout
 from dragon.vm.torch.core.nn.functional import elu
 from dragon.vm.torch.core.nn.functional import group_norm
+from dragon.vm.torch.core.nn.functional import kl_div
 from dragon.vm.torch.core.nn.functional import l1_loss
 from dragon.vm.torch.core.nn.functional import leaky_relu
 from dragon.vm.torch.core.nn.functional import linear

--- a/torch/core/nn/functional.py
+++ b/torch/core/nn/functional.py
@@ -18,6 +18,7 @@ from dragon.core.util import nest
 from dragon.vm.torch.core.nn.modules import _functions
 from dragon.vm.torch.core.nn import _reduction
 from dragon.vm.torch.core.nn.modules import utils
+from dragon.vm.torch.core.ops.math import functional as math_funcs


 def avg_pool2d(
@@ -715,6 +716,59 @@ def interpolate(
        ).apply(input, size, scale_factor)


+def kl_div(
+    input,
+    target,
+    size_average=None,
+    reduce=None,
+    reduction='mean',
+    log_target=False,
+):
+    """Compute the Kullback-Leibler divergence.
+
+    Parameters
+    ----------
+    input : dragon.vm.torch.Tensor
+        The input tensor.
+    target : dragon.vm.torch.Tensor
+        The target tensor.
+    size_average : bool, optional
+        Whether to average the loss.
+    reduce : bool, optional
+        Whether to reduce the loss.
+    reduction : {'none', 'batchmean', 'mean', 'sum'}, optional
+        The reduce method.
+    log_target : bool, optional, default=False
+        The flag indicating whether ``target`` is passed in log space.
+
+    Returns
+    -------
+    dragon.vm.torch.Tensor
+        The output tensor.
+
+    See Also
+    --------
+    `torch.nn.KLDivLoss(...)`_
+
+    """
+    if size_average is not None or reduce is not None:
+        reduction = _reduction.legacy_get_string(size_average, reduce)
+    else:
+        reduction = reduction
+    if not log_target:
+        out = target * (math_funcs.log(target) - input)
+    else:
+        out = math_funcs.exp(target) * (target - input)
+    if reduction == 'none':
+        return out
+    elif reduction == 'batchmean':
+        return out.sum() / input.size()[0]
+    elif reduction == 'mean':
+        return out.mean()
+    else:
+        return out.sum()
+
+
 def l1_loss(
    input,
    target,

--- a/torch/core/nn/modules/loss.py
+++ b/torch/core/nn/modules/loss.py
@@ -256,6 +256,58 @@ class CrossEntropyLoss(_WeightedLoss):
        )


+class KLDivLoss(_Loss):
+    """Compute the Kullback-Leibler divergence.
+
+    Examples:
+
+    ```python
+    m = torch.nn.KLDivLoss()
+    eps = 1e-12  # Epsilon to avoid log(0)
+    # Compute KL(P || Q)
+    q = torch.tensor([0.0, 0.1, 0.2, 0.3, 1.0])
+    p = torch.tensor([0.0, 0.3, 0.2, 0.1, 0.9])
+    loss = m(torch.log(torch.clamp(q, eps)), torch.clamp(p, eps))
+    ```
+
+    See Also
+    --------
+    `torch.nn.functional.kl_div(...)`_
+
+    """
+
+    def __init__(
+        self,
+        size_average=None,
+        reduce=None,
+        reduction='mean',
+        log_target=False,
+    ):
+        """Create a ``KDivLoss`` module.
+
+        Parameters
+        ----------
+        size_average : bool, optional
+            **True** to set the ``reduction`` to *'mean'*.
+        reduce : bool, optional
+            **True** to set the ``reduction`` to *'sum'* or *'mean'*.
+        reduction : {'none', 'batchmean', 'mean', 'sum'}, optional
+            The reduce method.
+        log_target : bool, optional, default=False
+            The flag indicating whether ``target`` is passed in log space.
+
+        """
+        super(KDivLoss, self).__init__(size_average, reduce, reduction)
+        self.log_target = log_target
+
+    def forward(self, input, target):
+        return F.kl_div(
+            input, target,
+            reduction=self.reduction,
+            log_target=self.log_target,
+        )
+
+
 class L1Loss(_Loss):
    r"""Compute the element-wise absolute value difference.