Implement softmax kernels via warp reduce

Summary: This commit adds extra CUDA softmax kernels using warp reduce. Warp reduce leads to better performance when dimension <= 256, which is preferred for the recent vision transformers.

Implement softmax kernels via warp reduce
Summary: This commit adds extra CUDA softmax kernels using warp reduce. Warp reduce leads to better performance when dimension <= 256, which is preferred for the recent vision transformers.
Ting PAN
Commit 654febe3 authored Jun 26, 2021 by Ting PAN
Showing with 535 additions and 231 deletions
dali/_api/ops/__init__.py
dali/core/ops/image_ops.py
dali/core/ops/math_ops.py
docs/api/cc/dragon/core/Tensor.rst
docs/api/python/dali/ops.rst
docs/api/python/dali/ops/Normalize.rst
dragon/core/context_cuda.h
dragon/core/memory.h
dragon/core/operator.cc
dragon/core/tensor.h
dragon/kernels/activation/softmax_op_kernel.cu
dragon/modules/python/cuda.h
dragon/modules/python/tensor.h
dragon/operators/activation/softmax_op.h
dragon/operators/activation/softmax_op_cudnn.cc
dragon/operators/array/concat_op.cc
dragon/operators/array/stack_op.cc
dragon/python/core/device/cuda.py
dragon/utils/conversions.h
dragon/utils/device/common_cub.h
--- a/dali/_api/ops/__init__.py
+++ b/dali/_api/ops/__init__.py
@@ -34,6 +34,7 @@ from dragon.vm.dali.core.ops.image_ops import RandomResizedCrop
 from dragon.vm.dali.core.ops.image_ops import Resize
 from dragon.vm.dali.core.ops.image_ops import Rotate
 from dragon.vm.dali.core.ops.image_ops import WarpAffine
+from dragon.vm.dali.core.ops.math_ops import Normalize
 from dragon.vm.dali.core.ops.random_ops import CoinFlip
 from dragon.vm.dali.core.ops.random_ops import Uniform
 from dragon.vm.dali.core.ops.reader_ops import KPLRecordReader

--- a/dali/core/ops/image_ops.py
+++ b/dali/core/ops/image_ops.py
@@ -118,13 +118,13 @@ class CropMirrorNormalize(object):
        # (H, W) for 2d input
        # (D, H, W) for 3d input
        crop=(224, 224),
-        # Historical values to normalize input
-        mean=(102., 115., 122.),
-        std=(1., 1., 1.),
+        # Historical BGR values to normalize input
+        mean=(103.53, 116.28, 123.675),
+        std=(57.375, 57.12, 58.395),
        # Or ``float16`` for fp16 training
        dtype='float32',
-        # Or ``NHWC``
-        output_layout='NCHW'
+        # Or ``HWC``
+        output_layout='CHW',
    )
    y = cmn(inputs['x'], mirror=flip_rng())
    ```
@@ -138,7 +138,7 @@ class CropMirrorNormalize(object):
        mean=0.,
        std=1.,
        dtype='float32',
-        output_layout='NCHW',
+        output_layout='CHW',
        **kwargs
    ):
        """Create a ``CropMirrorNormalize`` operator.
@@ -153,10 +153,10 @@ class CropMirrorNormalize(object):
            The values to subtract.
        std : Union[float, Sequence[float]], optional
            The values to divide after subtraction.
-        dtype : {'float16', 'float32'}, optional
-            The data type of output.
-        output_layout : {'NCHW', 'NHWC'}, optional
-            The data format of output.
+        dtype : str, optional, default='float32'
+            The output data type.
+        output_layout : str, optional
+            The output data layout.

        Returns
        -------
@@ -167,7 +167,7 @@ class CropMirrorNormalize(object):
        if isinstance(dtype, six.string_types):
            dtype = getattr(types, dtype.upper())
        if isinstance(output_layout, six.string_types):
-            output_layout = getattr(types, output_layout.upper())
+            output_layout = output_layout.upper()
        return ops.CropMirrorNormalize(
            crop=crop,
            mirror=mirror,

--- a/dali/core/ops/math_ops.py
+++ b/dali/core/ops/math_ops.py
+# ------------------------------------------------------------
+# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
+#
+# Licensed under the BSD 2-Clause License.
+# You should have received a copy of the BSD 2-Clause License
+# along with the software. If not, See,
+#
+#     <https://opensource.org/licenses/BSD-2-Clause>
+#
+# ------------------------------------------------------------
+"""Math ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+try:
+    from nvidia.dali import ops
+except ImportError:
+    from dragon.core.util import deprecation
+    ops = deprecation.not_installed('nvidia.dali')
+
+from dragon.core.util import six
+from dragon.vm.dali.core.framework import context
+from dragon.vm.dali.core.framework import types
+
+
+class Normalize(object):
+    """Normalize input.
+
+    Examples:
+
+    ```python
+    norm = dali.ops.Normalize(
+        # Batch normalization case of HWC layout
+        axes=(0, 1), batch=True, epsilon=1e-5,
+    )
+    y = norm(inputs['x'])
+    ```
+
+    """
+
+    def __new__(
+        cls,
+        axes=(0, 1),
+        mean=None,
+        stddev=None,
+        scale=1.0,
+        shift=0.0,
+        batch=False,
+        epsilon=0,
+        dtype='float32',
+        **kwargs
+    ):
+        """Create a ``Normalize`` operator.
+
+        Parameters
+        ----------
+        axes : Sequence[int], optional
+            The axes to normalize.
+        mean : float, optional
+            The value to subtract.
+        stddev : float, optional
+            The value to divide after subtraction.
+        scale : float, optional, default=1.0
+            The scale factor after normalization.
+        shift : float, optional, default=0.0
+            The shift factor after normalization.
+        batch : bool, optional, default=False
+            Whether to compute mean and stddev across the batch.
+        epsilon : float, optional, default=0
+            The value added to the computed variance.
+        dtype : str, optional, default='float32'
+            The output data type.
+
+        Returns
+        -------
+        nvidia.dali.ops.Normalize
+            The operator.
+
+        """
+        if isinstance(dtype, six.string_types):
+            dtype = getattr(types, dtype.upper())
+        return ops.Normalize(
+            axes=axes,
+            mean=mean,
+            stddev=stddev,
+            scale=scale,
+            shift=shift,
+            batch=batch,
+            epsilon=epsilon,
+            dtype=dtype,
+            device=context.get_device_type(),
+            **kwargs
+        )
--- a/docs/api/cc/dragon/core/Tensor.rst
+++ b/docs/api/cc/dragon/core/Tensor.rst
@@ -17,7 +17,7 @@ Public Functions

 CopyFrom
 ########
-.. doxygenfunction:: dragon::Tensor::CopyFrom(const Tensor &other, Context *ctx)
+.. doxygenfunction:: dragon::Tensor::CopyFrom(Tensor &other, Context *ctx)

 CopyFrom
 ########
@@ -39,6 +39,10 @@ IsType
 ######
 .. doxygenfunction:: dragon::Tensor::IsType

+MapFrom
+#######
+.. doxygenfunction:: dragon::Tensor::MapFrom
+
 Reset
 #####
 .. doxygenfunction:: dragon::Tensor::Reset
@@ -51,10 +55,6 @@ ReshapeLike
 ###########
 .. doxygenfunction:: dragon::Tensor::ReshapeLike

-Share
-#####
-.. doxygenfunction:: dragon::Tensor::Share
-
 axis
 ####
 .. doxygenfunction:: dragon::Tensor::axis
@@ -135,10 +135,6 @@ raw_mutable_data
 ################
 .. doxygenfunction:: dragon::Tensor::raw_mutable_data()

-raw_mutable_data
-################
-.. doxygenfunction:: dragon::Tensor::raw_mutable_data(const TypeMeta &meta)
-
 size
 ####
 .. doxygenfunction:: dragon::Tensor::size

--- a/docs/api/python/dali/ops.rst
+++ b/docs/api/python/dali/ops.rst
@@ -45,6 +45,9 @@ vm.dali.ops
  `class ImageDecoderRandomCrop <ops/ImageDecoderRandomCrop.html>`_
  : Decode image and return a random crop.

+  `class Normalize <ops/Normalize.html>`_
+  : Normalize input.
+
  `class Pad <ops/Pad.html>`_
  : Pad input to have the same dimensions.

@@ -97,6 +100,7 @@ vm.dali.ops
  ops/Hsv
  ops/ImageDecoder
  ops/ImageDecoderRandomCrop
+  ops/Normalize
  ops/Pad
  ops/Paste
  ops/RandomBBoxCrop

--- a/docs/api/python/dali/ops/Normalize.rst
+++ b/docs/api/python/dali/ops/Normalize.rst
+Normalize
+=========
+
+.. autoclass:: dragon.vm.dali.ops.Normalize
+
+__new__
+--------
+.. automethod:: dragon.vm.dali.ops.Normalize.__new__
+
+.. raw:: html
+
+  <style>
+    h1:before {
+      content: "dali.ops.";
+      color: #103d3e;
+    }
+  </style>
--- a/dragon/core/context_cuda.h
+++ b/dragon/core/context_cuda.h
@@ -143,6 +143,9 @@ class CUDAObjects {
 #ifdef USE_CUDNN
  /*! \brief The cached cuDNN handles of each device */
  vector<cudnnHandle_t> cudnn_handles_[CUDA_MAX_DEVICES];
+
+  /*! \brief The disabled cuDNN operators */
+  Set<string> cudnn_disabled_ops_;
 #endif

 #ifdef USE_NCCL

--- a/dragon/core/memory.h
+++ b/dragon/core/memory.h
@@ -75,11 +75,6 @@ class DRAGON_API UnifiedMemory {
    return size_t(0);
  }

-  /*! \brief Return the number of batch chunks */
-  size_t num_chunks() const {
-    return num_chunks_;
-  }
-
  /*! \brief Return the storage order */
  StorageOrder order() const {
    return order_;
@@ -105,11 +100,6 @@ class DRAGON_API UnifiedMemory {
  /*! \brief Return the mutable cuda data */
  void* mutable_cuda_data(size_t size = 0);

-  /*! \brief Set the number of data chunks */
-  void set_num_chunks(size_t num_chunks) {
-    num_chunks_ = num_chunks;
-  }
-
  /*! \brief Set the storage order */
  void set_order(StorageOrder order) {
    order_ = order;
@@ -125,8 +115,8 @@ class DRAGON_API UnifiedMemory {
  /*! \brief The data state */
  State state_ = UNINITIALIZED;

-  /*! \brief The size and number of chunks */
-  size_t size_ = 0, num_chunks_ = 1;
+  /*! \brief The data size */
+  size_t size_ = 0;

  /*! \brief The type meta */
  TypeMeta meta_;
@@ -140,12 +130,12 @@ class DRAGON_API UnifiedMemory {
  /*! \brief The cpu data pointer */
  void* cpu_ptr_ = nullptr;

-  /*! \brief The ownership of cpu data pointer */
-  bool own_cpu_ptr_ = true;
-
  /*! \brief The cuda data pointer */
  void* cuda_ptr_ = nullptr;

+  /*! \brief The ownership of cpu data pointer */
+  bool own_cpu_ptr_ = true;
+
  /*! \brief The ownership of cuda data pointer */
  bool own_cuda_ptr_ = true;


--- a/dragon/core/operator.cc
+++ b/dragon/core/operator.cc
@@ -60,15 +60,13 @@ Tensor* OperatorBase::Output(int i, const vec32_t& inputs) {
  auto* Y = Output(i);
  if (i < output_aliases_.size()) {
    for (auto j : inputs) {
-      const auto& X = Input(j);
+      auto& X = Input(j);
      if (output_aliases_[i].count(X.name())) {
-        Output(i)->ReshapeLike(X)->Share(X.memory());
-        return Y;
+        return Y->ReshapeLike(X)->MapFrom(&X);
      }
    }
  }
-  Y->Share(nullptr);
-  return Y;
+  return Y->MapFrom(nullptr);
 }

 Tensor* OperatorBase::Buffer(const string& name) {
@@ -85,7 +83,8 @@ OperatorBase* OperatorBase::New(const OperatorDef& def, Workspace* ws) {
    case PROTO_CUDA:
 #ifdef USE_CUDNN
      if (CUDNNOperatorRegistry()->Has(op_type) &&
-          CUDAContext::objects().cudnn_enabled_) {
+          CUDAContext::objects().cudnn_enabled_ &&
+          !CUDAContext::objects().cudnn_disabled_ops_.count(op_type)) {
        return CUDNNOperatorRegistry()->Create(op_type, def, ws);
      }
 #endif

--- a/dragon/core/tensor.h
+++ b/dragon/core/tensor.h
@@ -79,7 +79,7 @@ class DRAGON_API Tensor {
    }
  }

-  /*! \brief Change the tensor dimensions */
+  /*! \brief Change the dimensions */
  Tensor* Reshape(const vec64_t& dims) {
    dims_ = dims;
    strides_.resize(dims.size());
@@ -91,31 +91,23 @@ class DRAGON_API Tensor {
      CHECK_GE(d, 0);
      if (d > 0) new_size *= d;
    }
-    if (capacity_ < new_size * meta_.itemsize()) {
-      if (own_memory_ptr_) {
-        memory_.reset();
-      } else {
-        mapped_memory_ = nullptr;
-        own_memory_ptr_ = true;
-      }
-      capacity_ = 0;
-    }
    size_ = new_size;
    return this;
  }

-  /*! \brief Change the tensor dimensions as the other */
+  /*! \brief Change the dimensions as the other */
  Tensor* ReshapeLike(const Tensor& other) {
    return Reshape(other.dims_);
  }

-  /*! \brief Copy memory from a tensor with context */
+  /*! \brief Copy memory from a tensor */
  template <class Context>
-  Tensor* CopyFrom(const Tensor& other, Context* ctx) {
+  Tensor* CopyFrom(Tensor& other, Context* ctx) {
    if ((void*)&other == (void*)this) return this;
    CHECK_EQ(size_, other.size_);
+    meta_ = other.meta_;
    auto* src = other.template raw_data<Context>();
-    auto* dst = raw_mutable_data<Context>(other.meta_);
+    auto* dst = raw_mutable_data<Context>();
    if (dst == src) return this;
    ctx->template MemcpyAsync<Context, Context>(nbytes(), dst, src);
    return this;
@@ -144,20 +136,22 @@ class DRAGON_API Tensor {
    }
  }

-  /*! \brief Share an external memory */
-  void Share(UnifiedMemory* memory) {
-    if (memory != nullptr) {
-      CHECK_LE(size_, memory->size())
-          << "\nShare an external memory with smaller capacity.";
-      memory_.reset();
-      capacity_ = memory->size();
+  /*! \brief Map memory from a tensor */
+  Tensor* MapFrom(Tensor* other) {
+    if (other == nullptr) {
+      mapped_memory_ = nullptr;
+      capacity_ = (memory_ != nullptr ? memory_->size() : 0);
    } else {
-      if (memory_) {
-        capacity_ = memory_->size();
+      auto* new_memory = other->memory();
+      if (new_memory != nullptr) {
+        CHECK_LE(size_, new_memory->size())
+            << "\nMap from a memory with smaller capacity.";
+        memory_.reset();
+        mapped_memory_ = new_memory;
+        capacity_ = new_memory->size();
      }
    }
-    mapped_memory_ = memory;
-    own_memory_ptr_ = (memory == nullptr);
+    return this;
  }

  /*! \brief Reset tensor to release all resources */
@@ -167,7 +161,6 @@ class DRAGON_API Tensor {
    memory_.reset();
    meta_ = TypeMeta();
    size_ = capacity_ = 0;
-    own_memory_ptr_ = true;
    mapped_memory_ = nullptr;
    if (ExternalDeleter != nullptr) {
      ExternalDeleter();
@@ -216,17 +209,17 @@ class DRAGON_API Tensor {
    return version_;
  }

-  /*! \brief Return the total number of elements */
+  /*! \brief Return the number of elements */
  size_t size() const {
    return size_;
  }

-  /*! \brief Return the memory capacity */
+  /*! \brief Return the byte length of memory */
  size_t capacity() const {
    return capacity_;
  }

-  /*! \brief Return the total number of data bytes */
+  /*! \brief Return the byte length of all elements */
  size_t nbytes() const {
    return size_ * meta_.itemsize();
  }
@@ -260,22 +253,22 @@ class DRAGON_API Tensor {
    return strides_[axis(i)];
  }

-  /*! \brief Return the tensor dimensions */
+  /*! \brief Return the dimensions */
  const vec64_t& dims() const {
    return dims_;
  }

-  /*! \brief Return the tensor strides */
+  /*! \brief Return the strides */
  const vec64_t& strides() const {
    return strides_;
  }

-  /*! \brief Return the total number of elements */
+  /*! \brief Return the number of elements counting along all axes */
  int64_t count() const {
    return (int64_t)size_;
  }

-  /*! \brief Return the number of elements counting along the given axes */
+  /*! \brief Return the number of elements counting along given axes */
  int64_t count(int64_t start, int64_t end) const {
    int64_t num = 1;
    for (int64_t i = start; i < end; i++) {
@@ -284,12 +277,12 @@ class DRAGON_API Tensor {
    return num;
  }

-  /*! \brief Return the number of elements counting from the given axis */
+  /*! \brief Return the number of elements counting from given axis */
  int64_t count(int64_t start) const {
    return count(start, ndim());
  }

-  /*! \brief Return whether the total number of elements is zero */
+  /*! \brief Return whether the number of elements is zero */
  bool empty() const {
    return size_ == 0;
  }
@@ -299,22 +292,30 @@ class DRAGON_API Tensor {
    return memory_ != nullptr || mapped_memory_ != nullptr;
  }

-  /*! \brief Return the memory pointer */
-  UnifiedMemory* memory(bool required = false, bool owned = false) const {
-    auto* ptr = own_memory_ptr_ || owned ? memory_.get() : mapped_memory_;
+  /*! \brief Return the memory */
+  UnifiedMemory* memory(bool required = false, bool owned = false) {
+    if (capacity_ < size_ * meta_.itemsize()) {
+      if (mapped_memory_ != nullptr) {
+        mapped_memory_ = nullptr;
+      } else {
+        memory_.reset();
+      }
+      capacity_ = 0;
+    }
+    auto* ptr = (owned || !mapped_memory_ ? memory_.get() : mapped_memory_);
    if (required) CHECK(ptr) << "\nAccess the empty memory.";
    return ptr;
  }

  /*! \brief Return the memory state */
-  UnifiedMemory::State memory_state() const {
+  UnifiedMemory::State memory_state() {
    return memory(true)->state();
  }

-  /*! \brief Try to return the raw const data pointer */
+  /*! \brief Return the raw data pointer */
  template <class Context>
-  const void* const_data_ptr() const {
-    auto context_type = TypeMeta::Id<Context>();
+  const void* raw_data() {
+    const auto context_type = TypeMeta::Id<Context>();
    if (context_type == TypeMeta::Id<CPUContext>()) {
      return memory(true)->cpu_data(nbytes());
    } else if (context_type == TypeMeta::Id<CUDAContext>()) {
@@ -325,14 +326,14 @@ class DRAGON_API Tensor {
    }
  }

-  /*! \brief Try to return the raw mutable data pointer */
+  /*! \brief Get the raw mutable data pointer */
  template <class Context>
-  void mutable_data_ptr(void** data_ptr) {
+  void raw_mutable_data(void** data_ptr) {
    auto* memory_ptr = memory();
    if (!memory_ptr) {
      *data_ptr = nullptr;
    } else {
-      auto context_type = TypeMeta::Id<Context>();
+      const auto context_type = TypeMeta::Id<Context>();
      if (context_type == TypeMeta::Id<CPUContext>()) {
        *data_ptr = memory_ptr->mutable_cpu_data(nbytes());
      } else if (context_type == TypeMeta::Id<CUDAContext>()) {
@@ -343,67 +344,38 @@ class DRAGON_API Tensor {
    }
  }

-  /*!
-   * \brief Return the raw mutable data pointer.
-   *
-   * If memory is not set, create to manage it with the given meta.
-   */
-  template <class Context>
-  void* raw_mutable_data(const TypeMeta& meta) {
-    void* data_ptr;
-    mutable_data_ptr<Context>(&data_ptr);
-    // Return the data pointer directly
-    if (meta_ == meta && data_ptr) return data_ptr;
-    // Create a new memory created with size and meta
-    CHECK_GT(size_, 0) << "\nInvalid tensor size.";
-    meta_ = meta;
-    capacity_ = size_ * meta.itemsize();
-    memory_.reset(new UnifiedMemory(meta_, capacity_));
-    mutable_data_ptr<Context>(&data_ptr);
-    if (meta_.ctor()) meta_.ctor()(data_ptr, size_);
-    return data_ptr;
-  }
-
  /*! \brief Return the raw mutable data pointer */
  template <class Context>
  void* raw_mutable_data() {
    CHECK_NE(meta_.id(), 0) << "\nTensor(" << name_ << "): unknown type, "
                            << "or does not have a type.";
-    return raw_mutable_data<Context>(meta_);
-  }
-
-  /*! \brief Return the raw const data pointer */
-  template <class Context>
-  const void* raw_data() const {
-    return const_data_ptr<Context>();
-  }
-
-  /*! \brief Return the typed mutable data pointer */
-  template <typename T, class Context>
-  T* mutable_data() {
    void* data_ptr;
-    mutable_data_ptr<Context>(&data_ptr);
-    if (data_ptr) {
-      auto meta = TypeMeta::Make<T>();
-      if (meta_ == meta) {
-        return static_cast<T*>(data_ptr);
-      } else if (capacity_ >= size_ * meta.itemsize()) {
-        meta_ = meta;
-        return static_cast<T*>(data_ptr);
-      }
-    }
-    return static_cast<T*>(raw_mutable_data<Context>(TypeMeta::Make<T>()));
+    raw_mutable_data<Context>(&data_ptr);
+    if (data_ptr) return data_ptr;
+    CHECK_GT(size_, 0) << "\nInvalid tensor size.";
+    capacity_ = size_ * meta_.itemsize();
+    memory_.reset(new UnifiedMemory(meta_, capacity_));
+    raw_mutable_data<Context>(&data_ptr);
+    if (meta_.ctor()) meta_.ctor()(data_ptr, size_);
+    return data_ptr;
  }

-  /*! \brief Return the typed const data pointer */
+  /*! \brief Return the typed data pointer */
  template <typename T, class Context>
-  const T* data() const {
+  const T* data() {
    CHECK(meta_.Match<T>()) << "\nThe type of Tensor(" << name() << ") is "
                            << dtypes::to_string(meta_) << ", while requesting "
                            << dtypes::to_string(TypeMeta::Make<T>()) << ".";
    return static_cast<const T*>(raw_data<Context>());
  }

+  /*! \brief Return the typed mutable data pointer */
+  template <typename T, class Context>
+  T* mutable_data() {
+    meta_ = TypeMeta::Make<T>();
+    return static_cast<T*>(raw_mutable_data<Context>());
+  }
+
  /*! \brief Set the tensor version */
  void set_version(int version) {
    version_ = version;
@@ -415,12 +387,14 @@ class DRAGON_API Tensor {
    return this;
  }

-  /*! \brief Set to manage the memory */
-  void set_memory(UnifiedMemory* memory_ptr) {
-    if (memory_ptr != memory_.get()) {
-      memory_.reset(memory_ptr);
+  /*! \brief Set the managed memory */
+  void set_memory(UnifiedMemory* memory) {
+    if (memory != nullptr) {
+      if (memory != memory_.get()) {
+        memory_.reset(memory);
+      }
+      capacity_ = memory->size();
    }
-    capacity_ = memory_ptr->size();
  }

 private:
@@ -430,14 +404,20 @@ class DRAGON_API Tensor {
  /*! \brief The type meta */
  TypeMeta meta_;

-  /*! \brief The size and capacity */
-  size_t size_ = 0, capacity_ = 0;
+  /*! \brief The number of elements */
+  size_t size_ = 0;
+
+  /*! \brief The byte length of memory */
+  size_t capacity_ = 0;

  /*! \brief The tensor version */
  int version_ = -1;

-  /*! \brief The dimensions and strides */
-  vec64_t dims_, strides_;
+  /*! \brief The dimensions */
+  vec64_t dims_;
+
+  /*! \brief The strides */
+  vec64_t strides_;

  /*! \brief The managed memory */
  unique_ptr<UnifiedMemory> memory_;
@@ -445,9 +425,6 @@ class DRAGON_API Tensor {
  /*! \brief The mapped memory */
  UnifiedMemory* mapped_memory_ = nullptr;

-  /*! \brief The ownership of memory pointer */
-  bool own_memory_ptr_ = true;
-
  DISABLE_COPY_AND_ASSIGN(Tensor);
 };


--- a/dragon/kernels/activation/softmax_op_kernel.cu
+++ b/dragon/kernels/activation/softmax_op_kernel.cu
@@ -11,33 +11,101 @@ namespace kernels {

 namespace {

+#define WARP_ITEMS 16
+#define BLOCK_THREADS 1024
 #define LDG(x, i) convert::To<AccT>(__ldg(x + i))

 template <typename T, typename AccT>
-__global__ void
-_Softmax(const int NxS, const int S, const int C, const T* x, T* y) {
-  __shared__ AccT block_max, block_sum;
-  __shared__ typename BlockReduce<AccT>::TempStorage storage;
+__global__ void _SoftmaxViaWarpReduce(
+    const int NxS,
+    const int S,
+    const int C,
+    const T* x,
+    T* y) {
+  const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_warps = gridDim.x * blockDim.y;
+  for (int i = warp_idx; i < NxS; i += num_warps) {
+    const int offset = (i / S) * C * S + (i % S);
+    auto* offset_x = x + offset;
+    auto* offset_y = y + offset;
+
+    AccT val = AccT(-FLT_MAX);
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      val = max(val, LDG(offset_x, j * S));
+    }
+    const AccT warp_max = WarpAllReduce<AccT, cub::Max>(val);
+
+    val = AccT(0);
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      val += exp(LDG(offset_x, j * S) - warp_max);
+    }
+    const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
+
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      const int k = j * S;
+      val = exp(LDG(offset_x, k) - warp_max);
+      offset_y[k] = convert::To<T>(val / warp_sum);
+    }
+  }
+}
+
+template <typename T, typename AccT>
+__global__ void _LogSoftmaxViaWarpReduce(
+    const int NxS,
+    const int S,
+    const int C,
+    const T* x,
+    T* y) {
+  const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_warps = gridDim.x * blockDim.y;
+  for (int i = warp_idx; i < NxS; i += num_warps) {
+    const int offset = (i / S) * C * S + (i % S);
+    auto* offset_x = x + offset;
+    auto* offset_y = y + offset;
+
+    AccT val = AccT(-FLT_MAX);
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      val = max(val, LDG(offset_x, j * S));
+    }
+    const AccT warp_max = WarpAllReduce<AccT, cub::Max>(val);
+
+    val = AccT(0);
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      val += exp(LDG(offset_x, j * S) - warp_max);
+    }
+    const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
+
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      const int k = j * S;
+      val = LDG(offset_x, k) - warp_max;
+      offset_y[k] = convert::To<T>(val - log(warp_sum));
+    }
+  }
+}
+
+template <typename T, typename AccT>
+__global__ void _SoftmaxViaBlockReduce(
+    const int NxS,
+    const int S,
+    const int C,
+    const T* x,
+    T* y) {
  CUDA_2D_KERNEL_LOOP1(i, NxS) {
    const int offset = (i / S) * C * S + (i % S);
    auto* offset_x = x + offset;
    auto* offset_y = y + offset;

-    AccT val = convert::To<AccT>(__ldg(offset_x));
+    AccT val = AccT(-FLT_MAX);
    CUDA_2D_KERNEL_LOOP2(j, C) {
      val = max(val, LDG(offset_x, j * S));
    }
-    val = BlockReduce<AccT>(storage).Reduce(val, cub::Max());
-    if (threadIdx.x == 0) block_max = val;
-    __syncthreads();
+    const AccT block_max = BlockAllReduce<AccT, cub::Max, BLOCK_THREADS>(val);

    val = AccT(0);
    CUDA_2D_KERNEL_LOOP2(j, C) {
      val += exp(LDG(offset_x, j * S) - block_max);
    }
-    val = BlockReduce<AccT>(storage).Sum(val);
-    if (threadIdx.x == 0) block_sum = val;
-    __syncthreads();
+    const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);

    CUDA_2D_KERNEL_LOOP2(j, C) {
      const int k = j * S;
@@ -48,30 +116,28 @@ _Softmax(const int NxS, const int S, const int C, const T* x, T* y) {
 }

 template <typename T, typename AccT>
-__global__ void
-_LogSoftmax(const int NxS, const int S, const int C, const T* x, T* y) {
-  __shared__ AccT block_max, block_sum;
-  __shared__ typename BlockReduce<AccT>::TempStorage storage;
+__global__ void _LogSoftmaxViaBlockReduce(
+    const int NxS,
+    const int S,
+    const int C,
+    const T* x,
+    T* y) {
  CUDA_2D_KERNEL_LOOP1(i, NxS) {
    const int offset = (i / S) * C * S + (i % S);
    auto* offset_x = x + offset;
    auto* offset_y = y + offset;

-    AccT val = convert::To<AccT>(__ldg(offset_x));
+    AccT val = AccT(-FLT_MAX);
    CUDA_2D_KERNEL_LOOP2(j, C) {
      val = max(val, LDG(offset_x, j * S));
    }
-    val = BlockReduce<AccT>(storage).Reduce(val, cub::Max());
-    if (threadIdx.x == 0) block_max = val;
-    __syncthreads();
+    const AccT block_max = BlockAllReduce<AccT, cub::Max, BLOCK_THREADS>(val);

    val = AccT(0);
    CUDA_2D_KERNEL_LOOP2(j, C) {
      val += exp(LDG(offset_x, j * S) - block_max);
    }
-    val = BlockReduce<AccT>(storage).Sum(val);
-    if (threadIdx.x == 0) block_sum = val;
-    __syncthreads();
+    const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);

    CUDA_2D_KERNEL_LOOP2(j, C) {
      const int k = j * S;
@@ -82,15 +148,74 @@ _LogSoftmax(const int NxS, const int S, const int C, const T* x, T* y) {
 }

 template <typename T, typename AccT>
-__global__ void _SoftmaxGrad(
+__global__ void _SoftmaxGradViaWarpReduce(
+    const int NxS,
+    const int S,
+    const int C,
+    const T* dy,
+    const T* y,
+    T* dx) {
+  const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_warps = gridDim.x * blockDim.y;
+  for (int i = warp_idx; i < NxS; i += num_warps) {
+    const int offset = (i / S) * C * S + (i % S);
+    auto* offset_dy = dy + offset;
+    auto* offset_y = y + offset;
+    auto* offset_dx = dx + offset;
+
+    AccT val = AccT(0);
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      const int k = j * S;
+      val += LDG(offset_dy, k) * LDG(offset_y, k);
+    }
+    const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
+
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      const int k = j * S;
+      val = LDG(offset_dy, k) - warp_sum;
+      offset_dx[k] = convert::To<T>(val * LDG(offset_y, k));
+    }
+  }
+}
+
+template <typename T, typename AccT>
+__global__ void _LogSoftmaxGradViaWarpReduce(
+    const int NxS,
+    const int S,
+    const int C,
+    const T* dy,
+    const T* y,
+    T* dx) {
+  const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int num_warps = gridDim.x * blockDim.y;
+  for (int i = warp_idx; i < NxS; i += num_warps) {
+    const int offset = (i / S) * C * S + (i % S);
+    auto* offset_dy = dy + offset;
+    auto* offset_y = y + offset;
+    auto* offset_dx = dx + offset;
+
+    AccT val = AccT(0);
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      val += LDG(offset_dy, j * S);
+    }
+    const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
+
+    for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
+      const int k = j * S;
+      val = exp(convert::To<AccT>(offset_y[k])) * warp_sum;
+      offset_dx[k] = convert::To<T>(LDG(offset_dy, k) - val);
+    }
+  }
+}
+
+template <typename T, typename AccT>
+__global__ void _SoftmaxGradViaBlockReduce(
    const int NxS,
    const int S,
    const int C,
    const T* dy,
    const T* y,
    T* dx) {
-  __shared__ AccT block_val;
-  __shared__ typename BlockReduce<AccT>::TempStorage storage;
  CUDA_2D_KERNEL_LOOP1(i, NxS) {
    const int offset = (i / S) * C * S + (i % S);
    auto* offset_dy = dy + offset;
@@ -102,28 +227,24 @@ __global__ void _SoftmaxGrad(
      const int k = j * S;
      val += LDG(offset_dy, k) * LDG(offset_y, k);
    }
-    val = BlockReduce<AccT>(storage).Sum(val);
-    if (threadIdx.x == 0) block_val = val;
-    __syncthreads();
+    const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);

    CUDA_2D_KERNEL_LOOP2(j, C) {
      const int k = j * S;
-      val = LDG(offset_dy, k) - block_val;
+      val = LDG(offset_dy, k) - block_sum;
      offset_dx[k] = convert::To<T>(val * LDG(offset_y, k));
    }
  }
 }

 template <typename T, typename AccT>
-__global__ void _LogSoftmaxGrad(
+__global__ void _LogSoftmaxGradViaBlockReduce(
    const int NxS,
    const int S,
    const int C,
    const T* dy,
    const T* y,
    T* dx) {
-  __shared__ AccT block_val;
-  __shared__ typename BlockReduce<AccT>::TempStorage storage;
  CUDA_2D_KERNEL_LOOP1(i, NxS) {
    const int offset = (i / S) * C * S + (i % S);
    auto* offset_dy = dy + offset;
@@ -134,13 +255,11 @@ __global__ void _LogSoftmaxGrad(
    CUDA_2D_KERNEL_LOOP2(j, C) {
      val += LDG(offset_dy, j * S);
    }
-    val = BlockReduce<AccT>(storage).Sum(val);
-    if (threadIdx.x == 0) block_val = val;
-    __syncthreads();
+    const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);

    CUDA_2D_KERNEL_LOOP2(j, C) {
      const int k = j * S;
-      val = exp(convert::To<AccT>(offset_y[k])) * block_val;
+      val = exp(convert::To<AccT>(offset_y[k])) * block_sum;
      offset_dx[k] = convert::To<T>(LDG(offset_dy, k) - val);
    }
  }
@@ -152,23 +271,41 @@ __global__ void _LogSoftmaxGrad(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(name, T)                               \
-  template <>                                                         \
-  void name<T, CUDAContext>(                                          \
-      const int N,                                                    \
-      const int S,                                                    \
-      const int C,                                                    \
-      const T* x,                                                     \
-      T* y,                                                           \
-      CUDAContext* ctx) {                                             \
-    const auto NxS = N * S;                                           \
-    _##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-        <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>(               \
-            NxS,                                                      \
-            S,                                                        \
-            C,                                                        \
-            reinterpret_cast<const math::ScalarType<T>::type*>(x),    \
-            reinterpret_cast<math::ScalarType<T>::type*>(y));         \
+#define DEFINE_KERNEL_LAUNCHER(name, T)                              \
+  template <>                                                        \
+  void name<T, CUDAContext>(                                         \
+      const int N,                                                   \
+      const int S,                                                   \
+      const int C,                                                   \
+      const T* x,                                                    \
+      T* y,                                                          \
+      CUDAContext* ctx) {                                            \
+    const auto NxS = N * S;                                          \
+    if (C <= 1024) {                                                 \
+      const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
+      _##name##ViaWarpReduce<                                        \
+          math::ScalarType<T>::type,                                 \
+          math::AccmulatorType<T>::type>                             \
+          <<<nblocks,                                                \
+             dim3(CUDA_WARP_SIZE, WARP_ITEMS),                       \
+             0,                                                      \
+             ctx->cuda_stream()>>>(                                  \
+              NxS,                                                   \
+              S,                                                     \
+              C,                                                     \
+              reinterpret_cast<const math::ScalarType<T>::type*>(x), \
+              reinterpret_cast<math::ScalarType<T>::type*>(y));      \
+      return;                                                        \
+    }                                                                \
+    _##name##ViaBlockReduce<                                         \
+        math::ScalarType<T>::type,                                   \
+        math::AccmulatorType<T>::type>                               \
+        <<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>(             \
+            NxS,                                                     \
+            S,                                                       \
+            C,                                                       \
+            reinterpret_cast<const math::ScalarType<T>::type*>(x),   \
+            reinterpret_cast<math::ScalarType<T>::type*>(y));        \
  }

 #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T)                          \
@@ -182,8 +319,27 @@ __global__ void _LogSoftmaxGrad(
      T* dx,                                                          \
      CUDAContext* ctx) {                                             \
    const auto NxS = N * S;                                           \
-    _##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
-        <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>(               \
+    if (C <= 1024) {                                                  \
+      const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS);  \
+      _##name##ViaWarpReduce<                                         \
+          math::ScalarType<T>::type,                                  \
+          math::AccmulatorType<T>::type>                              \
+          <<<nblocks,                                                 \
+             dim3(CUDA_WARP_SIZE, WARP_ITEMS),                        \
+             0,                                                       \
+             ctx->cuda_stream()>>>(                                   \
+              NxS,                                                    \
+              S,                                                      \
+              C,                                                      \
+              reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
+              reinterpret_cast<const math::ScalarType<T>::type*>(y),  \
+              reinterpret_cast<math::ScalarType<T>::type*>(dx));      \
+      return;                                                         \
+    }                                                                 \
+    _##name##ViaBlockReduce<                                          \
+        math::ScalarType<T>::type,                                    \
+        math::AccmulatorType<T>::type>                                \
+        <<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>(              \
            NxS,                                                      \
            S,                                                        \
            C,                                                        \
@@ -206,6 +362,8 @@ DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, float);
 DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, double);
 #undef DEFINE_KERNEL_LAUNCHER
 #undef DEFINE_GRAD_KERNEL_LAUNCHER
+#undef WARP_ITEMS
+#undef BLOCK_THREADS

 } // namespace kernels


--- a/dragon/modules/python/cuda.h
+++ b/dragon/modules/python/cuda.h
@@ -91,16 +91,24 @@ void RegisterModule_cuda(py::module& m) {
 #endif
  });

-  /*! \brief Activate the CuDNN engine */
+  /*! \brief Enable using the cuDNN library */
  m.def(
      "cudaEnableDNN",
-      [](bool enabled, bool deterministic, bool benchmark, bool allow_tf32) {
+      [](bool enabled,
+         bool deterministic,
+         bool benchmark,
+         bool allow_tf32,
+         const vector<string>& disabled_ops) {
 #ifdef USE_CUDA
        auto& cuda_objects = CUDAContext::objects();
        cuda_objects.cudnn_enabled_ = enabled;
        cuda_objects.cudnn_deterministic_ = deterministic;
        cuda_objects.cudnn_benchmark_ = benchmark;
        cuda_objects.cudnn_allow_tf32_ = allow_tf32;
+        cuda_objects.cudnn_disabled_ops_.clear();
+        for (const auto& op_type : disabled_ops) {
+          cuda_objects.cudnn_disabled_ops_.insert(op_type);
+        }
 #endif
      });


--- a/dragon/modules/python/tensor.h
+++ b/dragon/modules/python/tensor.h
@@ -29,15 +29,22 @@ void RegisterModule_tensor(py::module& m) {
      /*! \brief Return the number of dimensions */
      .def_property_readonly("ndim", &Tensor::ndim)

-      /*! \brief Return all the dimensions */
+      /*! \brief Return the dimensions */
      .def_property_readonly("dims", &Tensor::dims)

-      /*! \brief Return the total number of elements */
+      /*! \brief Return the number of elements */
      .def_property_readonly("size", &Tensor::size)

-      /*! \brief Return the total number of bytes */
+      /*! \brief Return the byte length of one element */
+      .def_property_readonly(
+          "itemsize", [](Tensor* self) { return self->meta().itemsize(); })
+
+      /*! \brief Return the byte length of all elements */
      .def_property_readonly("nbytes", &Tensor::nbytes)

+      /*! \brief Return the byte length of allocated memory */
+      .def_property_readonly("capacity", &Tensor::capacity)
+
      /*! \brief Return the data type */
      .def_property_readonly(
          "dtype",

--- a/dragon/operators/activation/softmax_op.h
+++ b/dragon/operators/activation/softmax_op.h
@@ -60,7 +60,9 @@ class CuDNNSoftmaxOp final : public SoftmaxOp<Context> {
    CuDNNDestroyTensorDesc(&input_desc_);
  }

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();
@@ -82,7 +84,9 @@ class CuDNNSoftmaxGradientOp final : public SoftmaxGradientOp<Context> {
    CuDNNDestroyTensorDesc(&input_desc_);
  }

-  void RunOnDevice() override;
+  void RunOnDevice() override {
+    DispatchHelper<dtypes::Floating>::Call(this, Input(0));
+  }

  template <typename T>
  void DoRunWithType();

--- a/dragon/operators/activation/softmax_op_cudnn.cc
+++ b/dragon/operators/activation/softmax_op_cudnn.cc
 #ifdef USE_CUDNN

 #include "dragon/operators/activation/softmax_op.h"
+#include "dragon/utils/op_kernels.h"

 namespace dragon {

@@ -9,8 +10,19 @@ template <typename T>
 void CuDNNSoftmaxOp<Context>::DoRunWithType() {
  auto &X = Input(0), *Y = Output(0, {0});
  GET_OP_AXIS_ARG(axis, X.ndim(), -1);
-  auto S = X.count(axis + 1);
-  CuDNNSetTensorDesc<T>(&input_desc_, {X.count(0, axis), X.dim(axis), S, 1});
+  const auto C = X.dim(axis);
+  const auto N = X.count(0, axis), S = X.count(axis + 1);
+  if (C < 384 && S == 1) {
+    kernels::Softmax(
+        N,
+        S,
+        C,
+        X.template data<T, Context>(),
+        Y->ReshapeLike(X)->template mutable_data<T, Context>(),
+        ctx());
+    return;
+  }
+  CuDNNSetTensorDesc<T>(&input_desc_, {N, C, S, 1});
  CUDNN_CHECK(cudnnSoftmaxForward(
      ctx()->cudnn_handle(),
      CUDNN_SOFTMAX_ACCURATE,
@@ -24,17 +36,24 @@ void CuDNNSoftmaxOp<Context>::DoRunWithType() {
 }

 template <class Context>
-void CuDNNSoftmaxOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
-template <class Context>
 template <typename T>
 void CuDNNSoftmaxGradientOp<Context>::DoRunWithType() {
  auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
  GET_OP_AXIS_ARG(axis, Y.ndim(), -1);
-  auto S = Y.count(axis + 1);
-  CuDNNSetTensorDesc<T>(&input_desc_, {Y.count(0, axis), Y.dim(axis), S, 1});
+  const auto C = Y.dim(axis);
+  const auto N = Y.count(0, axis), S = Y.count(axis + 1);
+  if (C < 256 && S == 1) {
+    kernels::SoftmaxGrad(
+        Y.count(0, axis),
+        Y.count(axis + 1),
+        Y.dim(axis),
+        dY.template data<T, Context>(),
+        Y.template data<T, Context>(),
+        dX->ReshapeLike(Y)->template mutable_data<T, Context>(),
+        ctx());
+    return;
+  }
+  CuDNNSetTensorDesc<T>(&input_desc_, {N, C, S, 1});
  CUDNN_CHECK(cudnnSoftmaxBackward(
      ctx()->cudnn_handle(),
      CUDNN_SOFTMAX_ACCURATE,
@@ -49,11 +68,6 @@ void CuDNNSoftmaxGradientOp<Context>::DoRunWithType() {
      dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
 }

-template <class Context>
-void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
-  DispatchHelper<dtypes::Floating>::Call(this, Input(0));
-}
-
 DEPLOY_CUDNN_OPERATOR(Softmax);
 DEPLOY_CUDNN_OPERATOR(SoftmaxGradient);


--- a/dragon/operators/array/concat_op.cc
+++ b/dragon/operators/array/concat_op.cc
@@ -29,7 +29,7 @@ void ConcatOp<Context>::DoRunWithType() {
  int64_t output_offset = 0;

  for (int i = 0; i < InputSize(); ++i) {
-    const auto& Xi = Input(i);
+    auto& Xi = Input(i);
    math::CopyMatrix(
        Xi.count(0, axis),
        Xi.count(axis),

--- a/dragon/operators/array/stack_op.cc
+++ b/dragon/operators/array/stack_op.cc
@@ -28,7 +28,7 @@ void StackOp<Context>::DoRunWithType() {
  int64_t output_offset = 0;

  for (int i = 0; i < num_stacks; i++) {
-    const auto& Xi = Input(i);
+    auto& Xi = Input(i);
    math::CopyMatrix(
        Xi.count(0, axis),
        Xi.count(axis),

--- a/dragon/python/core/device/cuda.py
+++ b/dragon/python/core/device/cuda.py
@@ -67,6 +67,7 @@ def enable_cudnn(
    deterministic=False,
    benchmark=False,
    allow_tf32=False,
+    disabled_ops=None,
 ):
    """Enable backend to use the cuDNN library.

@@ -80,9 +81,12 @@ def enable_cudnn(
        Select fastest algorithms via benchmark or heuristics.
    allow_tf32 : bool, optional, default=False
        Allow TF32 tensor core operation or not.
+    disabled_ops : Sequence[str], optional
+        The operator types to disable using cuDNN.

    """
-    return backend.cudaEnableDNN(enabled, deterministic, benchmark, allow_tf32)
+    return backend.cudaEnableDNN(enabled, deterministic, benchmark,
+                                 allow_tf32, disabled_ops or [])


 def get_device_capability(device_index=None):

--- a/dragon/utils/conversions.h
+++ b/dragon/utils/conversions.h
@@ -143,6 +143,11 @@ CONVERSIONS_DECL half To<half, half>(half val) {
 }

 template <>
+CONVERSIONS_DECL float To<float, half>(half val) {
+  return __half2float(val);
+}
+
+template <>
 CONVERSIONS_DECL half To<half, float>(float val) {
 #if CUDA_VERSION_MIN(9, 2)
  return __float2half(val);

--- a/dragon/utils/device/common_cub.h
+++ b/dragon/utils/device/common_cub.h
@@ -20,12 +20,35 @@
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/warp/warp_reduce.cuh>

 namespace dragon {

 template <typename T>
+using WarpReduce = cub::BlockReduce<T, CUDA_WARP_SIZE>;
+
+template <typename T>
 using BlockReduce = cub::BlockReduce<T, CUDA_THREADS>;

+template <typename T, typename Reducer>
+__inline__ __device__ T WarpAllReduce(T val) {
+  for (int mask = CUDA_WARP_SIZE / 2; mask > 0; mask /= 2) {
+    val = Reducer()(val, __shfl_xor_sync(0xffffffff, val, mask));
+  }
+  return val;
+}
+
+template <typename T, typename Reducer, int kThreadsPerBlock>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef cub::BlockReduce<T, kThreadsPerBlock> BlockReduce;
+  __shared__ T block_val;
+  __shared__ typename BlockReduce::TempStorage storage;
+  val = BlockReduce(storage).Reduce(val, Reducer());
+  if (threadIdx.x == 0) block_val = val;
+  __syncthreads();
+  return block_val;
+}
+
 } // namespace dragon

 #endif // USE_CUDA

--- a/dragon/utils/device/common_cuda.h
+++ b/dragon/utils/device/common_cuda.h
@@ -27,10 +27,13 @@ namespace dragon {

 #ifdef USE_CUDA

-/*! \brief The number of cuda threads to use */
+/*! \brief The number of cuda threads in a warp */
+constexpr int CUDA_WARP_SIZE = 32;
+
+/*! \brief The number of cuda threads in a block */
 constexpr int CUDA_THREADS = 256;

-/*! \brief The maximum number of blocks to use in the default kernel call */
+/*! \brief The maximum number of blocks to use in a default kernel call */
 constexpr int CUDA_MAX_BLOCKS = 4096;

 /*! \brief The maximum number of devices in a single machine */

--- a/dragon/utils/math/transpose.cc
+++ b/dragon/utils/math/transpose.cc
@@ -29,11 +29,9 @@ void _Transpose(

 } // namespace

-/* ------------------- Launcher Separator ------------------- */
-
 #define DEFINE_TRANSPOSE_FUNC(T)                                            \
  template <>                                                               \
-  void Transpose<T, CPUContext>(                                            \
+  DRAGON_API void Transpose<T, CPUContext>(                                 \
      const int num_dims,                                                   \
      const int64_t* dims,                                                  \
      const int64_t* axes,                                                  \

--- a/dragon/utils/math/transpose.cu
+++ b/dragon/utils/math/transpose.cu
@@ -82,11 +82,9 @@ void _TransposeImpl(

 } // namespace

-/* ------------------- Launcher Separator ------------------- */
-
 #define DEFINE_TRANSPOSE_FUNC(T)                                            \
  template <>                                                               \
-  void Transpose<T, CUDAContext>(                                           \
+  DRAGON_API void Transpose<T, CUDAContext>(                                \
      const int num_dims,                                                   \
      const int64_t* dims,                                                  \
      const int64_t* axes,                                                  \