Use block reduction for ArgMax and ArgMin Operator

Summary: This commit reimplements the cuda argmax/argmin via BlockReduce, instead of the naive reduction in kernel loop.

Use block reduction for ArgMax and ArgMin Operator
Summary: This commit reimplements the cuda argmax/argmin via BlockReduce, instead of the naive reduction in kernel loop.
Ting PAN
Commit 5cbbef4b authored Oct 08, 2020 by Ting PAN
Showing with 141 additions and 100 deletions
dragon/kernels/array/arg_op_kernel.cu
dragon/operators/normalization/batch_norm_op_sync.cc
test/dragon/test_ops.py
--- a/dragon/kernels/array/arg_op_kernel.cu
+++ b/dragon/kernels/array/arg_op_kernel.cu
 #ifdef USE_CUDA

 #include "dragon/core/context_cuda.h"
+#include "dragon/utils/device/common_cub.h"
+#include "dragon/utils/math/functional.h"
 #include "dragon/utils/op_kernels.h"

 namespace dragon {
@@ -10,98 +12,58 @@ namespace kernel {
 namespace {

 template <typename T>
-__global__ void _ArgMax(
-    const int nthreads,
-    const int inner_dim,
-    const int axis_dim,
-    const T* x,
-    int64_t* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int i = yi / inner_dim;
-    const int j = yi % inner_dim;
-    const T* offset_x = x + (i * axis_dim * inner_dim + j);
-    auto max_val = offset_x[0];
-    auto max_idx = int64_t(0);
-    for (int k = 1; k < axis_dim; ++k) {
-      const T val = offset_x[k * inner_dim];
-      if (val > max_val) {
-        max_val = val;
-        max_idx = k;
-      }
-    }
-    y[yi] = max_idx;
-  }
-}
-
-template <>
-__global__ void _ArgMax<half>(
-    const int nthreads,
-    const int inner_dim,
-    const int axis_dim,
-    const half* x,
-    int64_t* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int i = yi / inner_dim;
-    const int j = yi % inner_dim;
-    const half* offset_x = x + (i * axis_dim * inner_dim + j);
-    auto max_val = __half2float(offset_x[0]);
-    auto max_idx = int64_t(0);
-    for (int k = 1; k < axis_dim; ++k) {
-      const float val = __half2float(offset_x[k * inner_dim]);
-      if (val > max_val) {
-        max_val = val;
-        max_idx = k;
-      }
+struct ArgMaxFunctor {
+  inline __device__ cub::KeyValuePair<int64_t, T> operator()(
+      const cub::KeyValuePair<int64_t, T>& lhs,
+      const cub::KeyValuePair<int64_t, T>& rhs) const {
+    if ((greater_(rhs.value, lhs.value)) ||
+        (equal_(lhs.value, rhs.value) && (rhs.key < lhs.key))) {
+      return rhs;
    }
-    y[yi] = max_idx;
+    return lhs;
  }
-}
+  math::GreaterFunctor<T> greater_;
+  math::EqualFunctor<T> equal_;
+};

 template <typename T>
-__global__ void _ArgMin(
-    const int nthreads,
-    const int inner_dim,
-    const int axis_dim,
-    const T* x,
-    int64_t* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int i = yi / inner_dim;
-    const int j = yi % inner_dim;
-    const T* offset_x = x + (i * axis_dim * inner_dim + j);
-    auto min_val = offset_x[0];
-    auto min_idx = int64_t(0);
-    for (int k = 1; k < axis_dim; ++k) {
-      const T val = offset_x[k * inner_dim];
-      if (val < min_val) {
-        min_val = val;
-        min_idx = k;
-      }
+struct ArgMinFunctor {
+  inline __device__ cub::KeyValuePair<int64_t, T> operator()(
+      const cub::KeyValuePair<int64_t, T>& lhs,
+      const cub::KeyValuePair<int64_t, T>& rhs) const {
+    if ((less_(rhs.value, lhs.value)) ||
+        (equal_(lhs.value, rhs.value) && (rhs.key < lhs.key))) {
+      return rhs;
    }
-    y[yi] = min_idx;
+    return lhs;
  }
-}
+  math::LessFunctor<T> less_;
+  math::EqualFunctor<T> equal_;
+};

-template <>
-__global__ void _ArgMin<half>(
-    const int nthreads,
+template <typename T, class Reducer>
+__global__ void _ArgReduce(
+    const int rows,
+    const int cols,
    const int inner_dim,
-    const int axis_dim,
-    const half* x,
+    const Reducer reducer,
+    const T init,
+    const T* x,
    int64_t* y) {
-  CUDA_1D_KERNEL_LOOP(yi, nthreads) {
-    const int i = yi / inner_dim;
-    const int j = yi % inner_dim;
-    const half* offset_x = x + (i * axis_dim * inner_dim + j);
-    auto min_val = __half2float(offset_x[0]);
-    auto min_idx = int64_t(0);
-    for (int k = 1; k < axis_dim; ++k) {
-      const float val = __half2float(offset_x[k * inner_dim]);
-      if (val < min_val) {
-        min_val = val;
-        min_idx = k;
+  typedef cub::KeyValuePair<int64_t, T> KeyValuePair;
+  __shared__ typename BlockReduce<KeyValuePair>::TempStorage storage;
+  CUDA_2D_KERNEL_LOOP1(i, rows) {
+    auto key_val = KeyValuePair(-1, init);
+    CUDA_2D_KERNEL_LOOP2(j, cols) {
+      key_val = reducer(
+          key_val,
+          KeyValuePair(
+              j, x[((i / inner_dim) * cols + j) * inner_dim + i % inner_dim]));
    }
+    key_val = BlockReduce<KeyValuePair>(storage).Reduce(key_val, reducer);
+    if (threadIdx.x == 0) {
+      y[i] = key_val.key;
    }
-    y[yi] = min_idx;
  }
 }

@@ -109,7 +71,7 @@ __global__ void _ArgMin<half>(

 /* ------------------- Launcher Separator ------------------- */

-#define DEFINE_KERNEL_LAUNCHER(name, T1, T2)                                 \
+#define DEFINE_KERNEL_LAUNCHER(name, T1, T2, Reducer, kInit)                   \
  template <>                                                                  \
  void name<T1, CUDAContext>(                                                  \
      const int outer_dim,                                                     \
@@ -118,25 +80,102 @@ __global__ void _ArgMin<half>(
      const T1* x,                                                             \
      int64_t* y,                                                              \
      CUDAContext* ctx) {                                                      \
-    auto nthreads = outer_dim * inner_dim;                                   \
-    _##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
-        nthreads, inner_dim, axis_dim, reinterpret_cast<const T2*>(x), y);   \
+    const auto rows = outer_dim * inner_dim;                                   \
+    const auto cols = axis_dim;                                                \
+    _ArgReduce<<<CUDA_2D_BLOCKS(rows), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
+        rows,                                                                  \
+        cols,                                                                  \
+        inner_dim,                                                             \
+        Reducer<T2>(),                                                         \
+        kInit,                                                                 \
+        reinterpret_cast<const T2*>(x),                                        \
+        y);                                                                    \
  }

-DEFINE_KERNEL_LAUNCHER(ArgMax, int8_t, int8_t);
-DEFINE_KERNEL_LAUNCHER(ArgMax, uint8_t, uint8_t);
-DEFINE_KERNEL_LAUNCHER(ArgMax, int, int);
-DEFINE_KERNEL_LAUNCHER(ArgMax, int64_t, int64_t);
-DEFINE_KERNEL_LAUNCHER(ArgMax, float16, half);
-DEFINE_KERNEL_LAUNCHER(ArgMax, float, float);
-DEFINE_KERNEL_LAUNCHER(ArgMax, double, double);
-DEFINE_KERNEL_LAUNCHER(ArgMin, int8_t, int8_t);
-DEFINE_KERNEL_LAUNCHER(ArgMin, uint8_t, uint8_t);
-DEFINE_KERNEL_LAUNCHER(ArgMin, int, int);
-DEFINE_KERNEL_LAUNCHER(ArgMin, int64_t, int64_t);
-DEFINE_KERNEL_LAUNCHER(ArgMin, float16, half);
-DEFINE_KERNEL_LAUNCHER(ArgMin, float, float);
-DEFINE_KERNEL_LAUNCHER(ArgMin, double, double);
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    int8_t,
+    int8_t,
+    ArgMaxFunctor,
+    std::numeric_limits<int8_t>::lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    uint8_t,
+    uint8_t,
+    ArgMaxFunctor,
+    std::numeric_limits<uint8_t>::lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    int,
+    int,
+    ArgMaxFunctor,
+    std::numeric_limits<int>::lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    int64_t,
+    int64_t,
+    ArgMaxFunctor,
+    std::numeric_limits<int64_t>::lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    float16,
+    half,
+    ArgMaxFunctor,
+    cub::Traits<half>::Lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    float,
+    float,
+    ArgMaxFunctor,
+    std::numeric_limits<float>::lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMax,
+    double,
+    double,
+    ArgMaxFunctor,
+    std::numeric_limits<double>::lowest());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    int8_t,
+    int8_t,
+    ArgMinFunctor,
+    std::numeric_limits<int8_t>::max());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    uint8_t,
+    uint8_t,
+    ArgMinFunctor,
+    std::numeric_limits<uint8_t>::max());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    int,
+    int,
+    ArgMinFunctor,
+    std::numeric_limits<int>::max());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    int64_t,
+    int64_t,
+    ArgMinFunctor,
+    std::numeric_limits<int64_t>::max());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    float16,
+    half,
+    ArgMinFunctor,
+    cub::Traits<half>::Max());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    float,
+    float,
+    ArgMinFunctor,
+    std::numeric_limits<float>::max());
+DEFINE_KERNEL_LAUNCHER(
+    ArgMin,
+    double,
+    double,
+    ArgMinFunctor,
+    std::numeric_limits<double>::max());
 #undef DEFINE_KERNEL_LAUNCHER

 } // namespace kernel

--- a/dragon/operators/normalization/batch_norm_op_sync.cc
+++ b/dragon/operators/normalization/batch_norm_op_sync.cc
@@ -44,6 +44,7 @@ void SyncBatchNormOp<Context>::TrainingImpl() {
      ctx());

  // Compute D(X) = E(X^2) - E(X)^2
+  ctx()->FinishDeviceComputation();
  if (enable_nccl_) {
 #ifdef USE_NCCL
    auto nccl_comm_ = this->nccl_comm();
@@ -138,6 +139,7 @@ void SyncBatchNormGradientOp<Context>::TrainingImpl() {
      N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());

  // Gradient w.r.t. gamma and beta of global batch
+  ctx()->FinishDeviceComputation();
  if (enable_nccl_) {
 #ifdef USE_NCCL
    auto nccl_comm_ = this->nccl_comm();

--- a/test/dragon/test_ops.py
+++ b/test/dragon/test_ops.py
@@ -709,7 +709,7 @@ class TestArrayOps(OpTestCase):
                    self.assertEqual(x.shape, (4,))

    @unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
-    def test_range_cuda(self):
+    def test_permutation_cuda(self):
        with dragon.device('cuda'):
            self.test_permutation()