Commit 46feba80 by Ting PAN

Instantiate dispatch template by value for crucial CUDA kernels

Summary:
This commit instantiates CUDA kernels by using constant dimensions
to enable the optimization during compiler-time.
1 parent 936c351b
FROM ubuntu:16.04 FROM ubuntu:18.04
RUN \ RUN \
apt-get update && apt-get install -y \ apt-get update && apt-get install -y \
...@@ -43,8 +43,8 @@ RUN \ ...@@ -43,8 +43,8 @@ RUN \
-DPYTHON_EXECUTABLE=/usr/bin/python3 \ -DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DUSE_CUDA=OFF \ -DUSE_CUDA=OFF \
-DUSE_CUDNN=OFF \ -DUSE_CUDNN=OFF \
-DUSE_AVX2=OFF \ -DUSE_AVX2=ON \
-DUSE_FMA=OFF && \ -DUSE_FMA=ON && \
make install -j $(nproc) && \ make install -j $(nproc) && \
cd .. && rm -rf build && \ cd .. && rm -rf build && \
python3 setup.py install python3 setup.py install
......
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04
RUN \ RUN \
rm /etc/apt/sources.list.d/cuda.list && \ rm /etc/apt/sources.list.d/cuda.list && \
...@@ -48,8 +48,8 @@ RUN \ ...@@ -48,8 +48,8 @@ RUN \
-DPYTHON_EXECUTABLE=/usr/bin/python3 \ -DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DUSE_MPI=ON \ -DUSE_MPI=ON \
-DUSE_NCCL=ON \ -DUSE_NCCL=ON \
-DUSE_AVX2=OFF \ -DUSE_AVX2=ON \
-DUSE_FMA=OFF && \ -DUSE_FMA=ON && \
make install -j $(nproc) && \ make install -j $(nproc) && \
cd .. && rm -rf build && \ cd .. && rm -rf build && \
python3 setup.py install python3 setup.py install
......
...@@ -62,10 +62,6 @@ class CUDAObjects { ...@@ -62,10 +62,6 @@ class CUDAObjects {
} else { } else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
} }
#elif CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
}
#endif #endif
} }
return handles[stream_id]; return handles[stream_id];
...@@ -437,7 +433,8 @@ class DRAGON_API CUDAContext { ...@@ -437,7 +433,8 @@ class DRAGON_API CUDAContext {
CUDA_NOT_COMPILED; CUDA_NOT_COMPILED;
} }
/*! \brief Switch to the device and select given stream in current thread */ /*! \brief Switch to the device and select given stream in current
* thread */
void SwitchToDevice(int stream_id) { void SwitchToDevice(int stream_id) {
CUDA_NOT_COMPILED; CUDA_NOT_COMPILED;
} }
......
...@@ -13,7 +13,6 @@ namespace { ...@@ -13,7 +13,6 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _ConstPad( __global__ void _ConstPad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -23,7 +22,8 @@ __global__ void _ConstPad( ...@@ -23,7 +22,8 @@ __global__ void _ConstPad(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi, d; int xi = 0, tmp = yi, d;
for (d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_pads.data[d]; r -= X_pads.data[d];
...@@ -37,7 +37,6 @@ __global__ void _ConstPad( ...@@ -37,7 +37,6 @@ __global__ void _ConstPad(
template <typename T, int D> template <typename T, int D>
__global__ void _ReflectPad( __global__ void _ReflectPad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -46,7 +45,8 @@ __global__ void _ReflectPad( ...@@ -46,7 +45,8 @@ __global__ void _ReflectPad(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_pads.data[d]; r -= X_pads.data[d];
...@@ -61,7 +61,6 @@ __global__ void _ReflectPad( ...@@ -61,7 +61,6 @@ __global__ void _ReflectPad(
template <typename T, int D> template <typename T, int D>
__global__ void _EdgePad( __global__ void _EdgePad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -70,7 +69,8 @@ __global__ void _EdgePad( ...@@ -70,7 +69,8 @@ __global__ void _EdgePad(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r = min(X_dims.data[d] - 1, max(r - X_pads.data[d], 0)); r = min(X_dims.data[d] - 1, max(r - X_pads.data[d], 0));
...@@ -80,77 +80,109 @@ __global__ void _EdgePad( ...@@ -80,77 +80,109 @@ __global__ void _EdgePad(
} }
} }
template <typename T, int D>
void _PadImpl(
const int64_t* x_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const int64_t* pads,
const float value,
const string& mode,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_dims, X_strides, Y_dims, X_pads;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_dims.data[i] = x_dims[i];
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
X_pads.data[i] = pads[i];
}
if (mode == "ConstPad") {
_ConstPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, convert::To<T>(value), x, y);
} else if (mode == "ReflectPad") {
_ReflectPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, x, y);
} else if (mode == "EdgePad") {
_EdgePad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, x, y);
} else {
LOG(FATAL) << "Unknown Pad: " << mode << ".";
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_CONST_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void ConstPad<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_dims, \ const int64_t* x_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const int64_t* pads, \ const int64_t* pads, \
const float value, \ const float value, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, X_strides, Y_dims, X_pads; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
const auto N = std::accumulate( \ _PadImpl, \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ T, \
for (int i = 0; i < num_dims; ++i) { \ num_dims, \
X_dims.data[i] = x_dims[i]; \ x_dims, \
X_strides.data[i] = x_strides[i]; \ x_strides, \
Y_dims.data[i] = y_dims[i]; \ y_dims, \
X_pads.data[i] = pads[i]; \ pads, \
} \ value, \
_ConstPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ #name, \
N, \ x, \
num_dims, \ y, \
X_dims, \ ctx); \
X_strides, \
Y_dims, \
X_pads, \
convert::To<T>(value), \
x, \
y); \
} }
#define DEFINE_KERNEL_LAUNCHER(name, T) \ DEFINE_KERNEL_LAUNCHER(ConstPad, bool);
template <> \ DEFINE_KERNEL_LAUNCHER(ConstPad, uint8_t);
void name<T, CUDAContext>( \ DEFINE_KERNEL_LAUNCHER(ConstPad, int8_t);
const int num_dims, \ DEFINE_KERNEL_LAUNCHER(ConstPad, int);
const int64_t* x_dims, \ DEFINE_KERNEL_LAUNCHER(ConstPad, int64_t);
const int64_t* x_strides, \ DEFINE_KERNEL_LAUNCHER(ConstPad, float16);
const int64_t* y_dims, \ DEFINE_KERNEL_LAUNCHER(ConstPad, float);
const int64_t* pads, \ DEFINE_KERNEL_LAUNCHER(ConstPad, double);
const T* x, \ #undef DEFINE_KERNEL_LAUNCHER
T* y, \
CUDAContext* ctx) { \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ template <> \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, X_strides, Y_dims, X_pads; \ void name<T, CUDAContext>( \
const auto N = std::accumulate( \ const int num_dims, \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ const int64_t* x_dims, \
for (int i = 0; i < num_dims; ++i) { \ const int64_t* x_strides, \
X_dims.data[i] = x_dims[i]; \ const int64_t* y_dims, \
X_strides.data[i] = x_strides[i]; \ const int64_t* pads, \
Y_dims.data[i] = y_dims[i]; \ const T* x, \
X_pads.data[i] = pads[i]; \ T* y, \
} \ CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
N, num_dims, X_dims, X_strides, Y_dims, X_pads, x, y); \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_PadImpl, \
T, \
num_dims, \
x_dims, \
x_strides, \
y_dims, \
pads, \
0.f, \
#name, \
x, \
y, \
ctx); \
} }
DEFINE_CONST_KERNEL_LAUNCHER(bool);
DEFINE_CONST_KERNEL_LAUNCHER(uint8_t);
DEFINE_CONST_KERNEL_LAUNCHER(int8_t);
DEFINE_CONST_KERNEL_LAUNCHER(int);
DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool); DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
...@@ -167,7 +199,6 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t); ...@@ -167,7 +199,6 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16); DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float); DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double); DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
......
...@@ -10,60 +10,76 @@ namespace kernels { ...@@ -10,60 +10,76 @@ namespace kernels {
namespace { namespace {
template <typename T, int D> template <typename T, typename AccT, int D>
__global__ void _ReduceSumGrad( __global__ void _ReduceSumGrad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> Y_strides, const SimpleArray<int, D> Y_strides,
const float scale, const AccT scale,
const T* dy, const T* dy,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(xi, N) { CUDA_1D_KERNEL_LOOP(xi, N) {
int yi = 0, tmp = xi; int yi = 0, tmp = xi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(X_dims.data[d], tmp, &tmp, &r);
yi += (r % Y_dims.data[d]) * Y_strides.data[d]; yi += (r % Y_dims.data[d]) * Y_strides.data[d];
} }
dx[xi] = convert::To<T>(convert::To<float>(__ldg(dy + yi)) * scale); dx[xi] = convert::To<T>(convert::To<AccT>(__ldg(dy + yi)) * scale);
} }
} }
template <typename T, typename AccT, int D>
void _ReduceSumGradImpl(
const int64_t* x_dims,
const int64_t* y_dims,
const int64_t* y_strides,
const AccT scale,
const T* dy,
T* dx,
CUDAContext* ctx) {
SimpleArray<int, D> X_dims, Y_dims, Y_strides;
const auto N =
std::accumulate(x_dims, x_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_dims.data[i] = x_dims[i];
Y_dims.data[i] = y_dims[i];
Y_strides.data[i] = y_strides[i];
}
_ReduceSumGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, Y_dims, Y_strides, scale, dy, dx);
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void ReduceSumGrad<T, CUDAContext>( \ void ReduceSumGrad<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_dims, \ const int64_t* x_dims, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const int64_t* y_strides, \ const int64_t* y_strides, \
const float scale, \ const float scale, \
const T* dy, \ const T* dy, \
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, Y_dims, Y_strides; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
const auto N = std::accumulate( \ _ReduceSumGradImpl, \
x_dims, x_dims + num_dims, 1, std::multiplies<int64_t>()); \ math::ScalarType<T>::type, \
for (int i = 0; i < num_dims; ++i) { \ math::AccmulatorType<T>::type, \
X_dims.data[i] = x_dims[i]; \ num_dims, \
Y_dims.data[i] = y_dims[i]; \ x_dims, \
Y_strides.data[i] = y_strides[i]; \ y_dims, \
} \ y_strides, \
_ReduceSumGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ convert::To<math::AccmulatorType<T>::type>(scale), \
N, \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
num_dims, \ reinterpret_cast<math::ScalarType<T>::type*>(dx), \
X_dims, \ ctx); \
Y_dims, \
Y_strides, \
scale, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
......
...@@ -12,7 +12,6 @@ namespace { ...@@ -12,7 +12,6 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _Roll( __global__ void _Roll(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_shifts, const SimpleArray<int, D> X_shifts,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -20,7 +19,8 @@ __global__ void _Roll( ...@@ -20,7 +19,8 @@ __global__ void _Roll(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_shifts.data[d]; r -= X_shifts.data[d];
...@@ -31,33 +31,43 @@ __global__ void _Roll( ...@@ -31,33 +31,43 @@ __global__ void _Roll(
} }
} }
template <typename T, int D>
void _RollImpl(
const int64_t* x_shifts,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_shifts, X_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_shifts.data[i] = x_shifts[i];
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_Roll<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_shifts, X_strides, Y_dims, x, y);
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void Roll<T, CUDAContext>( \ void Roll<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_shifts, \ const int64_t* x_shifts, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_shifts; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides; \ _RollImpl, T, num_dims, x_shifts, x_strides, y_dims, x, y, ctx); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> Y_dims; \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_shifts.data[i] = x_shifts[i]; \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_Roll<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_shifts, X_strides, Y_dims, x, y); \
} }
DEFINE_KERNEL_LAUNCHER(bool); DEFINE_KERNEL_LAUNCHER(bool);
......
...@@ -13,7 +13,6 @@ namespace { ...@@ -13,7 +13,6 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _Slice( __global__ void _Slice(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> X_starts, const SimpleArray<int, D> X_starts,
...@@ -21,7 +20,8 @@ __global__ void _Slice( ...@@ -21,7 +20,8 @@ __global__ void _Slice(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += (r + X_starts.data[d]) * X_strides.data[d]; xi += (r + X_starts.data[d]) * X_strides.data[d];
...@@ -33,7 +33,6 @@ __global__ void _Slice( ...@@ -33,7 +33,6 @@ __global__ void _Slice(
template <typename T, int D> template <typename T, int D>
__global__ void _SliceGrad( __global__ void _SliceGrad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> X_starts, const SimpleArray<int, D> X_starts,
...@@ -41,7 +40,8 @@ __global__ void _SliceGrad( ...@@ -41,7 +40,8 @@ __global__ void _SliceGrad(
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += (r + X_starts.data[d]) * X_strides.data[d]; xi += (r + X_starts.data[d]) * X_strides.data[d];
...@@ -50,31 +50,49 @@ __global__ void _SliceGrad( ...@@ -50,31 +50,49 @@ __global__ void _SliceGrad(
} }
} }
template <typename T, int D>
void _SliceImpl(
const string& routine,
const int64_t* x_strides,
const int64_t* y_dims,
const int64_t* starts,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_strides, Y_dims, X_starts;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
X_starts.data[i] = starts[i];
}
if (routine == "Slice") {
_Slice<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, X_starts, x, y);
} else if (routine == "SliceGrad") {
_SliceGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, X_starts, x, y);
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const int64_t* starts, \ const int64_t* starts, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims, X_starts; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
const auto N = std::accumulate( \ _SliceImpl, T, num_dims, #name, x_strides, y_dims, starts, x, y, ctx); \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
X_starts.data[i] = starts[i]; \
} \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_strides, Y_dims, X_starts, x, y); \
} }
DEFINE_KERNEL_LAUNCHER(Slice, bool); DEFINE_KERNEL_LAUNCHER(Slice, bool);
......
...@@ -31,12 +31,13 @@ __global__ void _Transpose( ...@@ -31,12 +31,13 @@ __global__ void _Transpose(
template <typename T, int D> template <typename T, int D>
void _TransposeImpl( void _TransposeImpl(
const int N,
const int64_t* x_strides, const int64_t* x_strides,
const int64_t* y_dims, const int64_t* y_dims,
const T* x, const T* x,
T* y, T* y,
CUDAContext* ctx) { CUDAContext* ctx) {
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
SimpleArray<int, D> X_strides, Y_dims; SimpleArray<int, D> X_strides, Y_dims;
for (int i = 0; i < D; ++i) { for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i]; X_strides.data[i] = x_strides[i];
...@@ -50,46 +51,18 @@ void _TransposeImpl( ...@@ -50,46 +51,18 @@ void _TransposeImpl(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void Transpose<T, CUDAContext>( \ void Transpose<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
const auto N = std::accumulate( \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ _TransposeImpl, T, num_dims, x_strides, y_dims, x, y, ctx); \
switch (num_dims) { \
case 1: \
_TransposeImpl<T, 1>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 2: \
_TransposeImpl<T, 2>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 3: \
_TransposeImpl<T, 3>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 4: \
_TransposeImpl<T, 4>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 5: \
_TransposeImpl<T, 5>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 6: \
_TransposeImpl<T, 6>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 7: \
_TransposeImpl<T, 7>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 8: \
_TransposeImpl<T, 8>(N, x_strides, y_dims, x, y, ctx); \
break; \
default: \
break; \
} \
} }
DEFINE_KERNEL_LAUNCHER(bool); DEFINE_KERNEL_LAUNCHER(bool);
......
...@@ -82,7 +82,7 @@ __global__ void _SoftmaxCrossEntropyGrad( ...@@ -82,7 +82,7 @@ __global__ void _SoftmaxCrossEntropyGrad(
const int S, const int S,
const int C, const int C,
const int ignore_index, const int ignore_index,
const InputT* input, const InputT* /* input */,
const TargetT* target, const TargetT* target,
InputT* dx, InputT* dx,
InputT* mask) { InputT* mask) {
......
...@@ -38,7 +38,7 @@ __global__ void _NLLLossGrad( ...@@ -38,7 +38,7 @@ __global__ void _NLLLossGrad(
const int S, const int S,
const int C, const int C,
const int ignore_index, const int ignore_index,
const InputT* input, const InputT* /* input */,
const TargetT* target, const TargetT* target,
InputT* dx, InputT* dx,
InputT* mask) { InputT* mask) {
......
...@@ -67,7 +67,6 @@ template <typename T, typename AccT, int D> ...@@ -67,7 +67,6 @@ template <typename T, typename AccT, int D>
__global__ void _GenericMoments( __global__ void _GenericMoments(
const int rows, const int rows,
const int cols, const int cols,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const T* x, const T* x,
...@@ -80,7 +79,8 @@ __global__ void _GenericMoments( ...@@ -80,7 +79,8 @@ __global__ void _GenericMoments(
AccT m_val = AccT(0), v_val = AccT(0); AccT m_val = AccT(0), v_val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j; int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r); FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r);
xi += r * X_strides.data[d]; xi += r * X_strides.data[d];
...@@ -98,9 +98,8 @@ __global__ void _GenericMoments( ...@@ -98,9 +98,8 @@ __global__ void _GenericMoments(
} }
} }
template <typename T, typename AccT> template <typename T, typename AccT, int D>
void _Moments( void _GenericMomentsImpl(
const int num_dims,
const int* dims, const int* dims,
const int num_axes, const int num_axes,
const int* axes, const int* axes,
...@@ -108,70 +107,72 @@ void _Moments( ...@@ -108,70 +107,72 @@ void _Moments(
AccT* mean, AccT* mean,
AccT* var, AccT* var,
CUDAContext* ctx) { CUDAContext* ctx) {
int rows, cols; SimpleArray<int, D> transpose_axes;
vec32_t out_dims(dims, dims + num_dims); SimpleArray<int, D> transpose_strides;
for (int i = 0; i < num_axes; ++i) { SimpleArray<int, D> transpose_dims;
out_dims[axes[i]] = 1; math::utils::TransposeAxesForReduce(D, num_axes, axes, transpose_axes.data);
}
if (math::utils::IsRowwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, x, mean, var);
return;
}
if (math::utils::IsColwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, x, mean, var);
return;
}
CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims;
math::utils::TransposeAxesForReduce(
num_dims, num_axes, axes, transpose_axes.data);
math::utils::ComputeTransposeStrides( math::utils::ComputeTransposeStrides(
num_dims, dims, transpose_axes.data, transpose_strides.data); D, dims, transpose_axes.data, transpose_strides.data);
rows = cols = 1; int rows = 1, cols = 1;
const int pivot = num_dims - num_axes; const int pivot = D - num_axes;
for (int i = 0; i < pivot; ++i) { for (int i = 0; i < pivot; ++i) {
rows *= dims[transpose_axes.data[i]]; rows *= dims[transpose_axes.data[i]];
} }
for (int i = pivot; i < num_dims; ++i) { for (int i = pivot; i < D; ++i) {
cols *= dims[transpose_axes.data[i]]; cols *= dims[transpose_axes.data[i]];
} }
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < D; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]]; transpose_dims.data[i] = dims[transpose_axes.data[i]];
} }
_GenericMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( _GenericMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, num_dims, transpose_dims, transpose_strides, x, mean, var); rows, cols, transpose_dims, transpose_strides, x, mean, var);
} }
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \ #define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \ template <> \
void Moments<T, AccT, CUDAContext>( \ void Moments<T, AccT, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int* dims, \ const int* dims, \
const int num_axes, \ const int num_axes, \
const int* axes, \ const int* axes, \
const T* x, \ const T* x, \
AccT* mean, \ AccT* mean, \
AccT* var, \ AccT* var, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_Moments( \ int rows, cols; \
num_dims, \ vec32_t out_dims(dims, dims + num_dims); \
dims, \ for (int i = 0; i < num_axes; ++i) { \
num_axes, \ out_dims[axes[i]] = 1; \
axes, \ } \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ if (math::utils::IsRowwiseReduce( \
mean, \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
var, \ _RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
ctx); \ rows, cols, x, mean, var); \
return; \
} \
if (math::utils::IsColwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, x, mean, var); \
return; \
} \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
_GenericMomentsImpl, \
T, \
AccT, \
num_dims, \
dims, \
num_axes, \
axes, \
x, \
mean, \
var, \
ctx); \
} }
DEFINE_KERNEL_LAUNCHER(uint8_t, float); DEFINE_KERNEL_LAUNCHER(uint8_t, float);
......
...@@ -7,7 +7,7 @@ namespace kernels { ...@@ -7,7 +7,7 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T, typename AccT>
void _AvgPool2dNCHW( void _AvgPool2dNCHW(
const int N, const int N,
const int C, const int C,
...@@ -28,31 +28,30 @@ void _AvgPool2dNCHW( ...@@ -28,31 +28,30 @@ void _AvgPool2dNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w; const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w}; std::array<int, 4> dims = {N, C, out_h, out_w};
T val, area;
int hstart, hend, wstart, wend; int hstart, hend, wstart, wend;
for (int i = 0; i < NxCxHoxWo; ++i) { for (int i = 0; i < NxCxHoxWo; ++i) {
hstart = index[2] * stride_h - pad_h; hstart = index[2] * stride_h - pad_h;
wstart = index[3] * stride_w - pad_w; wstart = index[3] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
val = T(0); AccT val = AccT(0);
const T* offset_x = x + index[0] * CxHxW + index[1] * HxW; const T* offset_x = x + index[0] * CxHxW + index[1] * HxW;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[h * W + w]; val += convert::To<AccT>(offset_x[h * W + w]);
} }
} }
y[i] = val / area; y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool2dNHWC( void _AvgPool2dNHWC(
const int N, const int N,
const int C, const int C,
...@@ -72,29 +71,30 @@ void _AvgPool2dNHWC( ...@@ -72,29 +71,30 @@ void _AvgPool2dNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w; const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C}; std::array<int, 4> dims = {N, out_h, out_w, C};
T val, area;
int hstart, hend, wstart, wend; int hstart, hend, wstart, wend;
for (int i = 0; i < NxHoxWoxC; ++i) { for (int i = 0; i < NxHoxWoxC; ++i) {
hstart = index[1] * stride_h - pad_h; hstart = index[1] * stride_h - pad_h;
wstart = index[2] * stride_w - pad_w; wstart = index[2] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * HxWxC + index[3]; const T* offset_x = x + index[0] * HxWxC + index[3];
val = T(0); AccT val = AccT(0);
for (int h = hstart; h < hend; ++h) for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) for (int w = wstart; w < wend; ++w) {
val += offset_x[(h * W + w) * C]; val += convert::To<AccT>(offset_x[(h * W + w) * C]);
y[i] = val / area; }
}
y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool2dGradNCHW( void _AvgPool2dGradNCHW(
const int N, const int N,
const int C, const int C,
...@@ -115,7 +115,6 @@ void _AvgPool2dGradNCHW( ...@@ -115,7 +115,6 @@ void _AvgPool2dGradNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w; const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w}; std::array<int, 4> dims = {N, C, out_h, out_w};
T area;
int hstart, hend, wstart, wend, xi; int hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * CxHxW); memset(dx, 0, sizeof(T) * N * CxHxW);
for (int i = 0; i < NxCxHoxWo; ++i) { for (int i = 0; i < NxCxHoxWo; ++i) {
...@@ -123,22 +122,24 @@ void _AvgPool2dGradNCHW( ...@@ -123,22 +122,24 @@ void _AvgPool2dGradNCHW(
wstart = index[3] * stride_w - pad_w; wstart = index[3] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * CxHxW + index[1] * HxW; T* offset_dx = dx + index[0] * CxHxW + index[1] * HxW;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
offset_dx[h * W + w] += dy[i] / area; const auto xi = h * W + w;
offset_dx[xi] = convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
} }
} }
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool2dGradNHWC( void _AvgPool2dGradNHWC(
const int N, const int N,
const int C, const int C,
...@@ -158,7 +159,6 @@ void _AvgPool2dGradNHWC( ...@@ -158,7 +159,6 @@ void _AvgPool2dGradNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w; const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C}; std::array<int, 4> dims = {N, out_h, out_w, C};
T area;
int hstart, hend, wstart, wend, xi; int hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * HxWxC); memset(dx, 0, sizeof(T) * N * HxWxC);
for (int i = 0; i < NxHoxWoxC; ++i) { for (int i = 0; i < NxHoxWoxC; ++i) {
...@@ -166,20 +166,24 @@ void _AvgPool2dGradNHWC( ...@@ -166,20 +166,24 @@ void _AvgPool2dGradNHWC(
wstart = index[2] * stride_w - pad_w; wstart = index[2] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * HxWxC + index[3]; T* offset_dx = dx + index[0] * HxWxC + index[3];
for (int h = hstart; h < hend; ++h) for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) for (int w = wstart; w < wend; ++w) {
offset_dx[(h * W + w) * C] += dy[i] / area; const auto xi = (h * W + w) * C;
offset_dx[xi] = convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
}
}
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool3dNCHW( void _AvgPool3dNCHW(
const int N, const int N,
const int C, const int C,
...@@ -205,7 +209,6 @@ void _AvgPool3dNCHW( ...@@ -205,7 +209,6 @@ void _AvgPool3dNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w; const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w}; std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T val, area;
int dstart, dend, hstart, hend, wstart, wend; int dstart, dend, hstart, hend, wstart, wend;
for (int i = 0; i < NxCxDoxHoxWo; ++i) { for (int i = 0; i < NxCxDoxHoxWo; ++i) {
dstart = index[2] * stride_d - pad_d; dstart = index[2] * stride_d - pad_d;
...@@ -214,28 +217,28 @@ void _AvgPool3dNCHW( ...@@ -214,28 +217,28 @@ void _AvgPool3dNCHW(
dend = std::min(dstart + kernel_d, D + pad_d); dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D); dend = std::min(dend, D);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
dstart = std::max(dstart, 0); dstart = std::max(dstart, 0);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
val = T(0); AccT val = AccT(0);
const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW; const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW;
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[(d * H + h) * W + w]; val += convert::To<AccT>(offset_x[(d * H + h) * W + w]);
} }
} }
} }
y[i] = val / area; y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(5, dims.data(), index.data()); math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool3dNHWC( void _AvgPool3dNHWC(
const int N, const int N,
const int C, const int C,
...@@ -260,7 +263,6 @@ void _AvgPool3dNHWC( ...@@ -260,7 +263,6 @@ void _AvgPool3dNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w; const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C}; std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T val, area;
int dstart, dend, hstart, hend, wstart, wend; int dstart, dend, hstart, hend, wstart, wend;
for (int i = 0; i < NxDoxHoxWoxC; ++i) { for (int i = 0; i < NxDoxHoxWoxC; ++i) {
dstart = index[1] * stride_d - pad_d; dstart = index[1] * stride_d - pad_d;
...@@ -269,7 +271,7 @@ void _AvgPool3dNHWC( ...@@ -269,7 +271,7 @@ void _AvgPool3dNHWC(
dend = std::min(dstart + kernel_d, D + pad_d); dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D); dend = std::min(dend, D);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
...@@ -277,20 +279,20 @@ void _AvgPool3dNHWC( ...@@ -277,20 +279,20 @@ void _AvgPool3dNHWC(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * DxHxWxC + index[4]; const T* offset_x = x + index[0] * DxHxWxC + index[4];
val = T(0); AccT val = AccT(0);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[((d * H + h) * W + w) * C]; val += convert::To<AccT>(offset_x[((d * H + h) * W + w) * C]);
} }
} }
} }
y[i] = val / area; y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(5, dims.data(), index.data()); math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool3dGradNCHW( void _AvgPool3dGradNCHW(
const int N, const int N,
const int C, const int C,
...@@ -316,7 +318,6 @@ void _AvgPool3dGradNCHW( ...@@ -316,7 +318,6 @@ void _AvgPool3dGradNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w; const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w}; std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T area;
int dstart, dend, hstart, hend, wstart, wend, xi; int dstart, dend, hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * CxDxHxW); memset(dx, 0, sizeof(T) * N * CxDxHxW);
for (int i = 0; i < NxCxDoxHoxWo; ++i) { for (int i = 0; i < NxCxDoxHoxWo; ++i) {
...@@ -326,18 +327,21 @@ void _AvgPool3dGradNCHW( ...@@ -326,18 +327,21 @@ void _AvgPool3dGradNCHW(
dend = std::min(dstart + kernel_d, D + pad_d); dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D); dend = std::min(dend, D);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
dstart = std::max(dstart, 0); dstart = std::max(dstart, 0);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * CxDxHxW + index[1] * DxHxW; T* offset_dx = dx + index[0] * CxDxHxW + index[1] * DxHxW;
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
offset_dx[((d * H) + h) * W + w] += dy[i] / area; const auto xi = ((d * H) + h) * W + w;
offset_dx[xi] =
convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
} }
} }
} }
...@@ -345,7 +349,7 @@ void _AvgPool3dGradNCHW( ...@@ -345,7 +349,7 @@ void _AvgPool3dGradNCHW(
} }
} }
template <typename T> template <typename T, typename AccT>
void _AvgPool3dGradNHWC( void _AvgPool3dGradNHWC(
const int N, const int N,
const int C, const int C,
...@@ -370,7 +374,6 @@ void _AvgPool3dGradNHWC( ...@@ -370,7 +374,6 @@ void _AvgPool3dGradNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w; const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C}; std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T area;
int dstart, dend, hstart, hend, wstart, wend, xi; int dstart, dend, hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * DxHxWxC); memset(dx, 0, sizeof(T) * N * DxHxWxC);
for (int i = 0; i < NxDoxHoxWoxC; ++i) { for (int i = 0; i < NxDoxHoxWoxC; ++i) {
...@@ -380,18 +383,21 @@ void _AvgPool3dGradNHWC( ...@@ -380,18 +383,21 @@ void _AvgPool3dGradNHWC(
dend = std::min(dstart + kernel_d, D + pad_d); dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h); hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w); wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D); dend = std::min(dend, D);
hend = std::min(hend, H); hend = std::min(hend, H);
wend = std::min(wend, W); wend = std::min(wend, W);
dstart = std::max(dstart, 0); dstart = std::max(dstart, 0);
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * DxHxWxC + index[4]; T* offset_dx = dx + index[0] * DxHxWxC + index[4];
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
offset_dx[((d * H + h) * W + w) * C] += dy[i] / area; const auto xi = ((d * H + h) * W + w) * C;
offset_dx[xi] =
convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
} }
} }
} }
...@@ -403,11 +409,11 @@ void _AvgPool3dGradNHWC( ...@@ -403,11 +409,11 @@ void _AvgPool3dGradNHWC(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, ...) \ #define DISPATCH_POOL_KERNEL(name, T, AccT, ...) \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
name##NCHW(__VA_ARGS__); \ name##NCHW<T, AccT>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \ } else if (data_format == "NHWC") { \
name##NHWC(__VA_ARGS__); \ name##NHWC<T, AccT>(__VA_ARGS__); \
} else { \ } else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \ LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} }
...@@ -433,6 +439,8 @@ void _AvgPool3dGradNHWC( ...@@ -433,6 +439,8 @@ void _AvgPool3dGradNHWC(
CPUContext* ctx) { \ CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \ N, \
C, \ C, \
H, \ H, \
...@@ -449,8 +457,10 @@ void _AvgPool3dGradNHWC( ...@@ -449,8 +457,10 @@ void _AvgPool3dGradNHWC(
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float16);
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float); DEFINE_KERNEL_LAUNCHER(AvgPool2d, float);
DEFINE_KERNEL_LAUNCHER(AvgPool2d, double); DEFINE_KERNEL_LAUNCHER(AvgPool2d, double);
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float16); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float); // AvgPool2dGrad DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
...@@ -481,6 +491,8 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad ...@@ -481,6 +491,8 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
CPUContext* ctx) { \ CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \ N, \
C, \ C, \
D, \ D, \
...@@ -502,8 +514,10 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad ...@@ -502,8 +514,10 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float16);
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float); DEFINE_KERNEL_LAUNCHER(AvgPool3d, float);
DEFINE_KERNEL_LAUNCHER(AvgPool3d, double); DEFINE_KERNEL_LAUNCHER(AvgPool3d, double);
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float16); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float); // AvgPool3dGrad DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double); // AvgPool3dGrad DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double); // AvgPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -9,7 +10,9 @@ namespace kernels { ...@@ -9,7 +10,9 @@ namespace kernels {
namespace { namespace {
template <typename T> #define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT>
__global__ void _AvgPool2dNCHW( __global__ void _AvgPool2dNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -35,24 +38,24 @@ __global__ void _AvgPool2dNCHW( ...@@ -35,24 +38,24 @@ __global__ void _AvgPool2dNCHW(
int wstart = w_out * stride_w - pad_w; int wstart = w_out * stride_w - pad_w;
int hend = min(hstart + kernel_h, H + pad_h); int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w); int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
hend = min(hend, H); hend = min(hend, H);
wend = min(wend, W); wend = min(wend, W);
const T* offset_x = x + (n * C + c) * H * W; const T* offset_x = x + (n * C + c) * H * W;
T val = T(0); AccT val = AccT(0);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[h * W + w]; val += LDG(offset_x, h * W + w);
} }
} }
y[yi] = val / area; y[yi] = convert::To<T>(val / area);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool2dNHWC( __global__ void _AvgPool2dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -78,24 +81,24 @@ __global__ void _AvgPool2dNHWC( ...@@ -78,24 +81,24 @@ __global__ void _AvgPool2dNHWC(
int wstart = w_out * stride_w - pad_w; int wstart = w_out * stride_w - pad_w;
int hend = min(hstart + kernel_h, H + pad_h); int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w); int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
hend = min(hend, H); hend = min(hend, H);
wend = min(wend, W); wend = min(wend, W);
const T* offset_x = x + n * H * W * C + c; const T* offset_x = x + n * H * W * C + c;
T val = T(0); AccT val = AccT(0);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[(h * W + w) * C]; val += LDG(offset_x, (h * W + w) * C);
} }
} }
y[yi] = val / area; y[yi] = convert::To<T>(val / area);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool2dGradNCHW( __global__ void _AvgPool2dGradNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -123,22 +126,22 @@ __global__ void _AvgPool2dGradNCHW( ...@@ -123,22 +126,22 @@ __global__ void _AvgPool2dGradNCHW(
const int out_wend = min(w / stride_w + 1, out_w); const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + (n * C + c) * out_h * out_w; const T* offset_dy = dy + (n * C + c) * out_h * out_w;
T val = T(0); AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
const int hstart = h_out * stride_h - pad_h; const int hstart = h_out * stride_h - pad_h;
const int hend = min(hstart + kernel_h, H + pad_h); const int hend = min(hstart + kernel_h, H + pad_h);
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w; const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w); const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
val += offset_dy[h_out * out_w + w_out] / area; val += LDG(offset_dy, h_out * out_w + w_out) / area;
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool2dGradNHWC( __global__ void _AvgPool2dGradNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -166,22 +169,22 @@ __global__ void _AvgPool2dGradNHWC( ...@@ -166,22 +169,22 @@ __global__ void _AvgPool2dGradNHWC(
const int out_wend = min(w / stride_w + 1, out_w); const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + n * out_h * out_w * C + c; const T* offset_dy = dy + n * out_h * out_w * C + c;
T val = T(0); AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
const int hstart = h_out * stride_h - pad_h; const int hstart = h_out * stride_h - pad_h;
const int hend = min(hstart + kernel_h, H + pad_h); const int hend = min(hstart + kernel_h, H + pad_h);
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w; const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w); const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart); const AccT area = (hend - hstart) * (wend - wstart);
val += offset_dy[(h_out * out_w + w_out) * C] / area; val += LDG(offset_dy, (h_out * out_w + w_out) * C) / area;
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool3dNCHW( __global__ void _AvgPool3dNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -218,7 +221,7 @@ __global__ void _AvgPool3dNCHW( ...@@ -218,7 +221,7 @@ __global__ void _AvgPool3dNCHW(
int dend = min(dstart + kernel_d, D + pad_d); int dend = min(dstart + kernel_d, D + pad_d);
int hend = min(hstart + kernel_h, H + pad_h); int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w); int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dstart = max(dstart, 0); dstart = max(dstart, 0);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
...@@ -227,19 +230,19 @@ __global__ void _AvgPool3dNCHW( ...@@ -227,19 +230,19 @@ __global__ void _AvgPool3dNCHW(
wend = min(wend, W); wend = min(wend, W);
const T* offset_x = x + (n * C + c) * D * H * W; const T* offset_x = x + (n * C + c) * D * H * W;
T val = T(0); AccT val = AccT(0);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[(d * H + h) * W + w]; val += LDG(offset_x, (d * H + h) * W + w);
} }
} }
} }
y[yi] = val / area; y[yi] = convert::To<T>(val / area);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool3dNHWC( __global__ void _AvgPool3dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -276,7 +279,7 @@ __global__ void _AvgPool3dNHWC( ...@@ -276,7 +279,7 @@ __global__ void _AvgPool3dNHWC(
int dend = min(dstart + kernel_d, D + pad_d); int dend = min(dstart + kernel_d, D + pad_d);
int hend = min(hstart + kernel_h, H + pad_h); int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w); int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dstart = max(dstart, 0); dstart = max(dstart, 0);
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
...@@ -285,19 +288,19 @@ __global__ void _AvgPool3dNHWC( ...@@ -285,19 +288,19 @@ __global__ void _AvgPool3dNHWC(
wend = min(wend, W); wend = min(wend, W);
const T* offset_x = x + n * D * H * W * C + c; const T* offset_x = x + n * D * H * W * C + c;
T val = T(0); AccT val = AccT(0);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
val += offset_x[((d * H + h) * W + w) * C]; val += LDG(offset_x, ((d * H + h) * W + w) * C);
} }
} }
} }
y[yi] = val / area; y[yi] = convert::To<T>(val / area);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool3dGradNCHW( __global__ void _AvgPool3dGradNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -336,7 +339,7 @@ __global__ void _AvgPool3dGradNCHW( ...@@ -336,7 +339,7 @@ __global__ void _AvgPool3dGradNCHW(
const int out_wend = min(w / stride_w + 1, out_w); const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + (n * C + c) * out_d * out_h * out_w; const T* offset_dy = dy + (n * C + c) * out_d * out_h * out_w;
T val = T(0); AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) { for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
const int dstart = d_out * stride_d - pad_d; const int dstart = d_out * stride_d - pad_d;
const int dend = min(dstart + kernel_d, D + pad_d); const int dend = min(dstart + kernel_d, D + pad_d);
...@@ -346,16 +349,16 @@ __global__ void _AvgPool3dGradNCHW( ...@@ -346,16 +349,16 @@ __global__ void _AvgPool3dGradNCHW(
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w; const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w); const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
val += offset_dy[(d_out * out_h + h_out) * out_w + w_out] / area; val += LDG(offset_dy, (d_out * out_h + h_out) * out_w + w_out) / area;
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _AvgPool3dGradNHWC( __global__ void _AvgPool3dGradNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -394,7 +397,7 @@ __global__ void _AvgPool3dGradNHWC( ...@@ -394,7 +397,7 @@ __global__ void _AvgPool3dGradNHWC(
const int out_wend = min(w / stride_w + 1, out_w); const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + n * out_d * out_h * out_w * C + c; const T* offset_dy = dy + n * out_d * out_h * out_w * C + c;
T val = T(0); AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) { for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
const int dstart = d_out * stride_d - pad_d; const int dstart = d_out * stride_d - pad_d;
const int dend = min(dstart + kernel_d, D + pad_d); const int dend = min(dstart + kernel_d, D + pad_d);
...@@ -404,129 +407,144 @@ __global__ void _AvgPool3dGradNHWC( ...@@ -404,129 +407,144 @@ __global__ void _AvgPool3dGradNHWC(
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w; const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w); const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart); const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
val += val += LDG(offset_dy, ((d_out * out_h + h_out) * out_w + w_out) * C) /
offset_dy[((d_out * out_h + h_out) * out_w + w_out) * C] / area; area;
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
#undef LDG
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, kBlocks, kThreads, ...) \ #define DISPATCH_POOL_KERNEL(name, T, AccT, kBlocks, kThreads, ...) \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
name##NCHW<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \ name##NCHW<T, AccT> \
} else if (data_format == "NHWC") { \ <<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
name##NHWC<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \ } else if (data_format == "NHWC") { \
} else { \ name##NHWC<T, AccT> \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \ <<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} }
#define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \ #define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const int H, \ const int H, \
const int W, \ const int W, \
const int out_h, \ const int out_h, \
const int out_w, \ const int out_w, \
const int kernel_h, \ const int kernel_h, \
const int kernel_w, \ const int kernel_w, \
const int stride_h, \ const int stride_h, \
const int stride_w, \ const int stride_w, \
const int pad_h, \ const int pad_h, \
const int pad_w, \ const int pad_w, \
const string& data_format, \ const string& data_format, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const int nthreads = N * C * out_dim; \ const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
CUDA_BLOCKS(nthreads), \ math::ScalarType<T>::type, \
CUDA_THREADS, \ math::AccmulatorType<T>::type, \
nthreads, \ CUDA_BLOCKS(nthreads), \
C, \ CUDA_THREADS, \
H, \ nthreads, \
W, \ C, \
out_h, \ H, \
out_w, \ W, \
kernel_h, \ out_h, \
kernel_w, \ out_w, \
stride_h, \ kernel_h, \
stride_w, \ kernel_w, \
pad_h, \ stride_h, \
pad_w, \ stride_w, \
x, \ pad_h, \
y); \ pad_w, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float16, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float, (out_h * out_w)); DEFINE_KERNEL_LAUNCHER(AvgPool2d, float, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool2d, double, (out_h * out_w)); DEFINE_KERNEL_LAUNCHER(AvgPool2d, double, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float16, (H * W)); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float, (H * W)); // AvgPool2dGrad DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float, (H * W)); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \ #define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const int D, \ const int D, \
const int H, \ const int H, \
const int W, \ const int W, \
const int out_d, \ const int out_d, \
const int out_h, \ const int out_h, \
const int out_w, \ const int out_w, \
const int kernel_d, \ const int kernel_d, \
const int kernel_h, \ const int kernel_h, \
const int kernel_w, \ const int kernel_w, \
const int stride_d, \ const int stride_d, \
const int stride_h, \ const int stride_h, \
const int stride_w, \ const int stride_w, \
const int pad_d, \ const int pad_d, \
const int pad_h, \ const int pad_h, \
const int pad_w, \ const int pad_w, \
const string& data_format, \ const string& data_format, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const int nthreads = N * C * out_dim; \ const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
CUDA_BLOCKS(nthreads), \ math::ScalarType<T>::type, \
CUDA_THREADS, \ math::AccmulatorType<T>::type, \
nthreads, \ CUDA_BLOCKS(nthreads), \
C, \ CUDA_THREADS, \
D, \ nthreads, \
H, \ C, \
W, \ D, \
out_d, \ H, \
out_h, \ W, \
out_w, \ out_d, \
kernel_d, \ out_h, \
kernel_h, \ out_w, \
kernel_w, \ kernel_d, \
stride_d, \ kernel_h, \
stride_h, \ kernel_w, \
stride_w, \ stride_d, \
pad_d, \ stride_h, \
pad_h, \ stride_w, \
pad_w, \ pad_d, \
x, \ pad_h, \
y); \ pad_w, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float16, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float, (out_d * out_h * out_w)); DEFINE_KERNEL_LAUNCHER(AvgPool3d, float, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool3d, double, (out_d * out_h * out_w)); DEFINE_KERNEL_LAUNCHER(AvgPool3d, double, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float, (D * H * W)); // AvgPool3dGrad DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float16,
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double, (D * H * W)); // AvgPool3dGrad (D * H * W)); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float,
(D * H * W)); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double,
(D * H * W)); // AvgPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DISPATCH_POOL_KERNEL #undef DISPATCH_POOL_KERNEL
......
...@@ -13,7 +13,7 @@ namespace { ...@@ -13,7 +13,7 @@ namespace {
template <typename T> template <typename T>
__global__ void _Im2Col2dNCHW( __global__ void _Im2Col2dNCHW(
const int nthreads, const int nthreads,
const int C, const int /* C */,
const int H, const int H,
const int W, const int W,
const int out_h, const int out_h,
...@@ -59,7 +59,7 @@ __global__ void _Im2Col2dNHWC( ...@@ -59,7 +59,7 @@ __global__ void _Im2Col2dNHWC(
const int C, const int C,
const int H, const int H,
const int W, const int W,
const int out_h, const int /* out_h */,
const int out_w, const int out_w,
const int kernel_h, const int kernel_h,
const int kernel_w, const int kernel_w,
...@@ -97,7 +97,7 @@ __global__ void _Im2Col2dNHWC( ...@@ -97,7 +97,7 @@ __global__ void _Im2Col2dNHWC(
template <typename T> template <typename T>
__global__ void _Col2Im2dNCHW( __global__ void _Col2Im2dNCHW(
const int nthreads, const int nthreads,
const int C, const int /* C */,
const int H, const int H,
const int W, const int W,
const int out_h, const int out_h,
...@@ -147,7 +147,7 @@ template <typename T> ...@@ -147,7 +147,7 @@ template <typename T>
__global__ void _Col2Im2dNHWC( __global__ void _Col2Im2dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
const int H, const int /* H */,
const int W, const int W,
const int out_h, const int out_h,
const int out_w, const int out_w,
......
...@@ -7,7 +7,7 @@ namespace kernels { ...@@ -7,7 +7,7 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T, typename AccT>
void _MaxPool2dNCHW( void _MaxPool2dNCHW(
const int N, const int N,
const int C, const int C,
...@@ -29,8 +29,7 @@ void _MaxPool2dNCHW( ...@@ -29,8 +29,7 @@ void _MaxPool2dNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w; const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w}; std::array<int, 4> dims = {N, C, out_h, out_w};
T val; int hstart, hend, wstart, wend;
int hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxCxHoxWo; ++i) { for (int i = 0; i < NxCxHoxWo; ++i) {
hstart = index[2] * stride_h - pad_h; hstart = index[2] * stride_h - pad_h;
wstart = index[3] * stride_w - pad_w; wstart = index[3] * stride_w - pad_w;
...@@ -39,23 +38,24 @@ void _MaxPool2dNCHW( ...@@ -39,23 +38,24 @@ void _MaxPool2dNCHW(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * CxHxW + index[1] * HxW; const T* offset_x = x + index[0] * CxHxW + index[1] * HxW;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = h * W + w; const auto xi = h * W + w;
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool2dNHWC( void _MaxPool2dNHWC(
const int N, const int N,
const int C, const int C,
...@@ -76,8 +76,7 @@ void _MaxPool2dNHWC( ...@@ -76,8 +76,7 @@ void _MaxPool2dNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w; const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C}; std::array<int, 4> dims = {N, out_h, out_w, C};
T val; int hstart, hend, wstart, wend;
int hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxHoxWoxC; ++i) { for (int i = 0; i < NxHoxWoxC; ++i) {
hstart = index[1] * stride_h - pad_h; hstart = index[1] * stride_h - pad_h;
wstart = index[2] * stride_w - pad_w; wstart = index[2] * stride_w - pad_w;
...@@ -86,23 +85,24 @@ void _MaxPool2dNHWC( ...@@ -86,23 +85,24 @@ void _MaxPool2dNHWC(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * HxWxC; const T* offset_x = x + index[0] * HxWxC;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = (h * W + w) * C + index[3]; const auto xi = (h * W + w) * C + index[3];
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool2dGradNCHW( void _MaxPool2dGradNCHW(
const int N, const int N,
const int C, const int C,
...@@ -127,13 +127,15 @@ void _MaxPool2dGradNCHW( ...@@ -127,13 +127,15 @@ void _MaxPool2dGradNCHW(
memset(dx, 0, sizeof(T) * N * CxHxW); memset(dx, 0, sizeof(T) * N * CxHxW);
for (int i = 0; i < NxCxHoxWo; ++i) { for (int i = 0; i < NxCxHoxWo; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * CxHxW + index[1] * HxW + mask[i]] += dy[i]; const auto xi = index[0] * CxHxW + index[1] * HxW + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(3, dims.data(), index.data()); math::utils::IncreaseIndexInDims(3, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool2dGradNHWC( void _MaxPool2dGradNHWC(
const int N, const int N,
const int C, const int C,
...@@ -157,13 +159,15 @@ void _MaxPool2dGradNHWC( ...@@ -157,13 +159,15 @@ void _MaxPool2dGradNHWC(
memset(dx, 0, sizeof(T) * N * HxWxC); memset(dx, 0, sizeof(T) * N * HxWxC);
for (int i = 0; i < NxHoxWoxC; ++i) { for (int i = 0; i < NxHoxWoxC; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * HxWxC + mask[i]] += dy[i]; const auto xi = index[0] * HxWxC + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(2, dims.data(), index.data()); math::utils::IncreaseIndexInDims(2, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dNCHW( void _MaxPool3dNCHW(
const int N, const int N,
const int C, const int C,
...@@ -190,8 +194,7 @@ void _MaxPool3dNCHW( ...@@ -190,8 +194,7 @@ void _MaxPool3dNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w; const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w}; std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T val; int dstart, dend, hstart, hend, wstart, wend;
int dstart, dend, hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxCxDoxHoxWo; ++i) { for (int i = 0; i < NxCxDoxHoxWo; ++i) {
dstart = index[2] * stride_d - pad_d; dstart = index[2] * stride_d - pad_d;
hstart = index[3] * stride_h - pad_h; hstart = index[3] * stride_h - pad_h;
...@@ -203,25 +206,26 @@ void _MaxPool3dNCHW( ...@@ -203,25 +206,26 @@ void _MaxPool3dNCHW(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW; const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = (d * H + h) * W + w; const auto xi = (d * H + h) * W + w;
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(5, dims.data(), index.data()); math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dNHWC( void _MaxPool3dNHWC(
const int N, const int N,
const int C, const int C,
...@@ -247,8 +251,7 @@ void _MaxPool3dNHWC( ...@@ -247,8 +251,7 @@ void _MaxPool3dNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w; const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C}; std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T val; int dstart, dend, hstart, hend, wstart, wend;
int dstart, dend, hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxDoxHoxWoxC; ++i) { for (int i = 0; i < NxDoxHoxWoxC; ++i) {
dstart = index[1] * stride_d - pad_d; dstart = index[1] * stride_d - pad_d;
hstart = index[2] * stride_h - pad_h; hstart = index[2] * stride_h - pad_h;
...@@ -260,25 +263,26 @@ void _MaxPool3dNHWC( ...@@ -260,25 +263,26 @@ void _MaxPool3dNHWC(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * DxHxWxC; const T* offset_x = x + index[0] * DxHxWxC;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = ((d * H + h) * W + w) * C + index[4]; const auto xi = ((d * H + h) * W + w) * C + index[4];
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(5, dims.data(), index.data()); math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dGradNCHW( void _MaxPool3dGradNCHW(
const int N, const int N,
const int C, const int C,
...@@ -308,13 +312,15 @@ void _MaxPool3dGradNCHW( ...@@ -308,13 +312,15 @@ void _MaxPool3dGradNCHW(
memset(dx, 0, sizeof(T) * N * CxDxHxW); memset(dx, 0, sizeof(T) * N * CxDxHxW);
for (int i = 0; i < NxCxDoxHoxWo; ++i) { for (int i = 0; i < NxCxDoxHoxWo; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * CxDxHxW + index[1] * DxHxW + mask[i]] += dy[i]; const auto xi = index[0] * CxDxHxW + index[1] * DxHxW + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(3, dims.data(), index.data()); math::utils::IncreaseIndexInDims(3, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dGradNHWC( void _MaxPool3dGradNHWC(
const int N, const int N,
const int C, const int C,
...@@ -343,7 +349,9 @@ void _MaxPool3dGradNHWC( ...@@ -343,7 +349,9 @@ void _MaxPool3dGradNHWC(
memset(dx, 0, sizeof(T) * N * DxHxWxC); memset(dx, 0, sizeof(T) * N * DxHxWxC);
for (int i = 0; i < NxDoxHoxWoxC; ++i) { for (int i = 0; i < NxDoxHoxWoxC; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * DxHxWxC + mask[i]] += dy[i]; const auto xi = index[0] * DxHxWxC + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(2, dims.data(), index.data()); math::utils::IncreaseIndexInDims(2, dims.data(), index.data());
} }
...@@ -353,11 +361,11 @@ void _MaxPool3dGradNHWC( ...@@ -353,11 +361,11 @@ void _MaxPool3dGradNHWC(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, ...) \ #define DISPATCH_POOL_KERNEL(name, T, AccT, ...) \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
name##NCHW(__VA_ARGS__); \ name##NCHW<T, AccT>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \ } else if (data_format == "NHWC") { \
name##NHWC(__VA_ARGS__); \ name##NHWC<T, AccT>(__VA_ARGS__); \
} else { \ } else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \ LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} }
...@@ -384,6 +392,8 @@ void _MaxPool3dGradNHWC( ...@@ -384,6 +392,8 @@ void _MaxPool3dGradNHWC(
CPUContext* ctx) { \ CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \ N, \
C, \ C, \
H, \ H, \
...@@ -401,8 +411,10 @@ void _MaxPool3dGradNHWC( ...@@ -401,8 +411,10 @@ void _MaxPool3dGradNHWC(
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float16);
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float); DEFINE_KERNEL_LAUNCHER(MaxPool2d, float);
DEFINE_KERNEL_LAUNCHER(MaxPool2d, double); DEFINE_KERNEL_LAUNCHER(MaxPool2d, double);
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float16); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
...@@ -434,6 +446,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad ...@@ -434,6 +446,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
CPUContext* ctx) { \ CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \ N, \
C, \ C, \
D, \ D, \
...@@ -456,8 +470,10 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad ...@@ -456,8 +470,10 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float16);
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float); DEFINE_KERNEL_LAUNCHER(MaxPool3d, float);
DEFINE_KERNEL_LAUNCHER(MaxPool3d, double); DEFINE_KERNEL_LAUNCHER(MaxPool3d, double);
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float16); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double); // MaxPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -9,7 +10,9 @@ namespace kernels { ...@@ -9,7 +10,9 @@ namespace kernels {
namespace { namespace {
template <typename T> #define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT>
__global__ void _MaxPool2dNCHW( __global__ void _MaxPool2dNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -41,20 +44,21 @@ __global__ void _MaxPool2dNCHW( ...@@ -41,20 +44,21 @@ __global__ void _MaxPool2dNCHW(
const T* offset_x = x + (n * C + c) * H * W; const T* offset_x = x + (n * C + c) * H * W;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
if (offset_x[h * W + w] > val) { if (LDG(offset_x, h * W + w) > val) {
val = offset_x[mask_val = h * W + w]; mask_val = h * W + w;
val = LDG(offset_x, mask_val);
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool2dNHWC( __global__ void _MaxPool2dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -86,21 +90,22 @@ __global__ void _MaxPool2dNHWC( ...@@ -86,21 +90,22 @@ __global__ void _MaxPool2dNHWC(
const int x_offset = n * H * W * C + c; const int x_offset = n * H * W * C + c;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = T(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
const int xi = x_offset + (h * W + w) * C; const int xi = x_offset + (h * W + w) * C;
if (x[xi] > val) { if (LDG(x, xi) > val) {
val = x[mask_val = xi]; mask_val = xi;
val = LDG(x, xi);
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool2dGradNCHW( __global__ void _MaxPool2dGradNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -131,20 +136,20 @@ __global__ void _MaxPool2dGradNCHW( ...@@ -131,20 +136,20 @@ __global__ void _MaxPool2dGradNCHW(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = (n * C + c) * out_h * out_w; const int y_offset = (n * C + c) * out_h * out_w;
T val = T(0); AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int yi = y_offset + h_out * out_w + w_out; const int yi = y_offset + h_out * out_w + w_out;
if (mask[yi] == (h * W + w)) { if (mask[yi] == (h * W + w)) {
val += dy[yi]; val += LDG(dy, yi);
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool2dGradNHWC( __global__ void _MaxPool2dGradNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -175,20 +180,20 @@ __global__ void _MaxPool2dGradNHWC( ...@@ -175,20 +180,20 @@ __global__ void _MaxPool2dGradNHWC(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = n * out_h * out_w * C + c; const int y_offset = n * out_h * out_w * C + c;
T val = T(0); AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int yi = y_offset + (h_out * out_w + w_out) * C; const int yi = y_offset + (h_out * out_w + w_out) * C;
if (mask[yi] == xi) { if (mask[yi] == xi) {
val += dy[yi]; val += LDG(dy, yi);
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dNCHW( __global__ void _MaxPool3dNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -232,23 +237,24 @@ __global__ void _MaxPool3dNCHW( ...@@ -232,23 +237,24 @@ __global__ void _MaxPool3dNCHW(
const T* offset_x = x + (n * C + c) * D * H * W; const T* offset_x = x + (n * C + c) * D * H * W;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
tmp = (d * H + h) * W + w; tmp = (d * H + h) * W + w;
if (offset_x[tmp] > val) { if (LDG(offset_x, tmp) > val) {
val = offset_x[mask_val = tmp]; mask_val = tmp;
val = LDG(offset_x, mask_val);
} }
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dNHWC( __global__ void _MaxPool3dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -292,23 +298,24 @@ __global__ void _MaxPool3dNHWC( ...@@ -292,23 +298,24 @@ __global__ void _MaxPool3dNHWC(
const int x_offset = n * D * H * W * C + c; const int x_offset = n * D * H * W * C + c;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
tmp = x_offset + ((d * H + h) * W + w) * C; tmp = x_offset + ((d * H + h) * W + w) * C;
if (x[tmp] > val) { if (LDG(x, tmp) > val) {
val = x[mask_val = tmp]; mask_val = tmp;
val = LDG(x, tmp);
} }
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dGradNCHW( __global__ void _MaxPool3dGradNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -351,22 +358,22 @@ __global__ void _MaxPool3dGradNCHW( ...@@ -351,22 +358,22 @@ __global__ void _MaxPool3dGradNCHW(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = (n * C + c) * out_d * out_h * out_w; const int y_offset = (n * C + c) * out_d * out_h * out_w;
T val = T(0); AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) { for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
tmp = y_offset + (d_out * out_h + h_out) * out_w + w_out; tmp = y_offset + (d_out * out_h + h_out) * out_w + w_out;
if (mask[tmp] == ((d * H + h) * W + w)) { if (mask[tmp] == ((d * H + h) * W + w)) {
val += dy[tmp]; val += LDG(dy, tmp);
} }
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dGradNHWC( __global__ void _MaxPool3dGradNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -409,136 +416,148 @@ __global__ void _MaxPool3dGradNHWC( ...@@ -409,136 +416,148 @@ __global__ void _MaxPool3dGradNHWC(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = n * out_d * out_h * out_w * C + c; const int y_offset = n * out_d * out_h * out_w * C + c;
T val = T(0); AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) { for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
tmp = y_offset + ((d_out * out_h + h_out) * out_w + w_out) * C; tmp = y_offset + ((d_out * out_h + h_out) * out_w + w_out) * C;
if (mask[tmp] == xi) { if (mask[tmp] == xi) {
val += dy[tmp]; val += LDG(dy, tmp);
} }
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
#undef LDG
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, kBlocks, kThreads, ...) \ #define DISPATCH_POOL_KERNEL(name, T, AccT, kBlocks, kThreads, ...) \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
name##NCHW<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \ name##NCHW<T, AccT> \
} else if (data_format == "NHWC") { \ <<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
name##NHWC<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \ } else if (data_format == "NHWC") { \
} else { \ name##NHWC<T, AccT> \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \ <<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} }
#define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \ #define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const int H, \ const int H, \
const int W, \ const int W, \
const int out_h, \ const int out_h, \
const int out_w, \ const int out_w, \
const int kernel_h, \ const int kernel_h, \
const int kernel_w, \ const int kernel_w, \
const int stride_h, \ const int stride_h, \
const int stride_w, \ const int stride_w, \
const int pad_h, \ const int pad_h, \
const int pad_w, \ const int pad_w, \
const string& data_format, \ const string& data_format, \
const T* x, \ const T* x, \
int* mask, \ int* mask, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const int nthreads = N * C * out_dim; \ const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
CUDA_BLOCKS(nthreads), \ math::ScalarType<T>::type, \
CUDA_THREADS, \ math::AccmulatorType<T>::type, \
nthreads, \ CUDA_BLOCKS(nthreads), \
C, \ CUDA_THREADS, \
H, \ nthreads, \
W, \ C, \
out_h, \ H, \
out_w, \ W, \
kernel_h, \ out_h, \
kernel_w, \ out_w, \
stride_h, \ kernel_h, \
stride_w, \ kernel_w, \
pad_h, \ stride_h, \
pad_w, \ stride_w, \
x, \ pad_h, \
mask, \ pad_w, \
y); \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mask, \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float16, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float, (out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool2d, float, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2d, double, (out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool2d, double, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float16, (H * W)); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float, (H * W)); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float, (H * W)); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \ #define DEFINE_KERNEL_LAUNCHER(name, T, out_dim) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const int D, \ const int D, \
const int H, \ const int H, \
const int W, \ const int W, \
const int out_d, \ const int out_d, \
const int out_h, \ const int out_h, \
const int out_w, \ const int out_w, \
const int kernel_d, \ const int kernel_d, \
const int kernel_h, \ const int kernel_h, \
const int kernel_w, \ const int kernel_w, \
const int stride_d, \ const int stride_d, \
const int stride_h, \ const int stride_h, \
const int stride_w, \ const int stride_w, \
const int pad_d, \ const int pad_d, \
const int pad_h, \ const int pad_h, \
const int pad_w, \ const int pad_w, \
const string& data_format, \ const string& data_format, \
const T* x, \ const T* x, \
int* mask, \ int* mask, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const int nthreads = N * C * out_dim; \ const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
CUDA_BLOCKS(nthreads), \ math::ScalarType<T>::type, \
CUDA_THREADS, \ math::AccmulatorType<T>::type, \
nthreads, \ CUDA_BLOCKS(nthreads), \
C, \ CUDA_THREADS, \
D, \ nthreads, \
H, \ C, \
W, \ D, \
out_d, \ H, \
out_h, \ W, \
out_w, \ out_d, \
kernel_d, \ out_h, \
kernel_h, \ out_w, \
kernel_w, \ kernel_d, \
stride_d, \ kernel_h, \
stride_h, \ kernel_w, \
stride_w, \ stride_d, \
pad_d, \ stride_h, \
pad_h, \ stride_w, \
pad_w, \ pad_d, \
x, \ pad_h, \
mask, \ pad_w, \
y); \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mask, \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float16, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float, (out_d * out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool3d, float, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3d, double, (out_d * out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool3d, double, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float16, (D * H * W)); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float, (D * H * W)); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float, (D * H * W)); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double, (D * H * W)); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double, (D * H * W)); // MaxPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
...@@ -85,7 +85,7 @@ __global__ void _RoiPoolGrad( ...@@ -85,7 +85,7 @@ __global__ void _RoiPoolGrad(
const int W, const int W,
const int out_h, const int out_h,
const int out_w, const int out_w,
const float spatial_scale, const float /* spatial_scale */,
const T* dy, const T* dy,
const float* rois, const float* rois,
const int* mask, const int* mask,
......
...@@ -11,14 +11,13 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -11,14 +11,13 @@ void TransposeOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0}); auto &X = Input(0), *Y = Output(0, {0});
int num_axes, num_dims = X.ndim(); int num_axes, num_dims = X.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
perm(0, &num_axes); perm(0, &num_axes);
CHECK(num_axes == 0 || num_axes == num_dims) CHECK(num_axes == 0 || num_axes == num_dims)
<< "\nProviding " << num_axes << " dimensions to permute, " << "\nProviding " << num_axes << " dimensions to permute, "
<< "while Tensor(" << X.name() << ")'s dims are " << X.DimString(); << "while Tensor(" << X.name() << ")'s dims are " << X.DimString();
vec64_t new_axes(num_dims); vec64_t new_axes(num_dims), new_dims(num_dims);
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1; new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1;
} }
...@@ -31,13 +30,27 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -31,13 +30,27 @@ void TransposeOp<Context>::DoRunWithType() {
} }
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X.stride(new_axes[i]); new_dims[i] = X.dim(new_axes[i]);
Y_dims[i] = X.dim(new_axes[i]); }
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
num_dims,
X.dims().data(),
new_axes.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
? ctx()->workspace()->template data<T, Context>({X.count()})[0] ? ctx()->workspace()->template data<T, Context>({X.count()})[0]
: Y->Reshape(Y_dims)->template mutable_data<T, Context>(); : Y->Reshape(new_dims)->template mutable_data<T, Context>();
kernels::Transpose( kernels::Transpose(
num_dims, num_dims,
...@@ -51,7 +64,7 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -51,7 +64,7 @@ void TransposeOp<Context>::DoRunWithType() {
math::Copy( math::Copy(
X.count(), X.count(),
scratch, scratch,
Y->Reshape(Y_dims)->template mutable_data<T, Context>(), Y->Reshape(new_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
} }
......
...@@ -107,11 +107,6 @@ void PoolOp<Context>::DoRunWithType() { ...@@ -107,11 +107,6 @@ void PoolOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void PoolOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::TypesBase<float, double>>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void PoolGradientOp<Context>::DoRunWithType() { void PoolGradientOp<Context>::DoRunWithType() {
ComputeOutShape(); ComputeOutShape();
...@@ -212,11 +207,6 @@ void PoolGradientOp<Context>::DoRunWithType() { ...@@ -212,11 +207,6 @@ void PoolGradientOp<Context>::DoRunWithType() {
} }
} }
template <class Context>
void PoolGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::TypesBase<float, double>>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Pool); DEPLOY_CPU_OPERATOR(Pool);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Pool); DEPLOY_CUDA_OPERATOR(Pool);
......
...@@ -27,7 +27,9 @@ class PoolOp final : public PoolOpBase<Context> { ...@@ -27,7 +27,9 @@ class PoolOp final : public PoolOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_POOL_FUNCTIONS; USE_POOL_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -43,7 +45,9 @@ class PoolGradientOp final : public PoolOpBase<Context> { ...@@ -43,7 +45,9 @@ class PoolGradientOp final : public PoolOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_POOL_FUNCTIONS; USE_POOL_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -70,7 +74,9 @@ class CuDNNPoolOp final : public CuDNNPoolOpBase<Context> { ...@@ -70,7 +74,9 @@ class CuDNNPoolOp final : public CuDNNPoolOpBase<Context> {
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_)); CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
} }
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -99,7 +105,9 @@ class CuDNNPoolGradientOp final : public CuDNNPoolOpBase<Context> { ...@@ -99,7 +105,9 @@ class CuDNNPoolGradientOp final : public CuDNNPoolOpBase<Context> {
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_)); CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
} }
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
......
#ifdef USE_CUDNN #ifdef USE_CUDNN
#include "dragon/core/workspace.h"
#include "dragon/operators/vision/pool_op.h" #include "dragon/operators/vision/pool_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -10,6 +12,56 @@ void CuDNNPoolOp<Context>::DoRunWithType() { ...@@ -10,6 +12,56 @@ void CuDNNPoolOp<Context>::DoRunWithType() {
ComputeOutShape(); ComputeOutShape();
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
// CuDNN NHWC pooling is slow.
// Temporarily fallback to the naive implementation.
if (data_format() == "NHWC" && mode_ == "AVG") {
if (num_axes_ == 1 || num_axes_ == 2) {
kernels::AvgPool2d(
in_dims_[0],
in_dims_[1],
in_dims_[2],
num_axes_ == 1 ? 1 : in_dims_[3],
out_dims_[2],
num_axes_ == 1 ? 1 : out_dims_[3],
kshape_[0],
num_axes_ == 1 ? 1 : kshape_[1],
strides_[0],
num_axes_ == 1 ? 1 : strides_[1],
pads_begin_[0],
num_axes_ == 1 ? 0 : pads_begin_[1],
data_format(),
X.template data<T, Context>(),
Y->Reshape(out_shape_)->template mutable_data<T, Context>(),
ctx());
} else if (num_axes_ == 3) {
kernels::AvgPool3d(
in_dims_[0],
in_dims_[1],
in_dims_[2],
in_dims_[3],
in_dims_[4],
out_dims_[2],
out_dims_[3],
out_dims_[4],
kshape_[0],
kshape_[1],
kshape_[2],
strides_[0],
strides_[1],
strides_[2],
pads_begin_[0],
pads_begin_[1],
pads_begin_[2],
data_format(),
X.template data<T, Context>(),
Y->Reshape(out_shape_)->template mutable_data<T, Context>(),
ctx());
} else {
LOG(FATAL) << "AvgPool" << num_axes_ << "d is not supported.";
}
return;
}
SetPoolDesc(); SetPoolDesc();
CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format()); CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format());
...@@ -26,11 +78,6 @@ void CuDNNPoolOp<Context>::DoRunWithType() { ...@@ -26,11 +78,6 @@ void CuDNNPoolOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void CuDNNPoolOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void CuDNNPoolGradientOp<Context>::DoRunWithType() { void CuDNNPoolGradientOp<Context>::DoRunWithType() {
ComputeOutShape(); ComputeOutShape();
...@@ -56,11 +103,6 @@ void CuDNNPoolGradientOp<Context>::DoRunWithType() { ...@@ -56,11 +103,6 @@ void CuDNNPoolGradientOp<Context>::DoRunWithType() {
dX->ReshapeLike(X)->template mutable_data<T, Context>())); dX->ReshapeLike(X)->template mutable_data<T, Context>()));
} }
template <class Context>
void CuDNNPoolGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CUDNN_OPERATOR(Pool); DEPLOY_CUDNN_OPERATOR(Pool);
DEPLOY_CUDNN_OPERATOR(PoolGradient); DEPLOY_CUDNN_OPERATOR(PoolGradient);
......
...@@ -60,12 +60,19 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -60,12 +60,19 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t transpose_dims, transpose_axes;
vec64_t X_strides(in_dims.size()); math::utils::CollapseTransposeAxes(
vec64_t Y_dims(in_dims.size()); X_reshape.ndim(),
for (int i = 0; i < X_reshape.ndim(); i++) { X_reshape.dims().data(),
X_strides[i] = X_reshape.stride(perm[i]); perm.data(),
Y_dims[i] = X_reshape.dim(perm[i]); transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
...@@ -73,7 +80,7 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -73,7 +80,7 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
: Y->Reshape(out_shape)->template mutable_data<T, Context>(); : Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose( kernels::Transpose(
X_strides.size(), num_dims,
X_strides.data(), X_strides.data(),
Y_dims.data(), Y_dims.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
...@@ -135,12 +142,19 @@ void DepthToSpaceOp<Context>::DoRunWithType() { ...@@ -135,12 +142,19 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t transpose_dims, transpose_axes;
vec64_t X_strides(in_dims.size()); math::utils::CollapseTransposeAxes(
vec64_t Y_dims(in_dims.size()); X_reshape.ndim(),
for (int i = 0; i < in_dims.size(); i++) { X_reshape.dims().data(),
X_strides[i] = X_reshape.stride(perm[i]); perm.data(),
Y_dims[i] = X_reshape.dim(perm[i]); transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
...@@ -148,7 +162,7 @@ void DepthToSpaceOp<Context>::DoRunWithType() { ...@@ -148,7 +162,7 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
: Y->Reshape(out_shape)->template mutable_data<T, Context>(); : Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose( kernels::Transpose(
X_strides.size(), num_dims,
X_strides.data(), X_strides.data(),
Y_dims.data(), Y_dims.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
......
...@@ -158,6 +158,129 @@ class CUDADeviceGuard { ...@@ -158,6 +158,129 @@ class CUDADeviceGuard {
int prev_id_; int prev_id_;
}; };
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1(Func, T, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2(Func, T1, T2, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T1, T2, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T1, T2, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T1, T2, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T1, T2, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T1, T2, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T1, T2, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T1, T2, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T1, T2, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3(Func, T1, T2, T3, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T1, T2, T3, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T1, T2, T3, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T1, T2, T3, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T1, T2, T3, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T1, T2, T3, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T1, T2, T3, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T1, T2, T3, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T1, T2, T3, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#else #else
#define CUDA_NOT_COMPILED LOG(FATAL) << "CUDA library is not compiled with." #define CUDA_NOT_COMPILED LOG(FATAL) << "CUDA library is not compiled with."
......
...@@ -147,9 +147,9 @@ DEFINE_SCALE_FUNC(int64_t); ...@@ -147,9 +147,9 @@ DEFINE_SCALE_FUNC(int64_t);
} \ } \
if (alpha != 1.f) { \ if (alpha != 1.f) { \
T alpha_val = static_cast<T>(alpha); \ T alpha_val = static_cast<T>(alpha); \
CUBLAS_CHECK(cublasSetPointerMode( \ const auto& handle = ctx->cublas_handle(); \
ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, &alpha_val, y, 1)); \ CUBLAS_CHECK(cublasFunc(handle, N, &alpha_val, y, 1)); \
} \ } \
} }
...@@ -169,17 +169,10 @@ DRAGON_API void Scale<float16, CUDAContext>( ...@@ -169,17 +169,10 @@ DRAGON_API void Scale<float16, CUDAContext>(
ctx->cuda_stream())); ctx->cuda_stream()));
} }
if (alpha != 1.f) { if (alpha != 1.f) {
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasScalEx( CUBLAS_CHECK(cublasScalEx(
ctx->cublas_handle(), handle, N, &alpha, CUDA_R_32F, y, CUDA_R_16F, 1, CUDA_R_32F));
N,
&alpha,
CUDA_R_32F,
y,
CUDA_R_16F,
1,
CUDA_R_32F));
} }
} }
...@@ -294,14 +287,14 @@ DEFINE_AXPY_FUNC(int); ...@@ -294,14 +287,14 @@ DEFINE_AXPY_FUNC(int);
DEFINE_AXPY_FUNC(int64_t); DEFINE_AXPY_FUNC(int64_t);
#undef DEFINE_AXPY_FUNC #undef DEFINE_AXPY_FUNC
#define DEFINE_AXPY_FUNC(T, cublasFunc) \ #define DEFINE_AXPY_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void Axpy<T, CUDAContext>( \ DRAGON_API void Axpy<T, CUDAContext>( \
const int N, const float alpha, const T* x, T* y, CUDAContext* ctx) { \ const int N, const float alpha, const T* x, T* y, CUDAContext* ctx) { \
T alpha_val = static_cast<T>(alpha); \ T alpha_val = static_cast<T>(alpha); \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, &alpha_val, x, 1, y, 1)); \ CUBLAS_CHECK(cublasFunc(handle, N, &alpha_val, x, 1, y, 1)); \
} }
template <> template <>
...@@ -311,10 +304,10 @@ DRAGON_API void Axpy<float16, CUDAContext>( ...@@ -311,10 +304,10 @@ DRAGON_API void Axpy<float16, CUDAContext>(
const float16* x, const float16* x,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasAxpyEx( CUBLAS_CHECK(cublasAxpyEx(
ctx->cublas_handle(), handle,
N, N,
&alpha, &alpha,
CUDA_R_32F, CUDA_R_32F,
...@@ -377,22 +370,22 @@ DEFINE_AXPBY_FUNC(float); ...@@ -377,22 +370,22 @@ DEFINE_AXPBY_FUNC(float);
DEFINE_AXPBY_FUNC(double); DEFINE_AXPBY_FUNC(double);
#undef DEFINE_AXPBY_FUNC #undef DEFINE_AXPBY_FUNC
#define DEFINE_DOT_FUNC(T, cublasFunc) \ #define DEFINE_DOT_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void Dot<T, CUDAContext>( \ DRAGON_API void Dot<T, CUDAContext>( \
const int N, const T* a, const T* b, T* y, CUDAContext* ctx) { \ const int N, const T* a, const T* b, T* y, CUDAContext* ctx) { \
CUBLAS_CHECK(cublasSetPointerMode( \ const auto& handle = ctx->cublas_handle(); \
ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, a, 1, b, 1, y)); \ CUBLAS_CHECK(cublasFunc(handle, N, a, 1, b, 1, y)); \
} \ } \
template <> \ template <> \
DRAGON_API T Dot<T, CUDAContext>( \ DRAGON_API T Dot<T, CUDAContext>( \
const int N, const T* a, const T* b, CUDAContext* ctx) { \ const int N, const T* a, const T* b, CUDAContext* ctx) { \
T ret; \ T ret; \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, a, 1, b, 1, &ret)); \ CUBLAS_CHECK(cublasFunc(handle, N, a, 1, b, 1, &ret)); \
return ret; \ return ret; \
} }
template <> template <>
...@@ -402,10 +395,10 @@ DRAGON_API void Dot<float16, CUDAContext>( ...@@ -402,10 +395,10 @@ DRAGON_API void Dot<float16, CUDAContext>(
const float16* b, const float16* b,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
CUBLAS_CHECK(cublasDotEx( CUBLAS_CHECK(cublasDotEx(
ctx->cublas_handle(), handle,
N, N,
a, a,
CUDA_R_16F, CUDA_R_16F,
...@@ -422,22 +415,22 @@ DEFINE_DOT_FUNC(float, cublasSdot); ...@@ -422,22 +415,22 @@ DEFINE_DOT_FUNC(float, cublasSdot);
DEFINE_DOT_FUNC(double, cublasDdot); DEFINE_DOT_FUNC(double, cublasDdot);
#undef DEFINE_DOT_FUNC #undef DEFINE_DOT_FUNC
#define DEFINE_ASUM_FUNC(T, cublasFunc) \ #define DEFINE_ASUM_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void ASum<T, CUDAContext>( \ DRAGON_API void ASum<T, CUDAContext>( \
const int N, const T* x, T* y, CUDAContext* ctx) { \ const int N, const T* x, T* y, CUDAContext* ctx) { \
CUBLAS_CHECK(cublasSetPointerMode( \ const auto& handle = ctx->cublas_handle(); \
ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, x, 1, y)); \ CUBLAS_CHECK(cublasFunc(handle, N, x, 1, y)); \
} \ } \
template <> \ template <> \
DRAGON_API T ASum<T, CUDAContext>( \ DRAGON_API T ASum<T, CUDAContext>( \
const int N, const T* x, CUDAContext* ctx) { \ const int N, const T* x, CUDAContext* ctx) { \
T ret; \ T ret; \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, x, 1, &ret)); \ CUBLAS_CHECK(cublasFunc(handle, N, x, 1, &ret)); \
return ret; \ return ret; \
} }
DEFINE_ASUM_FUNC(float, cublasSasum); DEFINE_ASUM_FUNC(float, cublasSasum);
...@@ -456,15 +449,18 @@ DRAGON_API void Gemv<float16, CUDAContext>( ...@@ -456,15 +449,18 @@ DRAGON_API void Gemv<float16, CUDAContext>(
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N; auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = cuTransA == CUBLAS_OP_N ? N : M; const int m = cuTransA == CUBLAS_OP_N ? N : M;
int k = cuTransA == CUBLAS_OP_N ? M : N; const int k = cuTransA == CUBLAS_OP_N ? M : N;
int LDA = cuTransA == CUBLAS_OP_N ? m : k; const int LDA = cuTransA == CUBLAS_OP_N ? m : k;
int LDC = m; const int LDC = m;
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); #if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
if (TENSOR_CORE_AVAILABLE()) { if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasGemmEx( CUBLAS_CHECK(cublasGemmEx(
ctx->cublas_handle(), handle,
cuTransA, cuTransA,
CUBLAS_OP_N, CUBLAS_OP_N,
m, m,
...@@ -485,7 +481,7 @@ DRAGON_API void Gemv<float16, CUDAContext>( ...@@ -485,7 +481,7 @@ DRAGON_API void Gemv<float16, CUDAContext>(
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else { } else {
CUBLAS_CHECK(cublasSgemmEx( CUBLAS_CHECK(cublasSgemmEx(
ctx->cublas_handle(), handle,
cuTransA, cuTransA,
CUBLAS_OP_N, CUBLAS_OP_N,
m, m,
...@@ -503,38 +499,30 @@ DRAGON_API void Gemv<float16, CUDAContext>( ...@@ -503,38 +499,30 @@ DRAGON_API void Gemv<float16, CUDAContext>(
CUDA_R_16F, CUDA_R_16F,
LDC)); LDC));
} }
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
} }
#define DEFINE_GEMV_FUNC(T, cublasFunc) \ #define DEFINE_GEMV_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void Gemv<T, CUDAContext>( \ DRAGON_API void Gemv<T, CUDAContext>( \
const CBLAS_TRANSPOSE TransA, \ const CBLAS_TRANSPOSE TransA, \
const int M, \ const int M, \
const int N, \ const int N, \
const float alpha, \ const float alpha, \
const T* A, \ const T* A, \
const T* x, \ const T* x, \
const float beta, \ const float beta, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N; \ auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N; \
const auto alpha_val = static_cast<T>(alpha); \ const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \ const auto beta_val = static_cast<T>(beta); \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \ CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \ handle, cuTransA, N, M, &alpha_val, A, N, x, 1, &beta_val, y, 1)); \
cuTransA, \
N, \
M, \
&alpha_val, \
A, \
N, \
x, \
1, \
&beta_val, \
y, \
1)); \
} }
DEFINE_GEMV_FUNC(float, cublasSgemv); DEFINE_GEMV_FUNC(float, cublasSgemv);
...@@ -558,11 +546,14 @@ DRAGON_API void Gemm<float16, CUDAContext>( ...@@ -558,11 +546,14 @@ DRAGON_API void Gemm<float16, CUDAContext>(
int ldb = (TransB == CblasNoTrans) ? N : K; int ldb = (TransB == CblasNoTrans) ? N : K;
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); #if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
if (TENSOR_CORE_AVAILABLE()) { if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasGemmEx( CUBLAS_CHECK(cublasGemmEx(
ctx->cublas_handle(), handle,
cuTransB, cuTransB,
cuTransA, cuTransA,
N, N,
...@@ -583,7 +574,7 @@ DRAGON_API void Gemm<float16, CUDAContext>( ...@@ -583,7 +574,7 @@ DRAGON_API void Gemm<float16, CUDAContext>(
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else { } else {
CUBLAS_CHECK(cublasSgemmEx( CUBLAS_CHECK(cublasSgemmEx(
ctx->cublas_handle(), handle,
cuTransB, cuTransB,
cuTransA, cuTransA,
N, N,
...@@ -601,45 +592,48 @@ DRAGON_API void Gemm<float16, CUDAContext>( ...@@ -601,45 +592,48 @@ DRAGON_API void Gemm<float16, CUDAContext>(
CUDA_R_16F, CUDA_R_16F,
N)); N));
} }
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
} }
#define DEFINE_GEMM_FUNC(T, cublasFunc) \ #define DEFINE_GEMM_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void Gemm<T, CUDAContext>( \ DRAGON_API void Gemm<T, CUDAContext>( \
const CBLAS_TRANSPOSE TransA, \ const CBLAS_TRANSPOSE TransA, \
const CBLAS_TRANSPOSE TransB, \ const CBLAS_TRANSPOSE TransB, \
const int M, \ const int M, \
const int N, \ const int N, \
const int K, \ const int K, \
const float alpha, \ const float alpha, \
const T* A, \ const T* A, \
const T* B, \ const T* B, \
const float beta, \ const float beta, \
T* C, \ T* C, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
int lda = TransA == CblasNoTrans ? K : M; \ int lda = TransA == CblasNoTrans ? K : M; \
int ldb = TransB == CblasNoTrans ? N : K; \ int ldb = TransB == CblasNoTrans ? N : K; \
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \ auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \ auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
const auto alpha_val = static_cast<T>(alpha); \ const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \ const auto beta_val = static_cast<T>(beta); \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \ CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \ handle, \
cuTransB, \ cuTransB, \
cuTransA, \ cuTransA, \
N, \ N, \
M, \ M, \
K, \ K, \
&alpha_val, \ &alpha_val, \
B, \ B, \
ldb, \ ldb, \
A, \ A, \
lda, \ lda, \
&beta_val, \ &beta_val, \
C, \ C, \
N)); \ N)); \
} }
DEFINE_GEMM_FUNC(float, cublasSgemm); DEFINE_GEMM_FUNC(float, cublasSgemm);
...@@ -668,10 +662,13 @@ DRAGON_API void GemmBatched<float16, CUDAContext>( ...@@ -668,10 +662,13 @@ DRAGON_API void GemmBatched<float16, CUDAContext>(
thrust::device_vector<const void*> A_arr(A, A + batch_size); thrust::device_vector<const void*> A_arr(A, A + batch_size);
thrust::device_vector<const void*> B_arr(B, B + batch_size); thrust::device_vector<const void*> B_arr(B, B + batch_size);
thrust::device_vector<void*> C_arr(C, C + batch_size); thrust::device_vector<void*> C_arr(C, C + batch_size);
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); #if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasGemmBatchedEx( CUBLAS_CHECK(cublasGemmBatchedEx(
ctx->cublas_handle(), handle,
cuTransB, cuTransB,
cuTransA, cuTransA,
N, N,
...@@ -691,51 +688,54 @@ DRAGON_API void GemmBatched<float16, CUDAContext>( ...@@ -691,51 +688,54 @@ DRAGON_API void GemmBatched<float16, CUDAContext>(
batch_size, batch_size,
CUDA_R_32F, CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
} }
#define DEFINE_BATCHED_GEMM_FUNC(T, cublasFunc) \ #define DEFINE_BATCHED_GEMM_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void GemmBatched<T, CUDAContext>( \ DRAGON_API void GemmBatched<T, CUDAContext>( \
const CBLAS_TRANSPOSE TransA, \ const CBLAS_TRANSPOSE TransA, \
const CBLAS_TRANSPOSE TransB, \ const CBLAS_TRANSPOSE TransB, \
const int batch_size, \ const int batch_size, \
const int M, \ const int M, \
const int N, \ const int N, \
const int K, \ const int K, \
const float alpha, \ const float alpha, \
const T** A, \ const T** A, \
const T** B, \ const T** B, \
const float beta, \ const float beta, \
T** C, \ T** C, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
int lda = TransA == CblasNoTrans ? K : M; \ int lda = TransA == CblasNoTrans ? K : M; \
int ldb = TransB == CblasNoTrans ? N : K; \ int ldb = TransB == CblasNoTrans ? N : K; \
int ldc = N; \ int ldc = N; \
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \ auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \ auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
const auto alpha_val = static_cast<T>(alpha); \ const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \ const auto beta_val = static_cast<T>(beta); \
thrust::device_vector<const T*> A_arr(A, A + batch_size); \ thrust::device_vector<const T*> A_arr(A, A + batch_size); \
thrust::device_vector<const T*> B_arr(B, B + batch_size); \ thrust::device_vector<const T*> B_arr(B, B + batch_size); \
thrust::device_vector<T*> C_arr(C, C + batch_size); \ thrust::device_vector<T*> C_arr(C, C + batch_size); \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \ CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \ handle, \
cuTransB, \ cuTransB, \
cuTransA, \ cuTransA, \
N, \ N, \
M, \ M, \
K, \ K, \
&alpha_val, \ &alpha_val, \
B_arr.data().get(), \ B_arr.data().get(), \
ldb, \ ldb, \
A_arr.data().get(), \ A_arr.data().get(), \
lda, \ lda, \
&beta_val, \ &beta_val, \
C_arr.data().get(), \ C_arr.data().get(), \
ldc, \ ldc, \
batch_size)); \ batch_size)); \
} }
DEFINE_BATCHED_GEMM_FUNC(float, cublasSgemmBatched); DEFINE_BATCHED_GEMM_FUNC(float, cublasSgemmBatched);
...@@ -764,10 +764,13 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>( ...@@ -764,10 +764,13 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>(
int ldc = N; int ldc = N;
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
CUBLAS_CHECK( const auto& handle = ctx->cublas_handle();
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); #if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasGemmStridedBatchedEx( CUBLAS_CHECK(cublasGemmStridedBatchedEx(
ctx->cublas_handle(), handle,
cuTransB, cuTransB,
cuTransA, cuTransA,
N, N,
...@@ -790,54 +793,57 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>( ...@@ -790,54 +793,57 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>(
batch_size, batch_size,
CUDA_R_32F, CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
} }
#define DEFINE_STRIDED_BATCHED_GEMM_FUNC(T, cublasFunc) \ #define DEFINE_STRIDED_BATCHED_GEMM_FUNC(T, cublasFunc) \
template <> \ template <> \
DRAGON_API void GemmStridedBatched<T, CUDAContext>( \ DRAGON_API void GemmStridedBatched<T, CUDAContext>( \
const CBLAS_TRANSPOSE TransA, \ const CBLAS_TRANSPOSE TransA, \
const CBLAS_TRANSPOSE TransB, \ const CBLAS_TRANSPOSE TransB, \
const int batch_size, \ const int batch_size, \
const int M, \ const int M, \
const int N, \ const int N, \
const int K, \ const int K, \
const int A_stride, \ const int A_stride, \
const int B_stride, \ const int B_stride, \
const int C_stride, \ const int C_stride, \
const float alpha, \ const float alpha, \
const T* A, \ const T* A, \
const T* B, \ const T* B, \
const float beta, \ const float beta, \
T* C, \ T* C, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
int lda = TransA == CblasNoTrans ? K : M; \ int lda = TransA == CblasNoTrans ? K : M; \
int ldb = TransB == CblasNoTrans ? N : K; \ int ldb = TransB == CblasNoTrans ? N : K; \
int ldc = N; \ int ldc = N; \
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \ auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \ auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
const auto alpha_val = static_cast<T>(alpha); \ const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \ const auto beta_val = static_cast<T>(beta); \
CUBLAS_CHECK( \ const auto& handle = ctx->cublas_handle(); \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \ CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \ CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \ handle, \
cuTransB, \ cuTransB, \
cuTransA, \ cuTransA, \
N, \ N, \
M, \ M, \
K, \ K, \
&alpha_val, \ &alpha_val, \
B, \ B, \
ldb, \ ldb, \
B_stride, \ B_stride, \
A, \ A, \
lda, \ lda, \
A_stride, \ A_stride, \
&beta_val, \ &beta_val, \
C, \ C, \
ldc, \ ldc, \
C_stride, \ C_stride, \
batch_size)); \ batch_size)); \
} }
DEFINE_STRIDED_BATCHED_GEMM_FUNC(float, cublasSgemmStridedBatched); DEFINE_STRIDED_BATCHED_GEMM_FUNC(float, cublasSgemmStridedBatched);
......
...@@ -14,30 +14,30 @@ namespace math { ...@@ -14,30 +14,30 @@ namespace math {
namespace { namespace {
template <typename T> template <typename T>
__global__ void _RowwiseSet(const int n, const int cols, const T* x, T* y) { __global__ void _RowwiseSet(const int N, const int cols, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, n) { CUDA_1D_KERNEL_LOOP(i, N) {
y[i] = __ldg(x + i % cols); y[i] = __ldg(x + i % cols);
} }
} }
template <typename T> template <typename T>
__global__ void _ColwiseSet(const int n, const int cols, const T* x, T* y) { __global__ void _ColwiseSet(const int N, const int cols, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, n) { CUDA_1D_KERNEL_LOOP(i, N) {
y[i] = __ldg(x + i / cols); y[i] = __ldg(x + i / cols);
} }
} }
template <typename T, int D> template <typename T, int D>
__global__ void _BroadcastSet( __global__ void _BroadcastSet(
const int nthreads, const int N,
const int num_dims,
const SimpleArray<int, D> x_strides, const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims, const SimpleArray<int, D> y_dims,
const T* x, const T* x,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d]; xi += r * x_strides.data[d];
...@@ -48,13 +48,13 @@ __global__ void _BroadcastSet( ...@@ -48,13 +48,13 @@ __global__ void _BroadcastSet(
template <typename InputT, typename OutputT, class Functor, bool BroadcastA> template <typename InputT, typename OutputT, class Functor, bool BroadcastA>
__global__ void _RowwiseBinaryFunc( __global__ void _RowwiseBinaryFunc(
const int nthreads, const int N,
const int cols, const int cols,
const Functor op, const Functor op,
const InputT* a, const InputT* a,
const InputT* b, const InputT* b,
OutputT* y) { OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, N) {
const int i = yi % cols; const int i = yi % cols;
const int ai = BroadcastA ? i : yi; const int ai = BroadcastA ? i : yi;
const int bi = BroadcastA ? yi : i; const int bi = BroadcastA ? yi : i;
...@@ -64,13 +64,13 @@ __global__ void _RowwiseBinaryFunc( ...@@ -64,13 +64,13 @@ __global__ void _RowwiseBinaryFunc(
template <typename InputT, typename OutputT, class Functor, bool BroadcastA> template <typename InputT, typename OutputT, class Functor, bool BroadcastA>
__global__ void _ColwiseBinaryFunc( __global__ void _ColwiseBinaryFunc(
const int nthreads, const int N,
const int cols, const int cols,
const Functor op, const Functor op,
const InputT* a, const InputT* a,
const InputT* b, const InputT* b,
OutputT* y) { OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, N) {
const int i = yi / cols; const int i = yi / cols;
const int ai = BroadcastA ? i : yi; const int ai = BroadcastA ? i : yi;
const int bi = BroadcastA ? yi : i; const int bi = BroadcastA ? yi : i;
...@@ -80,8 +80,7 @@ __global__ void _ColwiseBinaryFunc( ...@@ -80,8 +80,7 @@ __global__ void _ColwiseBinaryFunc(
template <typename InputT, typename OutputT, class Functor, int D> template <typename InputT, typename OutputT, class Functor, int D>
__global__ void _BroadcastBinaryFunc( __global__ void _BroadcastBinaryFunc(
const int nthreads, const int N,
const int num_dims,
const SimpleArray<int, D> a_strides, const SimpleArray<int, D> a_strides,
const SimpleArray<int, D> b_strides, const SimpleArray<int, D> b_strides,
const SimpleArray<int, D> y_dims, const SimpleArray<int, D> y_dims,
...@@ -89,9 +88,10 @@ __global__ void _BroadcastBinaryFunc( ...@@ -89,9 +88,10 @@ __global__ void _BroadcastBinaryFunc(
const InputT* a, const InputT* a,
const InputT* b, const InputT* b,
OutputT* y) { OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, N) {
int ai = 0, bi = 0, tmp = yi; int ai = 0, bi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
ai += r * a_strides.data[d]; ai += r * a_strides.data[d];
...@@ -103,8 +103,7 @@ __global__ void _BroadcastBinaryFunc( ...@@ -103,8 +103,7 @@ __global__ void _BroadcastBinaryFunc(
template <typename T, int D> template <typename T, int D>
__global__ void _BroadcastWhere( __global__ void _BroadcastWhere(
const int nthreads, const int N,
const int num_dims,
const SimpleArray<int, D> a_strides, const SimpleArray<int, D> a_strides,
const SimpleArray<int, D> b_strides, const SimpleArray<int, D> b_strides,
const SimpleArray<int, D> c_strides, const SimpleArray<int, D> c_strides,
...@@ -113,9 +112,10 @@ __global__ void _BroadcastWhere( ...@@ -113,9 +112,10 @@ __global__ void _BroadcastWhere(
const T* b, const T* b,
const uint8_t* c, const uint8_t* c,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, N) {
int ai = 0, bi = 0, ci = 0, tmp = yi; int ai = 0, bi = 0, ci = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
ai += r * a_strides.data[d]; ai += r * a_strides.data[d];
...@@ -126,78 +126,126 @@ __global__ void _BroadcastWhere( ...@@ -126,78 +126,126 @@ __global__ void _BroadcastWhere(
} }
} }
template <typename T, int D>
void _BroadcastSetImpl(
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_BroadcastSet<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, x, y);
}
template <typename InputT, typename OutputT, class Functor, int D>
void _BroadcastBinaryFuncImpl(
const int64_t* a_strides,
const int64_t* b_strides,
const int64_t* y_dims,
const Functor op,
const InputT* a,
const InputT* b,
OutputT* y,
CUDAContext* ctx) {
SimpleArray<int, D> A_strides, B_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
A_strides.data[i] = a_strides[i];
B_strides.data[i] = b_strides[i];
Y_dims.data[i] = y_dims[i];
}
_BroadcastBinaryFunc<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, A_strides, B_strides, Y_dims, op, a, b, y);
}
template <typename T, int D>
void _BroadcastWhereImpl(
const int64_t* a_strides,
const int64_t* b_strides,
const int64_t* c_strides,
const int64_t* y_dims,
const T* a,
const T* b,
const uint8_t* c,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> A_strides, B_strides, C_strides;
SimpleArray<int, D> Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
A_strides.data[i] = a_strides[i];
B_strides.data[i] = b_strides[i];
C_strides.data[i] = c_strides[i];
Y_dims.data[i] = y_dims[i];
}
_BroadcastWhere<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, A_strides, B_strides, C_strides, Y_dims, a, b, c, y);
}
} // namespace } // namespace
#define DEFINE_SET_FUNC(T, ScalarT) \ #define DEFINE_SET_FUNC(T, ScalarT) \
template <> \ template <> \
DRAGON_API void Set<T, CUDAContext>( \ DRAGON_API void Set<T, CUDAContext>( \
const int x_ndim, \ const int x_ndim, \
const int64_t* x_dims, \ const int64_t* x_dims, \
const int y_ndim, \ const int y_ndim, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
int rows, cols; \ int rows, cols; \
vec64_t X_dims(x_dims, x_dims + x_ndim); \ vec64_t X_dims(x_dims, x_dims + x_ndim); \
vec64_t Y_dims(y_dims, y_dims + y_ndim); \ vec64_t Y_dims(y_dims, y_dims + y_ndim); \
vec64_t X_broadcast_dims, Y_broadcast_dims; \ vec64_t X_broadcast_dims, Y_broadcast_dims; \
math::utils::ComputeBinaryBroadcastDims( \ math::utils::ComputeBinaryBroadcastDims( \
X_dims, Y_dims, X_broadcast_dims, Y_broadcast_dims); \ X_dims, Y_dims, X_broadcast_dims, Y_broadcast_dims); \
if (X_broadcast_dims == Y_broadcast_dims) { \ if (X_broadcast_dims == Y_broadcast_dims) { \
auto count = std::accumulate( \ auto count = std::accumulate( \
x_dims, x_dims + x_ndim, 1, std::multiplies<int64_t>()); \ x_dims, x_dims + x_ndim, 1, std::multiplies<int64_t>()); \
Copy(count, x, y, ctx); \ Copy(count, x, y, ctx); \
return; \ return; \
} \ } \
if (math::utils::IsRowwiseBroadcast(X_dims, Y_dims, &rows, &cols)) { \ if (math::utils::IsRowwiseBroadcast(X_dims, Y_dims, &rows, &cols)) { \
const auto nthreads = rows * cols; \ const auto N = rows * cols; \
_RowwiseSet<<< \ _RowwiseSet<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
CUDA_BLOCKS(nthreads), \ N, \
CUDA_THREADS, \ cols, \
0, \ reinterpret_cast<const ScalarT*>(x), \
ctx->cuda_stream()>>>( \ reinterpret_cast<ScalarT*>(y)); \
nthreads, \ return; \
cols, \ } \
reinterpret_cast<const ScalarT*>(x), \ if (math::utils::IsColwiseBroadcast(X_dims, Y_dims, &rows, &cols)) { \
reinterpret_cast<ScalarT*>(y)); \ const auto N = rows * cols; \
return; \ _ColwiseSet<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
} \ N, \
if (math::utils::IsColwiseBroadcast(X_dims, Y_dims, &rows, &cols)) { \ cols, \
const auto nthreads = rows * cols; \ reinterpret_cast<const ScalarT*>(x), \
_ColwiseSet<<< \ reinterpret_cast<ScalarT*>(y)); \
CUDA_BLOCKS(nthreads), \ return; \
CUDA_THREADS, \ } \
0, \ vec64_t X_broadcast_strides, _; \
ctx->cuda_stream()>>>( \ CUDA_TENSOR_DIMS_CHECK(int(Y_dims.size())); \
nthreads, \ math::utils::ComputeBinaryBroadcastStrides( \
cols, \ X_dims, Y_dims, X_broadcast_strides, _, _); \
reinterpret_cast<const ScalarT*>(x), \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
reinterpret_cast<ScalarT*>(y)); \ _BroadcastSetImpl, \
return; \ ScalarT, \
} \ int(Y_dims.size()), \
vec64_t X_broadcast_strides, _; \ X_broadcast_strides.data(), \
CUDA_TENSOR_DIMS_CHECK((int)Y_dims.size()); \ Y_dims.data(), \
math::utils::ComputeBinaryBroadcastStrides( \ reinterpret_cast<const ScalarT*>(x), \
X_dims, Y_dims, X_broadcast_strides, _, _); \ reinterpret_cast<ScalarT*>(y), \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> strides, dims; \ ctx); \
const auto nthreads = std::accumulate( \
Y_dims.begin(), Y_dims.end(), 1, std::multiplies<int64_t>()); \
for (int i = 0; i < Y_dims.size(); ++i) { \
strides.data[i] = X_broadcast_strides[i]; \
dims.data[i] = Y_dims[i]; \
} \
_BroadcastSet<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
Y_dims.size(), \
strides, \
dims, \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
} }
DEFINE_SET_FUNC(bool, uint8_t); DEFINE_SET_FUNC(bool, uint8_t);
...@@ -210,120 +258,107 @@ DEFINE_SET_FUNC(float, float); ...@@ -210,120 +258,107 @@ DEFINE_SET_FUNC(float, float);
DEFINE_SET_FUNC(double, double); DEFINE_SET_FUNC(double, double);
#undef DEFINE_SET_FUNC #undef DEFINE_SET_FUNC
#define DEFINE_BINARY_FUNC(name, InputT, OutputT, Functor) \ #define DEFINE_BINARY_FUNC(name, InputT, OutputT, Functor) \
template <> \ template <> \
DRAGON_API void name<InputT, CUDAContext>( \ DRAGON_API void name<InputT, CUDAContext>( \
const int a_ndim, \ const int a_ndim, \
const int64_t* a_dims, \ const int64_t* a_dims, \
const int b_ndim, \ const int b_ndim, \
const int64_t* b_dims, \ const int64_t* b_dims, \
const InputT* a, \ const InputT* a, \
const InputT* b, \ const InputT* b, \
OutputT* y, \ OutputT* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
int rows, cols, broadcast_1st; \ int rows, cols, broadcast_1st; \
vec64_t A_dims(a_dims, a_dims + a_ndim); \ vec64_t A_dims(a_dims, a_dims + a_ndim); \
vec64_t B_dims(b_dims, b_dims + b_ndim); \ vec64_t B_dims(b_dims, b_dims + b_ndim); \
vec64_t A_broadcast_dims, B_broadcast_dims; \ vec64_t A_broadcast_dims, B_broadcast_dims; \
math::utils::ComputeBinaryBroadcastDims( \ math::utils::ComputeBinaryBroadcastDims( \
A_dims, B_dims, A_broadcast_dims, B_broadcast_dims); \ A_dims, B_dims, A_broadcast_dims, B_broadcast_dims); \
if (A_broadcast_dims == B_broadcast_dims) { \ if (A_broadcast_dims == B_broadcast_dims) { \
auto count = std::accumulate( \ auto count = std::accumulate( \
a_dims, a_dims + a_ndim, 1, std::multiplies<int64_t>()); \ a_dims, a_dims + a_ndim, 1, std::multiplies<int64_t>()); \
name(count, a, b, y, ctx); \ name(count, a, b, y, ctx); \
return; \ return; \
} \ } \
if (math::utils::IsRowwiseBroadcast( \ if (math::utils::IsRowwiseBroadcast( \
A_dims, B_dims, &rows, &cols, &broadcast_1st)) { \ A_dims, B_dims, &rows, &cols, &broadcast_1st)) { \
const auto nthreads = rows * cols; \ const auto N = rows * cols; \
if (broadcast_1st > 0) { \ if (broadcast_1st > 0) { \
_RowwiseBinaryFunc< \ _RowwiseBinaryFunc< \
math::ScalarType<InputT>::type, \ math::ScalarType<InputT>::type, \
math::ScalarType<OutputT>::type, \ math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \ Functor<math::ScalarType<InputT>::type>, \
true> \ true><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ N, \
nthreads, \ cols, \
cols, \ Functor<math::ScalarType<InputT>::type>(), \
Functor<math::ScalarType<InputT>::type>(), \ reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \ reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \ reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \ } else { \
} else { \ _RowwiseBinaryFunc< \
_RowwiseBinaryFunc< \ math::ScalarType<InputT>::type, \
math::ScalarType<InputT>::type, \ math::ScalarType<OutputT>::type, \
math::ScalarType<OutputT>::type, \ Functor<math::ScalarType<InputT>::type>, \
Functor<math::ScalarType<InputT>::type>, \ false><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
false> \ N, \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ cols, \
nthreads, \ Functor<math::ScalarType<InputT>::type>(), \
cols, \ reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
Functor<math::ScalarType<InputT>::type>(), \ reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \ reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \ } \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \ return; \
} \ } \
return; \ if (math::utils::IsColwiseBroadcast( \
} \ A_dims, B_dims, &rows, &cols, &broadcast_1st)) { \
if (math::utils::IsColwiseBroadcast( \ const auto N = rows * cols; \
A_dims, B_dims, &rows, &cols, &broadcast_1st)) { \ if (broadcast_1st > 0) { \
const auto nthreads = rows * cols; \ _ColwiseBinaryFunc< \
if (broadcast_1st > 0) { \ math::ScalarType<InputT>::type, \
_ColwiseBinaryFunc< \ math::ScalarType<OutputT>::type, \
math::ScalarType<InputT>::type, \ Functor<math::ScalarType<InputT>::type>, \
math::ScalarType<OutputT>::type, \ true><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
Functor<math::ScalarType<InputT>::type>, \ N, \
true> \ cols, \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ Functor<math::ScalarType<InputT>::type>(), \
nthreads, \ reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
cols, \ reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
Functor<math::ScalarType<InputT>::type>(), \ reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \ } else { \
reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \ _ColwiseBinaryFunc< \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \ math::ScalarType<InputT>::type, \
} else { \ math::ScalarType<OutputT>::type, \
_ColwiseBinaryFunc< \ Functor<math::ScalarType<InputT>::type>, \
math::ScalarType<InputT>::type, \ false><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
math::ScalarType<OutputT>::type, \ N, \
Functor<math::ScalarType<InputT>::type>, \ cols, \
false> \ Functor<math::ScalarType<InputT>::type>(), \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
nthreads, \ reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
cols, \ reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
Functor<math::ScalarType<InputT>::type>(), \ } \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \ return; \
reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \ } \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \ vec64_t A_broadcast_strides, B_broadcast_strides, Y_dims; \
} \ math::utils::ComputeBinaryBroadcastStrides( \
return; \ A_dims, B_dims, A_broadcast_strides, B_broadcast_strides, Y_dims); \
} \ CUDA_TENSOR_DIMS_CHECK(int(Y_dims.size())); \
vec64_t A_broadcast_strides, B_broadcast_strides, Y_dims; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3( \
math::utils::ComputeBinaryBroadcastStrides( \ _BroadcastBinaryFuncImpl, \
A_dims, B_dims, A_broadcast_strides, B_broadcast_strides, Y_dims); \ math::ScalarType<InputT>::type, \
CUDA_TENSOR_DIMS_CHECK((int)Y_dims.size()); \ math::ScalarType<OutputT>::type, \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> a_strides, b_strides, y_dims; \ Functor<math::ScalarType<InputT>::type>, \
const auto nthreads = std::accumulate( \ int(Y_dims.size()), \
Y_dims.begin(), Y_dims.end(), 1, std::multiplies<int64_t>()); \ A_broadcast_strides.data(), \
for (int i = 0; i < Y_dims.size(); ++i) { \ B_broadcast_strides.data(), \
a_strides.data[i] = A_broadcast_strides[i]; \ Y_dims.data(), \
b_strides.data[i] = B_broadcast_strides[i]; \ Functor<math::ScalarType<InputT>::type>(), \
y_dims.data[i] = Y_dims[i]; \ reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
} \ reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
_BroadcastBinaryFunc< \ reinterpret_cast<math::ScalarType<OutputT>::type*>(y), \
math::ScalarType<InputT>::type, \ ctx); \
math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \
CUDA_TENSOR_MAX_DIMS> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
Y_dims.size(), \
a_strides, \
b_strides, \
y_dims, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
} }
DEFINE_BINARY_FUNC(Add, uint8_t, uint8_t, math::PlusFunctor); DEFINE_BINARY_FUNC(Add, uint8_t, uint8_t, math::PlusFunctor);
...@@ -484,73 +519,61 @@ DEFINE_BINARY_FUNC(Greater, bool, bool, uint8_t, bool); ...@@ -484,73 +519,61 @@ DEFINE_BINARY_FUNC(Greater, bool, bool, uint8_t, bool);
DEFINE_BINARY_FUNC(GreaterEqual, bool, bool, uint8_t, bool); DEFINE_BINARY_FUNC(GreaterEqual, bool, bool, uint8_t, bool);
#undef DEFINE_BINARY_FUNC #undef DEFINE_BINARY_FUNC
#define DEFINE_WHERE_FUNC(T, ScalarT) \ #define DEFINE_WHERE_FUNC(T, ScalarT) \
template <> \ template <> \
DRAGON_API void Where<T, CUDAContext>( \ DRAGON_API void Where<T, CUDAContext>( \
const int a_ndim, \ const int a_ndim, \
const int64_t* a_dims, \ const int64_t* a_dims, \
const int b_ndim, \ const int b_ndim, \
const int64_t* b_dims, \ const int64_t* b_dims, \
const int c_ndim, \ const int c_ndim, \
const int64_t* c_dims, \ const int64_t* c_dims, \
const T* a, \ const T* a, \
const T* b, \ const T* b, \
const bool* c, \ const bool* c, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
vec64_t A_dims(a_dims, a_dims + a_ndim); \ vec64_t A_dims(a_dims, a_dims + a_ndim); \
vec64_t B_dims(b_dims, b_dims + b_ndim); \ vec64_t B_dims(b_dims, b_dims + b_ndim); \
vec64_t C_dims(c_dims, c_dims + c_ndim); \ vec64_t C_dims(c_dims, c_dims + c_ndim); \
vec64_t A_broadcast_dims, B_broadcast_dims, C_broadcast_dims; \ vec64_t A_broadcast_dims, B_broadcast_dims, C_broadcast_dims; \
vec64_t A_broadcast_strides, B_broadcast_strides, C_broadcast_strides; \ vec64_t A_broadcast_strides, B_broadcast_strides, C_broadcast_strides; \
vec64_t Y_dims, _, __; \ vec64_t Y_dims, _, __; \
math::utils::ComputeBinaryBroadcastStrides(A_dims, B_dims, _, _, __); \ math::utils::ComputeBinaryBroadcastStrides(A_dims, B_dims, _, _, __); \
math::utils::ComputeBinaryBroadcastStrides(C_dims, __, _, _, Y_dims); \ math::utils::ComputeBinaryBroadcastStrides(C_dims, __, _, _, Y_dims); \
math::utils::ComputeBinaryBroadcastDims( \ math::utils::ComputeBinaryBroadcastDims( \
A_dims, Y_dims, A_broadcast_dims, _); \ A_dims, Y_dims, A_broadcast_dims, _); \
math::utils::ComputeBinaryBroadcastDims( \ math::utils::ComputeBinaryBroadcastDims( \
B_dims, Y_dims, B_broadcast_dims, _); \ B_dims, Y_dims, B_broadcast_dims, _); \
math::utils::ComputeBinaryBroadcastDims( \ math::utils::ComputeBinaryBroadcastDims( \
C_dims, Y_dims, C_broadcast_dims, _); \ C_dims, Y_dims, C_broadcast_dims, _); \
if (A_broadcast_dims == B_broadcast_dims && \ if (A_broadcast_dims == B_broadcast_dims && \
B_broadcast_dims == C_broadcast_dims) { \ B_broadcast_dims == C_broadcast_dims) { \
auto count = std::accumulate( \ auto count = std::accumulate( \
a_dims, a_dims + a_ndim, 1, std::multiplies<int64_t>()); \ a_dims, a_dims + a_ndim, 1, std::multiplies<int64_t>()); \
Where(count, a, b, c, y, ctx); \ Where(count, a, b, c, y, ctx); \
return; \ return; \
} \ } \
CUDA_TENSOR_DIMS_CHECK((int)Y_dims.size()); \ CUDA_TENSOR_DIMS_CHECK((int)Y_dims.size()); \
math::utils::ComputeBinaryBroadcastStrides( \ math::utils::ComputeBinaryBroadcastStrides( \
A_dims, Y_dims, A_broadcast_strides, _, _); \ A_dims, Y_dims, A_broadcast_strides, _, _); \
math::utils::ComputeBinaryBroadcastStrides( \ math::utils::ComputeBinaryBroadcastStrides( \
B_dims, Y_dims, B_broadcast_strides, _, _); \ B_dims, Y_dims, B_broadcast_strides, _, _); \
math::utils::ComputeBinaryBroadcastStrides( \ math::utils::ComputeBinaryBroadcastStrides( \
C_dims, Y_dims, C_broadcast_strides, _, _); \ C_dims, Y_dims, C_broadcast_strides, _, _); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> a_strides, b_strides, c_strides; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> y_dims; \ _BroadcastWhereImpl, \
const auto nthreads = std::accumulate( \ ScalarT, \
Y_dims.begin(), Y_dims.end(), 1, std::multiplies<int64_t>()); \ int(Y_dims.size()), \
for (int i = 0; i < Y_dims.size(); ++i) { \ A_broadcast_strides.data(), \
a_strides.data[i] = A_broadcast_strides[i]; \ B_broadcast_strides.data(), \
b_strides.data[i] = B_broadcast_strides[i]; \ C_broadcast_strides.data(), \
c_strides.data[i] = C_broadcast_strides[i]; \ Y_dims.data(), \
y_dims.data[i] = Y_dims[i]; \ reinterpret_cast<const ScalarT*>(a), \
} \ reinterpret_cast<const ScalarT*>(b), \
_BroadcastWhere<<< \ reinterpret_cast<const uint8_t*>(c), \
CUDA_BLOCKS(nthreads), \ reinterpret_cast<ScalarT*>(y), \
CUDA_THREADS, \ ctx); \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
Y_dims.size(), \
a_strides, \
b_strides, \
c_strides, \
y_dims, \
reinterpret_cast<const ScalarT*>(a), \
reinterpret_cast<const ScalarT*>(b), \
reinterpret_cast<const uint8_t*>(c), \
reinterpret_cast<ScalarT*>(y)); \
} }
DEFINE_WHERE_FUNC(bool, uint8_t); DEFINE_WHERE_FUNC(bool, uint8_t);
......
...@@ -62,7 +62,6 @@ template <typename T, typename AccT, class Reducer, int D> ...@@ -62,7 +62,6 @@ template <typename T, typename AccT, class Reducer, int D>
__global__ void _GenericReduce( __global__ void _GenericReduce(
const int rows, const int rows,
const int cols, const int cols,
const int num_dims,
const SimpleArray<int, D> x_dims, const SimpleArray<int, D> x_dims,
const SimpleArray<int, D> x_strides, const SimpleArray<int, D> x_strides,
const Reducer reducer, const Reducer reducer,
...@@ -75,7 +74,8 @@ __global__ void _GenericReduce( ...@@ -75,7 +74,8 @@ __global__ void _GenericReduce(
AccT val = init; AccT val = init;
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j; int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(x_dims.data[d], c, &c, &r); FIXED_DIVISOR_DIV_MOD(x_dims.data[d], c, &c, &r);
xi += r * x_strides.data[d]; xi += r * x_strides.data[d];
...@@ -89,66 +89,92 @@ __global__ void _GenericReduce( ...@@ -89,66 +89,92 @@ __global__ void _GenericReduce(
} }
} }
#define DEFINE_REDUCE_DISPATCHER(name) \ template <typename T, typename AccT, class Reducer, int D>
template <typename T, typename AccT, typename Reducer> \ void _GenericReduceImpl(
void _Reduce##name( \ const int* dims,
const int num_dims, \ const int num_axes,
const int* dims, \ const int* axes,
const int num_axes, \ const Reducer reducer,
const int* axes, \ const AccT init,
const Reducer reducer, \ const AccT scale,
const AccT init, \ const T* x,
const AccT scale, \ T* y,
const T* x, \ CUDAContext* ctx) {
T* y, \ SimpleArray<int, D> transpose_axes;
CUDAContext* ctx) { \ SimpleArray<int, D> transpose_strides;
int rows, cols; \ SimpleArray<int, D> transpose_dims;
vec32_t out_dims(dims, dims + num_dims); \ math::utils::TransposeAxesForReduce(D, num_axes, axes, transpose_axes.data);
for (int i = 0; i < num_axes; ++i) { \ math::utils::ComputeTransposeStrides(
out_dims[axes[i]] = 1; \ D, dims, transpose_axes.data, transpose_strides.data);
} \ int rows = 1, cols = 1;
if (math::utils::IsRowwiseReduce( \ const int pivot = D - num_axes;
num_dims, dims, out_dims.data(), &rows, &cols)) { \ for (int i = 0; i < pivot; ++i) {
_RowwiseReduce<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ rows *= dims[transpose_axes.data[i]];
rows, cols, reducer, init, scale, x, y); \ }
return; \ for (int i = pivot; i < D; ++i) {
} \ cols *= dims[transpose_axes.data[i]];
if (math::utils::IsColwiseReduce( \ }
num_dims, dims, out_dims.data(), &rows, &cols)) { \ for (int i = 0; i < D; ++i) {
_ColwiseReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ transpose_dims.data[i] = dims[transpose_axes.data[i]];
rows, cols, reducer, init, scale, x, y); \ }
return; \ _GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
} \ rows,
CUDA_TENSOR_DIMS_CHECK(num_dims); \ cols,
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes; \ transpose_dims,
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides; \ transpose_strides,
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims; \ reducer,
math::utils::TransposeAxesForReduce( \ init,
num_dims, num_axes, axes, transpose_axes.data); \ scale,
math::utils::ComputeTransposeStrides( \ x,
num_dims, dims, transpose_axes.data, transpose_strides.data); \ y);
rows = cols = 1; \ }
const int pivot = num_dims - num_axes; \
for (int i = 0; i < pivot; ++i) { \ #define DEFINE_REDUCE_DISPATCHER(name) \
rows *= dims[transpose_axes.data[i]]; \ template <typename T, typename AccT, typename Reducer> \
} \ void _Reduce##name( \
for (int i = pivot; i < num_dims; ++i) { \ const int num_dims, \
cols *= dims[transpose_axes.data[i]]; \ const int* dims, \
} \ const int num_axes, \
for (int i = 0; i < num_dims; ++i) { \ const int* axes, \
transpose_dims.data[i] = dims[transpose_axes.data[i]]; \ const Reducer reducer, \
} \ const AccT init, \
_GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ const AccT scale, \
rows, \ const T* x, \
cols, \ T* y, \
num_dims, \ CUDAContext* ctx) { \
transpose_dims, \ int rows, cols; \
transpose_strides, \ vec32_t out_dims(dims, dims + num_dims); \
reducer, \ for (int i = 0; i < num_axes; ++i) { \
init, \ out_dims[axes[i]] = 1; \
scale, \ } \
x, \ if (math::utils::IsRowwiseReduce( \
y); \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseReduce<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, reducer, init, scale, x, y); \
return; \
} \
if (math::utils::IsColwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, reducer, init, scale, x, y); \
return; \
} \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3( \
_GenericReduceImpl, \
T, \
AccT, \
Reducer, \
num_dims, \
dims, \
num_axes, \
axes, \
reducer, \
init, \
scale, \
x, \
y, \
ctx); \
} }
DEFINE_REDUCE_DISPATCHER(Max); DEFINE_REDUCE_DISPATCHER(Max);
......
...@@ -311,14 +311,41 @@ inline void ComputeTransposeStrides( ...@@ -311,14 +311,41 @@ inline void ComputeTransposeStrides(
} }
} }
template <typename DimT, typename AxisT>
inline void CollapseTransposeAxes(
const int num_dims,
const DimT* dims,
const AxisT* axes,
vector<DimT>& new_dims,
vector<AxisT>& new_axes) {
new_dims = vector<DimT>(dims, dims + num_dims);
new_axes = vector<AxisT>({axes[0]});
vector<AxisT> collapse_axes;
for (int i = 1; i < num_dims; ++i) {
if (axes[i] - 1 == axes[i - 1]) {
collapse_axes.push_back(axes[i]);
new_dims[axes[i]] *= new_dims[axes[i] - 1];
new_dims[axes[i] - 1] = -1;
} else {
new_axes.push_back(axes[i]);
}
}
const auto& erase_iter = std::remove_if(
new_dims.begin(), new_dims.end(), [](int x) { return x == -1; });
new_dims.erase(erase_iter, new_dims.end());
for (int i = 0; i < new_axes.size(); ++i) {
for (auto collapse_axis : collapse_axes) {
if (new_axes[i] > collapse_axis) new_axes[i]--;
}
}
}
template <typename DimT, typename IndexT> template <typename DimT, typename IndexT>
inline IndexT inline IndexT
GetIndexFromDims(const int num_dims, const DimT* dims, IndexT* index) { GetIndexFromDims(const int num_dims, const DimT* dims, IndexT* index) {
IndexT ret = 0; IndexT ret = 0;
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
if (dims[i] > 1) { if (dims[i] > 1) ret = ret * dims[i] + index[i];
ret = ret * dims[i] + index[i];
}
} }
return ret; return ret;
} }
......
...@@ -267,7 +267,7 @@ def uniform_(tensor, a=0, b=1): ...@@ -267,7 +267,7 @@ def uniform_(tensor, a=0, b=1):
---------- ----------
tensor : dragon.vm.torch.Tensor tensor : dragon.vm.torch.Tensor
The input tensor. The input tensor.
a : number, optional, default=-1 a : number, optional, default=0
The value to :math:`\alpha`. The value to :math:`\alpha`.
b : number, optional, default=1 b : number, optional, default=1
The value to :math:`\beta`. The value to :math:`\beta`.
......
...@@ -390,7 +390,7 @@ class MultiheadAttention(Module): ...@@ -390,7 +390,7 @@ class MultiheadAttention(Module):
self.in_proj_bias = Parameter(Tensor(3 * embed_dim)) self.in_proj_bias = Parameter(Tensor(3 * embed_dim))
else: else:
self.register_parameter('in_proj_bias', None) self.register_parameter('in_proj_bias', None)
self.out_proj = Linear(embed_dim, embed_dim, bias=True) self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
self.reset_parameters() self.reset_parameters()
def reset_parameters(self): def reset_parameters(self):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!