Commit 46feba80 by Ting PAN

Instantiate dispatch template by value for crucial CUDA kernels

Summary:
This commit instantiates CUDA kernels by using constant dimensions
to enable the optimization during compiler-time.
1 parent 936c351b
FROM ubuntu:16.04 FROM ubuntu:18.04
RUN \ RUN \
apt-get update && apt-get install -y \ apt-get update && apt-get install -y \
...@@ -43,8 +43,8 @@ RUN \ ...@@ -43,8 +43,8 @@ RUN \
-DPYTHON_EXECUTABLE=/usr/bin/python3 \ -DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DUSE_CUDA=OFF \ -DUSE_CUDA=OFF \
-DUSE_CUDNN=OFF \ -DUSE_CUDNN=OFF \
-DUSE_AVX2=OFF \ -DUSE_AVX2=ON \
-DUSE_FMA=OFF && \ -DUSE_FMA=ON && \
make install -j $(nproc) && \ make install -j $(nproc) && \
cd .. && rm -rf build && \ cd .. && rm -rf build && \
python3 setup.py install python3 setup.py install
......
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04
RUN \ RUN \
rm /etc/apt/sources.list.d/cuda.list && \ rm /etc/apt/sources.list.d/cuda.list && \
...@@ -48,8 +48,8 @@ RUN \ ...@@ -48,8 +48,8 @@ RUN \
-DPYTHON_EXECUTABLE=/usr/bin/python3 \ -DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DUSE_MPI=ON \ -DUSE_MPI=ON \
-DUSE_NCCL=ON \ -DUSE_NCCL=ON \
-DUSE_AVX2=OFF \ -DUSE_AVX2=ON \
-DUSE_FMA=OFF && \ -DUSE_FMA=ON && \
make install -j $(nproc) && \ make install -j $(nproc) && \
cd .. && rm -rf build && \ cd .. && rm -rf build && \
python3 setup.py install python3 setup.py install
......
...@@ -62,10 +62,6 @@ class CUDAObjects { ...@@ -62,10 +62,6 @@ class CUDAObjects {
} else { } else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
} }
#elif CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
}
#endif #endif
} }
return handles[stream_id]; return handles[stream_id];
...@@ -437,7 +433,8 @@ class DRAGON_API CUDAContext { ...@@ -437,7 +433,8 @@ class DRAGON_API CUDAContext {
CUDA_NOT_COMPILED; CUDA_NOT_COMPILED;
} }
/*! \brief Switch to the device and select given stream in current thread */ /*! \brief Switch to the device and select given stream in current
* thread */
void SwitchToDevice(int stream_id) { void SwitchToDevice(int stream_id) {
CUDA_NOT_COMPILED; CUDA_NOT_COMPILED;
} }
......
...@@ -13,7 +13,6 @@ namespace { ...@@ -13,7 +13,6 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _ConstPad( __global__ void _ConstPad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -23,7 +22,8 @@ __global__ void _ConstPad( ...@@ -23,7 +22,8 @@ __global__ void _ConstPad(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi, d; int xi = 0, tmp = yi, d;
for (d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_pads.data[d]; r -= X_pads.data[d];
...@@ -37,7 +37,6 @@ __global__ void _ConstPad( ...@@ -37,7 +37,6 @@ __global__ void _ConstPad(
template <typename T, int D> template <typename T, int D>
__global__ void _ReflectPad( __global__ void _ReflectPad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -46,7 +45,8 @@ __global__ void _ReflectPad( ...@@ -46,7 +45,8 @@ __global__ void _ReflectPad(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_pads.data[d]; r -= X_pads.data[d];
...@@ -61,7 +61,6 @@ __global__ void _ReflectPad( ...@@ -61,7 +61,6 @@ __global__ void _ReflectPad(
template <typename T, int D> template <typename T, int D>
__global__ void _EdgePad( __global__ void _EdgePad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -70,7 +69,8 @@ __global__ void _EdgePad( ...@@ -70,7 +69,8 @@ __global__ void _EdgePad(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r = min(X_dims.data[d] - 1, max(r - X_pads.data[d], 0)); r = min(X_dims.data[d] - 1, max(r - X_pads.data[d], 0));
...@@ -80,13 +80,47 @@ __global__ void _EdgePad( ...@@ -80,13 +80,47 @@ __global__ void _EdgePad(
} }
} }
template <typename T, int D>
void _PadImpl(
const int64_t* x_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const int64_t* pads,
const float value,
const string& mode,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_dims, X_strides, Y_dims, X_pads;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_dims.data[i] = x_dims[i];
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
X_pads.data[i] = pads[i];
}
if (mode == "ConstPad") {
_ConstPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, convert::To<T>(value), x, y);
} else if (mode == "ReflectPad") {
_ReflectPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, x, y);
} else if (mode == "EdgePad") {
_EdgePad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, x, y);
} else {
LOG(FATAL) << "Unknown Pad: " << mode << ".";
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_CONST_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void ConstPad<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_dims, \ const int64_t* x_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
...@@ -97,27 +131,31 @@ __global__ void _EdgePad( ...@@ -97,27 +131,31 @@ __global__ void _EdgePad(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, X_strides, Y_dims, X_pads; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
const auto N = std::accumulate( \ _PadImpl, \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ T, \
for (int i = 0; i < num_dims; ++i) { \
X_dims.data[i] = x_dims[i]; \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
X_pads.data[i] = pads[i]; \
} \
_ConstPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
num_dims, \ num_dims, \
X_dims, \ x_dims, \
X_strides, \ x_strides, \
Y_dims, \ y_dims, \
X_pads, \ pads, \
convert::To<T>(value), \ value, \
#name, \
x, \ x, \
y); \ y, \
ctx); \
} }
DEFINE_KERNEL_LAUNCHER(ConstPad, bool);
DEFINE_KERNEL_LAUNCHER(ConstPad, uint8_t);
DEFINE_KERNEL_LAUNCHER(ConstPad, int8_t);
DEFINE_KERNEL_LAUNCHER(ConstPad, int);
DEFINE_KERNEL_LAUNCHER(ConstPad, int64_t);
DEFINE_KERNEL_LAUNCHER(ConstPad, float16);
DEFINE_KERNEL_LAUNCHER(ConstPad, float);
DEFINE_KERNEL_LAUNCHER(ConstPad, double);
#undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_KERNEL_LAUNCHER(name, T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
...@@ -130,27 +168,21 @@ __global__ void _EdgePad( ...@@ -130,27 +168,21 @@ __global__ void _EdgePad(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, X_strides, Y_dims, X_pads; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
const auto N = std::accumulate( \ _PadImpl, \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ T, \
for (int i = 0; i < num_dims; ++i) { \ num_dims, \
X_dims.data[i] = x_dims[i]; \ x_dims, \
X_strides.data[i] = x_strides[i]; \ x_strides, \
Y_dims.data[i] = y_dims[i]; \ y_dims, \
X_pads.data[i] = pads[i]; \ pads, \
} \ 0.f, \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ #name, \
N, num_dims, X_dims, X_strides, Y_dims, X_pads, x, y); \ x, \
y, \
ctx); \
} }
DEFINE_CONST_KERNEL_LAUNCHER(bool);
DEFINE_CONST_KERNEL_LAUNCHER(uint8_t);
DEFINE_CONST_KERNEL_LAUNCHER(int8_t);
DEFINE_CONST_KERNEL_LAUNCHER(int);
DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool); DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t); DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
...@@ -167,7 +199,6 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t); ...@@ -167,7 +199,6 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16); DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float); DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double); DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
......
...@@ -10,27 +10,48 @@ namespace kernels { ...@@ -10,27 +10,48 @@ namespace kernels {
namespace { namespace {
template <typename T, int D> template <typename T, typename AccT, int D>
__global__ void _ReduceSumGrad( __global__ void _ReduceSumGrad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> Y_strides, const SimpleArray<int, D> Y_strides,
const float scale, const AccT scale,
const T* dy, const T* dy,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(xi, N) { CUDA_1D_KERNEL_LOOP(xi, N) {
int yi = 0, tmp = xi; int yi = 0, tmp = xi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(X_dims.data[d], tmp, &tmp, &r);
yi += (r % Y_dims.data[d]) * Y_strides.data[d]; yi += (r % Y_dims.data[d]) * Y_strides.data[d];
} }
dx[xi] = convert::To<T>(convert::To<float>(__ldg(dy + yi)) * scale); dx[xi] = convert::To<T>(convert::To<AccT>(__ldg(dy + yi)) * scale);
} }
} }
template <typename T, typename AccT, int D>
void _ReduceSumGradImpl(
const int64_t* x_dims,
const int64_t* y_dims,
const int64_t* y_strides,
const AccT scale,
const T* dy,
T* dx,
CUDAContext* ctx) {
SimpleArray<int, D> X_dims, Y_dims, Y_strides;
const auto N =
std::accumulate(x_dims, x_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_dims.data[i] = x_dims[i];
Y_dims.data[i] = y_dims[i];
Y_strides.data[i] = y_strides[i];
}
_ReduceSumGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, Y_dims, Y_strides, scale, dy, dx);
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
...@@ -47,23 +68,18 @@ __global__ void _ReduceSumGrad( ...@@ -47,23 +68,18 @@ __global__ void _ReduceSumGrad(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, Y_dims, Y_strides; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
const auto N = std::accumulate( \ _ReduceSumGradImpl, \
x_dims, x_dims + num_dims, 1, std::multiplies<int64_t>()); \ math::ScalarType<T>::type, \
for (int i = 0; i < num_dims; ++i) { \ math::AccmulatorType<T>::type, \
X_dims.data[i] = x_dims[i]; \
Y_dims.data[i] = y_dims[i]; \
Y_strides.data[i] = y_strides[i]; \
} \
_ReduceSumGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
num_dims, \ num_dims, \
X_dims, \ x_dims, \
Y_dims, \ y_dims, \
Y_strides, \ y_strides, \
scale, \ convert::To<math::AccmulatorType<T>::type>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx), \
ctx); \
} }
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(float16);
......
...@@ -12,7 +12,6 @@ namespace { ...@@ -12,7 +12,6 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _Roll( __global__ void _Roll(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_shifts, const SimpleArray<int, D> X_shifts,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
...@@ -20,7 +19,8 @@ __global__ void _Roll( ...@@ -20,7 +19,8 @@ __global__ void _Roll(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_shifts.data[d]; r -= X_shifts.data[d];
...@@ -31,6 +31,26 @@ __global__ void _Roll( ...@@ -31,6 +31,26 @@ __global__ void _Roll(
} }
} }
template <typename T, int D>
void _RollImpl(
const int64_t* x_shifts,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_shifts, X_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_shifts.data[i] = x_shifts[i];
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_Roll<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_shifts, X_strides, Y_dims, x, y);
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
...@@ -46,18 +66,8 @@ __global__ void _Roll( ...@@ -46,18 +66,8 @@ __global__ void _Roll(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_shifts; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides; \ _RollImpl, T, num_dims, x_shifts, x_strides, y_dims, x, y, ctx); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> Y_dims; \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_shifts.data[i] = x_shifts[i]; \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_Roll<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_shifts, X_strides, Y_dims, x, y); \
} }
DEFINE_KERNEL_LAUNCHER(bool); DEFINE_KERNEL_LAUNCHER(bool);
......
...@@ -13,7 +13,6 @@ namespace { ...@@ -13,7 +13,6 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _Slice( __global__ void _Slice(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> X_starts, const SimpleArray<int, D> X_starts,
...@@ -21,7 +20,8 @@ __global__ void _Slice( ...@@ -21,7 +20,8 @@ __global__ void _Slice(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += (r + X_starts.data[d]) * X_strides.data[d]; xi += (r + X_starts.data[d]) * X_strides.data[d];
...@@ -33,7 +33,6 @@ __global__ void _Slice( ...@@ -33,7 +33,6 @@ __global__ void _Slice(
template <typename T, int D> template <typename T, int D>
__global__ void _SliceGrad( __global__ void _SliceGrad(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> X_starts, const SimpleArray<int, D> X_starts,
...@@ -41,7 +40,8 @@ __global__ void _SliceGrad( ...@@ -41,7 +40,8 @@ __global__ void _SliceGrad(
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += (r + X_starts.data[d]) * X_strides.data[d]; xi += (r + X_starts.data[d]) * X_strides.data[d];
...@@ -50,6 +50,32 @@ __global__ void _SliceGrad( ...@@ -50,6 +50,32 @@ __global__ void _SliceGrad(
} }
} }
template <typename T, int D>
void _SliceImpl(
const string& routine,
const int64_t* x_strides,
const int64_t* y_dims,
const int64_t* starts,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_strides, Y_dims, X_starts;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
X_starts.data[i] = starts[i];
}
if (routine == "Slice") {
_Slice<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, X_starts, x, y);
} else if (routine == "SliceGrad") {
_SliceGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, X_starts, x, y);
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
...@@ -65,16 +91,8 @@ __global__ void _SliceGrad( ...@@ -65,16 +91,8 @@ __global__ void _SliceGrad(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims, X_starts; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
const auto N = std::accumulate( \ _SliceImpl, T, num_dims, #name, x_strides, y_dims, starts, x, y, ctx); \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
X_starts.data[i] = starts[i]; \
} \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_strides, Y_dims, X_starts, x, y); \
} }
DEFINE_KERNEL_LAUNCHER(Slice, bool); DEFINE_KERNEL_LAUNCHER(Slice, bool);
......
...@@ -31,12 +31,13 @@ __global__ void _Transpose( ...@@ -31,12 +31,13 @@ __global__ void _Transpose(
template <typename T, int D> template <typename T, int D>
void _TransposeImpl( void _TransposeImpl(
const int N,
const int64_t* x_strides, const int64_t* x_strides,
const int64_t* y_dims, const int64_t* y_dims,
const T* x, const T* x,
T* y, T* y,
CUDAContext* ctx) { CUDAContext* ctx) {
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
SimpleArray<int, D> X_strides, Y_dims; SimpleArray<int, D> X_strides, Y_dims;
for (int i = 0; i < D; ++i) { for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i]; X_strides.data[i] = x_strides[i];
...@@ -60,36 +61,8 @@ void _TransposeImpl( ...@@ -60,36 +61,8 @@ void _TransposeImpl(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
const auto N = std::accumulate( \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ _TransposeImpl, T, num_dims, x_strides, y_dims, x, y, ctx); \
switch (num_dims) { \
case 1: \
_TransposeImpl<T, 1>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 2: \
_TransposeImpl<T, 2>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 3: \
_TransposeImpl<T, 3>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 4: \
_TransposeImpl<T, 4>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 5: \
_TransposeImpl<T, 5>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 6: \
_TransposeImpl<T, 6>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 7: \
_TransposeImpl<T, 7>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 8: \
_TransposeImpl<T, 8>(N, x_strides, y_dims, x, y, ctx); \
break; \
default: \
break; \
} \
} }
DEFINE_KERNEL_LAUNCHER(bool); DEFINE_KERNEL_LAUNCHER(bool);
......
...@@ -82,7 +82,7 @@ __global__ void _SoftmaxCrossEntropyGrad( ...@@ -82,7 +82,7 @@ __global__ void _SoftmaxCrossEntropyGrad(
const int S, const int S,
const int C, const int C,
const int ignore_index, const int ignore_index,
const InputT* input, const InputT* /* input */,
const TargetT* target, const TargetT* target,
InputT* dx, InputT* dx,
InputT* mask) { InputT* mask) {
......
...@@ -38,7 +38,7 @@ __global__ void _NLLLossGrad( ...@@ -38,7 +38,7 @@ __global__ void _NLLLossGrad(
const int S, const int S,
const int C, const int C,
const int ignore_index, const int ignore_index,
const InputT* input, const InputT* /* input */,
const TargetT* target, const TargetT* target,
InputT* dx, InputT* dx,
InputT* mask) { InputT* mask) {
......
...@@ -67,7 +67,6 @@ template <typename T, typename AccT, int D> ...@@ -67,7 +67,6 @@ template <typename T, typename AccT, int D>
__global__ void _GenericMoments( __global__ void _GenericMoments(
const int rows, const int rows,
const int cols, const int cols,
const int num_dims,
const SimpleArray<int, D> X_dims, const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const T* x, const T* x,
...@@ -80,7 +79,8 @@ __global__ void _GenericMoments( ...@@ -80,7 +79,8 @@ __global__ void _GenericMoments(
AccT m_val = AccT(0), v_val = AccT(0); AccT m_val = AccT(0), v_val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j; int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r); FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r);
xi += r * X_strides.data[d]; xi += r * X_strides.data[d];
...@@ -98,9 +98,8 @@ __global__ void _GenericMoments( ...@@ -98,9 +98,8 @@ __global__ void _GenericMoments(
} }
} }
template <typename T, typename AccT> template <typename T, typename AccT, int D>
void _Moments( void _GenericMomentsImpl(
const int num_dims,
const int* dims, const int* dims,
const int num_axes, const int num_axes,
const int* axes, const int* axes,
...@@ -108,44 +107,25 @@ void _Moments( ...@@ -108,44 +107,25 @@ void _Moments(
AccT* mean, AccT* mean,
AccT* var, AccT* var,
CUDAContext* ctx) { CUDAContext* ctx) {
int rows, cols; SimpleArray<int, D> transpose_axes;
vec32_t out_dims(dims, dims + num_dims); SimpleArray<int, D> transpose_strides;
for (int i = 0; i < num_axes; ++i) { SimpleArray<int, D> transpose_dims;
out_dims[axes[i]] = 1; math::utils::TransposeAxesForReduce(D, num_axes, axes, transpose_axes.data);
}
if (math::utils::IsRowwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, x, mean, var);
return;
}
if (math::utils::IsColwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, x, mean, var);
return;
}
CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims;
math::utils::TransposeAxesForReduce(
num_dims, num_axes, axes, transpose_axes.data);
math::utils::ComputeTransposeStrides( math::utils::ComputeTransposeStrides(
num_dims, dims, transpose_axes.data, transpose_strides.data); D, dims, transpose_axes.data, transpose_strides.data);
rows = cols = 1; int rows = 1, cols = 1;
const int pivot = num_dims - num_axes; const int pivot = D - num_axes;
for (int i = 0; i < pivot; ++i) { for (int i = 0; i < pivot; ++i) {
rows *= dims[transpose_axes.data[i]]; rows *= dims[transpose_axes.data[i]];
} }
for (int i = pivot; i < num_dims; ++i) { for (int i = pivot; i < D; ++i) {
cols *= dims[transpose_axes.data[i]]; cols *= dims[transpose_axes.data[i]];
} }
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < D; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]]; transpose_dims.data[i] = dims[transpose_axes.data[i]];
} }
_GenericMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( _GenericMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, num_dims, transpose_dims, transpose_strides, x, mean, var); rows, cols, transpose_dims, transpose_strides, x, mean, var);
} }
} // namespace } // namespace
...@@ -163,12 +143,33 @@ void _Moments( ...@@ -163,12 +143,33 @@ void _Moments(
AccT* mean, \ AccT* mean, \
AccT* var, \ AccT* var, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_Moments( \ int rows, cols; \
vec32_t out_dims(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
out_dims[axes[i]] = 1; \
} \
if (math::utils::IsRowwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, x, mean, var); \
return; \
} \
if (math::utils::IsColwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, x, mean, var); \
return; \
} \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
_GenericMomentsImpl, \
T, \
AccT, \
num_dims, \ num_dims, \
dims, \ dims, \
num_axes, \ num_axes, \
axes, \ axes, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ x, \
mean, \ mean, \
var, \ var, \
ctx); \ ctx); \
......
...@@ -13,7 +13,7 @@ namespace { ...@@ -13,7 +13,7 @@ namespace {
template <typename T> template <typename T>
__global__ void _Im2Col2dNCHW( __global__ void _Im2Col2dNCHW(
const int nthreads, const int nthreads,
const int C, const int /* C */,
const int H, const int H,
const int W, const int W,
const int out_h, const int out_h,
...@@ -59,7 +59,7 @@ __global__ void _Im2Col2dNHWC( ...@@ -59,7 +59,7 @@ __global__ void _Im2Col2dNHWC(
const int C, const int C,
const int H, const int H,
const int W, const int W,
const int out_h, const int /* out_h */,
const int out_w, const int out_w,
const int kernel_h, const int kernel_h,
const int kernel_w, const int kernel_w,
...@@ -97,7 +97,7 @@ __global__ void _Im2Col2dNHWC( ...@@ -97,7 +97,7 @@ __global__ void _Im2Col2dNHWC(
template <typename T> template <typename T>
__global__ void _Col2Im2dNCHW( __global__ void _Col2Im2dNCHW(
const int nthreads, const int nthreads,
const int C, const int /* C */,
const int H, const int H,
const int W, const int W,
const int out_h, const int out_h,
...@@ -147,7 +147,7 @@ template <typename T> ...@@ -147,7 +147,7 @@ template <typename T>
__global__ void _Col2Im2dNHWC( __global__ void _Col2Im2dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
const int H, const int /* H */,
const int W, const int W,
const int out_h, const int out_h,
const int out_w, const int out_w,
......
...@@ -7,7 +7,7 @@ namespace kernels { ...@@ -7,7 +7,7 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T, typename AccT>
void _MaxPool2dNCHW( void _MaxPool2dNCHW(
const int N, const int N,
const int C, const int C,
...@@ -29,8 +29,7 @@ void _MaxPool2dNCHW( ...@@ -29,8 +29,7 @@ void _MaxPool2dNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w; const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w}; std::array<int, 4> dims = {N, C, out_h, out_w};
T val; int hstart, hend, wstart, wend;
int hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxCxHoxWo; ++i) { for (int i = 0; i < NxCxHoxWo; ++i) {
hstart = index[2] * stride_h - pad_h; hstart = index[2] * stride_h - pad_h;
wstart = index[3] * stride_w - pad_w; wstart = index[3] * stride_w - pad_w;
...@@ -39,23 +38,24 @@ void _MaxPool2dNCHW( ...@@ -39,23 +38,24 @@ void _MaxPool2dNCHW(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * CxHxW + index[1] * HxW; const T* offset_x = x + index[0] * CxHxW + index[1] * HxW;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = h * W + w; const auto xi = h * W + w;
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool2dNHWC( void _MaxPool2dNHWC(
const int N, const int N,
const int C, const int C,
...@@ -76,8 +76,7 @@ void _MaxPool2dNHWC( ...@@ -76,8 +76,7 @@ void _MaxPool2dNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w; const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0}; std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C}; std::array<int, 4> dims = {N, out_h, out_w, C};
T val; int hstart, hend, wstart, wend;
int hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxHoxWoxC; ++i) { for (int i = 0; i < NxHoxWoxC; ++i) {
hstart = index[1] * stride_h - pad_h; hstart = index[1] * stride_h - pad_h;
wstart = index[2] * stride_w - pad_w; wstart = index[2] * stride_w - pad_w;
...@@ -86,23 +85,24 @@ void _MaxPool2dNHWC( ...@@ -86,23 +85,24 @@ void _MaxPool2dNHWC(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * HxWxC; const T* offset_x = x + index[0] * HxWxC;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = (h * W + w) * C + index[3]; const auto xi = (h * W + w) * C + index[3];
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(4, dims.data(), index.data()); math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool2dGradNCHW( void _MaxPool2dGradNCHW(
const int N, const int N,
const int C, const int C,
...@@ -127,13 +127,15 @@ void _MaxPool2dGradNCHW( ...@@ -127,13 +127,15 @@ void _MaxPool2dGradNCHW(
memset(dx, 0, sizeof(T) * N * CxHxW); memset(dx, 0, sizeof(T) * N * CxHxW);
for (int i = 0; i < NxCxHoxWo; ++i) { for (int i = 0; i < NxCxHoxWo; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * CxHxW + index[1] * HxW + mask[i]] += dy[i]; const auto xi = index[0] * CxHxW + index[1] * HxW + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(3, dims.data(), index.data()); math::utils::IncreaseIndexInDims(3, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool2dGradNHWC( void _MaxPool2dGradNHWC(
const int N, const int N,
const int C, const int C,
...@@ -157,13 +159,15 @@ void _MaxPool2dGradNHWC( ...@@ -157,13 +159,15 @@ void _MaxPool2dGradNHWC(
memset(dx, 0, sizeof(T) * N * HxWxC); memset(dx, 0, sizeof(T) * N * HxWxC);
for (int i = 0; i < NxHoxWoxC; ++i) { for (int i = 0; i < NxHoxWoxC; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * HxWxC + mask[i]] += dy[i]; const auto xi = index[0] * HxWxC + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(2, dims.data(), index.data()); math::utils::IncreaseIndexInDims(2, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dNCHW( void _MaxPool3dNCHW(
const int N, const int N,
const int C, const int C,
...@@ -190,8 +194,7 @@ void _MaxPool3dNCHW( ...@@ -190,8 +194,7 @@ void _MaxPool3dNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w; const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w}; std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T val; int dstart, dend, hstart, hend, wstart, wend;
int dstart, dend, hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxCxDoxHoxWo; ++i) { for (int i = 0; i < NxCxDoxHoxWo; ++i) {
dstart = index[2] * stride_d - pad_d; dstart = index[2] * stride_d - pad_d;
hstart = index[3] * stride_h - pad_h; hstart = index[3] * stride_h - pad_h;
...@@ -203,25 +206,26 @@ void _MaxPool3dNCHW( ...@@ -203,25 +206,26 @@ void _MaxPool3dNCHW(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW; const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = (d * H + h) * W + w; const auto xi = (d * H + h) * W + w;
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(5, dims.data(), index.data()); math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dNHWC( void _MaxPool3dNHWC(
const int N, const int N,
const int C, const int C,
...@@ -247,8 +251,7 @@ void _MaxPool3dNHWC( ...@@ -247,8 +251,7 @@ void _MaxPool3dNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w; const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0}; std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C}; std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T val; int dstart, dend, hstart, hend, wstart, wend;
int dstart, dend, hstart, hend, wstart, wend, xi, mask_val;
for (int i = 0; i < NxDoxHoxWoxC; ++i) { for (int i = 0; i < NxDoxHoxWoxC; ++i) {
dstart = index[1] * stride_d - pad_d; dstart = index[1] * stride_d - pad_d;
hstart = index[2] * stride_h - pad_h; hstart = index[2] * stride_h - pad_h;
...@@ -260,25 +263,26 @@ void _MaxPool3dNHWC( ...@@ -260,25 +263,26 @@ void _MaxPool3dNHWC(
hstart = std::max(hstart, 0); hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * DxHxWxC; const T* offset_x = x + index[0] * DxHxWxC;
mask_val = -1; int mask_val = -1;
val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
xi = ((d * H + h) * W + w) * C + index[4]; const auto xi = ((d * H + h) * W + w) * C + index[4];
if (offset_x[xi] > val) { if (convert::To<AccT>(offset_x[xi]) > val) {
val = offset_x[mask_val = xi]; mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
} }
} }
} }
} }
y[i] = val; y[i] = convert::To<T>(val);
mask[i] = mask_val; mask[i] = mask_val;
math::utils::IncreaseIndexInDims(5, dims.data(), index.data()); math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dGradNCHW( void _MaxPool3dGradNCHW(
const int N, const int N,
const int C, const int C,
...@@ -308,13 +312,15 @@ void _MaxPool3dGradNCHW( ...@@ -308,13 +312,15 @@ void _MaxPool3dGradNCHW(
memset(dx, 0, sizeof(T) * N * CxDxHxW); memset(dx, 0, sizeof(T) * N * CxDxHxW);
for (int i = 0; i < NxCxDoxHoxWo; ++i) { for (int i = 0; i < NxCxDoxHoxWo; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * CxDxHxW + index[1] * DxHxW + mask[i]] += dy[i]; const auto xi = index[0] * CxDxHxW + index[1] * DxHxW + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(3, dims.data(), index.data()); math::utils::IncreaseIndexInDims(3, dims.data(), index.data());
} }
} }
template <typename T> template <typename T, typename AccT>
void _MaxPool3dGradNHWC( void _MaxPool3dGradNHWC(
const int N, const int N,
const int C, const int C,
...@@ -343,7 +349,9 @@ void _MaxPool3dGradNHWC( ...@@ -343,7 +349,9 @@ void _MaxPool3dGradNHWC(
memset(dx, 0, sizeof(T) * N * DxHxWxC); memset(dx, 0, sizeof(T) * N * DxHxWxC);
for (int i = 0; i < NxDoxHoxWoxC; ++i) { for (int i = 0; i < NxDoxHoxWoxC; ++i) {
if (mask[i] != -1) { if (mask[i] != -1) {
dx[index[0] * DxHxWxC + mask[i]] += dy[i]; const auto xi = index[0] * DxHxWxC + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
} }
math::utils::IncreaseIndexInDims(2, dims.data(), index.data()); math::utils::IncreaseIndexInDims(2, dims.data(), index.data());
} }
...@@ -353,11 +361,11 @@ void _MaxPool3dGradNHWC( ...@@ -353,11 +361,11 @@ void _MaxPool3dGradNHWC(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, ...) \ #define DISPATCH_POOL_KERNEL(name, T, AccT, ...) \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
name##NCHW(__VA_ARGS__); \ name##NCHW<T, AccT>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \ } else if (data_format == "NHWC") { \
name##NHWC(__VA_ARGS__); \ name##NHWC<T, AccT>(__VA_ARGS__); \
} else { \ } else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \ LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} }
...@@ -384,6 +392,8 @@ void _MaxPool3dGradNHWC( ...@@ -384,6 +392,8 @@ void _MaxPool3dGradNHWC(
CPUContext* ctx) { \ CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \ N, \
C, \ C, \
H, \ H, \
...@@ -401,8 +411,10 @@ void _MaxPool3dGradNHWC( ...@@ -401,8 +411,10 @@ void _MaxPool3dGradNHWC(
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float16);
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float); DEFINE_KERNEL_LAUNCHER(MaxPool2d, float);
DEFINE_KERNEL_LAUNCHER(MaxPool2d, double); DEFINE_KERNEL_LAUNCHER(MaxPool2d, double);
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float16); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
...@@ -434,6 +446,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad ...@@ -434,6 +446,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
CPUContext* ctx) { \ CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \ N, \
C, \ C, \
D, \ D, \
...@@ -456,8 +470,10 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad ...@@ -456,8 +470,10 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float16);
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float); DEFINE_KERNEL_LAUNCHER(MaxPool3d, float);
DEFINE_KERNEL_LAUNCHER(MaxPool3d, double); DEFINE_KERNEL_LAUNCHER(MaxPool3d, double);
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float16); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double); // MaxPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -9,7 +10,9 @@ namespace kernels { ...@@ -9,7 +10,9 @@ namespace kernels {
namespace { namespace {
template <typename T> #define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT>
__global__ void _MaxPool2dNCHW( __global__ void _MaxPool2dNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -41,20 +44,21 @@ __global__ void _MaxPool2dNCHW( ...@@ -41,20 +44,21 @@ __global__ void _MaxPool2dNCHW(
const T* offset_x = x + (n * C + c) * H * W; const T* offset_x = x + (n * C + c) * H * W;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
if (offset_x[h * W + w] > val) { if (LDG(offset_x, h * W + w) > val) {
val = offset_x[mask_val = h * W + w]; mask_val = h * W + w;
val = LDG(offset_x, mask_val);
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool2dNHWC( __global__ void _MaxPool2dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -86,21 +90,22 @@ __global__ void _MaxPool2dNHWC( ...@@ -86,21 +90,22 @@ __global__ void _MaxPool2dNHWC(
const int x_offset = n * H * W * C + c; const int x_offset = n * H * W * C + c;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = T(-FLT_MAX);
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
const int xi = x_offset + (h * W + w) * C; const int xi = x_offset + (h * W + w) * C;
if (x[xi] > val) { if (LDG(x, xi) > val) {
val = x[mask_val = xi]; mask_val = xi;
val = LDG(x, xi);
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool2dGradNCHW( __global__ void _MaxPool2dGradNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -131,20 +136,20 @@ __global__ void _MaxPool2dGradNCHW( ...@@ -131,20 +136,20 @@ __global__ void _MaxPool2dGradNCHW(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = (n * C + c) * out_h * out_w; const int y_offset = (n * C + c) * out_h * out_w;
T val = T(0); AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int yi = y_offset + h_out * out_w + w_out; const int yi = y_offset + h_out * out_w + w_out;
if (mask[yi] == (h * W + w)) { if (mask[yi] == (h * W + w)) {
val += dy[yi]; val += LDG(dy, yi);
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool2dGradNHWC( __global__ void _MaxPool2dGradNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -175,20 +180,20 @@ __global__ void _MaxPool2dGradNHWC( ...@@ -175,20 +180,20 @@ __global__ void _MaxPool2dGradNHWC(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = n * out_h * out_w * C + c; const int y_offset = n * out_h * out_w * C + c;
T val = T(0); AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int yi = y_offset + (h_out * out_w + w_out) * C; const int yi = y_offset + (h_out * out_w + w_out) * C;
if (mask[yi] == xi) { if (mask[yi] == xi) {
val += dy[yi]; val += LDG(dy, yi);
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dNCHW( __global__ void _MaxPool3dNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -232,23 +237,24 @@ __global__ void _MaxPool3dNCHW( ...@@ -232,23 +237,24 @@ __global__ void _MaxPool3dNCHW(
const T* offset_x = x + (n * C + c) * D * H * W; const T* offset_x = x + (n * C + c) * D * H * W;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
tmp = (d * H + h) * W + w; tmp = (d * H + h) * W + w;
if (offset_x[tmp] > val) { if (LDG(offset_x, tmp) > val) {
val = offset_x[mask_val = tmp]; mask_val = tmp;
val = LDG(offset_x, mask_val);
} }
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dNHWC( __global__ void _MaxPool3dNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -292,23 +298,24 @@ __global__ void _MaxPool3dNHWC( ...@@ -292,23 +298,24 @@ __global__ void _MaxPool3dNHWC(
const int x_offset = n * D * H * W * C + c; const int x_offset = n * D * H * W * C + c;
int mask_val = -1; int mask_val = -1;
T val = T(-FLT_MAX); AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) { for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
tmp = x_offset + ((d * H + h) * W + w) * C; tmp = x_offset + ((d * H + h) * W + w) * C;
if (x[tmp] > val) { if (LDG(x, tmp) > val) {
val = x[mask_val = tmp]; mask_val = tmp;
val = LDG(x, tmp);
} }
} }
} }
} }
y[yi] = val; y[yi] = convert::To<T>(val);
mask[yi] = mask_val; mask[yi] = mask_val;
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dGradNCHW( __global__ void _MaxPool3dGradNCHW(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -351,22 +358,22 @@ __global__ void _MaxPool3dGradNCHW( ...@@ -351,22 +358,22 @@ __global__ void _MaxPool3dGradNCHW(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = (n * C + c) * out_d * out_h * out_w; const int y_offset = (n * C + c) * out_d * out_h * out_w;
T val = T(0); AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) { for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
tmp = y_offset + (d_out * out_h + h_out) * out_w + w_out; tmp = y_offset + (d_out * out_h + h_out) * out_w + w_out;
if (mask[tmp] == ((d * H + h) * W + w)) { if (mask[tmp] == ((d * H + h) * W + w)) {
val += dy[tmp]; val += LDG(dy, tmp);
} }
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
template <typename T> template <typename T, typename AccT>
__global__ void _MaxPool3dGradNHWC( __global__ void _MaxPool3dGradNHWC(
const int nthreads, const int nthreads,
const int C, const int C,
...@@ -409,30 +416,34 @@ __global__ void _MaxPool3dGradNHWC( ...@@ -409,30 +416,34 @@ __global__ void _MaxPool3dGradNHWC(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w); const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = n * out_d * out_h * out_w * C + c; const int y_offset = n * out_d * out_h * out_w * C + c;
T val = T(0); AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) { for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
for (int h_out = out_hstart; h_out < out_hend; ++h_out) { for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) { for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
tmp = y_offset + ((d_out * out_h + h_out) * out_w + w_out) * C; tmp = y_offset + ((d_out * out_h + h_out) * out_w + w_out) * C;
if (mask[tmp] == xi) { if (mask[tmp] == xi) {
val += dy[tmp]; val += LDG(dy, tmp);
} }
} }
} }
} }
dx[xi] = val; dx[xi] = convert::To<T>(val);
} }
} }
#undef LDG
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, kBlocks, kThreads, ...) \ #define DISPATCH_POOL_KERNEL(name, T, AccT, kBlocks, kThreads, ...) \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
name##NCHW<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \ name##NCHW<T, AccT> \
<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \ } else if (data_format == "NHWC") { \
name##NHWC<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \ name##NHWC<T, AccT> \
<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \ } else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \ LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} }
...@@ -460,6 +471,8 @@ __global__ void _MaxPool3dGradNHWC( ...@@ -460,6 +471,8 @@ __global__ void _MaxPool3dGradNHWC(
const int nthreads = N * C * out_dim; \ const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
nthreads, \ nthreads, \
...@@ -474,13 +487,15 @@ __global__ void _MaxPool3dGradNHWC( ...@@ -474,13 +487,15 @@ __global__ void _MaxPool3dGradNHWC(
stride_w, \ stride_w, \
pad_h, \ pad_h, \
pad_w, \ pad_w, \
x, \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mask, \ mask, \
y); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float16, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float, (out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool2d, float, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2d, double, (out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool2d, double, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float16, (H * W)); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float, (H * W)); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float, (H * W)); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
...@@ -513,6 +528,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad ...@@ -513,6 +528,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
const int nthreads = N * C * out_dim; \ const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
nthreads, \ nthreads, \
...@@ -532,13 +549,15 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad ...@@ -532,13 +549,15 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
pad_d, \ pad_d, \
pad_h, \ pad_h, \
pad_w, \ pad_w, \
x, \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mask, \ mask, \
y); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float16, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float, (out_d * out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool3d, float, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3d, double, (out_d * out_h * out_w)); DEFINE_KERNEL_LAUNCHER(MaxPool3d, double, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float16, (D * H * W)); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float, (D * H * W)); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float, (D * H * W)); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double, (D * H * W)); // MaxPool3dGrad DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double, (D * H * W)); // MaxPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
...@@ -85,7 +85,7 @@ __global__ void _RoiPoolGrad( ...@@ -85,7 +85,7 @@ __global__ void _RoiPoolGrad(
const int W, const int W,
const int out_h, const int out_h,
const int out_w, const int out_w,
const float spatial_scale, const float /* spatial_scale */,
const T* dy, const T* dy,
const float* rois, const float* rois,
const int* mask, const int* mask,
......
...@@ -11,14 +11,13 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -11,14 +11,13 @@ void TransposeOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0}); auto &X = Input(0), *Y = Output(0, {0});
int num_axes, num_dims = X.ndim(); int num_axes, num_dims = X.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
perm(0, &num_axes); perm(0, &num_axes);
CHECK(num_axes == 0 || num_axes == num_dims) CHECK(num_axes == 0 || num_axes == num_dims)
<< "\nProviding " << num_axes << " dimensions to permute, " << "\nProviding " << num_axes << " dimensions to permute, "
<< "while Tensor(" << X.name() << ")'s dims are " << X.DimString(); << "while Tensor(" << X.name() << ")'s dims are " << X.DimString();
vec64_t new_axes(num_dims); vec64_t new_axes(num_dims), new_dims(num_dims);
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1; new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1;
} }
...@@ -31,13 +30,27 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -31,13 +30,27 @@ void TransposeOp<Context>::DoRunWithType() {
} }
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X.stride(new_axes[i]); new_dims[i] = X.dim(new_axes[i]);
Y_dims[i] = X.dim(new_axes[i]); }
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
num_dims,
X.dims().data(),
new_axes.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
? ctx()->workspace()->template data<T, Context>({X.count()})[0] ? ctx()->workspace()->template data<T, Context>({X.count()})[0]
: Y->Reshape(Y_dims)->template mutable_data<T, Context>(); : Y->Reshape(new_dims)->template mutable_data<T, Context>();
kernels::Transpose( kernels::Transpose(
num_dims, num_dims,
...@@ -51,7 +64,7 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -51,7 +64,7 @@ void TransposeOp<Context>::DoRunWithType() {
math::Copy( math::Copy(
X.count(), X.count(),
scratch, scratch,
Y->Reshape(Y_dims)->template mutable_data<T, Context>(), Y->Reshape(new_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
} }
......
...@@ -107,11 +107,6 @@ void PoolOp<Context>::DoRunWithType() { ...@@ -107,11 +107,6 @@ void PoolOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void PoolOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::TypesBase<float, double>>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void PoolGradientOp<Context>::DoRunWithType() { void PoolGradientOp<Context>::DoRunWithType() {
ComputeOutShape(); ComputeOutShape();
...@@ -212,11 +207,6 @@ void PoolGradientOp<Context>::DoRunWithType() { ...@@ -212,11 +207,6 @@ void PoolGradientOp<Context>::DoRunWithType() {
} }
} }
template <class Context>
void PoolGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::TypesBase<float, double>>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Pool); DEPLOY_CPU_OPERATOR(Pool);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Pool); DEPLOY_CUDA_OPERATOR(Pool);
......
...@@ -27,7 +27,9 @@ class PoolOp final : public PoolOpBase<Context> { ...@@ -27,7 +27,9 @@ class PoolOp final : public PoolOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_POOL_FUNCTIONS; USE_POOL_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -43,7 +45,9 @@ class PoolGradientOp final : public PoolOpBase<Context> { ...@@ -43,7 +45,9 @@ class PoolGradientOp final : public PoolOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_POOL_FUNCTIONS; USE_POOL_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -70,7 +74,9 @@ class CuDNNPoolOp final : public CuDNNPoolOpBase<Context> { ...@@ -70,7 +74,9 @@ class CuDNNPoolOp final : public CuDNNPoolOpBase<Context> {
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_)); CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
} }
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -99,7 +105,9 @@ class CuDNNPoolGradientOp final : public CuDNNPoolOpBase<Context> { ...@@ -99,7 +105,9 @@ class CuDNNPoolGradientOp final : public CuDNNPoolOpBase<Context> {
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_)); CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
} }
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
......
#ifdef USE_CUDNN #ifdef USE_CUDNN
#include "dragon/core/workspace.h"
#include "dragon/operators/vision/pool_op.h" #include "dragon/operators/vision/pool_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -10,6 +12,56 @@ void CuDNNPoolOp<Context>::DoRunWithType() { ...@@ -10,6 +12,56 @@ void CuDNNPoolOp<Context>::DoRunWithType() {
ComputeOutShape(); ComputeOutShape();
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
// CuDNN NHWC pooling is slow.
// Temporarily fallback to the naive implementation.
if (data_format() == "NHWC" && mode_ == "AVG") {
if (num_axes_ == 1 || num_axes_ == 2) {
kernels::AvgPool2d(
in_dims_[0],
in_dims_[1],
in_dims_[2],
num_axes_ == 1 ? 1 : in_dims_[3],
out_dims_[2],
num_axes_ == 1 ? 1 : out_dims_[3],
kshape_[0],
num_axes_ == 1 ? 1 : kshape_[1],
strides_[0],
num_axes_ == 1 ? 1 : strides_[1],
pads_begin_[0],
num_axes_ == 1 ? 0 : pads_begin_[1],
data_format(),
X.template data<T, Context>(),
Y->Reshape(out_shape_)->template mutable_data<T, Context>(),
ctx());
} else if (num_axes_ == 3) {
kernels::AvgPool3d(
in_dims_[0],
in_dims_[1],
in_dims_[2],
in_dims_[3],
in_dims_[4],
out_dims_[2],
out_dims_[3],
out_dims_[4],
kshape_[0],
kshape_[1],
kshape_[2],
strides_[0],
strides_[1],
strides_[2],
pads_begin_[0],
pads_begin_[1],
pads_begin_[2],
data_format(),
X.template data<T, Context>(),
Y->Reshape(out_shape_)->template mutable_data<T, Context>(),
ctx());
} else {
LOG(FATAL) << "AvgPool" << num_axes_ << "d is not supported.";
}
return;
}
SetPoolDesc(); SetPoolDesc();
CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format()); CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format()); CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format());
...@@ -26,11 +78,6 @@ void CuDNNPoolOp<Context>::DoRunWithType() { ...@@ -26,11 +78,6 @@ void CuDNNPoolOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void CuDNNPoolOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void CuDNNPoolGradientOp<Context>::DoRunWithType() { void CuDNNPoolGradientOp<Context>::DoRunWithType() {
ComputeOutShape(); ComputeOutShape();
...@@ -56,11 +103,6 @@ void CuDNNPoolGradientOp<Context>::DoRunWithType() { ...@@ -56,11 +103,6 @@ void CuDNNPoolGradientOp<Context>::DoRunWithType() {
dX->ReshapeLike(X)->template mutable_data<T, Context>())); dX->ReshapeLike(X)->template mutable_data<T, Context>()));
} }
template <class Context>
void CuDNNPoolGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CUDNN_OPERATOR(Pool); DEPLOY_CUDNN_OPERATOR(Pool);
DEPLOY_CUDNN_OPERATOR(PoolGradient); DEPLOY_CUDNN_OPERATOR(PoolGradient);
......
...@@ -60,12 +60,19 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -60,12 +60,19 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t transpose_dims, transpose_axes;
vec64_t X_strides(in_dims.size()); math::utils::CollapseTransposeAxes(
vec64_t Y_dims(in_dims.size()); X_reshape.ndim(),
for (int i = 0; i < X_reshape.ndim(); i++) { X_reshape.dims().data(),
X_strides[i] = X_reshape.stride(perm[i]); perm.data(),
Y_dims[i] = X_reshape.dim(perm[i]); transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
...@@ -73,7 +80,7 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -73,7 +80,7 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
: Y->Reshape(out_shape)->template mutable_data<T, Context>(); : Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose( kernels::Transpose(
X_strides.size(), num_dims,
X_strides.data(), X_strides.data(),
Y_dims.data(), Y_dims.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
...@@ -135,12 +142,19 @@ void DepthToSpaceOp<Context>::DoRunWithType() { ...@@ -135,12 +142,19 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t transpose_dims, transpose_axes;
vec64_t X_strides(in_dims.size()); math::utils::CollapseTransposeAxes(
vec64_t Y_dims(in_dims.size()); X_reshape.ndim(),
for (int i = 0; i < in_dims.size(); i++) { X_reshape.dims().data(),
X_strides[i] = X_reshape.stride(perm[i]); perm.data(),
Y_dims[i] = X_reshape.dim(perm[i]); transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
} }
auto* scratch = ((void*)&X == (void*)Y) auto* scratch = ((void*)&X == (void*)Y)
...@@ -148,7 +162,7 @@ void DepthToSpaceOp<Context>::DoRunWithType() { ...@@ -148,7 +162,7 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
: Y->Reshape(out_shape)->template mutable_data<T, Context>(); : Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose( kernels::Transpose(
X_strides.size(), num_dims,
X_strides.data(), X_strides.data(),
Y_dims.data(), Y_dims.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
......
...@@ -158,6 +158,129 @@ class CUDADeviceGuard { ...@@ -158,6 +158,129 @@ class CUDADeviceGuard {
int prev_id_; int prev_id_;
}; };
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1(Func, T, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2(Func, T1, T2, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T1, T2, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T1, T2, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T1, T2, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T1, T2, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T1, T2, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T1, T2, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T1, T2, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T1, T2, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3(Func, T1, T2, T3, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T1, T2, T3, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T1, T2, T3, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T1, T2, T3, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T1, T2, T3, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T1, T2, T3, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T1, T2, T3, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T1, T2, T3, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T1, T2, T3, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#else #else
#define CUDA_NOT_COMPILED LOG(FATAL) << "CUDA library is not compiled with." #define CUDA_NOT_COMPILED LOG(FATAL) << "CUDA library is not compiled with."
......
...@@ -62,7 +62,6 @@ template <typename T, typename AccT, class Reducer, int D> ...@@ -62,7 +62,6 @@ template <typename T, typename AccT, class Reducer, int D>
__global__ void _GenericReduce( __global__ void _GenericReduce(
const int rows, const int rows,
const int cols, const int cols,
const int num_dims,
const SimpleArray<int, D> x_dims, const SimpleArray<int, D> x_dims,
const SimpleArray<int, D> x_strides, const SimpleArray<int, D> x_strides,
const Reducer reducer, const Reducer reducer,
...@@ -75,7 +74,8 @@ __global__ void _GenericReduce( ...@@ -75,7 +74,8 @@ __global__ void _GenericReduce(
AccT val = init; AccT val = init;
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j; int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(x_dims.data[d], c, &c, &r); FIXED_DIVISOR_DIV_MOD(x_dims.data[d], c, &c, &r);
xi += r * x_strides.data[d]; xi += r * x_strides.data[d];
...@@ -89,6 +89,46 @@ __global__ void _GenericReduce( ...@@ -89,6 +89,46 @@ __global__ void _GenericReduce(
} }
} }
template <typename T, typename AccT, class Reducer, int D>
void _GenericReduceImpl(
const int* dims,
const int num_axes,
const int* axes,
const Reducer reducer,
const AccT init,
const AccT scale,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> transpose_axes;
SimpleArray<int, D> transpose_strides;
SimpleArray<int, D> transpose_dims;
math::utils::TransposeAxesForReduce(D, num_axes, axes, transpose_axes.data);
math::utils::ComputeTransposeStrides(
D, dims, transpose_axes.data, transpose_strides.data);
int rows = 1, cols = 1;
const int pivot = D - num_axes;
for (int i = 0; i < pivot; ++i) {
rows *= dims[transpose_axes.data[i]];
}
for (int i = pivot; i < D; ++i) {
cols *= dims[transpose_axes.data[i]];
}
for (int i = 0; i < D; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]];
}
_GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows,
cols,
transpose_dims,
transpose_strides,
reducer,
init,
scale,
x,
y);
}
#define DEFINE_REDUCE_DISPATCHER(name) \ #define DEFINE_REDUCE_DISPATCHER(name) \
template <typename T, typename AccT, typename Reducer> \ template <typename T, typename AccT, typename Reducer> \
void _Reduce##name( \ void _Reduce##name( \
...@@ -120,35 +160,21 @@ __global__ void _GenericReduce( ...@@ -120,35 +160,21 @@ __global__ void _GenericReduce(
return; \ return; \
} \ } \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes; \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3( \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides; \ _GenericReduceImpl, \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims; \ T, \
math::utils::TransposeAxesForReduce( \ AccT, \
num_dims, num_axes, axes, transpose_axes.data); \ Reducer, \
math::utils::ComputeTransposeStrides( \
num_dims, dims, transpose_axes.data, transpose_strides.data); \
rows = cols = 1; \
const int pivot = num_dims - num_axes; \
for (int i = 0; i < pivot; ++i) { \
rows *= dims[transpose_axes.data[i]]; \
} \
for (int i = pivot; i < num_dims; ++i) { \
cols *= dims[transpose_axes.data[i]]; \
} \
for (int i = 0; i < num_dims; ++i) { \
transpose_dims.data[i] = dims[transpose_axes.data[i]]; \
} \
_GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, \
cols, \
num_dims, \ num_dims, \
transpose_dims, \ dims, \
transpose_strides, \ num_axes, \
axes, \
reducer, \ reducer, \
init, \ init, \
scale, \ scale, \
x, \ x, \
y); \ y, \
ctx); \
} }
DEFINE_REDUCE_DISPATCHER(Max); DEFINE_REDUCE_DISPATCHER(Max);
......
...@@ -311,14 +311,41 @@ inline void ComputeTransposeStrides( ...@@ -311,14 +311,41 @@ inline void ComputeTransposeStrides(
} }
} }
template <typename DimT, typename AxisT>
inline void CollapseTransposeAxes(
const int num_dims,
const DimT* dims,
const AxisT* axes,
vector<DimT>& new_dims,
vector<AxisT>& new_axes) {
new_dims = vector<DimT>(dims, dims + num_dims);
new_axes = vector<AxisT>({axes[0]});
vector<AxisT> collapse_axes;
for (int i = 1; i < num_dims; ++i) {
if (axes[i] - 1 == axes[i - 1]) {
collapse_axes.push_back(axes[i]);
new_dims[axes[i]] *= new_dims[axes[i] - 1];
new_dims[axes[i] - 1] = -1;
} else {
new_axes.push_back(axes[i]);
}
}
const auto& erase_iter = std::remove_if(
new_dims.begin(), new_dims.end(), [](int x) { return x == -1; });
new_dims.erase(erase_iter, new_dims.end());
for (int i = 0; i < new_axes.size(); ++i) {
for (auto collapse_axis : collapse_axes) {
if (new_axes[i] > collapse_axis) new_axes[i]--;
}
}
}
template <typename DimT, typename IndexT> template <typename DimT, typename IndexT>
inline IndexT inline IndexT
GetIndexFromDims(const int num_dims, const DimT* dims, IndexT* index) { GetIndexFromDims(const int num_dims, const DimT* dims, IndexT* index) {
IndexT ret = 0; IndexT ret = 0;
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
if (dims[i] > 1) { if (dims[i] > 1) ret = ret * dims[i] + index[i];
ret = ret * dims[i] + index[i];
}
} }
return ret; return ret;
} }
......
...@@ -267,7 +267,7 @@ def uniform_(tensor, a=0, b=1): ...@@ -267,7 +267,7 @@ def uniform_(tensor, a=0, b=1):
---------- ----------
tensor : dragon.vm.torch.Tensor tensor : dragon.vm.torch.Tensor
The input tensor. The input tensor.
a : number, optional, default=-1 a : number, optional, default=0
The value to :math:`\alpha`. The value to :math:`\alpha`.
b : number, optional, default=1 b : number, optional, default=1
The value to :math:`\beta`. The value to :math:`\beta`.
......
...@@ -390,7 +390,7 @@ class MultiheadAttention(Module): ...@@ -390,7 +390,7 @@ class MultiheadAttention(Module):
self.in_proj_bias = Parameter(Tensor(3 * embed_dim)) self.in_proj_bias = Parameter(Tensor(3 * embed_dim))
else: else:
self.register_parameter('in_proj_bias', None) self.register_parameter('in_proj_bias', None)
self.out_proj = Linear(embed_dim, embed_dim, bias=True) self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
self.reset_parameters() self.reset_parameters()
def reset_parameters(self): def reset_parameters(self):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!