Commit 46feba80 by Ting PAN

Instantiate dispatch template by value for crucial CUDA kernels

Summary:
This commit instantiates CUDA kernels by using constant dimensions
to enable the optimization during compiler-time.
1 parent 936c351b
FROM ubuntu:16.04
FROM ubuntu:18.04
RUN \
apt-get update && apt-get install -y \
......@@ -43,8 +43,8 @@ RUN \
-DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DUSE_CUDA=OFF \
-DUSE_CUDNN=OFF \
-DUSE_AVX2=OFF \
-DUSE_FMA=OFF && \
-DUSE_AVX2=ON \
-DUSE_FMA=ON && \
make install -j $(nproc) && \
cd .. && rm -rf build && \
python3 setup.py install
......
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
FROM nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04
RUN \
rm /etc/apt/sources.list.d/cuda.list && \
......@@ -48,8 +48,8 @@ RUN \
-DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DUSE_MPI=ON \
-DUSE_NCCL=ON \
-DUSE_AVX2=OFF \
-DUSE_FMA=OFF && \
-DUSE_AVX2=ON \
-DUSE_FMA=ON && \
make install -j $(nproc) && \
cd .. && rm -rf build && \
python3 setup.py install
......
......@@ -62,10 +62,6 @@ class CUDAObjects {
} else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}
#elif CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
}
#endif
}
return handles[stream_id];
......@@ -437,7 +433,8 @@ class DRAGON_API CUDAContext {
CUDA_NOT_COMPILED;
}
/*! \brief Switch to the device and select given stream in current thread */
/*! \brief Switch to the device and select given stream in current
* thread */
void SwitchToDevice(int stream_id) {
CUDA_NOT_COMPILED;
}
......
......@@ -13,7 +13,6 @@ namespace {
template <typename T, int D>
__global__ void _ConstPad(
const int N,
const int num_dims,
const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
......@@ -23,7 +22,8 @@ __global__ void _ConstPad(
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi, d;
for (d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_pads.data[d];
......@@ -37,7 +37,6 @@ __global__ void _ConstPad(
template <typename T, int D>
__global__ void _ReflectPad(
const int N,
const int num_dims,
const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
......@@ -46,7 +45,8 @@ __global__ void _ReflectPad(
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_pads.data[d];
......@@ -61,7 +61,6 @@ __global__ void _ReflectPad(
template <typename T, int D>
__global__ void _EdgePad(
const int N,
const int num_dims,
const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
......@@ -70,7 +69,8 @@ __global__ void _EdgePad(
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r = min(X_dims.data[d] - 1, max(r - X_pads.data[d], 0));
......@@ -80,13 +80,47 @@ __global__ void _EdgePad(
}
}
template <typename T, int D>
void _PadImpl(
const int64_t* x_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const int64_t* pads,
const float value,
const string& mode,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_dims, X_strides, Y_dims, X_pads;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_dims.data[i] = x_dims[i];
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
X_pads.data[i] = pads[i];
}
if (mode == "ConstPad") {
_ConstPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, convert::To<T>(value), x, y);
} else if (mode == "ReflectPad") {
_ReflectPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, x, y);
} else if (mode == "EdgePad") {
_EdgePad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, X_strides, Y_dims, X_pads, x, y);
} else {
LOG(FATAL) << "Unknown Pad: " << mode << ".";
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_CONST_KERNEL_LAUNCHER(T) \
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void ConstPad<T, CUDAContext>( \
void name<T, CUDAContext>( \
const int num_dims, \
const int64_t* x_dims, \
const int64_t* x_strides, \
......@@ -97,27 +131,31 @@ __global__ void _EdgePad(
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, X_strides, Y_dims, X_pads; \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_dims.data[i] = x_dims[i]; \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
X_pads.data[i] = pads[i]; \
} \
_ConstPad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_PadImpl, \
T, \
num_dims, \
X_dims, \
X_strides, \
Y_dims, \
X_pads, \
convert::To<T>(value), \
x_dims, \
x_strides, \
y_dims, \
pads, \
value, \
#name, \
x, \
y); \
y, \
ctx); \
}
DEFINE_KERNEL_LAUNCHER(ConstPad, bool);
DEFINE_KERNEL_LAUNCHER(ConstPad, uint8_t);
DEFINE_KERNEL_LAUNCHER(ConstPad, int8_t);
DEFINE_KERNEL_LAUNCHER(ConstPad, int);
DEFINE_KERNEL_LAUNCHER(ConstPad, int64_t);
DEFINE_KERNEL_LAUNCHER(ConstPad, float16);
DEFINE_KERNEL_LAUNCHER(ConstPad, float);
DEFINE_KERNEL_LAUNCHER(ConstPad, double);
#undef DEFINE_KERNEL_LAUNCHER
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CUDAContext>( \
......@@ -130,27 +168,21 @@ __global__ void _EdgePad(
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, X_strides, Y_dims, X_pads; \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_dims.data[i] = x_dims[i]; \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
X_pads.data[i] = pads[i]; \
} \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_dims, X_strides, Y_dims, X_pads, x, y); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_PadImpl, \
T, \
num_dims, \
x_dims, \
x_strides, \
y_dims, \
pads, \
0.f, \
#name, \
x, \
y, \
ctx); \
}
DEFINE_CONST_KERNEL_LAUNCHER(bool);
DEFINE_CONST_KERNEL_LAUNCHER(uint8_t);
DEFINE_CONST_KERNEL_LAUNCHER(int8_t);
DEFINE_CONST_KERNEL_LAUNCHER(int);
DEFINE_CONST_KERNEL_LAUNCHER(int64_t);
DEFINE_CONST_KERNEL_LAUNCHER(float16);
DEFINE_CONST_KERNEL_LAUNCHER(float);
DEFINE_CONST_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(ReflectPad, bool);
DEFINE_KERNEL_LAUNCHER(ReflectPad, uint8_t);
DEFINE_KERNEL_LAUNCHER(ReflectPad, int8_t);
......@@ -167,7 +199,6 @@ DEFINE_KERNEL_LAUNCHER(EdgePad, int64_t);
DEFINE_KERNEL_LAUNCHER(EdgePad, float16);
DEFINE_KERNEL_LAUNCHER(EdgePad, float);
DEFINE_KERNEL_LAUNCHER(EdgePad, double);
#undef DEFINE_CONST_KERNEL_LAUNCHER
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
......
......@@ -10,27 +10,48 @@ namespace kernels {
namespace {
template <typename T, int D>
template <typename T, typename AccT, int D>
__global__ void _ReduceSumGrad(
const int N,
const int num_dims,
const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> Y_strides,
const float scale,
const AccT scale,
const T* dy,
T* dx) {
CUDA_1D_KERNEL_LOOP(xi, N) {
int yi = 0, tmp = xi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], tmp, &tmp, &r);
yi += (r % Y_dims.data[d]) * Y_strides.data[d];
}
dx[xi] = convert::To<T>(convert::To<float>(__ldg(dy + yi)) * scale);
dx[xi] = convert::To<T>(convert::To<AccT>(__ldg(dy + yi)) * scale);
}
}
template <typename T, typename AccT, int D>
void _ReduceSumGradImpl(
const int64_t* x_dims,
const int64_t* y_dims,
const int64_t* y_strides,
const AccT scale,
const T* dy,
T* dx,
CUDAContext* ctx) {
SimpleArray<int, D> X_dims, Y_dims, Y_strides;
const auto N =
std::accumulate(x_dims, x_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_dims.data[i] = x_dims[i];
Y_dims.data[i] = y_dims[i];
Y_strides.data[i] = y_strides[i];
}
_ReduceSumGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_dims, Y_dims, Y_strides, scale, dy, dx);
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
......@@ -47,23 +68,18 @@ __global__ void _ReduceSumGrad(
T* dx, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_dims, Y_dims, Y_strides; \
const auto N = std::accumulate( \
x_dims, x_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_dims.data[i] = x_dims[i]; \
Y_dims.data[i] = y_dims[i]; \
Y_strides.data[i] = y_strides[i]; \
} \
_ReduceSumGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
_ReduceSumGradImpl, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
num_dims, \
X_dims, \
Y_dims, \
Y_strides, \
scale, \
x_dims, \
y_dims, \
y_strides, \
convert::To<math::AccmulatorType<T>::type>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
reinterpret_cast<math::ScalarType<T>::type*>(dx), \
ctx); \
}
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
......
......@@ -12,7 +12,6 @@ namespace {
template <typename T, int D>
__global__ void _Roll(
const int N,
const int num_dims,
const SimpleArray<int, D> X_shifts,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
......@@ -20,7 +19,8 @@ __global__ void _Roll(
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
r -= X_shifts.data[d];
......@@ -31,6 +31,26 @@ __global__ void _Roll(
}
}
template <typename T, int D>
void _RollImpl(
const int64_t* x_shifts,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_shifts, X_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_shifts.data[i] = x_shifts[i];
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_Roll<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_shifts, X_strides, Y_dims, x, y);
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
......@@ -46,18 +66,8 @@ __global__ void _Roll(
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_shifts; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> Y_dims; \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_shifts.data[i] = x_shifts[i]; \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_Roll<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_shifts, X_strides, Y_dims, x, y); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_RollImpl, T, num_dims, x_shifts, x_strides, y_dims, x, y, ctx); \
}
DEFINE_KERNEL_LAUNCHER(bool);
......
......@@ -13,7 +13,6 @@ namespace {
template <typename T, int D>
__global__ void _Slice(
const int N,
const int num_dims,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> X_starts,
......@@ -21,7 +20,8 @@ __global__ void _Slice(
T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += (r + X_starts.data[d]) * X_strides.data[d];
......@@ -33,7 +33,6 @@ __global__ void _Slice(
template <typename T, int D>
__global__ void _SliceGrad(
const int N,
const int num_dims,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
const SimpleArray<int, D> X_starts,
......@@ -41,7 +40,8 @@ __global__ void _SliceGrad(
T* dx) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += (r + X_starts.data[d]) * X_strides.data[d];
......@@ -50,6 +50,32 @@ __global__ void _SliceGrad(
}
}
template <typename T, int D>
void _SliceImpl(
const string& routine,
const int64_t* x_strides,
const int64_t* y_dims,
const int64_t* starts,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> X_strides, Y_dims, X_starts;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
X_starts.data[i] = starts[i];
}
if (routine == "Slice") {
_Slice<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, X_starts, x, y);
} else if (routine == "SliceGrad") {
_SliceGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, X_starts, x, y);
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
......@@ -65,16 +91,8 @@ __global__ void _SliceGrad(
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims, X_starts; \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
X_starts.data[i] = starts[i]; \
} \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, num_dims, X_strides, Y_dims, X_starts, x, y); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_SliceImpl, T, num_dims, #name, x_strides, y_dims, starts, x, y, ctx); \
}
DEFINE_KERNEL_LAUNCHER(Slice, bool);
......
......@@ -31,12 +31,13 @@ __global__ void _Transpose(
template <typename T, int D>
void _TransposeImpl(
const int N,
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
SimpleArray<int, D> X_strides, Y_dims;
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
......@@ -60,36 +61,8 @@ void _TransposeImpl(
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
const auto N = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
switch (num_dims) { \
case 1: \
_TransposeImpl<T, 1>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 2: \
_TransposeImpl<T, 2>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 3: \
_TransposeImpl<T, 3>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 4: \
_TransposeImpl<T, 4>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 5: \
_TransposeImpl<T, 5>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 6: \
_TransposeImpl<T, 6>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 7: \
_TransposeImpl<T, 7>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 8: \
_TransposeImpl<T, 8>(N, x_strides, y_dims, x, y, ctx); \
break; \
default: \
break; \
} \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_TransposeImpl, T, num_dims, x_strides, y_dims, x, y, ctx); \
}
DEFINE_KERNEL_LAUNCHER(bool);
......
......@@ -82,7 +82,7 @@ __global__ void _SoftmaxCrossEntropyGrad(
const int S,
const int C,
const int ignore_index,
const InputT* input,
const InputT* /* input */,
const TargetT* target,
InputT* dx,
InputT* mask) {
......
......@@ -38,7 +38,7 @@ __global__ void _NLLLossGrad(
const int S,
const int C,
const int ignore_index,
const InputT* input,
const InputT* /* input */,
const TargetT* target,
InputT* dx,
InputT* mask) {
......
......@@ -67,7 +67,6 @@ template <typename T, typename AccT, int D>
__global__ void _GenericMoments(
const int rows,
const int cols,
const int num_dims,
const SimpleArray<int, D> X_dims,
const SimpleArray<int, D> X_strides,
const T* x,
......@@ -80,7 +79,8 @@ __global__ void _GenericMoments(
AccT m_val = AccT(0), v_val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r);
xi += r * X_strides.data[d];
......@@ -98,9 +98,8 @@ __global__ void _GenericMoments(
}
}
template <typename T, typename AccT>
void _Moments(
const int num_dims,
template <typename T, typename AccT, int D>
void _GenericMomentsImpl(
const int* dims,
const int num_axes,
const int* axes,
......@@ -108,44 +107,25 @@ void _Moments(
AccT* mean,
AccT* var,
CUDAContext* ctx) {
int rows, cols;
vec32_t out_dims(dims, dims + num_dims);
for (int i = 0; i < num_axes; ++i) {
out_dims[axes[i]] = 1;
}
if (math::utils::IsRowwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, x, mean, var);
return;
}
if (math::utils::IsColwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, x, mean, var);
return;
}
CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims;
math::utils::TransposeAxesForReduce(
num_dims, num_axes, axes, transpose_axes.data);
SimpleArray<int, D> transpose_axes;
SimpleArray<int, D> transpose_strides;
SimpleArray<int, D> transpose_dims;
math::utils::TransposeAxesForReduce(D, num_axes, axes, transpose_axes.data);
math::utils::ComputeTransposeStrides(
num_dims, dims, transpose_axes.data, transpose_strides.data);
rows = cols = 1;
const int pivot = num_dims - num_axes;
D, dims, transpose_axes.data, transpose_strides.data);
int rows = 1, cols = 1;
const int pivot = D - num_axes;
for (int i = 0; i < pivot; ++i) {
rows *= dims[transpose_axes.data[i]];
}
for (int i = pivot; i < num_dims; ++i) {
for (int i = pivot; i < D; ++i) {
cols *= dims[transpose_axes.data[i]];
}
for (int i = 0; i < num_dims; ++i) {
for (int i = 0; i < D; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]];
}
_GenericMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows, cols, num_dims, transpose_dims, transpose_strides, x, mean, var);
rows, cols, transpose_dims, transpose_strides, x, mean, var);
}
} // namespace
......@@ -163,12 +143,33 @@ void _Moments(
AccT* mean, \
AccT* var, \
CUDAContext* ctx) { \
_Moments( \
int rows, cols; \
vec32_t out_dims(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
out_dims[axes[i]] = 1; \
} \
if (math::utils::IsRowwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, x, mean, var); \
return; \
} \
if (math::utils::IsColwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, cols, x, mean, var); \
return; \
} \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
_GenericMomentsImpl, \
T, \
AccT, \
num_dims, \
dims, \
num_axes, \
axes, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
x, \
mean, \
var, \
ctx); \
......
......@@ -7,7 +7,7 @@ namespace kernels {
namespace {
template <typename T>
template <typename T, typename AccT>
void _AvgPool2dNCHW(
const int N,
const int C,
......@@ -28,31 +28,30 @@ void _AvgPool2dNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w};
T val, area;
int hstart, hend, wstart, wend;
for (int i = 0; i < NxCxHoxWo; ++i) {
hstart = index[2] * stride_h - pad_h;
wstart = index[3] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart);
const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H);
wend = std::min(wend, W);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
val = T(0);
AccT val = AccT(0);
const T* offset_x = x + index[0] * CxHxW + index[1] * HxW;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[h * W + w];
val += convert::To<AccT>(offset_x[h * W + w]);
}
}
y[i] = val / area;
y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool2dNHWC(
const int N,
const int C,
......@@ -72,29 +71,30 @@ void _AvgPool2dNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C};
T val, area;
int hstart, hend, wstart, wend;
for (int i = 0; i < NxHoxWoxC; ++i) {
hstart = index[1] * stride_h - pad_h;
wstart = index[2] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart);
const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H);
wend = std::min(wend, W);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * HxWxC + index[3];
val = T(0);
for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w)
val += offset_x[(h * W + w) * C];
y[i] = val / area;
AccT val = AccT(0);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += convert::To<AccT>(offset_x[(h * W + w) * C]);
}
}
y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool2dGradNCHW(
const int N,
const int C,
......@@ -115,7 +115,6 @@ void _AvgPool2dGradNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w};
T area;
int hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * CxHxW);
for (int i = 0; i < NxCxHoxWo; ++i) {
......@@ -123,22 +122,24 @@ void _AvgPool2dGradNCHW(
wstart = index[3] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart);
const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H);
wend = std::min(wend, W);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * CxHxW + index[1] * HxW;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
offset_dx[h * W + w] += dy[i] / area;
const auto xi = h * W + w;
offset_dx[xi] = convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
}
}
math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool2dGradNHWC(
const int N,
const int C,
......@@ -158,7 +159,6 @@ void _AvgPool2dGradNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C};
T area;
int hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * HxWxC);
for (int i = 0; i < NxHoxWoxC; ++i) {
......@@ -166,20 +166,24 @@ void _AvgPool2dGradNHWC(
wstart = index[2] * stride_w - pad_w;
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (hend - hstart) * (wend - wstart);
const AccT area = (hend - hstart) * (wend - wstart);
hend = std::min(hend, H);
wend = std::min(wend, W);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * HxWxC + index[3];
for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w)
offset_dx[(h * W + w) * C] += dy[i] / area;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const auto xi = (h * W + w) * C;
offset_dx[xi] = convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
}
}
math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool3dNCHW(
const int N,
const int C,
......@@ -205,7 +209,6 @@ void _AvgPool3dNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T val, area;
int dstart, dend, hstart, hend, wstart, wend;
for (int i = 0; i < NxCxDoxHoxWo; ++i) {
dstart = index[2] * stride_d - pad_d;
......@@ -214,28 +217,28 @@ void _AvgPool3dNCHW(
dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart);
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D);
hend = std::min(hend, H);
wend = std::min(wend, W);
dstart = std::max(dstart, 0);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
val = T(0);
AccT val = AccT(0);
const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW;
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[(d * H + h) * W + w];
val += convert::To<AccT>(offset_x[(d * H + h) * W + w]);
}
}
}
y[i] = val / area;
y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool3dNHWC(
const int N,
const int C,
......@@ -260,7 +263,6 @@ void _AvgPool3dNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T val, area;
int dstart, dend, hstart, hend, wstart, wend;
for (int i = 0; i < NxDoxHoxWoxC; ++i) {
dstart = index[1] * stride_d - pad_d;
......@@ -269,7 +271,7 @@ void _AvgPool3dNHWC(
dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart);
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D);
hend = std::min(hend, H);
wend = std::min(wend, W);
......@@ -277,20 +279,20 @@ void _AvgPool3dNHWC(
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * DxHxWxC + index[4];
val = T(0);
AccT val = AccT(0);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[((d * H + h) * W + w) * C];
val += convert::To<AccT>(offset_x[((d * H + h) * W + w) * C]);
}
}
}
y[i] = val / area;
y[i] = convert::To<T>(val / area);
math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool3dGradNCHW(
const int N,
const int C,
......@@ -316,7 +318,6 @@ void _AvgPool3dGradNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T area;
int dstart, dend, hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * CxDxHxW);
for (int i = 0; i < NxCxDoxHoxWo; ++i) {
......@@ -326,18 +327,21 @@ void _AvgPool3dGradNCHW(
dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart);
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D);
hend = std::min(hend, H);
wend = std::min(wend, W);
dstart = std::max(dstart, 0);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * CxDxHxW + index[1] * DxHxW;
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
offset_dx[((d * H) + h) * W + w] += dy[i] / area;
const auto xi = ((d * H) + h) * W + w;
offset_dx[xi] =
convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
}
}
}
......@@ -345,7 +349,7 @@ void _AvgPool3dGradNCHW(
}
}
template <typename T>
template <typename T, typename AccT>
void _AvgPool3dGradNHWC(
const int N,
const int C,
......@@ -370,7 +374,6 @@ void _AvgPool3dGradNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T area;
int dstart, dend, hstart, hend, wstart, wend, xi;
memset(dx, 0, sizeof(T) * N * DxHxWxC);
for (int i = 0; i < NxDoxHoxWoxC; ++i) {
......@@ -380,18 +383,21 @@ void _AvgPool3dGradNHWC(
dend = std::min(dstart + kernel_d, D + pad_d);
hend = std::min(hstart + kernel_h, H + pad_h);
wend = std::min(wstart + kernel_w, W + pad_w);
area = (dend - dstart) * (hend - hstart) * (wend - wstart);
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dend = std::min(dend, D);
hend = std::min(hend, H);
wend = std::min(wend, W);
dstart = std::max(dstart, 0);
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const AccT val = convert::To<AccT>(dy[i]) / area;
T* offset_dx = dx + index[0] * DxHxWxC + index[4];
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
offset_dx[((d * H + h) * W + w) * C] += dy[i] / area;
const auto xi = ((d * H + h) * W + w) * C;
offset_dx[xi] =
convert::To<T>(val + convert::To<AccT>(offset_dx[xi]));
}
}
}
......@@ -403,11 +409,11 @@ void _AvgPool3dGradNHWC(
/* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, ...) \
#define DISPATCH_POOL_KERNEL(name, T, AccT, ...) \
if (data_format == "NCHW") { \
name##NCHW(__VA_ARGS__); \
name##NCHW<T, AccT>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC(__VA_ARGS__); \
name##NHWC<T, AccT>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
......@@ -433,6 +439,8 @@ void _AvgPool3dGradNHWC(
CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \
C, \
H, \
......@@ -449,8 +457,10 @@ void _AvgPool3dGradNHWC(
y); \
}
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float16);
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float);
DEFINE_KERNEL_LAUNCHER(AvgPool2d, double);
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float16); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER
......@@ -481,6 +491,8 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \
C, \
D, \
......@@ -502,8 +514,10 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
y); \
}
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float16);
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float);
DEFINE_KERNEL_LAUNCHER(AvgPool3d, double);
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float16); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double); // AvgPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,7 +10,9 @@ namespace kernels {
namespace {
template <typename T>
#define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT>
__global__ void _AvgPool2dNCHW(
const int nthreads,
const int C,
......@@ -35,24 +38,24 @@ __global__ void _AvgPool2dNCHW(
int wstart = w_out * stride_w - pad_w;
int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart);
const AccT area = (hend - hstart) * (wend - wstart);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, H);
wend = min(wend, W);
const T* offset_x = x + (n * C + c) * H * W;
T val = T(0);
AccT val = AccT(0);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[h * W + w];
val += LDG(offset_x, h * W + w);
}
}
y[yi] = val / area;
y[yi] = convert::To<T>(val / area);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool2dNHWC(
const int nthreads,
const int C,
......@@ -78,24 +81,24 @@ __global__ void _AvgPool2dNHWC(
int wstart = w_out * stride_w - pad_w;
int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart);
const AccT area = (hend - hstart) * (wend - wstart);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, H);
wend = min(wend, W);
const T* offset_x = x + n * H * W * C + c;
T val = T(0);
AccT val = AccT(0);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[(h * W + w) * C];
val += LDG(offset_x, (h * W + w) * C);
}
}
y[yi] = val / area;
y[yi] = convert::To<T>(val / area);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool2dGradNCHW(
const int nthreads,
const int C,
......@@ -123,22 +126,22 @@ __global__ void _AvgPool2dGradNCHW(
const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + (n * C + c) * out_h * out_w;
T val = T(0);
AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
const int hstart = h_out * stride_h - pad_h;
const int hend = min(hstart + kernel_h, H + pad_h);
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart);
val += offset_dy[h_out * out_w + w_out] / area;
const AccT area = (hend - hstart) * (wend - wstart);
val += LDG(offset_dy, h_out * out_w + w_out) / area;
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool2dGradNHWC(
const int nthreads,
const int C,
......@@ -166,22 +169,22 @@ __global__ void _AvgPool2dGradNHWC(
const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + n * out_h * out_w * C + c;
T val = T(0);
AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
const int hstart = h_out * stride_h - pad_h;
const int hend = min(hstart + kernel_h, H + pad_h);
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (hend - hstart) * (wend - wstart);
val += offset_dy[(h_out * out_w + w_out) * C] / area;
const AccT area = (hend - hstart) * (wend - wstart);
val += LDG(offset_dy, (h_out * out_w + w_out) * C) / area;
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool3dNCHW(
const int nthreads,
const int C,
......@@ -218,7 +221,7 @@ __global__ void _AvgPool3dNCHW(
int dend = min(dstart + kernel_d, D + pad_d);
int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart);
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
......@@ -227,19 +230,19 @@ __global__ void _AvgPool3dNCHW(
wend = min(wend, W);
const T* offset_x = x + (n * C + c) * D * H * W;
T val = T(0);
AccT val = AccT(0);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[(d * H + h) * W + w];
val += LDG(offset_x, (d * H + h) * W + w);
}
}
}
y[yi] = val / area;
y[yi] = convert::To<T>(val / area);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool3dNHWC(
const int nthreads,
const int C,
......@@ -276,7 +279,7 @@ __global__ void _AvgPool3dNHWC(
int dend = min(dstart + kernel_d, D + pad_d);
int hend = min(hstart + kernel_h, H + pad_h);
int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart);
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
......@@ -285,19 +288,19 @@ __global__ void _AvgPool3dNHWC(
wend = min(wend, W);
const T* offset_x = x + n * D * H * W * C + c;
T val = T(0);
AccT val = AccT(0);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
val += offset_x[((d * H + h) * W + w) * C];
val += LDG(offset_x, ((d * H + h) * W + w) * C);
}
}
}
y[yi] = val / area;
y[yi] = convert::To<T>(val / area);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool3dGradNCHW(
const int nthreads,
const int C,
......@@ -336,7 +339,7 @@ __global__ void _AvgPool3dGradNCHW(
const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + (n * C + c) * out_d * out_h * out_w;
T val = T(0);
AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
const int dstart = d_out * stride_d - pad_d;
const int dend = min(dstart + kernel_d, D + pad_d);
......@@ -346,16 +349,16 @@ __global__ void _AvgPool3dGradNCHW(
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart);
val += offset_dy[(d_out * out_h + h_out) * out_w + w_out] / area;
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
val += LDG(offset_dy, (d_out * out_h + h_out) * out_w + w_out) / area;
}
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _AvgPool3dGradNHWC(
const int nthreads,
const int C,
......@@ -394,7 +397,7 @@ __global__ void _AvgPool3dGradNHWC(
const int out_wend = min(w / stride_w + 1, out_w);
const T* offset_dy = dy + n * out_d * out_h * out_w * C + c;
T val = T(0);
AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
const int dstart = d_out * stride_d - pad_d;
const int dend = min(dstart + kernel_d, D + pad_d);
......@@ -404,25 +407,29 @@ __global__ void _AvgPool3dGradNHWC(
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int wstart = w_out * stride_w - pad_w;
const int wend = min(wstart + kernel_w, W + pad_w);
const T area = (dend - dstart) * (hend - hstart) * (wend - wstart);
val +=
offset_dy[((d_out * out_h + h_out) * out_w + w_out) * C] / area;
const AccT area = (dend - dstart) * (hend - hstart) * (wend - wstart);
val += LDG(offset_dy, ((d_out * out_h + h_out) * out_w + w_out) * C) /
area;
}
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
#undef LDG
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, kBlocks, kThreads, ...) \
#define DISPATCH_POOL_KERNEL(name, T, AccT, kBlocks, kThreads, ...) \
if (data_format == "NCHW") { \
name##NCHW<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
name##NCHW<T, AccT> \
<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
name##NHWC<T, AccT> \
<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
......@@ -449,6 +456,8 @@ __global__ void _AvgPool3dGradNHWC(
const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......@@ -463,12 +472,14 @@ __global__ void _AvgPool3dGradNHWC(
stride_w, \
pad_h, \
pad_w, \
x, \
y); \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float16, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool2d, float, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool2d, double, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float16, (H * W)); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, float, (H * W)); // AvgPool2dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER
......@@ -500,6 +511,8 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......@@ -519,14 +532,19 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
pad_d, \
pad_h, \
pad_w, \
x, \
y); \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float16, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool3d, float, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool3d, double, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float, (D * H * W)); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double, (D * H * W)); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float16,
(D * H * W)); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, float,
(D * H * W)); // AvgPool3dGrad
DEFINE_KERNEL_LAUNCHER(AvgPool3dGrad, double,
(D * H * W)); // AvgPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER
#undef DISPATCH_POOL_KERNEL
......
......@@ -13,7 +13,7 @@ namespace {
template <typename T>
__global__ void _Im2Col2dNCHW(
const int nthreads,
const int C,
const int /* C */,
const int H,
const int W,
const int out_h,
......@@ -59,7 +59,7 @@ __global__ void _Im2Col2dNHWC(
const int C,
const int H,
const int W,
const int out_h,
const int /* out_h */,
const int out_w,
const int kernel_h,
const int kernel_w,
......@@ -97,7 +97,7 @@ __global__ void _Im2Col2dNHWC(
template <typename T>
__global__ void _Col2Im2dNCHW(
const int nthreads,
const int C,
const int /* C */,
const int H,
const int W,
const int out_h,
......@@ -147,7 +147,7 @@ template <typename T>
__global__ void _Col2Im2dNHWC(
const int nthreads,
const int C,
const int H,
const int /* H */,
const int W,
const int out_h,
const int out_w,
......
......@@ -7,7 +7,7 @@ namespace kernels {
namespace {
template <typename T>
template <typename T, typename AccT>
void _MaxPool2dNCHW(
const int N,
const int C,
......@@ -29,8 +29,7 @@ void _MaxPool2dNCHW(
const auto NxCxHoxWo = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, C, out_h, out_w};
T val;
int hstart, hend, wstart, wend, xi, mask_val;
int hstart, hend, wstart, wend;
for (int i = 0; i < NxCxHoxWo; ++i) {
hstart = index[2] * stride_h - pad_h;
wstart = index[3] * stride_w - pad_w;
......@@ -39,23 +38,24 @@ void _MaxPool2dNCHW(
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * CxHxW + index[1] * HxW;
mask_val = -1;
val = T(-FLT_MAX);
int mask_val = -1;
AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
xi = h * W + w;
if (offset_x[xi] > val) {
val = offset_x[mask_val = xi];
const auto xi = h * W + w;
if (convert::To<AccT>(offset_x[xi]) > val) {
mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
}
}
}
y[i] = val;
y[i] = convert::To<T>(val);
mask[i] = mask_val;
math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool2dNHWC(
const int N,
const int C,
......@@ -76,8 +76,7 @@ void _MaxPool2dNHWC(
const auto NxHoxWoxC = N * C * out_h * out_w;
std::array<int, 4> index = {0, 0, 0, 0};
std::array<int, 4> dims = {N, out_h, out_w, C};
T val;
int hstart, hend, wstart, wend, xi, mask_val;
int hstart, hend, wstart, wend;
for (int i = 0; i < NxHoxWoxC; ++i) {
hstart = index[1] * stride_h - pad_h;
wstart = index[2] * stride_w - pad_w;
......@@ -86,23 +85,24 @@ void _MaxPool2dNHWC(
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * HxWxC;
mask_val = -1;
val = T(-FLT_MAX);
int mask_val = -1;
AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
xi = (h * W + w) * C + index[3];
if (offset_x[xi] > val) {
val = offset_x[mask_val = xi];
const auto xi = (h * W + w) * C + index[3];
if (convert::To<AccT>(offset_x[xi]) > val) {
mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
}
}
}
y[i] = val;
y[i] = convert::To<T>(val);
mask[i] = mask_val;
math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool2dGradNCHW(
const int N,
const int C,
......@@ -127,13 +127,15 @@ void _MaxPool2dGradNCHW(
memset(dx, 0, sizeof(T) * N * CxHxW);
for (int i = 0; i < NxCxHoxWo; ++i) {
if (mask[i] != -1) {
dx[index[0] * CxHxW + index[1] * HxW + mask[i]] += dy[i];
const auto xi = index[0] * CxHxW + index[1] * HxW + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
}
math::utils::IncreaseIndexInDims(3, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool2dGradNHWC(
const int N,
const int C,
......@@ -157,13 +159,15 @@ void _MaxPool2dGradNHWC(
memset(dx, 0, sizeof(T) * N * HxWxC);
for (int i = 0; i < NxHoxWoxC; ++i) {
if (mask[i] != -1) {
dx[index[0] * HxWxC + mask[i]] += dy[i];
const auto xi = index[0] * HxWxC + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
}
math::utils::IncreaseIndexInDims(2, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool3dNCHW(
const int N,
const int C,
......@@ -190,8 +194,7 @@ void _MaxPool3dNCHW(
const auto NxCxDoxHoxWo = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, C, out_d, out_h, out_w};
T val;
int dstart, dend, hstart, hend, wstart, wend, xi, mask_val;
int dstart, dend, hstart, hend, wstart, wend;
for (int i = 0; i < NxCxDoxHoxWo; ++i) {
dstart = index[2] * stride_d - pad_d;
hstart = index[3] * stride_h - pad_h;
......@@ -203,25 +206,26 @@ void _MaxPool3dNCHW(
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * CxDxHxW + index[1] * DxHxW;
mask_val = -1;
val = T(-FLT_MAX);
int mask_val = -1;
AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
xi = (d * H + h) * W + w;
if (offset_x[xi] > val) {
val = offset_x[mask_val = xi];
const auto xi = (d * H + h) * W + w;
if (convert::To<AccT>(offset_x[xi]) > val) {
mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
}
}
}
}
y[i] = val;
y[i] = convert::To<T>(val);
mask[i] = mask_val;
math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool3dNHWC(
const int N,
const int C,
......@@ -247,8 +251,7 @@ void _MaxPool3dNHWC(
const auto NxDoxHoxWoxC = N * C * out_d * out_h * out_w;
std::array<int, 5> index = {0, 0, 0, 0, 0};
std::array<int, 5> dims = {N, out_d, out_h, out_w, C};
T val;
int dstart, dend, hstart, hend, wstart, wend, xi, mask_val;
int dstart, dend, hstart, hend, wstart, wend;
for (int i = 0; i < NxDoxHoxWoxC; ++i) {
dstart = index[1] * stride_d - pad_d;
hstart = index[2] * stride_h - pad_h;
......@@ -260,25 +263,26 @@ void _MaxPool3dNHWC(
hstart = std::max(hstart, 0);
wstart = std::max(wstart, 0);
const T* offset_x = x + index[0] * DxHxWxC;
mask_val = -1;
val = T(-FLT_MAX);
int mask_val = -1;
AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
xi = ((d * H + h) * W + w) * C + index[4];
if (offset_x[xi] > val) {
val = offset_x[mask_val = xi];
const auto xi = ((d * H + h) * W + w) * C + index[4];
if (convert::To<AccT>(offset_x[xi]) > val) {
mask_val = xi;
val = convert::To<AccT>(offset_x[xi]);
}
}
}
}
y[i] = val;
y[i] = convert::To<T>(val);
mask[i] = mask_val;
math::utils::IncreaseIndexInDims(5, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool3dGradNCHW(
const int N,
const int C,
......@@ -308,13 +312,15 @@ void _MaxPool3dGradNCHW(
memset(dx, 0, sizeof(T) * N * CxDxHxW);
for (int i = 0; i < NxCxDoxHoxWo; ++i) {
if (mask[i] != -1) {
dx[index[0] * CxDxHxW + index[1] * DxHxW + mask[i]] += dy[i];
const auto xi = index[0] * CxDxHxW + index[1] * DxHxW + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
}
math::utils::IncreaseIndexInDims(3, dims.data(), index.data());
}
}
template <typename T>
template <typename T, typename AccT>
void _MaxPool3dGradNHWC(
const int N,
const int C,
......@@ -343,7 +349,9 @@ void _MaxPool3dGradNHWC(
memset(dx, 0, sizeof(T) * N * DxHxWxC);
for (int i = 0; i < NxDoxHoxWoxC; ++i) {
if (mask[i] != -1) {
dx[index[0] * DxHxWxC + mask[i]] += dy[i];
const auto xi = index[0] * DxHxWxC + mask[i];
dx[xi] =
convert::To<T>(convert::To<AccT>(dx[xi]) + convert::To<AccT>(dy[i]));
}
math::utils::IncreaseIndexInDims(2, dims.data(), index.data());
}
......@@ -353,11 +361,11 @@ void _MaxPool3dGradNHWC(
/* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, ...) \
#define DISPATCH_POOL_KERNEL(name, T, AccT, ...) \
if (data_format == "NCHW") { \
name##NCHW(__VA_ARGS__); \
name##NCHW<T, AccT>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC(__VA_ARGS__); \
name##NHWC<T, AccT>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
......@@ -384,6 +392,8 @@ void _MaxPool3dGradNHWC(
CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \
C, \
H, \
......@@ -401,8 +411,10 @@ void _MaxPool3dGradNHWC(
y); \
}
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float16);
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float);
DEFINE_KERNEL_LAUNCHER(MaxPool2d, double);
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float16); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER
......@@ -434,6 +446,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
CPUContext* ctx) { \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
N, \
C, \
D, \
......@@ -456,8 +470,10 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
y); \
}
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float16);
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float);
DEFINE_KERNEL_LAUNCHER(MaxPool3d, double);
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float16); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double); // MaxPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,7 +10,9 @@ namespace kernels {
namespace {
template <typename T>
#define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT>
__global__ void _MaxPool2dNCHW(
const int nthreads,
const int C,
......@@ -41,20 +44,21 @@ __global__ void _MaxPool2dNCHW(
const T* offset_x = x + (n * C + c) * H * W;
int mask_val = -1;
T val = T(-FLT_MAX);
AccT val = AccT(-FLT_MAX);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
if (offset_x[h * W + w] > val) {
val = offset_x[mask_val = h * W + w];
if (LDG(offset_x, h * W + w) > val) {
mask_val = h * W + w;
val = LDG(offset_x, mask_val);
}
}
}
y[yi] = val;
y[yi] = convert::To<T>(val);
mask[yi] = mask_val;
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool2dNHWC(
const int nthreads,
const int C,
......@@ -86,21 +90,22 @@ __global__ void _MaxPool2dNHWC(
const int x_offset = n * H * W * C + c;
int mask_val = -1;
T val = T(-FLT_MAX);
AccT val = T(-FLT_MAX);
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int xi = x_offset + (h * W + w) * C;
if (x[xi] > val) {
val = x[mask_val = xi];
if (LDG(x, xi) > val) {
mask_val = xi;
val = LDG(x, xi);
}
}
}
y[yi] = val;
y[yi] = convert::To<T>(val);
mask[yi] = mask_val;
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool2dGradNCHW(
const int nthreads,
const int C,
......@@ -131,20 +136,20 @@ __global__ void _MaxPool2dGradNCHW(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = (n * C + c) * out_h * out_w;
T val = T(0);
AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int yi = y_offset + h_out * out_w + w_out;
if (mask[yi] == (h * W + w)) {
val += dy[yi];
val += LDG(dy, yi);
}
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool2dGradNHWC(
const int nthreads,
const int C,
......@@ -175,20 +180,20 @@ __global__ void _MaxPool2dGradNHWC(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = n * out_h * out_w * C + c;
T val = T(0);
AccT val = AccT(0);
for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
const int yi = y_offset + (h_out * out_w + w_out) * C;
if (mask[yi] == xi) {
val += dy[yi];
val += LDG(dy, yi);
}
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool3dNCHW(
const int nthreads,
const int C,
......@@ -232,23 +237,24 @@ __global__ void _MaxPool3dNCHW(
const T* offset_x = x + (n * C + c) * D * H * W;
int mask_val = -1;
T val = T(-FLT_MAX);
AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
tmp = (d * H + h) * W + w;
if (offset_x[tmp] > val) {
val = offset_x[mask_val = tmp];
if (LDG(offset_x, tmp) > val) {
mask_val = tmp;
val = LDG(offset_x, mask_val);
}
}
}
}
y[yi] = val;
y[yi] = convert::To<T>(val);
mask[yi] = mask_val;
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool3dNHWC(
const int nthreads,
const int C,
......@@ -292,23 +298,24 @@ __global__ void _MaxPool3dNHWC(
const int x_offset = n * D * H * W * C + c;
int mask_val = -1;
T val = T(-FLT_MAX);
AccT val = AccT(-FLT_MAX);
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
tmp = x_offset + ((d * H + h) * W + w) * C;
if (x[tmp] > val) {
val = x[mask_val = tmp];
if (LDG(x, tmp) > val) {
mask_val = tmp;
val = LDG(x, tmp);
}
}
}
}
y[yi] = val;
y[yi] = convert::To<T>(val);
mask[yi] = mask_val;
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool3dGradNCHW(
const int nthreads,
const int C,
......@@ -351,22 +358,22 @@ __global__ void _MaxPool3dGradNCHW(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = (n * C + c) * out_d * out_h * out_w;
T val = T(0);
AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
tmp = y_offset + (d_out * out_h + h_out) * out_w + w_out;
if (mask[tmp] == ((d * H + h) * W + w)) {
val += dy[tmp];
val += LDG(dy, tmp);
}
}
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
template <typename T>
template <typename T, typename AccT>
__global__ void _MaxPool3dGradNHWC(
const int nthreads,
const int C,
......@@ -409,30 +416,34 @@ __global__ void _MaxPool3dGradNHWC(
const int out_wend = min((w + pad_w) / stride_w + 1, out_w);
const int y_offset = n * out_d * out_h * out_w * C + c;
T val = T(0);
AccT val = AccT(0);
for (int d_out = out_dstart; d_out < out_dend; ++d_out) {
for (int h_out = out_hstart; h_out < out_hend; ++h_out) {
for (int w_out = out_wstart; w_out < out_wend; ++w_out) {
tmp = y_offset + ((d_out * out_h + h_out) * out_w + w_out) * C;
if (mask[tmp] == xi) {
val += dy[tmp];
val += LDG(dy, tmp);
}
}
}
}
dx[xi] = val;
dx[xi] = convert::To<T>(val);
}
}
#undef LDG
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DISPATCH_POOL_KERNEL(name, kBlocks, kThreads, ...) \
#define DISPATCH_POOL_KERNEL(name, T, AccT, kBlocks, kThreads, ...) \
if (data_format == "NCHW") { \
name##NCHW<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
name##NCHW<T, AccT> \
<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
name##NHWC<T, AccT> \
<<<kBlocks, kThreads, 0, ctx->cuda_stream()>>>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
......@@ -460,6 +471,8 @@ __global__ void _MaxPool3dGradNHWC(
const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......@@ -474,13 +487,15 @@ __global__ void _MaxPool3dGradNHWC(
stride_w, \
pad_h, \
pad_w, \
x, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mask, \
y); \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float16, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2d, float, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2d, double, (out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float16, (H * W)); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, float, (H * W)); // MaxPool2dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
#undef DEFINE_KERNEL_LAUNCHER
......@@ -513,6 +528,8 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
const int nthreads = N * C * out_dim; \
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......@@ -532,13 +549,15 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
pad_d, \
pad_h, \
pad_w, \
x, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
mask, \
y); \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float16, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3d, float, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3d, double, (out_d * out_h * out_w));
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float16, (D * H * W)); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, float, (D * H * W)); // MaxPool3dGrad
DEFINE_KERNEL_LAUNCHER(MaxPool3dGrad, double, (D * H * W)); // MaxPool3dGrad
#undef DEFINE_KERNEL_LAUNCHER
......
......@@ -85,7 +85,7 @@ __global__ void _RoiPoolGrad(
const int W,
const int out_h,
const int out_w,
const float spatial_scale,
const float /* spatial_scale */,
const T* dy,
const float* rois,
const int* mask,
......
......@@ -11,14 +11,13 @@ void TransposeOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0});
int num_axes, num_dims = X.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
perm(0, &num_axes);
CHECK(num_axes == 0 || num_axes == num_dims)
<< "\nProviding " << num_axes << " dimensions to permute, "
<< "while Tensor(" << X.name() << ")'s dims are " << X.DimString();
vec64_t new_axes(num_dims);
vec64_t new_axes(num_dims), new_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1;
}
......@@ -31,13 +30,27 @@ void TransposeOp<Context>::DoRunWithType() {
}
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X.stride(new_axes[i]);
Y_dims[i] = X.dim(new_axes[i]);
new_dims[i] = X.dim(new_axes[i]);
}
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
num_dims,
X.dims().data(),
new_axes.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
}
auto* scratch = ((void*)&X == (void*)Y)
? ctx()->workspace()->template data<T, Context>({X.count()})[0]
: Y->Reshape(Y_dims)->template mutable_data<T, Context>();
: Y->Reshape(new_dims)->template mutable_data<T, Context>();
kernels::Transpose(
num_dims,
......@@ -51,7 +64,7 @@ void TransposeOp<Context>::DoRunWithType() {
math::Copy(
X.count(),
scratch,
Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
Y->Reshape(new_dims)->template mutable_data<T, Context>(),
ctx());
}
}
......
......@@ -107,11 +107,6 @@ void PoolOp<Context>::DoRunWithType() {
}
template <class Context>
void PoolOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::TypesBase<float, double>>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void PoolGradientOp<Context>::DoRunWithType() {
ComputeOutShape();
......@@ -212,11 +207,6 @@ void PoolGradientOp<Context>::DoRunWithType() {
}
}
template <class Context>
void PoolGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::TypesBase<float, double>>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Pool);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Pool);
......
......@@ -27,7 +27,9 @@ class PoolOp final : public PoolOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_POOL_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -43,7 +45,9 @@ class PoolGradientOp final : public PoolOpBase<Context> {
USE_OPERATOR_FUNCTIONS;
USE_POOL_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -70,7 +74,9 @@ class CuDNNPoolOp final : public CuDNNPoolOpBase<Context> {
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
}
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -99,7 +105,9 @@ class CuDNNPoolGradientOp final : public CuDNNPoolOpBase<Context> {
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
}
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......
#ifdef USE_CUDNN
#include "dragon/core/workspace.h"
#include "dragon/operators/vision/pool_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -10,6 +12,56 @@ void CuDNNPoolOp<Context>::DoRunWithType() {
ComputeOutShape();
auto &X = Input(0), *Y = Output(0);
// CuDNN NHWC pooling is slow.
// Temporarily fallback to the naive implementation.
if (data_format() == "NHWC" && mode_ == "AVG") {
if (num_axes_ == 1 || num_axes_ == 2) {
kernels::AvgPool2d(
in_dims_[0],
in_dims_[1],
in_dims_[2],
num_axes_ == 1 ? 1 : in_dims_[3],
out_dims_[2],
num_axes_ == 1 ? 1 : out_dims_[3],
kshape_[0],
num_axes_ == 1 ? 1 : kshape_[1],
strides_[0],
num_axes_ == 1 ? 1 : strides_[1],
pads_begin_[0],
num_axes_ == 1 ? 0 : pads_begin_[1],
data_format(),
X.template data<T, Context>(),
Y->Reshape(out_shape_)->template mutable_data<T, Context>(),
ctx());
} else if (num_axes_ == 3) {
kernels::AvgPool3d(
in_dims_[0],
in_dims_[1],
in_dims_[2],
in_dims_[3],
in_dims_[4],
out_dims_[2],
out_dims_[3],
out_dims_[4],
kshape_[0],
kshape_[1],
kshape_[2],
strides_[0],
strides_[1],
strides_[2],
pads_begin_[0],
pads_begin_[1],
pads_begin_[2],
data_format(),
X.template data<T, Context>(),
Y->Reshape(out_shape_)->template mutable_data<T, Context>(),
ctx());
} else {
LOG(FATAL) << "AvgPool" << num_axes_ << "d is not supported.";
}
return;
}
SetPoolDesc();
CuDNNSetTensorDesc<T>(&input_desc_, X.dims(), data_format());
CuDNNSetTensorDesc<T>(&output_desc_, out_shape_, data_format());
......@@ -26,11 +78,6 @@ void CuDNNPoolOp<Context>::DoRunWithType() {
}
template <class Context>
void CuDNNPoolOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void CuDNNPoolGradientOp<Context>::DoRunWithType() {
ComputeOutShape();
......@@ -56,11 +103,6 @@ void CuDNNPoolGradientOp<Context>::DoRunWithType() {
dX->ReshapeLike(X)->template mutable_data<T, Context>()));
}
template <class Context>
void CuDNNPoolGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CUDNN_OPERATOR(Pool);
DEPLOY_CUDNN_OPERATOR(PoolGradient);
......
......@@ -60,12 +60,19 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t X_strides(in_dims.size());
vec64_t Y_dims(in_dims.size());
for (int i = 0; i < X_reshape.ndim(); i++) {
X_strides[i] = X_reshape.stride(perm[i]);
Y_dims[i] = X_reshape.dim(perm[i]);
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
X_reshape.ndim(),
X_reshape.dims().data(),
perm.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
}
auto* scratch = ((void*)&X == (void*)Y)
......@@ -73,7 +80,7 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
: Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose(
X_strides.size(),
num_dims,
X_strides.data(),
Y_dims.data(),
X.template data<T, Context>(),
......@@ -135,12 +142,19 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t X_strides(in_dims.size());
vec64_t Y_dims(in_dims.size());
for (int i = 0; i < in_dims.size(); i++) {
X_strides[i] = X_reshape.stride(perm[i]);
Y_dims[i] = X_reshape.dim(perm[i]);
vec64_t transpose_dims, transpose_axes;
math::utils::CollapseTransposeAxes(
X_reshape.ndim(),
X_reshape.dims().data(),
perm.data(),
transpose_dims,
transpose_axes);
Tensor X_collapse(transpose_dims);
num_dims = X_collapse.ndim();
vec64_t X_strides(num_dims), Y_dims(num_dims);
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X_collapse.stride(transpose_axes[i]);
Y_dims[i] = X_collapse.dim(transpose_axes[i]);
}
auto* scratch = ((void*)&X == (void*)Y)
......@@ -148,7 +162,7 @@ void DepthToSpaceOp<Context>::DoRunWithType() {
: Y->Reshape(out_shape)->template mutable_data<T, Context>();
kernels::Transpose(
X_strides.size(),
num_dims,
X_strides.data(),
Y_dims.data(),
X.template data<T, Context>(),
......
......@@ -158,6 +158,129 @@ class CUDADeviceGuard {
int prev_id_;
};
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1(Func, T, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2(Func, T1, T2, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T1, T2, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T1, T2, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T1, T2, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T1, T2, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T1, T2, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T1, T2, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T1, T2, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T1, T2, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#define DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3(Func, T1, T2, T3, val, ...) \
do { \
switch (val) { \
case 1: { \
Func<T1, T2, T3, 1>(__VA_ARGS__); \
break; \
} \
case 2: { \
Func<T1, T2, T3, 2>(__VA_ARGS__); \
break; \
} \
case 3: { \
Func<T1, T2, T3, 3>(__VA_ARGS__); \
break; \
} \
case 4: { \
Func<T1, T2, T3, 4>(__VA_ARGS__); \
break; \
} \
case 5: { \
Func<T1, T2, T3, 5>(__VA_ARGS__); \
break; \
} \
case 6: { \
Func<T1, T2, T3, 6>(__VA_ARGS__); \
break; \
} \
case 7: { \
Func<T1, T2, T3, 7>(__VA_ARGS__); \
break; \
} \
case 8: { \
Func<T1, T2, T3, 8>(__VA_ARGS__); \
break; \
} \
default: { \
break; \
} \
} \
} while (false)
#else
#define CUDA_NOT_COMPILED LOG(FATAL) << "CUDA library is not compiled with."
......
......@@ -147,9 +147,9 @@ DEFINE_SCALE_FUNC(int64_t);
} \
if (alpha != 1.f) { \
T alpha_val = static_cast<T>(alpha); \
CUBLAS_CHECK(cublasSetPointerMode( \
ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, &alpha_val, y, 1)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(handle, N, &alpha_val, y, 1)); \
} \
}
......@@ -169,17 +169,10 @@ DRAGON_API void Scale<float16, CUDAContext>(
ctx->cuda_stream()));
}
if (alpha != 1.f) {
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
const auto& handle = ctx->cublas_handle();
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasScalEx(
ctx->cublas_handle(),
N,
&alpha,
CUDA_R_32F,
y,
CUDA_R_16F,
1,
CUDA_R_32F));
handle, N, &alpha, CUDA_R_32F, y, CUDA_R_16F, 1, CUDA_R_32F));
}
}
......@@ -299,9 +292,9 @@ DEFINE_AXPY_FUNC(int64_t);
DRAGON_API void Axpy<T, CUDAContext>( \
const int N, const float alpha, const T* x, T* y, CUDAContext* ctx) { \
T alpha_val = static_cast<T>(alpha); \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, &alpha_val, x, 1, y, 1)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(handle, N, &alpha_val, x, 1, y, 1)); \
}
template <>
......@@ -311,10 +304,10 @@ DRAGON_API void Axpy<float16, CUDAContext>(
const float16* x,
float16* y,
CUDAContext* ctx) {
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
const auto& handle = ctx->cublas_handle();
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasAxpyEx(
ctx->cublas_handle(),
handle,
N,
&alpha,
CUDA_R_32F,
......@@ -381,17 +374,17 @@ DEFINE_AXPBY_FUNC(double);
template <> \
DRAGON_API void Dot<T, CUDAContext>( \
const int N, const T* a, const T* b, T* y, CUDAContext* ctx) { \
CUBLAS_CHECK(cublasSetPointerMode( \
ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, a, 1, b, 1, y)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE)); \
CUBLAS_CHECK(cublasFunc(handle, N, a, 1, b, 1, y)); \
} \
template <> \
DRAGON_API T Dot<T, CUDAContext>( \
const int N, const T* a, const T* b, CUDAContext* ctx) { \
T ret; \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, a, 1, b, 1, &ret)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(handle, N, a, 1, b, 1, &ret)); \
return ret; \
}
......@@ -402,10 +395,10 @@ DRAGON_API void Dot<float16, CUDAContext>(
const float16* b,
float16* y,
CUDAContext* ctx) {
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE));
const auto& handle = ctx->cublas_handle();
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
CUBLAS_CHECK(cublasDotEx(
ctx->cublas_handle(),
handle,
N,
a,
CUDA_R_16F,
......@@ -426,17 +419,17 @@ DEFINE_DOT_FUNC(double, cublasDdot);
template <> \
DRAGON_API void ASum<T, CUDAContext>( \
const int N, const T* x, T* y, CUDAContext* ctx) { \
CUBLAS_CHECK(cublasSetPointerMode( \
ctx->cublas_handle(), CUBLAS_POINTER_MODE_DEVICE)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, x, 1, y)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE)); \
CUBLAS_CHECK(cublasFunc(handle, N, x, 1, y)); \
} \
template <> \
DRAGON_API T ASum<T, CUDAContext>( \
const int N, const T* x, CUDAContext* ctx) { \
T ret; \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(ctx->cublas_handle(), N, x, 1, &ret)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc(handle, N, x, 1, &ret)); \
return ret; \
}
......@@ -456,15 +449,18 @@ DRAGON_API void Gemv<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = cuTransA == CUBLAS_OP_N ? N : M;
int k = cuTransA == CUBLAS_OP_N ? M : N;
int LDA = cuTransA == CUBLAS_OP_N ? m : k;
int LDC = m;
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
const int m = cuTransA == CUBLAS_OP_N ? N : M;
const int k = cuTransA == CUBLAS_OP_N ? M : N;
const int LDA = cuTransA == CUBLAS_OP_N ? m : k;
const int LDC = m;
const auto& handle = ctx->cublas_handle();
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasGemmEx(
ctx->cublas_handle(),
handle,
cuTransA,
CUBLAS_OP_N,
m,
......@@ -485,7 +481,7 @@ DRAGON_API void Gemv<float16, CUDAContext>(
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
CUBLAS_CHECK(cublasSgemmEx(
ctx->cublas_handle(),
handle,
cuTransA,
CUBLAS_OP_N,
m,
......@@ -503,6 +499,9 @@ DRAGON_API void Gemv<float16, CUDAContext>(
CUDA_R_16F,
LDC));
}
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
}
#define DEFINE_GEMV_FUNC(T, cublasFunc) \
......@@ -520,21 +519,10 @@ DRAGON_API void Gemv<float16, CUDAContext>(
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_T : CUBLAS_OP_N; \
const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \
cuTransA, \
N, \
M, \
&alpha_val, \
A, \
N, \
x, \
1, \
&beta_val, \
y, \
1)); \
handle, cuTransA, N, M, &alpha_val, A, N, x, 1, &beta_val, y, 1)); \
}
DEFINE_GEMV_FUNC(float, cublasSgemv);
......@@ -558,11 +546,14 @@ DRAGON_API void Gemm<float16, CUDAContext>(
int ldb = (TransB == CblasNoTrans) ? N : K;
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
const auto& handle = ctx->cublas_handle();
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasGemmEx(
ctx->cublas_handle(),
handle,
cuTransB,
cuTransA,
N,
......@@ -583,7 +574,7 @@ DRAGON_API void Gemm<float16, CUDAContext>(
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
CUBLAS_CHECK(cublasSgemmEx(
ctx->cublas_handle(),
handle,
cuTransB,
cuTransA,
N,
......@@ -601,6 +592,9 @@ DRAGON_API void Gemm<float16, CUDAContext>(
CUDA_R_16F,
N));
}
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
}
#define DEFINE_GEMM_FUNC(T, cublasFunc) \
......@@ -623,10 +617,10 @@ DRAGON_API void Gemm<float16, CUDAContext>(
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \
handle, \
cuTransB, \
cuTransA, \
N, \
......@@ -668,10 +662,13 @@ DRAGON_API void GemmBatched<float16, CUDAContext>(
thrust::device_vector<const void*> A_arr(A, A + batch_size);
thrust::device_vector<const void*> B_arr(B, B + batch_size);
thrust::device_vector<void*> C_arr(C, C + batch_size);
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
const auto& handle = ctx->cublas_handle();
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasGemmBatchedEx(
ctx->cublas_handle(),
handle,
cuTransB,
cuTransA,
N,
......@@ -691,6 +688,9 @@ DRAGON_API void GemmBatched<float16, CUDAContext>(
batch_size,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
}
#define DEFINE_BATCHED_GEMM_FUNC(T, cublasFunc) \
......@@ -718,10 +718,10 @@ DRAGON_API void GemmBatched<float16, CUDAContext>(
thrust::device_vector<const T*> A_arr(A, A + batch_size); \
thrust::device_vector<const T*> B_arr(B, B + batch_size); \
thrust::device_vector<T*> C_arr(C, C + batch_size); \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \
handle, \
cuTransB, \
cuTransA, \
N, \
......@@ -764,10 +764,13 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>(
int ldc = N;
auto cuTransA = TransA == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T;
CUBLAS_CHECK(
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST));
const auto& handle = ctx->cublas_handle();
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
#endif
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasGemmStridedBatchedEx(
ctx->cublas_handle(),
handle,
cuTransB,
cuTransA,
N,
......@@ -790,6 +793,9 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>(
batch_size,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
#if CUDA_VERSION < 11000
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
#endif
}
#define DEFINE_STRIDED_BATCHED_GEMM_FUNC(T, cublasFunc) \
......@@ -817,10 +823,10 @@ DRAGON_API void GemmStridedBatched<float16, CUDAContext>(
auto cuTransB = TransB == CblasNoTrans ? CUBLAS_OP_N : CUBLAS_OP_T; \
const auto alpha_val = static_cast<T>(alpha); \
const auto beta_val = static_cast<T>(beta); \
CUBLAS_CHECK( \
cublasSetPointerMode(ctx->cublas_handle(), CUBLAS_POINTER_MODE_HOST)); \
const auto& handle = ctx->cublas_handle(); \
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); \
CUBLAS_CHECK(cublasFunc( \
ctx->cublas_handle(), \
handle, \
cuTransB, \
cuTransA, \
N, \
......
......@@ -14,30 +14,30 @@ namespace math {
namespace {
template <typename T>
__global__ void _RowwiseSet(const int n, const int cols, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
__global__ void _RowwiseSet(const int N, const int cols, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, N) {
y[i] = __ldg(x + i % cols);
}
}
template <typename T>
__global__ void _ColwiseSet(const int n, const int cols, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
__global__ void _ColwiseSet(const int N, const int cols, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, N) {
y[i] = __ldg(x + i / cols);
}
}
template <typename T, int D>
__global__ void _BroadcastSet(
const int nthreads,
const int num_dims,
const int N,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d];
......@@ -48,13 +48,13 @@ __global__ void _BroadcastSet(
template <typename InputT, typename OutputT, class Functor, bool BroadcastA>
__global__ void _RowwiseBinaryFunc(
const int nthreads,
const int N,
const int cols,
const Functor op,
const InputT* a,
const InputT* b,
OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
CUDA_1D_KERNEL_LOOP(yi, N) {
const int i = yi % cols;
const int ai = BroadcastA ? i : yi;
const int bi = BroadcastA ? yi : i;
......@@ -64,13 +64,13 @@ __global__ void _RowwiseBinaryFunc(
template <typename InputT, typename OutputT, class Functor, bool BroadcastA>
__global__ void _ColwiseBinaryFunc(
const int nthreads,
const int N,
const int cols,
const Functor op,
const InputT* a,
const InputT* b,
OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
CUDA_1D_KERNEL_LOOP(yi, N) {
const int i = yi / cols;
const int ai = BroadcastA ? i : yi;
const int bi = BroadcastA ? yi : i;
......@@ -80,8 +80,7 @@ __global__ void _ColwiseBinaryFunc(
template <typename InputT, typename OutputT, class Functor, int D>
__global__ void _BroadcastBinaryFunc(
const int nthreads,
const int num_dims,
const int N,
const SimpleArray<int, D> a_strides,
const SimpleArray<int, D> b_strides,
const SimpleArray<int, D> y_dims,
......@@ -89,9 +88,10 @@ __global__ void _BroadcastBinaryFunc(
const InputT* a,
const InputT* b,
OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int ai = 0, bi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
ai += r * a_strides.data[d];
......@@ -103,8 +103,7 @@ __global__ void _BroadcastBinaryFunc(
template <typename T, int D>
__global__ void _BroadcastWhere(
const int nthreads,
const int num_dims,
const int N,
const SimpleArray<int, D> a_strides,
const SimpleArray<int, D> b_strides,
const SimpleArray<int, D> c_strides,
......@@ -113,9 +112,10 @@ __global__ void _BroadcastWhere(
const T* b,
const uint8_t* c,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
CUDA_1D_KERNEL_LOOP(yi, N) {
int ai = 0, bi = 0, ci = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
ai += r * a_strides.data[d];
......@@ -126,6 +126,71 @@ __global__ void _BroadcastWhere(
}
}
template <typename T, int D>
void _BroadcastSetImpl(
const int64_t* x_strides,
const int64_t* y_dims,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_BroadcastSet<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, x, y);
}
template <typename InputT, typename OutputT, class Functor, int D>
void _BroadcastBinaryFuncImpl(
const int64_t* a_strides,
const int64_t* b_strides,
const int64_t* y_dims,
const Functor op,
const InputT* a,
const InputT* b,
OutputT* y,
CUDAContext* ctx) {
SimpleArray<int, D> A_strides, B_strides, Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
A_strides.data[i] = a_strides[i];
B_strides.data[i] = b_strides[i];
Y_dims.data[i] = y_dims[i];
}
_BroadcastBinaryFunc<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, A_strides, B_strides, Y_dims, op, a, b, y);
}
template <typename T, int D>
void _BroadcastWhereImpl(
const int64_t* a_strides,
const int64_t* b_strides,
const int64_t* c_strides,
const int64_t* y_dims,
const T* a,
const T* b,
const uint8_t* c,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> A_strides, B_strides, C_strides;
SimpleArray<int, D> Y_dims;
const auto N =
std::accumulate(y_dims, y_dims + D, 1, std::multiplies<int64_t>());
for (int i = 0; i < D; ++i) {
A_strides.data[i] = a_strides[i];
B_strides.data[i] = b_strides[i];
C_strides.data[i] = c_strides[i];
Y_dims.data[i] = y_dims[i];
}
_BroadcastWhere<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, A_strides, B_strides, C_strides, Y_dims, a, b, c, y);
}
} // namespace
#define DEFINE_SET_FUNC(T, ScalarT) \
......@@ -151,53 +216,36 @@ __global__ void _BroadcastWhere(
return; \
} \
if (math::utils::IsRowwiseBroadcast(X_dims, Y_dims, &rows, &cols)) { \
const auto nthreads = rows * cols; \
_RowwiseSet<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
const auto N = rows * cols; \
_RowwiseSet<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
cols, \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
return; \
} \
if (math::utils::IsColwiseBroadcast(X_dims, Y_dims, &rows, &cols)) { \
const auto nthreads = rows * cols; \
_ColwiseSet<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
const auto N = rows * cols; \
_ColwiseSet<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
cols, \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
return; \
} \
vec64_t X_broadcast_strides, _; \
CUDA_TENSOR_DIMS_CHECK((int)Y_dims.size()); \
CUDA_TENSOR_DIMS_CHECK(int(Y_dims.size())); \
math::utils::ComputeBinaryBroadcastStrides( \
X_dims, Y_dims, X_broadcast_strides, _, _); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> strides, dims; \
const auto nthreads = std::accumulate( \
Y_dims.begin(), Y_dims.end(), 1, std::multiplies<int64_t>()); \
for (int i = 0; i < Y_dims.size(); ++i) { \
strides.data[i] = X_broadcast_strides[i]; \
dims.data[i] = Y_dims[i]; \
} \
_BroadcastSet<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
Y_dims.size(), \
strides, \
dims, \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_BroadcastSetImpl, \
ScalarT, \
int(Y_dims.size()), \
X_broadcast_strides.data(), \
Y_dims.data(), \
reinterpret_cast<const ScalarT*>(x), \
reinterpret_cast<ScalarT*>(y)); \
reinterpret_cast<ScalarT*>(y), \
ctx); \
}
DEFINE_SET_FUNC(bool, uint8_t);
......@@ -235,15 +283,14 @@ DEFINE_SET_FUNC(double, double);
} \
if (math::utils::IsRowwiseBroadcast( \
A_dims, B_dims, &rows, &cols, &broadcast_1st)) { \
const auto nthreads = rows * cols; \
const auto N = rows * cols; \
if (broadcast_1st > 0) { \
_RowwiseBinaryFunc< \
math::ScalarType<InputT>::type, \
math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \
true> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
true><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
cols, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
......@@ -254,9 +301,8 @@ DEFINE_SET_FUNC(double, double);
math::ScalarType<InputT>::type, \
math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \
false> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
false><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
cols, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
......@@ -267,15 +313,14 @@ DEFINE_SET_FUNC(double, double);
} \
if (math::utils::IsColwiseBroadcast( \
A_dims, B_dims, &rows, &cols, &broadcast_1st)) { \
const auto nthreads = rows * cols; \
const auto N = rows * cols; \
if (broadcast_1st > 0) { \
_ColwiseBinaryFunc< \
math::ScalarType<InputT>::type, \
math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \
true> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
true><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
cols, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
......@@ -286,9 +331,8 @@ DEFINE_SET_FUNC(double, double);
math::ScalarType<InputT>::type, \
math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \
false> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
false><<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
cols, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
......@@ -300,30 +344,21 @@ DEFINE_SET_FUNC(double, double);
vec64_t A_broadcast_strides, B_broadcast_strides, Y_dims; \
math::utils::ComputeBinaryBroadcastStrides( \
A_dims, B_dims, A_broadcast_strides, B_broadcast_strides, Y_dims); \
CUDA_TENSOR_DIMS_CHECK((int)Y_dims.size()); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> a_strides, b_strides, y_dims; \
const auto nthreads = std::accumulate( \
Y_dims.begin(), Y_dims.end(), 1, std::multiplies<int64_t>()); \
for (int i = 0; i < Y_dims.size(); ++i) { \
a_strides.data[i] = A_broadcast_strides[i]; \
b_strides.data[i] = B_broadcast_strides[i]; \
y_dims.data[i] = Y_dims[i]; \
} \
_BroadcastBinaryFunc< \
CUDA_TENSOR_DIMS_CHECK(int(Y_dims.size())); \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3( \
_BroadcastBinaryFuncImpl, \
math::ScalarType<InputT>::type, \
math::ScalarType<OutputT>::type, \
Functor<math::ScalarType<InputT>::type>, \
CUDA_TENSOR_MAX_DIMS> \
<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
Y_dims.size(), \
a_strides, \
b_strides, \
y_dims, \
int(Y_dims.size()), \
A_broadcast_strides.data(), \
B_broadcast_strides.data(), \
Y_dims.data(), \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(a), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(b), \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y), \
ctx); \
}
DEFINE_BINARY_FUNC(Add, uint8_t, uint8_t, math::PlusFunctor);
......@@ -526,31 +561,19 @@ DEFINE_BINARY_FUNC(GreaterEqual, bool, bool, uint8_t, bool);
B_dims, Y_dims, B_broadcast_strides, _, _); \
math::utils::ComputeBinaryBroadcastStrides( \
C_dims, Y_dims, C_broadcast_strides, _, _); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> a_strides, b_strides, c_strides; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> y_dims; \
const auto nthreads = std::accumulate( \
Y_dims.begin(), Y_dims.end(), 1, std::multiplies<int64_t>()); \
for (int i = 0; i < Y_dims.size(); ++i) { \
a_strides.data[i] = A_broadcast_strides[i]; \
b_strides.data[i] = B_broadcast_strides[i]; \
c_strides.data[i] = C_broadcast_strides[i]; \
y_dims.data[i] = Y_dims[i]; \
} \
_BroadcastWhere<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
Y_dims.size(), \
a_strides, \
b_strides, \
c_strides, \
y_dims, \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_1( \
_BroadcastWhereImpl, \
ScalarT, \
int(Y_dims.size()), \
A_broadcast_strides.data(), \
B_broadcast_strides.data(), \
C_broadcast_strides.data(), \
Y_dims.data(), \
reinterpret_cast<const ScalarT*>(a), \
reinterpret_cast<const ScalarT*>(b), \
reinterpret_cast<const uint8_t*>(c), \
reinterpret_cast<ScalarT*>(y)); \
reinterpret_cast<ScalarT*>(y), \
ctx); \
}
DEFINE_WHERE_FUNC(bool, uint8_t);
......
......@@ -62,7 +62,6 @@ template <typename T, typename AccT, class Reducer, int D>
__global__ void _GenericReduce(
const int rows,
const int cols,
const int num_dims,
const SimpleArray<int, D> x_dims,
const SimpleArray<int, D> x_strides,
const Reducer reducer,
......@@ -75,7 +74,8 @@ __global__ void _GenericReduce(
AccT val = init;
CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) {
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(x_dims.data[d], c, &c, &r);
xi += r * x_strides.data[d];
......@@ -89,6 +89,46 @@ __global__ void _GenericReduce(
}
}
template <typename T, typename AccT, class Reducer, int D>
void _GenericReduceImpl(
const int* dims,
const int num_axes,
const int* axes,
const Reducer reducer,
const AccT init,
const AccT scale,
const T* x,
T* y,
CUDAContext* ctx) {
SimpleArray<int, D> transpose_axes;
SimpleArray<int, D> transpose_strides;
SimpleArray<int, D> transpose_dims;
math::utils::TransposeAxesForReduce(D, num_axes, axes, transpose_axes.data);
math::utils::ComputeTransposeStrides(
D, dims, transpose_axes.data, transpose_strides.data);
int rows = 1, cols = 1;
const int pivot = D - num_axes;
for (int i = 0; i < pivot; ++i) {
rows *= dims[transpose_axes.data[i]];
}
for (int i = pivot; i < D; ++i) {
cols *= dims[transpose_axes.data[i]];
}
for (int i = 0; i < D; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]];
}
_GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows,
cols,
transpose_dims,
transpose_strides,
reducer,
init,
scale,
x,
y);
}
#define DEFINE_REDUCE_DISPATCHER(name) \
template <typename T, typename AccT, typename Reducer> \
void _Reduce##name( \
......@@ -120,35 +160,21 @@ __global__ void _GenericReduce(
return; \
} \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims; \
math::utils::TransposeAxesForReduce( \
num_dims, num_axes, axes, transpose_axes.data); \
math::utils::ComputeTransposeStrides( \
num_dims, dims, transpose_axes.data, transpose_strides.data); \
rows = cols = 1; \
const int pivot = num_dims - num_axes; \
for (int i = 0; i < pivot; ++i) { \
rows *= dims[transpose_axes.data[i]]; \
} \
for (int i = pivot; i < num_dims; ++i) { \
cols *= dims[transpose_axes.data[i]]; \
} \
for (int i = 0; i < num_dims; ++i) { \
transpose_dims.data[i] = dims[transpose_axes.data[i]]; \
} \
_GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, \
cols, \
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_3( \
_GenericReduceImpl, \
T, \
AccT, \
Reducer, \
num_dims, \
transpose_dims, \
transpose_strides, \
dims, \
num_axes, \
axes, \
reducer, \
init, \
scale, \
x, \
y); \
y, \
ctx); \
}
DEFINE_REDUCE_DISPATCHER(Max);
......
......@@ -311,14 +311,41 @@ inline void ComputeTransposeStrides(
}
}
template <typename DimT, typename AxisT>
inline void CollapseTransposeAxes(
const int num_dims,
const DimT* dims,
const AxisT* axes,
vector<DimT>& new_dims,
vector<AxisT>& new_axes) {
new_dims = vector<DimT>(dims, dims + num_dims);
new_axes = vector<AxisT>({axes[0]});
vector<AxisT> collapse_axes;
for (int i = 1; i < num_dims; ++i) {
if (axes[i] - 1 == axes[i - 1]) {
collapse_axes.push_back(axes[i]);
new_dims[axes[i]] *= new_dims[axes[i] - 1];
new_dims[axes[i] - 1] = -1;
} else {
new_axes.push_back(axes[i]);
}
}
const auto& erase_iter = std::remove_if(
new_dims.begin(), new_dims.end(), [](int x) { return x == -1; });
new_dims.erase(erase_iter, new_dims.end());
for (int i = 0; i < new_axes.size(); ++i) {
for (auto collapse_axis : collapse_axes) {
if (new_axes[i] > collapse_axis) new_axes[i]--;
}
}
}
template <typename DimT, typename IndexT>
inline IndexT
GetIndexFromDims(const int num_dims, const DimT* dims, IndexT* index) {
IndexT ret = 0;
for (int i = 0; i < num_dims; ++i) {
if (dims[i] > 1) {
ret = ret * dims[i] + index[i];
}
if (dims[i] > 1) ret = ret * dims[i] + index[i];
}
return ret;
}
......
......@@ -267,7 +267,7 @@ def uniform_(tensor, a=0, b=1):
----------
tensor : dragon.vm.torch.Tensor
The input tensor.
a : number, optional, default=-1
a : number, optional, default=0
The value to :math:`\alpha`.
b : number, optional, default=1
The value to :math:`\beta`.
......
......@@ -390,7 +390,7 @@ class MultiheadAttention(Module):
self.in_proj_bias = Parameter(Tensor(3 * embed_dim))
else:
self.register_parameter('in_proj_bias', None)
self.out_proj = Linear(embed_dim, embed_dim, bias=True)
self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
self.reset_parameters()
def reset_parameters(self):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!