Commit d56e67d1 by Ting PAN

Use FP32 accumulator for FP16 ReduceSum

Summary:
This commit adds a fallback with FP32 accumulator
for FP16 ReduceSum to avoid dropping too many small values.
Besides, FP16 kernels for arch < 530 are almost available.
1 parent 9ca4b60f
Showing with 846 additions and 782 deletions
...@@ -47,7 +47,7 @@ class Dropout(Layer): ...@@ -47,7 +47,7 @@ class Dropout(Layer):
param = layer_param.dropout_param param = layer_param.dropout_param
if not param.scale_train: if not param.scale_train:
raise ValueError('Unscaled dropout is not supported.') raise ValueError('Unscaled dropout is not supported.')
self.arguments = {'prob': param.dropout_ratio} self.arguments = {'ratio': param.dropout_ratio}
def __call__(self, bottom): def __call__(self, bottom):
return activation_ops.dropout(bottom, **self.arguments) return activation_ops.dropout(bottom, **self.arguments)
......
...@@ -22,27 +22,28 @@ __global__ void _DropPath( ...@@ -22,27 +22,28 @@ __global__ void _DropPath(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
y[i] = x[i] * (T)(__ldg(mask + (i / cols)) > thresh) * scale; y[i] = x[i] * T(__ldg(mask + (i / cols)) > thresh) * scale;
#else #else
y[i] = x[i] * (T)(mask[i / cols] > thresh) * scale; y[i] = x[i] * T(mask[i / cols] > thresh) * scale;
#endif #endif
} }
} }
template <> __global__ void _DropPath(
__global__ void _DropPath<half>(
const int nthreads, const int nthreads,
const int cols, const int cols,
const float thresh, const float thresh,
const half scale, const float scale,
const half* x, const half* x,
const float* mask, const float* mask,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
y[i] = __hmul( y[i] = __float2half(
__hmul(x[i], scale), __half2float(x[i]) * float(__ldg(mask + (i / cols)) > thresh) * scale);
__float2half((float)(__ldg(mask + (i / cols)) > thresh))); #else
y[i] = __float2half(
__half2float(x[i]) * float(mask[i / cols] > thresh) * scale);
#endif #endif
} }
} }
...@@ -60,13 +61,13 @@ void DropPath<float16, CUDAContext>( ...@@ -60,13 +61,13 @@ void DropPath<float16, CUDAContext>(
const float* mask, const float* mask,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
auto nthreads = rows * cols; const auto nthreads = rows * cols;
auto thresh = 1.f - (1.f / scale); const auto thresh = 1.f - (1.f / scale);
_DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads, nthreads,
cols, cols,
thresh, thresh,
cast::to<half>(scale), scale,
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
mask, mask,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
...@@ -82,8 +83,8 @@ void DropPath<float16, CUDAContext>( ...@@ -82,8 +83,8 @@ void DropPath<float16, CUDAContext>(
const float* mask, \ const float* mask, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
auto nthreads = rows * cols; \ const auto nthreads = rows * cols; \
auto thresh = 1.f - (1.f / scale); \ const auto thresh = 1.f - (1.f / scale); \
_DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, cols, thresh, cast::to<T>(scale), x, mask, y); \ nthreads, cols, thresh, cast::to<T>(scale), x, mask, y); \
} }
......
...@@ -37,20 +37,20 @@ void _ApplyMask<float16>( ...@@ -37,20 +37,20 @@ void _ApplyMask<float16>(
template <typename T> template <typename T>
void _Dropout( void _Dropout(
const int count, const int count,
const T prob, const T ratio,
const T scale, const T scale,
const T* x, const T* x,
uint8_t* mask, uint8_t* mask,
T* y, T* y,
CPUContext* ctx) { CPUContext* ctx) {
math::RandomBernoulli(count, T(1) - prob, mask, ctx); math::RandomBernoulli(count, T(1) - ratio, mask, ctx);
_ApplyMask(count, scale, x, mask, y); _ApplyMask(count, scale, x, mask, y);
} }
template <> template <>
void _Dropout<float16>( void _Dropout<float16>(
const int count, const int count,
const float16 prob, const float16 ratio,
const float16 scale, const float16 scale,
const float16* x, const float16* x,
uint8_t* mask, uint8_t* mask,
...@@ -77,14 +77,14 @@ void _Dropout<float16>( ...@@ -77,14 +77,14 @@ void _Dropout<float16>(
template <> \ template <> \
void Dropout<T, CPUContext>( \ void Dropout<T, CPUContext>( \
const int count, \ const int count, \
const float prob, \ const float ratio, \
const float scale, \ const float scale, \
const T* x, \ const T* x, \
uint8_t* mask, \ uint8_t* mask, \
T* y, \ T* y, \
uint32_t* r, \ uint32_t* r, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_Dropout(count, cast::to<T>(prob), cast::to<T>(scale), x, mask, y, ctx); \ _Dropout(count, cast::to<T>(ratio), cast::to<T>(scale), x, mask, y, ctx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
...@@ -23,17 +23,14 @@ __global__ void _ApplyMask( ...@@ -23,17 +23,14 @@ __global__ void _ApplyMask(
} }
} }
template <> __global__ void _ApplyMask(
__global__ void _ApplyMask<half>(
const int nthreads, const int nthreads,
const half scale, const float scale,
const half* x, const half* x,
const uint8_t* mask, const uint8_t* mask,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 y[i] = __float2half(__half2float(x[i]) * (float)mask[i] * scale);
y[i] = __hmul(__hmul(x[i], scale), __float2half((float)mask[i]));
#endif
} }
} }
...@@ -47,25 +44,21 @@ __global__ void _Dropout( ...@@ -47,25 +44,21 @@ __global__ void _Dropout(
uint8_t* mask, uint8_t* mask,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[i] * (T)(mask[i] = (r[i] > threshold)) * scale; y[i] = x[i] * T(mask[i] = (r[i] > threshold)) * scale;
} }
} }
template <> __global__ void _Dropout(
__global__ void _Dropout<half>(
const int nthreads, const int nthreads,
const uint32_t threshold, const uint32_t threshold,
const half scale, const float scale,
const half* x, const half* x,
const uint32_t* r, const uint32_t* r,
uint8_t* mask, uint8_t* mask,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 y[i] = __float2half(
y[i] = __hmul( __half2float(x[i]) * float(mask[i] = (r[i] > threshold)) * scale);
__hmul(x[i], scale),
__float2half((float)(mask[i] = (r[i] > threshold))));
#endif
} }
} }
...@@ -83,7 +76,7 @@ void ApplyMask<float16, CUDAContext>( ...@@ -83,7 +76,7 @@ void ApplyMask<float16, CUDAContext>(
CUDAContext* ctx) { CUDAContext* ctx) {
_ApplyMask<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _ApplyMask<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, count,
cast::to<half>(scale), scale,
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
mask, mask,
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
...@@ -92,7 +85,7 @@ void ApplyMask<float16, CUDAContext>( ...@@ -92,7 +85,7 @@ void ApplyMask<float16, CUDAContext>(
template <> template <>
void Dropout<float16, CUDAContext>( void Dropout<float16, CUDAContext>(
const int count, const int count,
const float prob, const float ratio,
const float scale, const float scale,
const float16* x, const float16* x,
uint8_t* mask, uint8_t* mask,
...@@ -102,8 +95,8 @@ void Dropout<float16, CUDAContext>( ...@@ -102,8 +95,8 @@ void Dropout<float16, CUDAContext>(
math::Random(count, r, ctx); math::Random(count, r, ctx);
_Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, count,
static_cast<uint32_t>(UINT_MAX * prob), static_cast<uint32_t>(UINT_MAX * ratio),
cast::to<half>(scale), scale,
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
r, r,
mask, mask,
...@@ -125,7 +118,7 @@ void Dropout<float16, CUDAContext>( ...@@ -125,7 +118,7 @@ void Dropout<float16, CUDAContext>(
template <> \ template <> \
void Dropout<T, CUDAContext>( \ void Dropout<T, CUDAContext>( \
const int count, \ const int count, \
const float prob, \ const float ratio, \
const float scale, \ const float scale, \
const T* x, \ const T* x, \
uint8_t* mask, \ uint8_t* mask, \
...@@ -133,7 +126,7 @@ void Dropout<float16, CUDAContext>( ...@@ -133,7 +126,7 @@ void Dropout<float16, CUDAContext>(
uint32_t* r, \ uint32_t* r, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
math::Random(count, r, ctx); \ math::Random(count, r, ctx); \
auto threshold = static_cast<uint32_t>(UINT_MAX * prob); \ auto threshold = static_cast<uint32_t>(UINT_MAX * ratio); \
_Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, threshold, cast::to<T>(scale), x, r, mask, y); \ count, threshold, cast::to<T>(scale), x, r, mask, y); \
} }
......
...@@ -24,11 +24,15 @@ __global__ void _PRelu(const int nthreads, const T* x, const T* w, T* y) { ...@@ -24,11 +24,15 @@ __global__ void _PRelu(const int nthreads, const T* x, const T* w, T* y) {
template <> template <>
__global__ void __global__ void
_PRelu<half>(const int nthreads, const half* x, const half* w, half* y) { _PRelu<half>(const int nthreads, const half* x, const half* w, half* y) {
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
y[i] = __hgt(__ldg(x + i), kZero) ? __ldg(x + i) y[i] = __half2float(__ldg(x + i)) > 0.f
: __hmul(__ldg(x + i), __ldg(w)); ? __ldg(x + i)
: __float2half(__half2float(__ldg(x + i)) * __half2float(__ldg(w)));
#else
y[i] = __half2float(x[i]) > 0.f
? x[i]
: __float2half(__half2float(x[i]) * __half2float(w[0]));
#endif #endif
} }
} }
...@@ -59,12 +63,17 @@ __global__ void _PReluNCHW<half>( ...@@ -59,12 +63,17 @@ __global__ void _PReluNCHW<half>(
const half* x, const half* x,
const half* w, const half* w,
half* y) { half* y) {
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
y[i] = __hgt(__ldg(x + i), kZero) y[i] = __half2float(__ldg(x + i)) > 0.f
? __ldg(x + i) ? __ldg(x + i)
: __hmul(__ldg(x + i), __ldg(w + ((i / S) % C))); : __float2half(
__half2float(__ldg(x + i)) *
__half2float(__ldg(w + ((i / S) % C))));
#else
y[i] = __half2float(x[i]) > 0.f
? x[i]
: __float2half(__half2float(x[i]) * __half2float(w[(i / S) % C]));
#endif #endif
} }
} }
...@@ -89,12 +98,16 @@ __global__ void _PReluNHWC<half>( ...@@ -89,12 +98,16 @@ __global__ void _PReluNHWC<half>(
const half* x, const half* x,
const half* w, const half* w,
half* y) { half* y) {
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
y[i] = __hgt(__ldg(x + i), kZero) y[i] = __half2float(__ldg(x + i)) > 0.f
? __ldg(x + i) ? __ldg(x + i)
: __hmul(__ldg(x + i), __ldg(w + (i % C))); : __float2half(
__half2float(__ldg(x + i)) * __half2float(__ldg(w + (i % C))));
#else
y[i] = __half2float(x[i]) > 0.f
? x[i]
: __float2half(__half2float(x[i]) * __half2float(w[i % C]));
#endif #endif
} }
} }
...@@ -118,11 +131,15 @@ __global__ void _PReluGrad<half>( ...@@ -118,11 +131,15 @@ __global__ void _PReluGrad<half>(
const half* x, const half* x,
const half* w, const half* w,
half* dx) { half* dx) {
const half kOne = __float2half(1.f);
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
dx[i] = __hmul(dy[i], (__hgt(__ldg(x + i), kZero) ? kOne : __ldg(w))); dx[i] = __float2half(
__half2float(dy[i]) *
(__half2float(x[i]) > 0.f ? 1.f : __half2float(__ldg(w))));
#else
dx[i] = __float2half(
__half2float(dy[i]) *
(__half2float(x[i]) > 0.f ? 1.f : __half2float(w[0])));
#endif #endif
} }
} }
...@@ -154,12 +171,16 @@ __global__ void _PReluGradNCHW<half>( ...@@ -154,12 +171,16 @@ __global__ void _PReluGradNCHW<half>(
const half* x, const half* x,
const half* w, const half* w,
half* dx) { half* dx) {
const half kOne = __float2half(1.f);
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
dx[i] = dx[i] = __float2half(
dy[i] * (__hgt(__ldg(x + i), kZero) ? kOne : __ldg(w + ((i / S) % C))); __half2float(dy[i]) *
(__half2float(x[i]) > 0.f ? 1.f
: __half2float(__ldg(w + ((i / S) % C)))));
#else
dx[i] = __float2half(
__half2float(dy[i]) *
(__half2float(x[i]) > 0.f ? 1.f : __half2float(w[(i / S) % C])));
#endif #endif
} }
} }
...@@ -189,11 +210,15 @@ __global__ void _PReluGradNHWC<half>( ...@@ -189,11 +210,15 @@ __global__ void _PReluGradNHWC<half>(
const half* x, const half* x,
const half* w, const half* w,
half* dx) { half* dx) {
const half kOne = __float2half(1.f);
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
dx[i] = dy[i] * (__hgt(__ldg(x + i), kZero) ? kOne : __ldg(w + (i % C))); dx[i] = __float2half(
__half2float(dy[i]) *
(__half2float(x[i]) > 0.f ? 1.f : __half2float(__ldg(w + (i % C)))));
#else
dx[i] = __float2half(
__half2float(dy[i]) *
(__half2float(x[i]) > 0.f ? 1.f : __half2float(w[i % C])));
#endif #endif
} }
} }
...@@ -201,12 +226,14 @@ __global__ void _PReluGradNHWC<half>( ...@@ -201,12 +226,14 @@ __global__ void _PReluGradNHWC<half>(
template <typename T> template <typename T>
__global__ void _PReluWGrad(const int N, const T* dy, const T* x, T* dw) { __global__ void _PReluWGrad(const int N, const T* dy, const T* x, T* dw) {
__shared__ typename BlockReduce<T>::TempStorage storage; __shared__ typename BlockReduce<T>::TempStorage storage;
T val = T(0); T val = T(0);
CUDA_2D_KERNEL_LOOP2(i, N) { CUDA_2D_KERNEL_LOOP2(i, N) {
#if __CUDA_ARCH__ >= 350
val += __ldg(x + i) < T(0) ? dy[i] * __ldg(x + i) : T(0);
#else
val += x[i] < T(0) ? dy[i] * x[i] : T(0); val += x[i] < T(0) ? dy[i] * x[i] : T(0);
#endif
} }
val = BlockReduce<T>(storage).Sum(val); val = BlockReduce<T>(storage).Sum(val);
if (threadIdx.x == 0) *dw = val; if (threadIdx.x == 0) *dw = val;
} }
...@@ -214,16 +241,18 @@ __global__ void _PReluWGrad(const int N, const T* dy, const T* x, T* dw) { ...@@ -214,16 +241,18 @@ __global__ void _PReluWGrad(const int N, const T* dy, const T* x, T* dw) {
template <> template <>
__global__ void __global__ void
_PReluWGrad<half>(const int N, const half* dy, const half* x, half* dw) { _PReluWGrad<half>(const int N, const half* dy, const half* x, half* dw) {
const half kZero = __float2half(0.f);
__shared__ typename BlockReduce<float>::TempStorage storage; __shared__ typename BlockReduce<float>::TempStorage storage;
float val = 0.f; float val = 0.f;
CUDA_2D_KERNEL_LOOP2(i, N) { CUDA_2D_KERNEL_LOOP2(i, N) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
val += __hlt(x[i], kZero) ? __half2float(__hmul(dy[i], x[i])) : 0.f; val += __half2float(__ldg(x + i)) < 0.f
? __half2float(dy[i]) * __half2float(__ldg(x + i))
: 0.f;
#else
val += __half2float(x[i]) < 0.f ? __half2float(dy[i]) * __half2float(x[i])
: 0.f;
#endif #endif
} }
val = BlockReduce<float>(storage).Sum(val); val = BlockReduce<float>(storage).Sum(val);
if (threadIdx.x == 0) *dw = __float2half(val); if (threadIdx.x == 0) *dw = __float2half(val);
} }
...@@ -241,7 +270,11 @@ __global__ void _PReluWGradNCHW( ...@@ -241,7 +270,11 @@ __global__ void _PReluWGradNCHW(
T val = T(0); T val = T(0);
CUDA_2D_KERNEL_LOOP2(j, NS) { CUDA_2D_KERNEL_LOOP2(j, NS) {
const int yi = ((j / S) * C + i) * S + j % S; const int yi = ((j / S) * C + i) * S + j % S;
#if __CUDA_ARCH__ >= 350
val += __ldg(x + yi) < T(0) ? dy[yi] * __ldg(x + yi) : T(0);
#else
val += x[yi] < T(0) ? dy[yi] * x[yi] : T(0); val += x[yi] < T(0) ? dy[yi] * x[yi] : T(0);
#endif
} }
val = BlockReduce<T>(storage).Sum(val); val = BlockReduce<T>(storage).Sum(val);
if (threadIdx.x == 0) dw[i] = val; if (threadIdx.x == 0) dw[i] = val;
...@@ -256,14 +289,19 @@ __global__ void _PReluWGradNCHW<half>( ...@@ -256,14 +289,19 @@ __global__ void _PReluWGradNCHW<half>(
const half* dy, const half* dy,
const half* x, const half* x,
half* dw) { half* dw) {
const half kZero = __float2half(0.f);
__shared__ typename BlockReduce<float>::TempStorage storage; __shared__ typename BlockReduce<float>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, C) { CUDA_2D_KERNEL_LOOP1(i, C) {
float val = 0.f; float val = 0.f;
CUDA_2D_KERNEL_LOOP2(j, NS) { CUDA_2D_KERNEL_LOOP2(j, NS) {
#if __CUDA_ARCH__ >= 530
const int yi = ((j / S) * C + i) * S + j % S; const int yi = ((j / S) * C + i) * S + j % S;
val += __hlt(x[yi], kZero) ? __half2float(__hmul(dy[yi], x[yi])) : 0.f; #if __CUDA_ARCH__ >= 350
val += __half2float(__ldg(x + yi)) < 0.f
? __half2float(dy[yi]) * __half2float(__ldg(x + yi))
: 0.f;
#else
val += __half2float(x[yi]) < 0.f
? __half2float(dy[yi]) * __half2float(x[yi])
: 0.f;
#endif #endif
} }
val = BlockReduce<float>(storage).Sum(val); val = BlockReduce<float>(storage).Sum(val);
...@@ -279,7 +317,11 @@ _PReluWGradNHWC(const int NS, const int C, const T* dy, const T* x, T* dw) { ...@@ -279,7 +317,11 @@ _PReluWGradNHWC(const int NS, const int C, const T* dy, const T* x, T* dw) {
T val = T(0); T val = T(0);
CUDA_2D_KERNEL_LOOP2(j, NS) { CUDA_2D_KERNEL_LOOP2(j, NS) {
const int yi = j * C + i; const int yi = j * C + i;
#if __CUDA_ARCH__ >= 350
val += __ldg(x + yi) < 0 ? dy[yi] * __ldg(x + yi) : T(0);
#else
val += x[yi] < 0 ? dy[yi] * x[yi] : T(0); val += x[yi] < 0 ? dy[yi] * x[yi] : T(0);
#endif
} }
val = BlockReduce<T>(storage).Sum(val); val = BlockReduce<T>(storage).Sum(val);
if (threadIdx.x == 0) dw[i] = val; if (threadIdx.x == 0) dw[i] = val;
...@@ -298,9 +340,15 @@ __global__ void _PReluWGradNHWC<half>( ...@@ -298,9 +340,15 @@ __global__ void _PReluWGradNHWC<half>(
CUDA_2D_KERNEL_LOOP1(i, C) { CUDA_2D_KERNEL_LOOP1(i, C) {
float val = 0.f; float val = 0.f;
CUDA_2D_KERNEL_LOOP2(j, NS) { CUDA_2D_KERNEL_LOOP2(j, NS) {
#if __CUDA_ARCH__ >= 530
const int yi = j * C + i; const int yi = j * C + i;
val += __hlt(x[yi], kZero) ? __half2float(__hmul(dy[yi], x[yi])) : 0.f; #if __CUDA_ARCH__ >= 350
val += __half2float(__ldg(x + yi)) < 0.f
? __half2float(dy[yi]) * __half2float(__ldg(x + yi))
: 0.f;
#else
val += __half2float(x[yi]) < 0.f
? __half2float(dy[yi]) * __half2float(x[yi])
: 0.f;
#endif #endif
} }
val = BlockReduce<float>(storage).Sum(val); val = BlockReduce<float>(storage).Sum(val);
......
...@@ -25,9 +25,12 @@ template <> ...@@ -25,9 +25,12 @@ template <>
__global__ void __global__ void
_Relu<half>(const int nthreads, const float alpha, const half* x, half* y) { _Relu<half>(const int nthreads, const float alpha, const half* x, half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
const float val = __half2float(__ldg(x + i)); const float val = __half2float(__ldg(x + i));
y[i] = val > 0.f ? __ldg(x + i) : __float2half(val * alpha); y[i] = val > 0.f ? __ldg(x + i) : __float2half(val * alpha);
#else
const float val = __half2float(x[i]);
y[i] = val > 0.f ? x[i] : __float2half(val * alpha);
#endif #endif
} }
} }
...@@ -36,12 +39,10 @@ template <> ...@@ -36,12 +39,10 @@ template <>
__global__ void __global__ void
_Relu<half2>(const int nthreads, const float alpha, const half2* x, half2* y) { _Relu<half2>(const int nthreads, const float alpha, const half2* x, half2* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const float2 val = __half22float2(x[i]); const float2 val = __half22float2(x[i]);
y[i] = __floats2half2_rn( y[i] = __floats2half2_rn(
val.x > 0.f ? val.x : val.x * alpha, val.x > 0.f ? val.x : val.x * alpha,
val.y > 0.f ? val.y : val.y * alpha); val.y > 0.f ? val.y : val.y * alpha);
#endif
} }
} }
...@@ -63,13 +64,25 @@ template <> ...@@ -63,13 +64,25 @@ template <>
__global__ void __global__ void
_ReluN<half>(const int nthreads, const half max_value, const half* x, half* y) { _ReluN<half>(const int nthreads, const half max_value, const half* x, half* y) {
const half kZero = __float2half(0.f); const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hgt(__ldg(x + i), kZero) y[i] = __hgt(__ldg(x + i), kZero)
? (__hlt(__ldg(x + i), max_value) ? __ldg(x + i) : max_value) ? (__hlt(__ldg(x + i), max_value) ? __ldg(x + i) : max_value)
: kZero; : kZero;
#endif
} }
#elif __CUDA_ARCH__ >= 350
const float kMax = __half2float(max_value);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float val = __half2float(__ldg(x + i));
y[i] = val > 0.f ? ((val < kMax) ? __ldg(x + i) : max_value) : kZero;
}
#else
const float kMax = __half2float(max_value);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float val = __half2float(x[i]);
y[i] = val > 0.f ? ((val < kMax) ? x[i] : max_value) : kZero;
}
#endif
} }
__global__ void _ReluNHalf2( __global__ void _ReluNHalf2(
...@@ -91,6 +104,25 @@ __global__ void _ReluNHalf2( ...@@ -91,6 +104,25 @@ __global__ void _ReluNHalf2(
? __high2half(__ldg(x + i)) ? __high2half(__ldg(x + i))
: max_value) : max_value)
: kZero); : kZero);
#elif __CUDA_ARCH__ >= 350
const float kMax = __half2float(max_value);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float2 val = __half22float2(__ldg(x + i));
y[i] = __halves2half2(
val.x > 0.f ? ((val.x < kMax) ? __low2half(__ldg(x + i)) : max_value)
: kZero,
val.y > 0.f ? ((val.y < kMax) ? __high2half(__ldg(x + i)) : max_value)
: kZero);
}
#else
const float kMax = __half2float(max_value);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float2 val = __half22float2(x[i]);
y[i] = __halves2half2(
val.x > 0.f ? ((val.x < kMax) ? __low2half(x[i]) : max_value) : kZero,
val.y > 0.f ? ((val.y < kMax) ? __high2half(x[i]) : max_value)
: kZero);
}
#endif #endif
} }
} }
...@@ -103,7 +135,7 @@ __global__ void _ReluGrad( ...@@ -103,7 +135,7 @@ __global__ void _ReluGrad(
const T* y, const T* y,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
dx[i] = __ldg(dy + i) * ((__ldg(y + i) > 0) + alpha * (__ldg(y + i) <= 0)); dx[i] = __ldg(dy + i) * ((__ldg(y + i) > 0) + alpha * (__ldg(y + i) <= 0));
#else #else
dx[i] = dy[i] * ((y[i] > 0) + alpha * (y[i] <= 0)); dx[i] = dy[i] * ((y[i] > 0) + alpha * (y[i] <= 0));
...@@ -118,14 +150,10 @@ __global__ void _ReluGrad<half>( ...@@ -118,14 +150,10 @@ __global__ void _ReluGrad<half>(
const half* dy, const half* dy,
const half* y, const half* y,
half* dx) { half* dx) {
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float val = __half2float(y[i]);
dx[i] = __hmul( dx[i] = __float2half(
dy[i], __half2float(dy[i]) * ((val > 0.f) + alpha * (val <= 0.f)));
__float2half(
__hgt(__ldg(y + i), kZero) + __hle(__ldg(y + i), kZero) * alpha));
#endif
} }
} // ReluGrad } // ReluGrad
...@@ -136,17 +164,12 @@ __global__ void _ReluGrad<half2>( ...@@ -136,17 +164,12 @@ __global__ void _ReluGrad<half2>(
const half2* dy, const half2* dy,
const half2* y, const half2* y,
half2* dx) { half2* dx) {
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(y[i]);
dx[i] = __hmul2( const float2 grad = __half22float2(dy[i]);
dy[i], dx[i] = __floats2half2_rn(
__floats2half2_rn( grad.x * ((val.x > 0.f) + alpha * (val.x <= 0.f)),
__hgt(__low2half(__ldg(y + i)), kZero) + grad.y * ((val.y > 0.f) + alpha * (val.y <= 0.f)));
__hle(__low2half(__ldg(y + i)), kZero) * alpha,
__hgt(__high2half(__ldg(y + i)), kZero) +
__hle(__high2half(__ldg(y + i)), kZero) * alpha));
#endif
} }
} // ReluGrad } // ReluGrad
...@@ -158,7 +181,7 @@ __global__ void _ReluNGrad( ...@@ -158,7 +181,7 @@ __global__ void _ReluNGrad(
const T* y, const T* y,
T* dx) { T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 350
dx[i] = (__ldg(y + i) > 0 && __ldg(y + i) < max_value) ? dy[i] : T(0); dx[i] = (__ldg(y + i) > 0 && __ldg(y + i) < max_value) ? dy[i] : T(0);
#else #else
dx[i] = (y[i] > 0 && y[i] < max_value) ? dy[i] : T(0); dx[i] = (y[i] > 0 && y[i] < max_value) ? dy[i] : T(0);
...@@ -174,14 +197,20 @@ __global__ void _ReluNGrad<half>( ...@@ -174,14 +197,20 @@ __global__ void _ReluNGrad<half>(
const half* y, const half* y,
half* dx) { half* dx) {
const half kZero = __float2half(0.f); const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__hgt(__ldg(y + i), kZero) && __hlt(__ldg(y + i), max_value)) dx[i] = (__hgt(__ldg(y + i), kZero) && __hlt(__ldg(y + i), max_value))
? dy[i] ? dy[i]
: kZero; : kZero;
#endif
} }
} // ReluNGrad #else
const float kMax = __half2float(max_value);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float val = __half2float(y[i]);
dx[i] = (val > 0.f && val < kMax) ? dy[i] : kZero;
}
#endif
}
template <> template <>
__global__ void _ReluNGrad<half2>( __global__ void _ReluNGrad<half2>(
...@@ -190,15 +219,33 @@ __global__ void _ReluNGrad<half2>( ...@@ -190,15 +219,33 @@ __global__ void _ReluNGrad<half2>(
const half2* dy, const half2* dy,
const half2* y, const half2* y,
half2* dx) { half2* dx) {
#if __CUDA_ARCH__ >= 530
const half2 kZero = __float2half2_rn(0.f); const half2 kZero = __float2half2_rn(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
dx[i] = __hmul2( dx[i] = __hmul2(
__hmul2(__hgt2(__ldg(y + i), kZero), __hlt2(__ldg(y + i), max_value)), __hmul2(__hgt2(__ldg(y + i), kZero), __hlt2(__ldg(y + i), max_value)),
dy[i]); dy[i]);
#endif
} }
} // ReluNGrad #elif __CUDA_ARCH__ >= 350
const half kZero = __float2half(0.f);
const float kMax = __half2float(__low2half(max_value));
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float2 val = __half22float2(y[i]);
dx[i] = __halves2half2(
(val.x > 0.f && val.x < kMax) ? __low2half(__ldg(dy + i)) : kZero,
(val.y > 0.f && val.y < kMax) ? __high2half(__ldg(dy + i)) : kZero);
}
#else
const half kZero = __float2half(0.f);
const float kMax = __half2float(__low2half(max_value));
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const float2 val = __half22float2(y[i]);
dx[i] = __halves2half2(
(val.x > 0.f && val.x < kMax) ? __low2half(dy[i]) : kZero,
(val.y > 0.f && val.y < kMax) ? __high2half(dy[i]) : kZero);
}
#endif
}
} // namespace } // namespace
......
...@@ -35,11 +35,9 @@ __global__ void _Selu<half>( ...@@ -35,11 +35,9 @@ __global__ void _Selu<half>(
const half* x, const half* x,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const float val = __half2float(x[i]); const float val = __half2float(x[i]);
y[i] = y[i] =
__float2half(val > 0.f ? gamma * val : alphaXgamma * (exp(val) - 1.f)); __float2half(val > 0.f ? gamma * val : alphaXgamma * (exp(val) - 1.f));
#endif
} }
} }
...@@ -51,12 +49,10 @@ __global__ void _Selu<half2>( ...@@ -51,12 +49,10 @@ __global__ void _Selu<half2>(
const half2* x, const half2* x,
half2* y) { half2* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const float2 val = __half22float2(x[i]); const float2 val = __half22float2(x[i]);
y[i] = __floats2half2_rn( y[i] = __floats2half2_rn(
val.x > 0.f ? gamma * val.x : alphaXgamma * (exp(val.x) - 1.f), val.x > 0.f ? gamma * val.x : alphaXgamma * (exp(val.x) - 1.f),
val.y > 0.f ? gamma * val.y : alphaXgamma * (exp(val.y) - 1.f)); val.y > 0.f ? gamma * val.y : alphaXgamma * (exp(val.y) - 1.f));
#endif
} }
} }
...@@ -86,11 +82,9 @@ __global__ void _SeluGrad<half>( ...@@ -86,11 +82,9 @@ __global__ void _SeluGrad<half>(
const half* y, const half* y,
half* dx) { half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const float val = __half2float(y[i]); const float val = __half2float(y[i]);
dx[i] = dx[i] = __float2half(
__hmul(dy[i], __float2half(val > 0.f ? gamma : (alphaXgamma + val))); __half2float(dy[i]) * (val > 0.f ? gamma : (alphaXgamma + val)));
#endif
} }
} // SeluGrad } // SeluGrad
...@@ -103,14 +97,11 @@ __global__ void _SeluGrad<half2>( ...@@ -103,14 +97,11 @@ __global__ void _SeluGrad<half2>(
const half2* y, const half2* y,
half2* dx) { half2* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const float2 val = __half22float2(y[i]); const float2 val = __half22float2(y[i]);
dx[i] = __hmul2( const float2 grad = __half22float2(dy[i]);
dy[i], dx[i] = __floats2half2_rn(
__floats2half2_rn( grad.x * (val.x > 0.f ? gamma : (alphaXgamma + val.x)),
val.x > 0.f ? gamma : (alphaXgamma + val.x), grad.y * (val.y > 0.f ? gamma : (alphaXgamma + val.y)));
val.y > 0.f ? gamma : (alphaXgamma + val.y)));
#endif
} }
} // SeluGrad } // SeluGrad
......
...@@ -18,21 +18,17 @@ __global__ void _Sigmoid(const int nthreads, const T* x, T* y) { ...@@ -18,21 +18,17 @@ __global__ void _Sigmoid(const int nthreads, const T* x, T* y) {
template <> template <>
__global__ void _Sigmoid<half>(const int nthreads, const half* x, half* y) { __global__ void _Sigmoid<half>(const int nthreads, const half* x, half* y) {
const half kOne = __float2half(1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 y[i] = __float2half(1.f / (1.f + exp(-__half2float(x[i]))));
y[i] = __hdiv(kOne, __hadd(kOne, hexp(__hneg(x[i]))));
#endif
} }
} }
template <> template <>
__global__ void _Sigmoid<half2>(const int nthreads, const half2* x, half2* y) { __global__ void _Sigmoid<half2>(const int nthreads, const half2* x, half2* y) {
const half2 kOne = __float2half2_rn(1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(x[i]);
y[i] = __h2div(kOne, __hadd2(kOne, h2exp(__hneg2(x[i])))); y[i] =
#endif __floats2half2_rn(1.f / (1.f + exp(-val.x)), 1.f / (1.f + exp(-val.y)));
} }
} }
...@@ -54,11 +50,9 @@ __global__ void _SigmoidGrad<half>( ...@@ -54,11 +50,9 @@ __global__ void _SigmoidGrad<half>(
const half* dy, const half* dy,
const half* y, const half* y,
half* dx) { half* dx) {
const half kOne = __float2half(1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float val = __half2float(y[i]);
dx[i] = __hmul(dy[i], __hmul(__ldg(y + i), __hsub(kOne, __ldg(y + i)))); dx[i] = __float2half(__half2float(dy[i]) * val * (1.f - val));
#endif
} }
} // SigmoidGrad } // SigmoidGrad
...@@ -68,11 +62,11 @@ __global__ void _SigmoidGrad<half2>( ...@@ -68,11 +62,11 @@ __global__ void _SigmoidGrad<half2>(
const half2* dy, const half2* dy,
const half2* y, const half2* y,
half2* dx) { half2* dx) {
const half2 kOne = __float2half2_rn(1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(y[i]);
dx[i] = __hmul2(dy[i], __hmul2(__ldg(y + i), __hsub2(kOne, __ldg(y + i)))); const float2 grad = __half22float2(dy[i]);
#endif dx[i] = __floats2half2_rn(
grad.x * val.x * (1.f - val.x), grad.y * val.y * (1.f - val.y));
} }
} // SigmoidGrad } // SigmoidGrad
......
...@@ -71,7 +71,6 @@ __global__ void _Softmax<half>( ...@@ -71,7 +71,6 @@ __global__ void _Softmax<half>(
const half lowest, const half lowest,
const half* x, const half* x,
half* y) { half* y) {
#if __CUDA_ARCH__ >= 530
__shared__ float block_val; __shared__ float block_val;
__shared__ typename BlockReduce<float>::TempStorage storage; __shared__ typename BlockReduce<float>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, rows) { CUDA_2D_KERNEL_LOOP1(i, rows) {
...@@ -105,7 +104,6 @@ __global__ void _Softmax<half>( ...@@ -105,7 +104,6 @@ __global__ void _Softmax<half>(
y[yi] = __float2half(__half2float(y[yi]) / block_val); y[yi] = __float2half(__half2float(y[yi]) / block_val);
} }
} }
#endif
} }
template <typename T> template <typename T>
...@@ -153,7 +151,6 @@ __global__ void _SoftmaxGrad<half>( ...@@ -153,7 +151,6 @@ __global__ void _SoftmaxGrad<half>(
const half* dy, const half* dy,
const half* y, const half* y,
half* dx) { half* dx) {
#if __CUDA_ARCH__ >= 530
__shared__ float block_val; __shared__ float block_val;
__shared__ typename BlockReduce<float>::TempStorage storage; __shared__ typename BlockReduce<float>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, rows) { CUDA_2D_KERNEL_LOOP1(i, rows) {
...@@ -162,7 +159,11 @@ __global__ void _SoftmaxGrad<half>( ...@@ -162,7 +159,11 @@ __global__ void _SoftmaxGrad<half>(
float val = 0.f; float val = 0.f;
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
const int yi = c + j * inner_dim; const int yi = c + j * inner_dim;
#if __CUDA_ARCH__ >= 350
val += __half2float(__ldg(dy + yi)) * __half2float(__ldg(y + yi)); val += __half2float(__ldg(dy + yi)) * __half2float(__ldg(y + yi));
#else
val += __half2float(dy[yi]) * __half2float(y[yi]);
#endif
} }
val = BlockReduce<float>(storage).Sum(val); val = BlockReduce<float>(storage).Sum(val);
if (threadIdx.x == 0) block_val = val; if (threadIdx.x == 0) block_val = val;
...@@ -170,12 +171,16 @@ __global__ void _SoftmaxGrad<half>( ...@@ -170,12 +171,16 @@ __global__ void _SoftmaxGrad<half>(
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
const int yi = c + j * inner_dim; const int yi = c + j * inner_dim;
#if __CUDA_ARCH__ >= 350
dx[yi] = __float2half( dx[yi] = __float2half(
(__half2float(__ldg(dy + yi)) - block_val) * (__half2float(__ldg(dy + yi)) - block_val) *
__half2float(__ldg(y + yi))); __half2float(__ldg(y + yi)));
#else
dx[yi] = __float2half(
(__half2float(dy[yi]) - block_val) * __half2float(y[yi]));
#endif
} }
} }
#endif
} // SoftmaxGrad } // SoftmaxGrad
} // namespace } // namespace
......
...@@ -20,22 +20,15 @@ __global__ void _Tanh(const int nthreads, const T* x, T* y) { ...@@ -20,22 +20,15 @@ __global__ void _Tanh(const int nthreads, const T* x, T* y) {
template <> template <>
__global__ void _Tanh<half>(const int nthreads, const half* x, half* y) { __global__ void _Tanh<half>(const int nthreads, const half* x, half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 y[i] = __float2half(tanh(__half2float(x[i])));
const half a = hexp(__ldg(x + i));
const half b = hexp(__hneg(__ldg(x + i)));
y[i] = __hdiv(__hsub(a, b), __hadd(a, b));
#endif
} }
} }
template <> template <>
__global__ void _Tanh<half2>(const int nthreads, const half2* x, half2* y) { __global__ void _Tanh<half2>(const int nthreads, const half2* x, half2* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(x[i]);
const half2 a = h2exp(__ldg(x + i)); y[i] = __floats2half2_rn(tanh(val.x), tanh(val.y));
const half2 b = h2exp(__hneg2(__ldg(x + i)));
y[i] = __h2div(__hsub2(a, b), __hadd2(a, b));
#endif
} }
} }
...@@ -49,11 +42,9 @@ __global__ void _TanhGrad(const int nthreads, const T* dy, const T* y, T* dx) { ...@@ -49,11 +42,9 @@ __global__ void _TanhGrad(const int nthreads, const T* dy, const T* y, T* dx) {
template <> template <>
__global__ void __global__ void
_TanhGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) { _TanhGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
const half kOne = __float2half(1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 dx[i] = __float2half(
dx[i] = __hmul(dy[i], __hsub(kOne, utils::math::Square(y[i]))); __half2float(dy[i]) * (1.f - utils::math::Square(__half2float(y[i]))));
#endif
} }
} }
...@@ -63,11 +54,12 @@ __global__ void _TanhGrad<half2>( ...@@ -63,11 +54,12 @@ __global__ void _TanhGrad<half2>(
const half2* dy, const half2* dy,
const half2* y, const half2* y,
half2* dx) { half2* dx) {
const half2 kOne = __float2half2_rn(1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(y[i]);
dx[i] = __hmul2(dy[i], __hsub2(kOne, utils::math::Square(y[i]))); const float2 grad = __half22float2(dy[i]);
#endif dx[i] = __floats2half2_rn(
grad.x * (1.f - utils::math::Square(val.x)),
grad.y * (1.f - utils::math::Square(val.y)));
} }
} }
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -22,7 +23,7 @@ __global__ void _CumSum( ...@@ -22,7 +23,7 @@ __global__ void _CumSum(
y[c] = exclusive ? T(0) : x[c]; y[c] = exclusive ? T(0) : x[c];
for (int j = 1; j < cols; ++j) { for (int j = 1; j < cols; ++j) {
const int yi = c + inner_dim; const int yi = c + inner_dim;
y[yi] = y[c] + x[exclusive ? c : yi]; y[yi] = math::PlusFunctor<T>()(y[c], x[exclusive ? c : yi]);
c = yi; c = yi;
} }
} }
...@@ -38,15 +39,13 @@ __global__ void _CumSum<half>( ...@@ -38,15 +39,13 @@ __global__ void _CumSum<half>(
half* y) { half* y) {
const half kZero = __float2half(0.f); const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, rows) { CUDA_1D_KERNEL_LOOP(i, rows) {
#if __CUDA_ARCH__ >= 530
int c = (i / inner_dim) * cols * inner_dim + (i % inner_dim); int c = (i / inner_dim) * cols * inner_dim + (i % inner_dim);
y[c] = exclusive ? kZero : x[c]; y[c] = exclusive ? kZero : x[c];
for (int j = 1; j < cols; ++j) { for (int j = 1; j < cols; ++j) {
const int yi = c + inner_dim; const int yi = c + inner_dim;
y[yi] = __hadd(y[c], x[exclusive ? c : yi]); y[yi] = math::PlusFunctor<half>()(y[c], x[exclusive ? c : yi]);
c = yi; c = yi;
} }
#endif
} }
} }
...@@ -63,7 +62,7 @@ __global__ void _CumSumReverse( ...@@ -63,7 +62,7 @@ __global__ void _CumSumReverse(
y[c] = exclusive ? T(0) : x[c]; y[c] = exclusive ? T(0) : x[c];
for (int j = cols - 2; j >= 0; --j) { for (int j = cols - 2; j >= 0; --j) {
const int yi = c - inner_dim; const int yi = c - inner_dim;
y[yi] = y[c] + x[exclusive ? c : yi]; y[yi] = math::PlusFunctor<T>()(y[c], x[exclusive ? c : yi]);
c = yi; c = yi;
} }
} }
...@@ -79,15 +78,13 @@ __global__ void _CumSumReverse<half>( ...@@ -79,15 +78,13 @@ __global__ void _CumSumReverse<half>(
half* y) { half* y) {
const half kZero = __float2half(0.f); const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, rows) { CUDA_1D_KERNEL_LOOP(i, rows) {
#if __CUDA_ARCH__ >= 530
int c = ((i / inner_dim) * cols + (cols - 1)) * inner_dim + (i % inner_dim); int c = ((i / inner_dim) * cols + (cols - 1)) * inner_dim + (i % inner_dim);
y[c] = exclusive ? kZero : x[c]; y[c] = exclusive ? kZero : x[c];
for (int j = cols - 2; j >= 0; --j) { for (int j = cols - 2; j >= 0; --j) {
const int yi = c - inner_dim; const int yi = c - inner_dim;
y[yi] = __hadd(y[c], x[exclusive ? c : yi]); y[yi] = math::PlusFunctor<half>()(y[c], x[exclusive ? c : yi]);
c = yi; c = yi;
} }
#endif
} }
} }
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -52,40 +53,8 @@ __global__ void _IndexSelectGrad( ...@@ -52,40 +53,8 @@ __global__ void _IndexSelectGrad(
int pos = index[k]; int pos = index[k];
#endif #endif
pos = pos >= 0 ? pos : pos + axis_dim; pos = pos >= 0 ? pos : pos + axis_dim;
dx[x_offset + pos * inner_dim] += (*offset_dy);
offset_dy += inner_dim;
}
}
}
template <>
__global__ void _IndexSelectGrad<half>(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int select_dim,
const int64_t* index,
const half* dy,
half* dx) {
CUDA_1D_KERNEL_LOOP(ti, nthreads) {
const int i = ti / inner_dim;
const int j = ti % inner_dim;
const int x_offset = i * axis_dim * inner_dim + j;
const half* offset_dy = dy + i * select_dim * inner_dim + j;
for (int k = 0; k < select_dim; ++k) {
#if __CUDA_ARCH__ >= 350
int pos = __ldg(index + k);
#else
int pos = index[k];
#endif
pos = pos >= 0 ? pos : pos + axis_dim;
pos = x_offset + pos * inner_dim; pos = x_offset + pos * inner_dim;
#if __CUDA_ARCH__ >= 530 dx[pos] = math::PlusFunctor<T>()(dx[pos], *(offset_dy));
dx[pos] = __hadd(dx[pos], *(offset_dy));
#else
dx[pos] =
__float2half(__half2float(dx[pos]) + __half2float(*(offset_dy)));
#endif
offset_dy += inner_dim; offset_dy += inner_dim;
} }
} }
...@@ -95,31 +64,6 @@ __global__ void _IndexSelectGrad<half>( ...@@ -95,31 +64,6 @@ __global__ void _IndexSelectGrad<half>(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <>
void IndexSelectGrad<float16, CUDAContext>(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int select_dim,
const int64_t* index,
const float16* dy,
float16* dx,
CUDAContext* ctx) {
const int nthreads = outer_dim * inner_dim;
_IndexSelectGrad<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
inner_dim,
axis_dim,
select_dim,
index,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
} // IndexSelectGrad
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void IndexSelect<T, CUDAContext>( \ void IndexSelect<T, CUDAContext>( \
...@@ -169,6 +113,7 @@ DEFINE_KERNEL_LAUNCHER(float16); ...@@ -169,6 +113,7 @@ DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(double);
......
...@@ -41,8 +41,9 @@ __global__ void _RepeatGrad( ...@@ -41,8 +41,9 @@ __global__ void _RepeatGrad(
const int i = xi / x_inner_dim / axis_dim; const int i = xi / x_inner_dim / axis_dim;
const T* offset_dy = dy + ((i * axis_dim + j) * y_inner_dim + k); const T* offset_dy = dy + ((i * axis_dim + j) * y_inner_dim + k);
T val = T(0); T val = T(0);
for (int r = 0; r < repeats; ++r) for (int r = 0; r < repeats; ++r) {
val += offset_dy[r * x_inner_dim]; val += offset_dy[r * x_inner_dim];
}
dx[xi] = val; dx[xi] = val;
} }
} }
...@@ -57,16 +58,15 @@ __global__ void _RepeatGrad<half>( ...@@ -57,16 +58,15 @@ __global__ void _RepeatGrad<half>(
const half* dy, const half* dy,
half* dx) { half* dx) {
CUDA_1D_KERNEL_LOOP(xi, nthreads) { CUDA_1D_KERNEL_LOOP(xi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int k = xi % x_inner_dim; const int k = xi % x_inner_dim;
const int j = (xi / x_inner_dim) % axis_dim; const int j = (xi / x_inner_dim) % axis_dim;
const int i = xi / x_inner_dim / axis_dim; const int i = xi / x_inner_dim / axis_dim;
const half* offset_dy = dy + ((i * axis_dim + j) * y_inner_dim + k); const half* offset_dy = dy + ((i * axis_dim + j) * y_inner_dim + k);
float val = 0.f; float val = 0.f;
for (int r = 0; r < repeats; ++r) for (int r = 0; r < repeats; ++r) {
val += __half2float(offset_dy[r * x_inner_dim]); val += __half2float(offset_dy[r * x_inner_dim]);
}
dx[xi] = __float2half(val); dx[xi] = __float2half(val);
#endif
} }
} }
......
...@@ -42,15 +42,12 @@ __global__ void _TileGrad( ...@@ -42,15 +42,12 @@ __global__ void _TileGrad(
const int i = xi / x_cols; const int i = xi / x_cols;
const int j = xi % x_cols; const int j = xi % x_cols;
const T* offset_dy = dy + i * y_cols + j; const T* offset_dy = dy + i * y_cols + j;
T val = (*offset_dy); T val = (*offset_dy);
offset_dy += x_cols; offset_dy += x_cols;
for (int k = 1; k < multiple; ++k) { for (int k = 1; k < multiple; ++k) {
val += (*offset_dy); val += (*offset_dy);
offset_dy += x_cols; offset_dy += x_cols;
} }
dx[xi] = val; dx[xi] = val;
} }
} }
...@@ -64,21 +61,16 @@ __global__ void _TileGrad<half>( ...@@ -64,21 +61,16 @@ __global__ void _TileGrad<half>(
const half* dy, const half* dy,
half* dx) { half* dx) {
CUDA_1D_KERNEL_LOOP(xi, nthreads) { CUDA_1D_KERNEL_LOOP(xi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int i = xi / x_cols; const int i = xi / x_cols;
const int j = xi % x_cols; const int j = xi % x_cols;
const half* offset_dy = dy + i * y_cols + j; const half* offset_dy = dy + i * y_cols + j;
float val = __half2float(*offset_dy);
half val = (*offset_dy);
offset_dy += x_cols; offset_dy += x_cols;
for (int k = 1; k < multiple; ++k) { for (int k = 1; k < multiple; ++k) {
val += __hadd((*offset_dy), val); val += __half2float(*offset_dy);
offset_dy += x_cols; offset_dy += x_cols;
} }
dx[xi] = __float2half(val);
dx[xi] = val;
#endif
} }
} }
......
...@@ -25,12 +25,19 @@ __global__ void _Clip<half>( ...@@ -25,12 +25,19 @@ __global__ void _Clip<half>(
const half high, const half high,
const half* x, const half* x,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
const half val = __hlt(__ldg(x + i), high) ? __ldg(x + i) : high; CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hgt(val, low) ? val : low; y[i] = __hlt(__ldg(x + i), high)
#endif ? (__hgt(__ldg(x + i), low) ? __ldg(x + i) : low)
: high;
}
#else
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __float2half(max(kLow, min(__half2float(x[i]), kHigh)));
} }
#endif
} }
template <typename T> template <typename T>
...@@ -59,12 +66,28 @@ __global__ void _ClipGrad<half>( ...@@ -59,12 +66,28 @@ __global__ void _ClipGrad<half>(
const half* x, const half* x,
half* dx) { half* dx) {
const half kZero = __float2half(0.f); const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 #if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = dx[i] =
__hlt(__ldg(x + i), low) || __hgt(__ldg(x + i), high) ? kZero : dy[i]; (__hlt(__ldg(x + i), low) || __hgt(__ldg(x + i), high)) ? kZero : dy[i];
#endif }
#elif __CUDA_ARCH__ >= 350
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__half2float(__ldg(x + i)) < kLow ||
__half2float(__ldg(x + i)) > kHigh)
? kZero
: dy[i];
} }
#else
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__half2float(x[i]) < kLow || __half2float(x[i]) > kHigh) ? kZero
: dy[i];
}
#endif
} }
} // namespace } // namespace
......
...@@ -20,11 +20,8 @@ __global__ void _CosGrad(const int nthreads, const T* dy, const T* x, T* dx) { ...@@ -20,11 +20,8 @@ __global__ void _CosGrad(const int nthreads, const T* dy, const T* x, T* dx) {
template <> template <>
__global__ void __global__ void
_CosGrad<half>(const int nthreads, const half* dy, const half* x, half* dx) { _CosGrad<half>(const int nthreads, const half* dy, const half* x, half* dx) {
const half kFactor = __float2half(-1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 dx[i] = __float2half(-__half2float(dy[i]) * sin(__half2float(x[i])));
dx[i] = __hmul(__hmul(dy[i], kFactor), hsin(x[i]));
#endif
} }
} }
...@@ -34,11 +31,10 @@ __global__ void _CosGrad<half2>( ...@@ -34,11 +31,10 @@ __global__ void _CosGrad<half2>(
const half2* dy, const half2* dy,
const half2* x, const half2* x,
half2* dx) { half2* dx) {
const half2 kFactor = __float2half2_rn(-1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(x[i]);
dx[i] = __hmul2(__hmul2(dy[i], kFactor), h2sin(x[i])); const float2 grad = __half22float2(dy[i]);
#endif dx[i] = __floats2half2_rn(-grad.x * sin(val.x), -grad.y * sin(val.y));
} }
} }
...@@ -53,9 +49,7 @@ template <> ...@@ -53,9 +49,7 @@ template <>
__global__ void __global__ void
_SinGrad<half>(const int nthreads, const half* dy, const half* x, half* dx) { _SinGrad<half>(const int nthreads, const half* dy, const half* x, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 dx[i] = __float2half(__half2float(dy[i]) * cos(__half2float(x[i])));
dx[i] = __hmul(dy[i], hcos(x[i]));
#endif
} }
} }
...@@ -66,9 +60,9 @@ __global__ void _SinGrad<half2>( ...@@ -66,9 +60,9 @@ __global__ void _SinGrad<half2>(
const half2* x, const half2* x,
half2* dx) { half2* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(x[i]);
dx[i] = __hmul2(dy[i], h2cos(x[i])); const float2 grad = __half22float2(dy[i]);
#endif dx[i] = __floats2half2_rn(grad.x * cos(val.x), grad.y * cos(val.y));
} }
} }
...@@ -86,11 +80,9 @@ __global__ void _ReciprocalGrad<half>( ...@@ -86,11 +80,9 @@ __global__ void _ReciprocalGrad<half>(
const half* dy, const half* dy,
const half* y, const half* y,
half* dx) { half* dx) {
const half c = __float2half(-1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 dx[i] = __float2half(
dx[i] = __hmul(__hmul(c, dy[i]), utils::math::Square(y[i])); -__half2float(dy[i]) * utils::math::Square(__half2float(y[i])));
#endif
} }
} }
...@@ -100,11 +92,11 @@ __global__ void _ReciprocalGrad<half2>( ...@@ -100,11 +92,11 @@ __global__ void _ReciprocalGrad<half2>(
const half2* dy, const half2* dy,
const half2* y, const half2* y,
half2* dx) { half2* dx) {
const half2 c = __float2half2_rn(-1.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(y[i]);
dx[i] = __hmul2(__hmul2(c, dy[i]), utils::math::Square(y[i])); const float2 grad = __half22float2(dy[i]);
#endif dx[i] =
__floats2half2_rn(-grad.x * (val.x * val.x), -grad.y * (val.y * val.y));
} }
} }
...@@ -118,11 +110,9 @@ __global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) { ...@@ -118,11 +110,9 @@ __global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) {
template <> template <>
__global__ void __global__ void
_RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) { _RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
const half c = __float2half(-0.5f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 dx[i] = __float2half(
dx[i] = __hmul(__hmul(c, dy[i]), utils::math::Cube(y[i])); -0.5f * __half2float(dy[i]) * utils::math::Cube(__half2float(y[i])));
#endif
} }
} }
...@@ -132,11 +122,12 @@ __global__ void _RsqrtGrad<half2>( ...@@ -132,11 +122,12 @@ __global__ void _RsqrtGrad<half2>(
const half2* dy, const half2* dy,
const half2* y, const half2* y,
half2* dx) { half2* dx) {
const half2 c = __float2half2_rn(-0.5f);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530 const float2 val = __half22float2(y[i]);
dx[i] = __hmul2(__hmul2(c, dy[i]), utils::math::Cube(y[i])); const float2 grad = __half22float2(dy[i]);
#endif dx[i] = __floats2half2_rn(
-0.5f * grad.x * (val.x * val.x * val.x),
-0.5f * grad.y * (val.y * val.y * val.y));
} }
} }
......
...@@ -51,7 +51,6 @@ __global__ void _RowwiseMoments<half, float>( ...@@ -51,7 +51,6 @@ __global__ void _RowwiseMoments<half, float>(
const half* x, const half* x,
float* mean, float* mean,
float* var) { float* var) {
#if __CUDA_ARCH__ >= 530
__shared__ typename BlockReduce<float>::TempStorage m_storage; __shared__ typename BlockReduce<float>::TempStorage m_storage;
__shared__ typename BlockReduce<float>::TempStorage v_storage; __shared__ typename BlockReduce<float>::TempStorage v_storage;
const float scale = 1.f / (float)rows; const float scale = 1.f / (float)rows;
...@@ -70,7 +69,6 @@ __global__ void _RowwiseMoments<half, float>( ...@@ -70,7 +69,6 @@ __global__ void _RowwiseMoments<half, float>(
var[i] = v_val * scale - mu * mu; var[i] = v_val * scale - mu * mu;
} }
} }
#endif
} }
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
...@@ -112,7 +110,6 @@ __global__ void _ColwiseMoments<half, float>( ...@@ -112,7 +110,6 @@ __global__ void _ColwiseMoments<half, float>(
const half* x, const half* x,
float* mean, float* mean,
float* var) { float* var) {
#if __CUDA_ARCH__ >= 530
__shared__ typename BlockReduce<float>::TempStorage m_storage; __shared__ typename BlockReduce<float>::TempStorage m_storage;
__shared__ typename BlockReduce<float>::TempStorage v_storage; __shared__ typename BlockReduce<float>::TempStorage v_storage;
const float scale = 1.f / (float)cols; const float scale = 1.f / (float)cols;
...@@ -131,7 +128,6 @@ __global__ void _ColwiseMoments<half, float>( ...@@ -131,7 +128,6 @@ __global__ void _ColwiseMoments<half, float>(
var[i] = v_val * scale - mu * mu; var[i] = v_val * scale - mu * mu;
} }
} }
#endif
} }
template <typename Tx, typename Ty, int D> template <typename Tx, typename Ty, int D>
......
...@@ -10,9 +10,9 @@ namespace dragon { ...@@ -10,9 +10,9 @@ namespace dragon {
namespace kernel { namespace kernel {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
#define L(x, i) __ldg(x + i) #define LOAD(x, i) __ldg(x + i)
#else #else
#define L(x, i) x[i] #define LOAD(x, i) x[i]
#endif #endif
namespace { namespace {
...@@ -34,8 +34,8 @@ __global__ void _BatchNormExpectation( ...@@ -34,8 +34,8 @@ __global__ void _BatchNormExpectation(
CUDA_2D_KERNEL_LOOP2(j, outer_dim) { CUDA_2D_KERNEL_LOOP2(j, outer_dim) {
const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S
: j * C + i; : j * C + i;
ex_val += L(x, xi); ex_val += LOAD(x, xi);
ex2_val += utils::math::Square(L(x, xi)); ex2_val += utils::math::Square(LOAD(x, xi));
} }
ex_val = BlockReduce<Tp>(ex_storage).Reduce(ex_val, cub::Sum()); ex_val = BlockReduce<Tp>(ex_storage).Reduce(ex_val, cub::Sum());
ex2_val = BlockReduce<Tp>(ex2_storage).Reduce(ex2_val, cub::Sum()); ex2_val = BlockReduce<Tp>(ex2_storage).Reduce(ex2_val, cub::Sum());
...@@ -66,8 +66,8 @@ __global__ void _BatchNormInternalGrad( ...@@ -66,8 +66,8 @@ __global__ void _BatchNormInternalGrad(
CUDA_2D_KERNEL_LOOP2(j, outer_dim) { CUDA_2D_KERNEL_LOOP2(j, outer_dim) {
const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S
: j * C + i; : j * C + i;
dg_val += L(dy, xi) * (L(x, xi) - L(mu, i)) * L(rsig, i); dg_val += LOAD(dy, xi) * (LOAD(x, xi) - LOAD(mu, i)) * LOAD(rsig, i);
db_val += L(dy, xi); db_val += LOAD(dy, xi);
} }
dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum()); dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum()); db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
...@@ -95,9 +95,9 @@ __global__ void _BatchNormTrainingGrad( ...@@ -95,9 +95,9 @@ __global__ void _BatchNormTrainingGrad(
const Tp denom = Tp(1) / Tp(N * S); const Tp denom = Tp(1) / Tp(N * S);
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int pi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C; const int pi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
const Tp x_norm = (L(x, i) - L(mu, pi)) * L(rsig, pi); const Tp x_norm = (LOAD(x, i) - LOAD(mu, pi)) * LOAD(rsig, pi);
dx[i] = L(gamma, pi) * L(rsig, pi) * dx[i] = LOAD(gamma, pi) * LOAD(rsig, pi) *
(L(dy, i) - fma(x_norm, L(dgamma, pi), L(dbeta, pi)) * denom); (LOAD(dy, i) - fma(x_norm, LOAD(dgamma, pi), LOAD(dbeta, pi)) * denom);
} }
} }
...@@ -120,8 +120,8 @@ __global__ void _BatchNormWGrad( ...@@ -120,8 +120,8 @@ __global__ void _BatchNormWGrad(
CUDA_2D_KERNEL_LOOP2(j, outer_dim) { CUDA_2D_KERNEL_LOOP2(j, outer_dim) {
const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S
: j * C + i; : j * C + i;
dg_val += L(dy, xi) * (L(x, xi) - L(mu, i)) * L(rsig, i); dg_val += LOAD(dy, xi) * (LOAD(x, xi) - LOAD(mu, i)) * LOAD(rsig, i);
db_val += L(dy, xi); db_val += LOAD(dy, xi);
} }
dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum()); dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum()); db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
...@@ -143,10 +143,12 @@ __global__ void _BatchNormInferenceGrad( ...@@ -143,10 +143,12 @@ __global__ void _BatchNormInferenceGrad(
Tx* dx) { Tx* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int pi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C; const int pi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
dx[i] = L(gamma, pi) * L(dy, i) * L(rsig, pi); dx[i] = LOAD(gamma, pi) * LOAD(dy, i) * LOAD(rsig, pi);
} }
} }
#undef LOAD
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
...@@ -294,7 +296,6 @@ __global__ void _BatchNormInferenceGrad( ...@@ -294,7 +296,6 @@ __global__ void _BatchNormInferenceGrad(
DEFINE_GRAD_KERNEL_LAUNCHER(float, float); DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
#undef L
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -10,11 +10,11 @@ namespace dragon { ...@@ -10,11 +10,11 @@ namespace dragon {
namespace kernel { namespace kernel {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
#define L(x, i) __ldg(x + i) #define LOAD(x, i) __ldg(x + i)
#define LF(x, i) __half2float(__ldg(x + i)) #define LOADF(x, i) __half2float(__ldg(x + i))
#else #else
#define L(x, i) x[i] #define LOAD(x, i) x[i]
#define LF(x, i) __half2float(x[i]) #define LOADF(x, i) __half2float(x[i])
#endif #endif
namespace { namespace {
...@@ -33,14 +33,14 @@ __global__ void _GroupNormFusedParams( ...@@ -33,14 +33,14 @@ __global__ void _GroupNormFusedParams(
const int outer_dim = N * G; const int outer_dim = N * G;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) { CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
const int g = i % G; const int g = i % G;
const T mu_val = L(mu, i); const T mu_val = LOAD(mu, i);
const T rsig_val = L(rsig, i); const T rsig_val = LOAD(rsig, i);
CUDA_2D_KERNEL_LOOP2(j, D) { CUDA_2D_KERNEL_LOOP2(j, D) {
const int wi = i * D + j; const int wi = i * D + j;
const int gi = g * D + j; const int gi = g * D + j;
const T w = L(gamma, gi) * rsig_val; const T w = LOAD(gamma, gi) * rsig_val;
scale[wi] = w; scale[wi] = w;
bias[wi] = fma(-w, mu_val, L(beta, gi)); bias[wi] = fma(-w, mu_val, LOAD(beta, gi));
} }
} }
} }
...@@ -56,11 +56,11 @@ __global__ void _GroupNormForwardNCHW( ...@@ -56,11 +56,11 @@ __global__ void _GroupNormForwardNCHW(
Tx* y) { Tx* y) {
const int outer_dim = N * C; const int outer_dim = N * C;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) { CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
const Tp w = L(scale, i); const Tp w = LOAD(scale, i);
const Tp b = L(bias, i); const Tp b = LOAD(bias, i);
CUDA_2D_KERNEL_LOOP2(j, S) { CUDA_2D_KERNEL_LOOP2(j, S) {
const int xi = i * S + j; const int xi = i * S + j;
y[xi] = fma(L(x, xi), w, b); y[xi] = fma(LOAD(x, xi), w, b);
} }
} }
} }
...@@ -76,11 +76,11 @@ __global__ void _GroupNormForwardNCHW<half, float>( ...@@ -76,11 +76,11 @@ __global__ void _GroupNormForwardNCHW<half, float>(
half* y) { half* y) {
const int outer_dim = N * C; const int outer_dim = N * C;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) { CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
const float w = L(scale, i); const float w = LOAD(scale, i);
const float b = L(bias, i); const float b = LOAD(bias, i);
CUDA_2D_KERNEL_LOOP2(j, S) { CUDA_2D_KERNEL_LOOP2(j, S) {
const int xi = i * S + j; const int xi = i * S + j;
y[xi] = __float2half(fmaf(LF(x, xi), w, b)); y[xi] = __float2half(fmaf(LOADF(x, xi), w, b));
} }
} }
} }
...@@ -100,7 +100,7 @@ __global__ void _GroupNormForwardNHWC( ...@@ -100,7 +100,7 @@ __global__ void _GroupNormForwardNHWC(
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
const int xi = i * C + j; const int xi = i * C + j;
const int wi = n * C + j; const int wi = n * C + j;
y[xi] = fma(L(x, xi), L(scale, wi), L(bias, wi)); y[xi] = fma(LOAD(x, xi), LOAD(scale, wi), LOAD(bias, wi));
} }
} }
} }
...@@ -120,7 +120,7 @@ __global__ void _GroupNormForwardNHWC<half, float>( ...@@ -120,7 +120,7 @@ __global__ void _GroupNormForwardNHWC<half, float>(
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
const int xi = i * C + j; const int xi = i * C + j;
const int wi = n * C + j; const int wi = n * C + j;
y[xi] = __float2half(fmaf(LF(x, xi), L(scale, wi), L(bias, wi))); y[xi] = __float2half(fmaf(LOADF(x, xi), LOAD(scale, wi), LOAD(bias, wi)));
} }
} }
} }
...@@ -149,8 +149,8 @@ __global__ void _GroupNormWGrad( ...@@ -149,8 +149,8 @@ __global__ void _GroupNormWGrad(
? (n * outer_dim + i) * S + j % S ? (n * outer_dim + i) * S + j % S
: j * outer_dim + i; : j * outer_dim + i;
const int mi = n * G + i / D; const int mi = n * G + i / D;
dg_val += L(dy, xi) * (L(x, xi) - L(mu, mi)) * L(rsig, mi); dg_val += LOAD(dy, xi) * (LOAD(x, xi) - LOAD(mu, mi)) * LOAD(rsig, mi);
db_val += L(dy, xi); db_val += LOAD(dy, xi);
} }
dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum()); dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum()); db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
...@@ -185,8 +185,8 @@ __global__ void _GroupNormWGradHalf( ...@@ -185,8 +185,8 @@ __global__ void _GroupNormWGradHalf(
? (n * outer_dim + i) * S + j % S ? (n * outer_dim + i) * S + j % S
: j * outer_dim + i; : j * outer_dim + i;
const int mi = n * G + i / D; const int mi = n * G + i / D;
dg_val += LF(dy, xi) * (LF(x, xi) - L(mu, mi)) * L(rsig, mi); dg_val += LOADF(dy, xi) * (LOADF(x, xi) - LOAD(mu, mi)) * LOAD(rsig, mi);
db_val += LF(dy, xi); db_val += LOADF(dy, xi);
} }
dg_val = BlockReduce<float>(dg_storage).Reduce(dg_val, cub::Sum()); dg_val = BlockReduce<float>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<float>(db_storage).Reduce(db_val, cub::Sum()); db_val = BlockReduce<float>(db_storage).Reduce(db_val, cub::Sum());
...@@ -219,8 +219,8 @@ __global__ void _GroupNormInternalGrad( ...@@ -219,8 +219,8 @@ __global__ void _GroupNormInternalGrad(
const int xi = kOrder == StorageOrder::NCHW const int xi = kOrder == StorageOrder::NCHW
? i * inner_dim + j ? i * inner_dim + j
: (i / G * S + j % S) * G * D + gi; : (i / G * S + j % S) * G * D + gi;
ds_val += L(gamma, gi) * L(dy, xi) * L(x, xi); ds_val += LOAD(gamma, gi) * LOAD(dy, xi) * LOAD(x, xi);
db_val += L(gamma, gi) * L(dy, xi); db_val += LOAD(gamma, gi) * LOAD(dy, xi);
} }
ds_val = BlockReduce<Tp>(ds_storage).Reduce(ds_val, cub::Sum()); ds_val = BlockReduce<Tp>(ds_storage).Reduce(ds_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum()); db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
...@@ -253,8 +253,8 @@ __global__ void _GroupNormInternalGradHalf( ...@@ -253,8 +253,8 @@ __global__ void _GroupNormInternalGradHalf(
const int xi = kOrder == StorageOrder::NCHW const int xi = kOrder == StorageOrder::NCHW
? i * inner_dim + j ? i * inner_dim + j
: (i / G * S + j % S) * G * D + gi; : (i / G * S + j % S) * G * D + gi;
ds_val += L(gamma, gi) * LF(dy, xi) * LF(x, xi); ds_val += LOAD(gamma, gi) * LOADF(dy, xi) * LOADF(x, xi);
db_val += L(gamma, gi) * LF(dy, xi); db_val += LOAD(gamma, gi) * LOADF(dy, xi);
} }
ds_val = BlockReduce<float>(ds_storage).Reduce(ds_val, cub::Sum()); ds_val = BlockReduce<float>(ds_storage).Reduce(ds_val, cub::Sum());
db_val = BlockReduce<float>(db_storage).Reduce(db_val, cub::Sum()); db_val = BlockReduce<float>(db_storage).Reduce(db_val, cub::Sum());
...@@ -285,10 +285,10 @@ __global__ void _GroupNormGrad( ...@@ -285,10 +285,10 @@ __global__ void _GroupNormGrad(
const int mi = kOrder == StorageOrder::NCHW ? i / (D * S) const int mi = kOrder == StorageOrder::NCHW ? i / (D * S)
: i / (C * S) * G + (i / D % G); : i / (C * S) * G + (i / D % G);
const int gi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C; const int gi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
const Tp u = fma(L(db, mi), L(mu, mi), -L(ds, mi)) * (L(x, i) - L(mu, mi)) * const Tp u = fma(LOAD(db, mi), LOAD(mu, mi), -LOAD(ds, mi)) *
utils::math::Cube(L(rsig, mi)); (LOAD(x, i) - LOAD(mu, mi)) * utils::math::Cube(LOAD(rsig, mi));
const Tp v = L(db, mi) * L(rsig, mi); const Tp v = LOAD(db, mi) * LOAD(rsig, mi);
dx[i] = L(gamma, gi) * L(dy, i) * L(rsig, mi) + (u - v) * denom; dx[i] = LOAD(gamma, gi) * LOAD(dy, i) * LOAD(rsig, mi) + (u - v) * denom;
} }
} }
...@@ -312,14 +312,17 @@ __global__ void _GroupNormGradHalf( ...@@ -312,14 +312,17 @@ __global__ void _GroupNormGradHalf(
const int mi = kOrder == StorageOrder::NCHW ? i / (D * S) const int mi = kOrder == StorageOrder::NCHW ? i / (D * S)
: i / (C * S) * G + (i / D % G); : i / (C * S) * G + (i / D % G);
const int gi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C; const int gi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
const float u = fmaf(L(db, mi), L(mu, mi), -L(ds, mi)) * const float u = fmaf(LOAD(db, mi), LOAD(mu, mi), -LOAD(ds, mi)) *
(LF(x, i) - L(mu, mi)) * utils::math::Cube(L(rsig, mi)); (LOADF(x, i) - LOAD(mu, mi)) * utils::math::Cube(LOAD(rsig, mi));
const float v = L(db, mi) * L(rsig, mi); const float v = LOAD(db, mi) * LOAD(rsig, mi);
dx[i] = dx[i] = __float2half(
__float2half(L(gamma, gi) * LF(dy, i) * L(rsig, mi) + (u - v) * denom); LOAD(gamma, gi) * LOADF(dy, i) * LOAD(rsig, mi) + (u - v) * denom);
} }
} }
#undef LOAD
#undef LOADF
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
...@@ -543,8 +546,6 @@ void GroupNormBackward<float16, float, CUDAContext>( ...@@ -543,8 +546,6 @@ void GroupNormBackward<float16, float, CUDAContext>(
DEFINE_KERNEL_LAUNCHER(float, float); DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float); DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
#undef L
#undef LF
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -14,23 +15,9 @@ __global__ void ...@@ -14,23 +15,9 @@ __global__ void
_BiasAdd(const int nthreads, const int axis_dim, const T* x, const T* b, T* y) { _BiasAdd(const int nthreads, const int axis_dim, const T* x, const T* b, T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
y[i] = x[i] + __ldg(b + i % axis_dim); y[i] = math::PlusFunctor<T>()(x[i], __ldg(b + i % axis_dim));
#else #else
y[i] = x[i] + b[i % axis_dim]; y[i] = math::PlusFunctor<T>()(x[i], b[i % axis_dim]);
#endif
}
}
template <>
__global__ void _BiasAdd<half>(
const int nthreads,
const int axis_dim,
const half* x,
const half* b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hadd(x[i], __ldg(b + i % axis_dim));
#endif #endif
} }
} }
...@@ -45,24 +32,9 @@ __global__ void _BiasAdd( ...@@ -45,24 +32,9 @@ __global__ void _BiasAdd(
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
y[i] = x[i] + __ldg(b + (i / inner_dim) % axis_dim); y[i] = math::PlusFunctor<T>()(x[i], __ldg(b + (i / inner_dim) % axis_dim));
#else #else
y[i] = x[i] + b[(i / inner_dim) % axis_dim]; y[i] = math::PlusFunctor<T>()(x[i], b[(i / inner_dim) % axis_dim]);
#endif
}
}
template <>
__global__ void _BiasAdd<half>(
const int nthreads,
const int inner_dim,
const int axis_dim,
const half* x,
const half* b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hadd(x[i], __ldg(b + (i / inner_dim) % axis_dim));
#endif #endif
} }
} }
......
...@@ -10,6 +10,14 @@ namespace kernel { ...@@ -10,6 +10,14 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i)
#define LOADF(x, i) __half2float(__ldg(x + i))
#else
#define LOAD(x, i) x[i]
#define LOADF(x, i) __half2float(x[i])
#endif
template <typename T> template <typename T>
float ComputeScale(T in_size, T out_size, bool align_corners) { float ComputeScale(T in_size, T out_size, bool align_corners) {
if (align_corners) { if (align_corners) {
...@@ -62,17 +70,10 @@ __global__ void _ResizeLinearNCHW( ...@@ -62,17 +70,10 @@ __global__ void _ResizeLinearNCHW(
const float u = w_in - li; const float u = w_in - li;
const int offset = (n * C + c) * H; const int offset = (n * C + c) * H;
#if __CUDA_ARCH__ >= 350 const float tl = LOAD(x, ((offset + ti) * W + li));
const float tl = __ldg(x + ((offset + ti) * W + li)); const float tr = LOAD(x, ((offset + ti) * W + ri));
const float tr = __ldg(x + ((offset + ti) * W + ri)); const float bl = LOAD(x, ((offset + bi) * W + li));
const float bl = __ldg(x + ((offset + bi) * W + li)); const float br = LOAD(x, ((offset + bi) * W + ri));
const float br = __ldg(x + ((offset + bi) * W + ri));
#else
const float tl = x[(offset + ti) * W + li];
const float tr = x[(offset + ti) * W + ri];
const float bl = x[(offset + bi) * W + li];
const float br = x[(offset + bi) * W + ri];
#endif
const float t = tl + (tr - tl) * u; const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u; const float b = bl + (br - bl) * u;
y[yi] = (T)(t + (b - t) * v); y[yi] = (T)(t + (b - t) * v);
...@@ -93,7 +94,6 @@ __global__ void _ResizeLinearNCHW<half>( ...@@ -93,7 +94,6 @@ __global__ void _ResizeLinearNCHW<half>(
const half* x, const half* x,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int w = yi % out_w; const int w = yi % out_w;
const int h = (yi / out_w) % out_h; const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
...@@ -110,15 +110,13 @@ __global__ void _ResizeLinearNCHW<half>( ...@@ -110,15 +110,13 @@ __global__ void _ResizeLinearNCHW<half>(
const float u = w_in - li; const float u = w_in - li;
const int offset = (n * C + c) * H; const int offset = (n * C + c) * H;
const float tl = __half2float(__ldg(x + ((offset + ti) * W + li))); const float tl = LOADF(x, ((offset + ti) * W + li));
const float tr = __half2float(__ldg(x + ((offset + ti) * W + ri))); const float tr = LOADF(x, ((offset + ti) * W + ri));
const float bl = __half2float(__ldg(x + ((offset + bi) * W + li))); const float bl = LOADF(x, ((offset + bi) * W + li));
const float br = __half2float(__ldg(x + ((offset + bi) * W + ri))); const float br = LOADF(x, ((offset + bi) * W + ri));
const float t = tl + (tr - tl) * u; const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u; const float b = bl + (br - bl) * u;
y[yi] = __float2half(t + (b - t) * v); y[yi] = __float2half(t + (b - t) * v);
#endif
} }
} }
...@@ -152,17 +150,10 @@ __global__ void _ResizeLinearNHWC( ...@@ -152,17 +150,10 @@ __global__ void _ResizeLinearNHWC(
const float u = w_in - li; const float u = w_in - li;
const int offset = n * H; const int offset = n * H;
#if __CUDA_ARCH__ >= 350 const float tl = LOAD(x, (((offset + ti) * W + li) * C + c));
const float tl = __ldg(x + (((offset + ti) * W + li) * C + c)); const float tr = LOAD(x, (((offset + ti) * W + ri) * C + c));
const float tr = __ldg(x + (((offset + ti) * W + ri) * C + c)); const float bl = LOAD(x, (((offset + bi) * W + li) * C + c));
const float bl = __ldg(x + (((offset + bi) * W + li) * C + c)); const float br = LOAD(x, (((offset + bi) * W + ri) * C + c));
const float br = __ldg(x + (((offset + bi) * W + ri) * C + c));
#else
const float tl = x[((offset + ti) * W + li) * C + c];
const float tr = x[((offset + ti) * W + ri) * C + c];
const float bl = x[((offset + bi) * W + li) * C + c];
const float br = x[((offset + bi) * W + ri) * C + c];
#endif
const float t = tl + (tr - tl) * u; const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u; const float b = bl + (br - bl) * u;
y[yi] = (T)(t + (b - t) * v); y[yi] = (T)(t + (b - t) * v);
...@@ -183,7 +174,6 @@ __global__ void _ResizeLinearNHWC<half>( ...@@ -183,7 +174,6 @@ __global__ void _ResizeLinearNHWC<half>(
const half* x, const half* x,
half* y) { half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int c = yi % C; const int c = yi % C;
const int w = (yi / C) % out_w; const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h; const int h = (yi / C / out_w) % out_h;
...@@ -200,19 +190,13 @@ __global__ void _ResizeLinearNHWC<half>( ...@@ -200,19 +190,13 @@ __global__ void _ResizeLinearNHWC<half>(
const float u = w_in - li; const float u = w_in - li;
const int offset = n * H; const int offset = n * H;
const float tl = const float tl = LOADF(x, (((offset + ti) * W + li) * C + c));
__half2float(__ldg(x + (((offset + ti) * W + li) * C + c))); const float tr = LOADF(x, (((offset + ti) * W + ri) * C + c));
const float tr = const float bl = LOADF(x, (((offset + bi) * W + li) * C + c));
__half2float(__ldg(x + (((offset + ti) * W + ri) * C + c))); const float br = LOADF(x, (((offset + bi) * W + ri) * C + c));
const float bl =
__half2float(__ldg(x + (((offset + bi) * W + li) * C + c)));
const float br =
__half2float(__ldg(x + (((offset + bi) * W + ri) * C + c)));
const float t = tl + (tr - tl) * u; const float t = tl + (tr - tl) * u;
const float b = bl + (br - bl) * u; const float b = bl + (br - bl) * u;
y[yi] = __float2half(t + (b - t) * v); y[yi] = __float2half(t + (b - t) * v);
#endif
} }
} }
...@@ -245,13 +229,8 @@ __global__ void _ResizeLinearGradNCHW( ...@@ -245,13 +229,8 @@ __global__ void _ResizeLinearGradNCHW(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1; const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li; const float u = w_in - li;
#if __CUDA_ARCH__ >= 350 const float dt = (1.f - v) * LOAD(dy, yi);
const float dt = (1.f - v) * ((float)__ldg(dy + yi)); const float db = v * LOAD(dy, yi);
const float db = v * ((float)__ldg(dy + yi));
#else
const float dt = (1.f - v) * ((float)dy[yi]);
const float db = v * ((float)dy[yi]);
#endif
const int offset = (n * C + c) * H; const int offset = (n * C + c) * H;
atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt); atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
...@@ -275,7 +254,6 @@ __global__ void _ResizeLinearGradNCHW<half>( ...@@ -275,7 +254,6 @@ __global__ void _ResizeLinearGradNCHW<half>(
const half* dy, const half* dy,
float* dx) { float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int w = yi % out_w; const int w = yi % out_w;
const int h = (yi / out_w) % out_h; const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
...@@ -291,15 +269,14 @@ __global__ void _ResizeLinearGradNCHW<half>( ...@@ -291,15 +269,14 @@ __global__ void _ResizeLinearGradNCHW<half>(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1; const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li; const float u = w_in - li;
const float dt = (1.f - v) * __half2float(__ldg(dy + yi)); const float dt = (1.f - v) * LOADF(dy, yi);
const float db = v * __half2float(__ldg(dy + yi)); const float db = v * LOADF(dy, yi);
const int offset = (n * C + c) * H; const int offset = (n * C + c) * H;
atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt); atomicAdd(&dx[(offset + ti) * W + li], (1.f - u) * dt);
atomicAdd(&dx[(offset + ti) * W + ri], u * dt); atomicAdd(&dx[(offset + ti) * W + ri], u * dt);
atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db); atomicAdd(&dx[(offset + bi) * W + li], (1.f - u) * db);
atomicAdd(&dx[(offset + bi) * W + ri], u * db); atomicAdd(&dx[(offset + bi) * W + ri], u * db);
#endif
} }
} }
...@@ -332,13 +309,8 @@ __global__ void _ResizeLinearGradNHWC( ...@@ -332,13 +309,8 @@ __global__ void _ResizeLinearGradNHWC(
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1; const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li; const float u = w_in - li;
#if __CUDA_ARCH__ >= 350 const float dt = (1.f - v) * LOAD(dy, yi);
const float dt = (1.f - v) * ((float)__ldg(dy + yi)); const float db = v * LOAD(dy, yi);
const float db = v * ((float)__ldg(dy + yi));
#else
const float dt = (1.f - v) * ((float)dy[yi]);
const float db = v * ((float)dy[yi]);
#endif
const int offset = n * H; const int offset = n * H;
atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt); atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
...@@ -348,6 +320,49 @@ __global__ void _ResizeLinearGradNHWC( ...@@ -348,6 +320,49 @@ __global__ void _ResizeLinearGradNHWC(
} }
} }
template <>
__global__ void _ResizeLinearGradNHWC<half>(
const int nthreads,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const float scale_h,
const float scale_w,
const bool align_corners,
const half* dy,
float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int c = yi % C;
const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h;
const float h_in = TransformCoordinate(h, scale_h, align_corners);
const int ti = floorf(h_in);
const int bi = (h_in < H - 1) ? ceilf(h_in) : H - 1;
const float v = h_in - ti;
const float w_in = TransformCoordinate(w, scale_w, align_corners);
const int li = floorf(w_in);
const int ri = (w_in < W - 1) ? ceilf(w_in) : W - 1;
const float u = w_in - li;
const float dt = (1.f - v) * LOADF(dy, yi);
const float db = v * LOADF(dy, yi);
const int offset = n * H;
atomicAdd(&dx[((offset + ti) * W + li) * C + c], (1.f - u) * dt);
atomicAdd(&dx[((offset + ti) * W + ri) * C + c], u * dt);
atomicAdd(&dx[((offset + bi) * W + li) * C + c], (1.f - u) * db);
atomicAdd(&dx[((offset + bi) * W + ri) * C + c], u * db);
}
}
#undef LOAD
#undef LOADF
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
......
...@@ -104,15 +104,17 @@ __global__ void _ResizeNearestGradNCHW<half>( ...@@ -104,15 +104,17 @@ __global__ void _ResizeNearestGradNCHW<half>(
const half* dy, const half* dy,
float* dx) { float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int w = yi % out_w; const int w = yi % out_w;
const int h = (yi / out_w) % out_h; const int h = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C; const int n = yi / out_w / out_h / C;
const int h_in = min(int(h * scale_h), H - 1); const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1); const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd( atomicAdd(
&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(__ldg(dy + yi))); &dx[((n * C + c) * H + h_in) * W + w_in], __half2float(__ldg(dy + yi)));
#else
atomicAdd(&dx[((n * C + c) * H + h_in) * W + w_in], __half2float(dy[yi]));
#endif #endif
} }
} }
...@@ -157,15 +159,17 @@ __global__ void _ResizeNearestGradNHWC<half>( ...@@ -157,15 +159,17 @@ __global__ void _ResizeNearestGradNHWC<half>(
const half* dy, const half* dy,
float* dx) { float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int c = yi % C; const int c = yi % C;
const int w = (yi / C) % out_w; const int w = (yi / C) % out_w;
const int h = (yi / C / out_w) % out_h; const int h = (yi / C / out_w) % out_h;
const int n = yi / C / out_w / out_h; const int n = yi / C / out_w / out_h;
const int h_in = min(int(h * scale_h), H - 1); const int h_in = min(int(h * scale_h), H - 1);
const int w_in = min(int(w * scale_w), W - 1); const int w_in = min(int(w * scale_w), W - 1);
#if __CUDA_ARCH__ >= 350
atomicAdd( atomicAdd(
&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(__ldg(dy + yi))); &dx[((n * H + h_in) * W + w_in) * C + c], __half2float(__ldg(dy + yi)));
#else
atomicAdd(&dx[((n * H + h_in) * W + w_in) * C + c], __half2float(dy[yi]));
#endif #endif
} }
} }
......
...@@ -9,6 +9,14 @@ namespace kernel { ...@@ -9,6 +9,14 @@ namespace kernel {
namespace { namespace {
#if __CUDA_ARCH__ >= 350
#define LOAD(x, i) __ldg(x + i)
#define LOADF(x, i) __half2float(__ldg(x + i))
#else
#define LOAD(x, i) x[i]
#define LOADF(x, i) __half2float(x[i])
#endif
template <typename T> template <typename T>
__device__ float __device__ float
_RoiAlignIntp(const int H, const int W, float h, float w, const T* x) { _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) {
...@@ -34,17 +42,10 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) { ...@@ -34,17 +42,10 @@ _RoiAlignIntp(const int H, const int W, float h, float w, const T* x) {
w = (float)li; w = (float)li;
} }
#if __CUDA_ARCH__ >= 350 const float tl = LOAD(x, (ti * W + li));
const float tl = __ldg(x + (ti * W + li)); const float tr = LOAD(x, (ti * W + ri));
const float tr = __ldg(x + (ti * W + ri)); const float bl = LOAD(x, (bi * W + li));
const float bl = __ldg(x + (bi * W + li)); const float br = LOAD(x, (bi * W + ri));
const float br = __ldg(x + (bi * W + ri));
#else
const float tl = x[ti * W + li];
const float tr = x[ti * W + ri];
const float bl = x[bi * W + li];
const float br = x[bi * W + ri];
#endif
const float v = h - ti; const float v = h - ti;
const float u = w - li; const float u = w - li;
...@@ -79,17 +80,10 @@ _RoiAlignIntp<half>(const int H, const int W, float h, float w, const half* x) { ...@@ -79,17 +80,10 @@ _RoiAlignIntp<half>(const int H, const int W, float h, float w, const half* x) {
w = (float)li; w = (float)li;
} }
#if __CUDA_ARCH__ >= 350 const float tl = LOADF(x, (ti * W + li));
const float tl = __half2float(__ldg(x + (ti * W + li))); const float tr = LOADF(x, (ti * W + ri));
const float tr = __half2float(__ldg(x + (ti * W + ri))); const float bl = LOADF(x, (bi * W + li));
const float bl = __half2float(__ldg(x + (bi * W + li))); const float br = LOADF(x, (bi * W + ri));
const float br = __half2float(__ldg(x + (bi * W + ri)));
#else
const float tl = __half2float(x[ti * W + li]);
const float tr = __half2float(x[ti * W + ri]);
const float bl = __half2float(x[bi * W + li]);
const float br = __half2float(x[bi * W + ri]);
#endif
const float v = h - ti; const float v = h - ti;
const float u = w - li; const float u = w - li;
...@@ -389,6 +383,9 @@ __global__ void _RoiAlignGrad<half>( ...@@ -389,6 +383,9 @@ __global__ void _RoiAlignGrad<half>(
} }
} }
#undef LOAD
#undef LOADF
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
......
...@@ -58,30 +58,29 @@ __global__ void _RoiPool( ...@@ -58,30 +58,29 @@ __global__ void _RoiPool(
wend = min(max(wend + roi_start_w, 0), W); wend = min(max(wend + roi_start_w, 0), W);
const bool empty = (hend <= hstart) || (wend <= wstart); const bool empty = (hend <= hstart) || (wend <= wstart);
int max_idx = empty ? -1 : 0;
const T* offset_x = x + (batch_ind * C + c) * H * W; const T* offset_x = x + (batch_ind * C + c) * H * W;
int maxi = empty ? -1 : 0;
T val = empty ? T(0) : offset_x[0]; T val = empty ? T(0) : offset_x[0];
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
int xi = h * W + w; const int xi = h * W + w;
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
if (__ldg(offset_x + xi) > val) { if (__ldg(offset_x + xi) > val) {
maxi = xi;
val = __ldg(offset_x + xi); val = __ldg(offset_x + xi);
max_idx = xi;
} }
#else #else
if (x[xi] > val) { if (offset_x[xi] > val) {
maxi = xi;
val = offset_x[xi]; val = offset_x[xi];
max_idx = xi;
} }
#endif #endif
} }
} }
y[yi] = val; y[yi] = val;
mask[yi] = maxi; mask[yi] = max_idx;
} }
} }
...@@ -98,9 +97,7 @@ __global__ void _RoiPool<half>( ...@@ -98,9 +97,7 @@ __global__ void _RoiPool<half>(
const float* rois, const float* rois,
int* mask, int* mask,
half* y) { half* y) {
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int ow = yi % out_w; const int ow = yi % out_w;
const int oh = (yi / out_w) % out_h; const int oh = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
...@@ -136,24 +133,42 @@ __global__ void _RoiPool<half>( ...@@ -136,24 +133,42 @@ __global__ void _RoiPool<half>(
wend = min(max(wend + roi_start_w, 0), W); wend = min(max(wend + roi_start_w, 0), W);
const bool empty = (hend <= hstart) || (wend <= wstart); const bool empty = (hend <= hstart) || (wend <= wstart);
int max_idx = empty ? -1 : 0;
const half* offset_x = x + ((batch_ind * C + c) * H * W); const half* offset_x = x + ((batch_ind * C + c) * H * W);
#if __CUDA_ARCH__ >= 530
int maxi = empty ? -1 : 0; half val = empty ? __float2half(0.f) : __ldg(offset_x);
half val = empty ? kZero : __ldg(offset_x); #else
float val = empty ? 0.f : __half2float(*offset_x);
#endif
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
int xi = h * W + w; const int xi = h * W + w;
#if __CUDA_ARCH__ >= 530
if (__hgt(__ldg(offset_x + xi), val)) { if (__hgt(__ldg(offset_x + xi), val)) {
maxi = xi;
val = __ldg(offset_x + xi); val = __ldg(offset_x + xi);
max_idx = xi;
}
#elif __CUDA_ARCH__ >= 350
if (__half2float(__ldg(offset_x + xi)) > val) {
val = __half2float(__ldg(offset_x + xi));
max_idx = xi;
} }
#else
if (__half2float(offset_x[xi]) > val) {
val = __half2float(offset_x[xi]);
max_idx = xi;
}
#endif
} }
} }
#if __CUDA_ARCH__ >= 530
y[yi] = val; y[yi] = val;
mask[yi] = maxi; #else
y[yi] = __float2half(val);
#endif #endif
mask[yi] = max_idx;
} }
} }
...@@ -205,19 +220,22 @@ __global__ void _RoiPoolGrad<half>( ...@@ -205,19 +220,22 @@ __global__ void _RoiPoolGrad<half>(
const int* mask, const int* mask,
float* dx) { float* dx) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) { CUDA_1D_KERNEL_LOOP(yi, nthreads) {
#if __CUDA_ARCH__ >= 530
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_h / C; const int n = yi / out_w / out_h / C;
const float* roi = rois + n * 5; const float* roi = rois + n * 5;
const int batch_ind = roi[0]; const int batch_ind = roi[0];
if (batch_ind < 0) continue; if (batch_ind < 0) continue;
float* offset_dx = dx + (batch_ind * C + c) * H * W; float* offset_dx = dx + (batch_ind * C + c) * H * W;
#if __CUDA_ARCH__ >= 350
if (__ldg(mask + yi) != -1) { if (__ldg(mask + yi) != -1) {
atomicAdd(offset_dx + __ldg(mask + yi), __half2float(dy[yi])); atomicAdd(offset_dx + __ldg(mask + yi), __half2float(dy[yi]));
} }
#else
if (mask[yi] != -1) {
atomicAdd(offset_dx + mask[yi], __half2float(dy[yi]));
}
#endif #endif
} }
} }
......
...@@ -12,27 +12,20 @@ void DropBlock2dOp<Context>::DoRunWithType() { ...@@ -12,27 +12,20 @@ void DropBlock2dOp<Context>::DoRunWithType() {
if (phase() == "TEST") { if (phase() == "TEST") {
Y->ReshapeLike(X)->CopyFrom(X, ctx()); Y->ReshapeLike(X)->CopyFrom(X, ctx());
} else if (phase() == "TRAIN") { } else if (phase() == "TRAIN") {
int64_t feat_h, feat_w, seed_h, seed_w; int64_t feature_h, feature_w, seed_h, seed_w;
if (data_format() == "NCHW") { if (data_format() == "NCHW") {
feat_h = X.dim(2), feat_w = X.dim(3); feature_h = X.dim(2), feature_w = X.dim(3);
} else if (data_format() == "NHWC") { } else if (data_format() == "NHWC") {
feat_h = X.dim(1), feat_w = X.dim(2); feature_h = X.dim(1), feature_w = X.dim(2);
} else { } else {
LOG(FATAL) << "Unknown DataFormat: " << data_format(); LOG(FATAL) << "Unknown DataFormat: " << data_format();
} }
seed_h = feat_h - block_size_ + 1; seed_h = feature_h - block_size_ + 1;
seed_w = feat_w - block_size_ + 1; seed_w = feature_w - block_size_ + 1;
CHECK(seed_h > 0 && seed_w > 0) << "\nExcepted block_size <= feat_size."; CHECK(seed_h > 0 && seed_w > 0) << "\nExcepted block_size <= feature_size.";
// Schedule the keep ratio
auto kp = keep_prob();
if (decrement_ > 0.f && prob_ > kp) {
prob_ -= decrement_;
} else {
prob_ = kp; // Fixed to the limit value
}
// Compute the drop ratio // Compute the drop ratio
float gamma = (1.f - prob_) / std::pow(block_size_, 2); float gamma = ratio() / std::pow(block_size_, 2);
gamma *= (alpha_ * (feat_h * feat_w) / (seed_h * seed_w)); gamma *= (float(feature_h * feature_w) / float(seed_h * seed_w));
// Prepare buffers // Prepare buffers
auto* mask = Buffer("mask") auto* mask = Buffer("mask")
->ReshapeLike(X) ->ReshapeLike(X)
...@@ -50,8 +43,8 @@ void DropBlock2dOp<Context>::DoRunWithType() { ...@@ -50,8 +43,8 @@ void DropBlock2dOp<Context>::DoRunWithType() {
kernel::DropBlock2d( kernel::DropBlock2d(
X.dim(0), X.dim(0),
data_format() == "NCHW" ? X.dim(1) : X.dim(-1), data_format() == "NCHW" ? X.dim(1) : X.dim(-1),
feat_h, feature_h,
feat_w, feature_w,
seed_h, seed_h,
seed_w, seed_w,
block_size_, block_size_,
......
...@@ -22,10 +22,8 @@ class DropBlock2dOp final : public Operator<Context> { ...@@ -22,10 +22,8 @@ class DropBlock2dOp final : public Operator<Context> {
public: public:
DropBlock2dOp(const OperatorDef& def, Workspace* ws) DropBlock2dOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), : Operator<Context>(def, ws),
block_size_(OP_SINGLE_ARG(int64_t, "block_size", 7)), block_size_(OP_SINGLE_ARG(int64_t, "block_size", 7)) {
alpha_(OP_SINGLE_ARG(float, "alpha", 1.f)), INIT_OP_SINGLE_ARG_WITH_DESC(float, ratio, 0.1f);
decrement_(OP_SINGLE_ARG(float, "decrement", 0.f)) {
INIT_OP_SINGLE_ARG_WITH_DESC(float, keep_prob, 0.9f);
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
...@@ -36,8 +34,7 @@ class DropBlock2dOp final : public Operator<Context> { ...@@ -36,8 +34,7 @@ class DropBlock2dOp final : public Operator<Context> {
protected: protected:
int64_t block_size_; int64_t block_size_;
float alpha_, decrement_, prob_ = 1.; DECLARE_OP_SINGLE_ARG_WITH_DESC(float, ratio);
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, keep_prob);
}; };
template <class Context> template <class Context>
...@@ -52,7 +49,7 @@ class DropBlock2dGradientOp final : public Operator<Context> { ...@@ -52,7 +49,7 @@ class DropBlock2dGradientOp final : public Operator<Context> {
void DoRunWithType(); void DoRunWithType();
}; };
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropBlock2dOp, keep_prob); DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropBlock2dOp, ratio);
} // namespace dragon } // namespace dragon
......
...@@ -12,27 +12,19 @@ void DropPathOp<Context>::DoRunWithType() { ...@@ -12,27 +12,19 @@ void DropPathOp<Context>::DoRunWithType() {
if (phase() == "TEST") { if (phase() == "TEST") {
Y->ReshapeLike(X)->CopyFrom(X, ctx()); Y->ReshapeLike(X)->CopyFrom(X, ctx());
} else if (phase() == "TRAIN") { } else if (phase() == "TRAIN") {
// Schedule the drop ratio
auto dp = prob();
if (inc_ > 0.f && drop_prob_ < dp) {
drop_prob_ += inc_;
} else {
drop_prob_ = dp; // Fixed to the limit value
}
auto* mask = Buffer("mask") auto* mask = Buffer("mask")
->Reshape({X.dim(0)}) ->Reshape({X.dim(0)})
->template mutable_data<float, Context>(); ->template mutable_data<float, Context>();
auto* scale = Buffer("scale") auto* scale = Buffer("scale")
->Reshape({}) ->Reshape({})
->template mutable_data<float, CPUContext>(); ->template mutable_data<float, CPUContext>();
scale[0] = 1.f / (1.f - drop_prob_);
// Generate mask for each example // Generate mask for each example
math::RandomUniform(X.dim(0), 0.f, 1.f, mask, ctx()); math::RandomUniform(X.dim(0), 0.f, 1.f, mask, ctx());
// Apply mask to the feature // Apply mask to the feature
kernel::DropPath( kernel::DropPath(
X.dim(0), X.dim(0),
X.stride(0), X.stride(0),
scale[0], 1.f / (1.f - ratio()),
X.template data<T, Context>(), X.template data<T, Context>(),
mask, mask,
Y->ReshapeLike(X)->template mutable_data<T, Context>(), Y->ReshapeLike(X)->template mutable_data<T, Context>(),
...@@ -57,7 +49,7 @@ void DropPathGradientOp<Context>::DoRunWithType() { ...@@ -57,7 +49,7 @@ void DropPathGradientOp<Context>::DoRunWithType() {
kernel::DropPath( kernel::DropPath(
dY.dim(0), dY.dim(0),
dY.stride(0), dY.stride(0),
Buffer("scale")->template data<float, CPUContext>()[0], 1.f / (1.f - ratio()),
dY.template data<T, Context>(), dY.template data<T, Context>(),
Buffer("mask")->template data<float, Context>(), Buffer("mask")->template data<float, Context>(),
dX->ReshapeLike(dY)->template mutable_data<T, Context>(), dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
......
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_OPERATORS_ACTIVATION_DROPPATH_OP_H_ #ifndef DRAGON_OPERATORS_ACTIVATION_DROP_PATH_OP_H_
#define DRAGON_OPERATORS_ACTIVATION_DROPPATH_OP_H_ #define DRAGON_OPERATORS_ACTIVATION_DROP_PATH_OP_H_
#include "dragon/core/operator.h" #include "dragon/core/operator.h"
...@@ -21,9 +21,8 @@ template <class Context> ...@@ -21,9 +21,8 @@ template <class Context>
class DropPathOp final : public Operator<Context> { class DropPathOp final : public Operator<Context> {
public: public:
DropPathOp(const OperatorDef& def, Workspace* ws) DropPathOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), : Operator<Context>(def, ws) {
inc_(OP_SINGLE_ARG(float, "increment", 0.f)) { INIT_OP_SINGLE_ARG_WITH_DESC(float, ratio, 0.2f);
INIT_OP_SINGLE_ARG_WITH_DESC(float, prob, 0.2f);
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
...@@ -33,24 +32,30 @@ class DropPathOp final : public Operator<Context> { ...@@ -33,24 +32,30 @@ class DropPathOp final : public Operator<Context> {
void DoRunWithType(); void DoRunWithType();
protected: protected:
float inc_, drop_prob_ = 0.f; DECLARE_OP_SINGLE_ARG_WITH_DESC(float, ratio);
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, prob);
}; };
template <class Context> template <class Context>
class DropPathGradientOp final : public Operator<Context> { class DropPathGradientOp final : public Operator<Context> {
public: public:
SIMPLE_CTOR_DTOR(DropPathGradientOp); DropPathGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {
INIT_OP_SINGLE_ARG_WITH_DESC(float, ratio, 0.5f);
}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override;
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
protected:
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, ratio);
}; };
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropPathOp, prob); DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropPathOp, ratio);
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropPathGradientOp, ratio);
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_ACTIVATION_DROPPATH_OP_H_ #endif // DRAGON_OPERATORS_ACTIVATION_DROP_PATH_OP_H_
...@@ -15,8 +15,8 @@ void DropoutOp<Context>::DoRunWithType() { ...@@ -15,8 +15,8 @@ void DropoutOp<Context>::DoRunWithType() {
Buffer("mask")->ReshapeLike(X); Buffer("mask")->ReshapeLike(X);
kernel::Dropout( kernel::Dropout(
X.count(), X.count(),
prob(), ratio(),
1.f / (1.f - prob()), 1.f / (1.f - ratio()),
X.template data<T, Context>(), X.template data<T, Context>(),
Buffer("mask")->template mutable_data<uint8_t, Context>(), Buffer("mask")->template mutable_data<uint8_t, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(), Y->ReshapeLike(X)->template mutable_data<T, Context>(),
...@@ -41,7 +41,7 @@ void DropoutGradientOp<Context>::DoRunWithType() { ...@@ -41,7 +41,7 @@ void DropoutGradientOp<Context>::DoRunWithType() {
} else if (phase() == "TRAIN") { } else if (phase() == "TRAIN") {
kernel::ApplyMask( kernel::ApplyMask(
dY.count(), dY.count(),
1.f / (1.f - prob()), 1.f / (1.f - ratio()),
dY.template data<T, Context>(), dY.template data<T, Context>(),
Buffer("mask")->template data<uint8_t, Context>(), Buffer("mask")->template data<uint8_t, Context>(),
dX->ReshapeLike(dY)->template mutable_data<T, Context>(), dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
......
...@@ -22,7 +22,7 @@ class DropoutOp : public Operator<Context> { ...@@ -22,7 +22,7 @@ class DropoutOp : public Operator<Context> {
public: public:
DropoutOp(const OperatorDef& def, Workspace* ws) DropoutOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) { : Operator<Context>(def, ws) {
INIT_OP_SINGLE_ARG_WITH_DESC(float, prob, 0.5f); INIT_OP_SINGLE_ARG_WITH_DESC(float, ratio, 0.5f);
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
...@@ -32,7 +32,7 @@ class DropoutOp : public Operator<Context> { ...@@ -32,7 +32,7 @@ class DropoutOp : public Operator<Context> {
void DoRunWithType(); void DoRunWithType();
protected: protected:
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, prob); DECLARE_OP_SINGLE_ARG_WITH_DESC(float, ratio);
}; };
template <class Context> template <class Context>
...@@ -40,7 +40,7 @@ class DropoutGradientOp : public Operator<Context> { ...@@ -40,7 +40,7 @@ class DropoutGradientOp : public Operator<Context> {
public: public:
DropoutGradientOp(const OperatorDef& def, Workspace* ws) DropoutGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) { : Operator<Context>(def, ws) {
INIT_OP_SINGLE_ARG_WITH_DESC(float, prob, 0.5f); INIT_OP_SINGLE_ARG_WITH_DESC(float, ratio, 0.5f);
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
...@@ -50,11 +50,11 @@ class DropoutGradientOp : public Operator<Context> { ...@@ -50,11 +50,11 @@ class DropoutGradientOp : public Operator<Context> {
void DoRunWithType(); void DoRunWithType();
protected: protected:
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, prob); DECLARE_OP_SINGLE_ARG_WITH_DESC(float, ratio);
}; };
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropoutOp, prob); DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropoutOp, ratio);
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropoutGradientOp, prob); DEFINE_OP_SINGLE_ARG_WITH_DESC(float, DropoutGradientOp, ratio);
#ifdef USE_CUDNN #ifdef USE_CUDNN
......
...@@ -28,7 +28,7 @@ void CuDNNDropoutOp<Context>::DoRunWithType() { ...@@ -28,7 +28,7 @@ void CuDNNDropoutOp<Context>::DoRunWithType() {
CUDNN_CHECK(cudnnRestoreDropoutDescriptor( CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc_, dropout_desc_,
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
this->prob(), this->ratio(),
X_states->template mutable_data<uint8_t, Context>(), X_states->template mutable_data<uint8_t, Context>(),
states_size, states_size,
rng_seed_)); rng_seed_));
...@@ -37,7 +37,7 @@ void CuDNNDropoutOp<Context>::DoRunWithType() { ...@@ -37,7 +37,7 @@ void CuDNNDropoutOp<Context>::DoRunWithType() {
CUDNN_CHECK(cudnnSetDropoutDescriptor( CUDNN_CHECK(cudnnSetDropoutDescriptor(
dropout_desc_, dropout_desc_,
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
this->prob(), this->ratio(),
X_states->template mutable_data<uint8_t, Context>(), X_states->template mutable_data<uint8_t, Context>(),
states_size, states_size,
rng_seed_)); rng_seed_));
...@@ -86,7 +86,7 @@ void CuDNNDropoutGradientOp<Context>::DoRunWithType() { ...@@ -86,7 +86,7 @@ void CuDNNDropoutGradientOp<Context>::DoRunWithType() {
CUDNN_CHECK(cudnnRestoreDropoutDescriptor( CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
dropout_desc_, dropout_desc_,
ctx()->cudnn_handle(), ctx()->cudnn_handle(),
this->prob(), this->ratio(),
X_states->template mutable_data<uint8_t, Context>(), X_states->template mutable_data<uint8_t, Context>(),
states_size, states_size,
rng_seed_)); rng_seed_));
......
...@@ -24,14 +24,14 @@ from dragon.core.ops.utils import parse_args ...@@ -24,14 +24,14 @@ from dragon.core.ops.utils import parse_args
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
@ArgHelper.desc('prob', as_target=False) @ArgHelper.desc('ratio', as_target=False)
def dropout(inputs, prob=0.5, scale=True, **kwargs): def dropout(inputs, ratio=0.5, **kwargs):
r"""Set the elements of the input to zero randomly. r"""Set the elements of the input to zero randomly.
`[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_. `[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
The **Dropout** function is defined as: The **Dropout** function is defined as:
.. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - \text{prob})) .. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - \text{ratio}))
Examples: Examples:
...@@ -44,10 +44,8 @@ def dropout(inputs, prob=0.5, scale=True, **kwargs): ...@@ -44,10 +44,8 @@ def dropout(inputs, prob=0.5, scale=True, **kwargs):
---------- ----------
inputs : dragon.Tensor inputs : dragon.Tensor
The input tensor. The input tensor.
prob : Union[float, dragon.Tensor], optional, default=0.2 ratio : Union[float, dragon.Tensor], optional, default=0.5
The dropping probability. The dropping ratio.
scale : bool, optional, default=True
Whether to scale the output during training.
Returns Returns
------- -------
...@@ -56,56 +54,39 @@ def dropout(inputs, prob=0.5, scale=True, **kwargs): ...@@ -56,56 +54,39 @@ def dropout(inputs, prob=0.5, scale=True, **kwargs):
""" """
args = parse_args(locals()) args = parse_args(locals())
args['prob'] = float(prob)
inplace = args.pop('inplace') if 'inplace' in args else False inplace = args.pop('inplace') if 'inplace' in args else False
op_lib = activation_ops_lib.Dropout op_lib = activation_ops_lib.Dropout
if context.executing_eagerly(): if context.executing_eagerly():
return op_lib \ return op_lib \
.instantiate(prob=args['prob'], scale=scale) \ .instantiate() \
.apply([inputs], inplace=inplace) .apply([inputs], ratio, inplace=inplace)
else: else:
return op_lib.blend(**args) return op_lib.blend(**args)
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
@ArgHelper.desc('keep_prob', as_target=False) @ArgHelper.desc('ratio', as_target=False)
def drop_block2d( def drop_block2d(inputs, ratio=0.1, block_size=7, data_format='NCHW', **kwargs):
inputs,
block_size=7,
keep_prob=0.9,
alpha=1.,
decrement=0.,
data_format='NCHW',
**kwargs
):
r"""Set the spatial blocks over input to zero randomly. r"""Set the spatial blocks over input to zero randomly.
`[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_. `[Ghiasi et.al, 2018] <https://arxiv.org/abs/1810.12890>`_.
The **DropBlock** function is defined as: The **DropBlock** function is defined as:
.. math:: .. math::
\text{DropBlock}(x_{ijk} = \text{DropBlock}(x_{ijk}) =
x_{ijk} * (r_{ik} \sim \mathcal{B}(1, \alpha\gamma)) \\ \quad \\ x_{ijk} * (r_{ik} \sim \mathcal{B}(1, 1 - \gamma)) \\ \quad \\
\text{where}\quad \gamma = \text{where}\quad \gamma =
\frac{\text{keep\_prob}}{\text{block\_size}^{n}} \frac{\text{ratio}}{\text{block\_size}^{n}}
\frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n} \frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n}
Set the ``decrement`` to schedule ``keep_prob`` from **1.0**.
Set the ``alpha`` to decrease :math:`\gamma` for different stages.
Parameters Parameters
---------- ----------
inputs : dragon.Tensor inputs : dragon.Tensor
The input tensor. The input tensor.
ratio : Union[float, dragon.Tensor], optional, default=0.1
The dropping ratio.
block_size : int, optional, default=7 block_size : int, optional, default=7
The size of a spatial block. The spatial block size.
keep_prob : Union[float, dragon.Tensor], optional, default=0.9
The keeping prob.
alpha : float, optional, default=1.
The value to :math:`\gamma`.
decrement : float, optional, default=0.
The decrement value to ``keep_prob``.
data_format : {'NCHW', 'NHWC'}, optional data_format : {'NCHW', 'NHWC'}, optional
The optional data format. The optional data format.
...@@ -116,43 +97,34 @@ def drop_block2d( ...@@ -116,43 +97,34 @@ def drop_block2d(
""" """
args = parse_args(locals()) args = parse_args(locals())
args['alpha'] = float(alpha) inplace = args.pop('inplace') if 'inplace' in args else False
args['decrement'] = float(decrement)
op_lib = activation_ops_lib.DropBlock2d op_lib = activation_ops_lib.DropBlock2d
if context.executing_eagerly(): if context.executing_eagerly():
return op_lib \ return op_lib \
.instantiate( .instantiate(
block_size=block_size, block_size=block_size,
keep_prob=float(args['keep_prob']),
alpha=args['alpha'],
decrement=args['decrement'],
data_format=data_format, data_format=data_format,
slot=args.get('slot', None), ).apply([inputs], ratio, inplace=inplace)
).apply([inputs], args.get('inplace', False))
else: else:
return op_lib.blend(**args) return op_lib.blend(**args)
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
@ArgHelper.desc('prob', as_target=False) @ArgHelper.desc('ratio', as_target=False)
def drop_path(inputs, prob=0.2, increment=0., **kwargs): def drop_path(inputs, ratio=0.2, **kwargs):
r"""Set the examples over the input to zero randomly. r"""Set the examples over the input to zero randomly.
`[Larsson et.al, 2016] <https://arxiv.org/abs/1605.07648>`_. `[Larsson et.al, 2016] <https://arxiv.org/abs/1605.07648>`_.
The **DropPath** function is defined as: The **DropPath** function is defined as:
.. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - \text{prob})) .. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - \text{ratio}))
Set the ``increment`` to schedule ``prob`` from **0.0** after each run.
Parameters Parameters
---------- ----------
inputs : dragon.Tensor inputs : dragon.Tensor
The input tensor. The input tensor.
prob : Union[float, dragon.Tensor], optional, default=0.2 ratio : Union[float, dragon.Tensor], optional, default=0.2
The dropping prob. The dropping ratio.
increment : float, optional, default=0.0
The increment ``prob``.
Returns Returns
------- -------
...@@ -161,23 +133,18 @@ def drop_path(inputs, prob=0.2, increment=0., **kwargs): ...@@ -161,23 +133,18 @@ def drop_path(inputs, prob=0.2, increment=0., **kwargs):
""" """
args = parse_args(locals()) args = parse_args(locals())
args['prob'] = float(prob)
args['increment'] = float(increment)
inplace = args.pop('inplace') if 'inplace' in args else False inplace = args.pop('inplace') if 'inplace' in args else False
op_lib = activation_ops_lib.DropPath op_lib = activation_ops_lib.DropPath
if context.executing_eagerly(): if context.executing_eagerly():
return op_lib \ return op_lib \
.instantiate( .instantiate() \
prob=args['prob'], .apply([inputs], ratio, inplace=inplace)
increment=args['increment'],
slot=args.get('slot', None),
).apply([inputs], inplace=inplace)
else: else:
return op_lib.blend(**args) return op_lib.blend(**args)
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
def elu(inputs, alpha=1., **kwargs): def elu(inputs, alpha=1.0, **kwargs):
r"""Apply the exponential linear unit. r"""Apply the exponential linear unit.
`[Clevert et.al, 2015] <https://arxiv.org/abs/1511.07289>`_. `[Clevert et.al, 2015] <https://arxiv.org/abs/1511.07289>`_.
...@@ -201,7 +168,7 @@ def elu(inputs, alpha=1., **kwargs): ...@@ -201,7 +168,7 @@ def elu(inputs, alpha=1., **kwargs):
---------- ----------
inputs : dragon.Tensor inputs : dragon.Tensor
The input tensor. The input tensor.
alpha : float, optional, default=1. alpha : float, optional, default=1.0
The value to :math:`\alpha`. The value to :math:`\alpha`.
Returns Returns
......
...@@ -32,63 +32,57 @@ class Activation(Operator): ...@@ -32,63 +32,57 @@ class Activation(Operator):
return self.dispatch(inputs, outputs) return self.dispatch(inputs, outputs)
class Dropout(Activation): class Dropout(Operator):
"""Dropout operator.""" """Dropout operator."""
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(Dropout, self).__init__(key, dev, **kwargs) super(Dropout, self).__init__(key, dev, **kwargs)
self.prob = kwargs.get('prob', 0.5)
self.scale = kwargs.get('scale', True)
def attributes(self): def attributes(self):
return { return {
'op_type': 'Dropout', 'op_type': 'Dropout',
'arguments': { 'arguments': {'ratio_desc': '${HANDLE}/ratio'},
'prob': self.prob,
'scale': self.scale,
}
} }
def feed(self, ws, handle, ratio):
self.feed_arg(ws, '{}/ratio'.format(handle), ratio, 'float32')
def forward(self, inputs, ratio, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(inputs, outputs,
callback=lambda ws, handle:
self.feed(ws, handle, ratio))
class DropBlock2d(Activation): class DropBlock2d(Dropout):
"""DropBlock2d operator.""" """DropBlock2d operator."""
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(DropBlock2d, self).__init__(key, dev, **kwargs) super(DropBlock2d, self).__init__(key, dev, **kwargs)
self.block_size = kwargs.get('block_size', 7) self.block_size = kwargs.get('block_size', 7)
self.keep_prob = kwargs.get('keep_prob', 0.9)
self.alpha = kwargs.get('alpha', 1.)
self.decrement = kwargs.get('decrement', 0.)
self.data_format = kwargs.get('data_format', 'NCHW') self.data_format = kwargs.get('data_format', 'NCHW')
def attributes(self): def attributes(self):
return { return {
'op_type': 'DropBlock2d', 'op_type': 'DropBlock2d',
'arguments': { 'arguments': {
'ratio_desc': '${HANDLE}/ratio',
'block_size': self.block_size, 'block_size': self.block_size,
'keep_prob': self.keep_prob,
'alpha': self.alpha,
'decrement': self.decrement,
'data_format': self.data_format, 'data_format': self.data_format,
}, },
} }
class DropPath(Activation): class DropPath(Dropout):
"""DropPath operator.""" """DropPath operator."""
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(DropPath, self).__init__(key, dev, **kwargs) super(DropPath, self).__init__(key, dev, **kwargs)
self.prob = kwargs.get('prob', 0.2)
self.increment = kwargs.get('increment', 0.)
def attributes(self): def attributes(self):
return { return {
'op_type': 'DropPath', 'op_type': 'DropPath',
'arguments': { 'arguments': {'ratio_desc': '${HANDLE}/ratio'},
'prob': self.prob,
'increment': self.increment,
}
} }
......
...@@ -36,6 +36,28 @@ __global__ void _RowwiseReduce( ...@@ -36,6 +36,28 @@ __global__ void _RowwiseReduce(
} }
} }
template <class Reducer>
__global__ void _RowwiseReduce(
const int rows,
const int cols,
const Reducer reducer,
const float init,
const float scale,
const half* x,
half* y) {
__shared__ typename BlockReduce<float>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, cols) {
float val = init;
CUDA_2D_KERNEL_LOOP2(j, rows) {
val = reducer(val, __half2float(x[j * cols + i]));
}
val = BlockReduce<float>(storage).Reduce(val, reducer);
if (threadIdx.x == 0) {
y[i] = __float2half(val * scale);
}
}
}
template <typename T, class Reducer> template <typename T, class Reducer>
__global__ void _ColwiseReduce( __global__ void _ColwiseReduce(
const int rows, const int rows,
...@@ -58,6 +80,28 @@ __global__ void _ColwiseReduce( ...@@ -58,6 +80,28 @@ __global__ void _ColwiseReduce(
} }
} }
template <class Reducer>
__global__ void _ColwiseReduce(
const int rows,
const int cols,
const Reducer reducer,
const float init,
const float scale,
const half* x,
half* y) {
__shared__ typename BlockReduce<float>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, rows) {
float val = init;
CUDA_2D_KERNEL_LOOP2(j, cols) {
val = reducer(val, __half2float(x[i * cols + j]));
}
val = BlockReduce<float>(storage).Sum(val);
if (threadIdx.x == 0) {
y[i] = __float2half(val * scale);
}
}
}
template <typename T, class Reducer, int D> template <typename T, class Reducer, int D>
__global__ void _GenericReduce( __global__ void _GenericReduce(
const int rows, const int rows,
...@@ -89,6 +133,37 @@ __global__ void _GenericReduce( ...@@ -89,6 +133,37 @@ __global__ void _GenericReduce(
} }
} }
template <class Reducer, int D>
__global__ void _GenericReduce(
const int rows,
const int cols,
const int num_dims,
const SimpleArray<int, D> x_dims,
const SimpleArray<int, D> x_strides,
const Reducer reducer,
const float init,
const float scale,
const half* x,
half* y) {
__shared__ typename BlockReduce<float>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, rows) {
float val = init;
CUDA_2D_KERNEL_LOOP2(j, cols) {
int xi = 0, c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(x_dims.data[d], c, &c, &r);
xi += r * x_strides.data[d];
}
val = reducer(val, __half2float(x[xi]));
}
val = BlockReduce<float>(storage).Reduce(val, reducer);
if (threadIdx.x == 0) {
y[i] = __float2half(val * scale);
}
}
}
#define DEFINE_REDUCE_FUNCTION(name) \ #define DEFINE_REDUCE_FUNCTION(name) \
template <typename T, typename Reducer> \ template <typename T, typename Reducer> \
int _Reduce##name( \ int _Reduce##name( \
...@@ -127,12 +202,12 @@ __global__ void _GenericReduce( ...@@ -127,12 +202,12 @@ __global__ void _GenericReduce(
return 0; \ return 0; \
} \ } \
int rows, cols; \ int rows, cols; \
vec32_t y_dims(dims, dims + num_dims); \ vec32_t out_dims(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) \ for (int i = 0; i < num_axes; ++i) { \
y_dims[axes[i]] = 1; \ out_dims[axes[i]] = 1; \
/*! Case #1: Rowwise Reduce */ \ } \
if (utils::math::IsRowwiseReduce( \ if (utils::math::IsRowwiseReduce( \
num_dims, dims, y_dims.data(), &rows, &cols)) { \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseReduce<<< \ _RowwiseReduce<<< \
CUDA_2D_BLOCKS(cols), \ CUDA_2D_BLOCKS(cols), \
CUDA_THREADS, \ CUDA_THREADS, \
...@@ -141,9 +216,8 @@ __global__ void _GenericReduce( ...@@ -141,9 +216,8 @@ __global__ void _GenericReduce(
rows, cols, reducer, init, cast::to<T>(scale), x, y); \ rows, cols, reducer, init, cast::to<T>(scale), x, y); \
return 1; \ return 1; \
} \ } \
/*! Case #2: Colwise Reduce */ \
if (utils::math::IsColwiseReduce( \ if (utils::math::IsColwiseReduce( \
num_dims, dims, y_dims.data(), &rows, &cols)) { \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseReduce<<< \ _ColwiseReduce<<< \
CUDA_2D_BLOCKS(rows), \ CUDA_2D_BLOCKS(rows), \
CUDA_THREADS, \ CUDA_THREADS, \
...@@ -152,20 +226,25 @@ __global__ void _GenericReduce( ...@@ -152,20 +226,25 @@ __global__ void _GenericReduce(
rows, cols, reducer, init, cast::to<T>(scale), x, y); \ rows, cols, reducer, init, cast::to<T>(scale), x, y); \
return 2; \ return 2; \
} \ } \
/*! Case #3: Generic Reduce */ \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> axesT, stridesT, dimsT; \ SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes; \
utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data); \ SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims; \
utils::math::TransposeAxesForReduce( \
num_dims, num_axes, axes, transpose_axes.data); \
utils::math::ComputeTransposeStrides( \ utils::math::ComputeTransposeStrides( \
num_dims, dims, axesT.data, stridesT.data); \ num_dims, dims, transpose_axes.data, transpose_strides.data); \
rows = cols = 1; \ rows = cols = 1; \
const int pivot = num_dims - num_axes; \ const int pivot = num_dims - num_axes; \
for (int i = 0; i < pivot; ++i) \ for (int i = 0; i < pivot; ++i) { \
rows *= dims[axesT.data[i]]; \ rows *= dims[transpose_axes.data[i]]; \
for (int i = pivot; i < num_dims; ++i) \ } \
cols *= dims[axesT.data[i]]; \ for (int i = pivot; i < num_dims; ++i) { \
for (int i = 0; i < num_dims; ++i) \ cols *= dims[transpose_axes.data[i]]; \
dimsT.data[i] = dims[axesT.data[i]]; \ } \
for (int i = 0; i < num_dims; ++i) { \
transpose_dims.data[i] = dims[transpose_axes.data[i]]; \
} \
_GenericReduce<<< \ _GenericReduce<<< \
CUDA_2D_BLOCKS(rows), \ CUDA_2D_BLOCKS(rows), \
CUDA_THREADS, \ CUDA_THREADS, \
...@@ -174,8 +253,8 @@ __global__ void _GenericReduce( ...@@ -174,8 +253,8 @@ __global__ void _GenericReduce(
rows, \ rows, \
cols, \ cols, \
num_dims, \ num_dims, \
dimsT, \ transpose_dims, \
stridesT, \ transpose_strides, \
reducer, \ reducer, \
init, \ init, \
cast::to<T>(scale), \ cast::to<T>(scale), \
...@@ -193,6 +272,91 @@ DEFINE_REDUCE_FUNCTION(Sum); ...@@ -193,6 +272,91 @@ DEFINE_REDUCE_FUNCTION(Sum);
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
// Fallback to use FP32 accumulator instead of FP16.
// We found that FP16 accumulator drops too many small values in
// empirical experiments.
template <>
void ReduceSum<float16, CUDAContext>(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float scale,
const float16* x,
float16* y,
CUDAContext* ctx) {
// NB: There is no way to dispatch device reduce
// with FP32 accumulator for FP16 input type.
// Performance may drop in some cases.
int rows, cols;
vec32_t out_dims(dims, dims + num_dims);
for (int i = 0; i < num_axes; ++i) {
out_dims[axes[i]] = 1;
}
if (utils::math::IsRowwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_RowwiseReduce<<<
CUDA_2D_BLOCKS(cols),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
rows,
cols,
math::PlusFunctor<float>(),
0.f,
scale,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
return;
}
if (utils::math::IsColwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) {
_ColwiseReduce<<<
CUDA_2D_BLOCKS(rows),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
rows,
cols,
math::PlusFunctor<float>(),
0.f,
scale,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
return;
}
CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides;
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims;
utils::math::TransposeAxesForReduce(
num_dims, num_axes, axes, transpose_axes.data);
utils::math::ComputeTransposeStrides(
num_dims, dims, transpose_axes.data, transpose_strides.data);
rows = cols = 1;
const int pivot = num_dims - num_axes;
for (int i = 0; i < pivot; ++i) {
rows *= dims[transpose_axes.data[i]];
}
for (int i = pivot; i < num_dims; ++i) {
cols *= dims[transpose_axes.data[i]];
}
for (int i = 0; i < num_dims; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]];
}
_GenericReduce<<<CUDA_2D_BLOCKS(rows), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
rows,
cols,
num_dims,
transpose_dims,
transpose_strides,
math::PlusFunctor<float>(),
0.f,
scale,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
#define DEFINE_KERNEL_LAUNCHER(name, T, Reducer, kInit) \ #define DEFINE_KERNEL_LAUNCHER(name, T, Reducer, kInit) \
template <> \ template <> \
void Reduce##name<T, CUDAContext>( \ void Reduce##name<T, CUDAContext>( \
...@@ -294,7 +458,6 @@ DEFINE_KERNEL_LAUNCHER(Sum, int8_t, math::PlusFunctor, int8_t(0)); ...@@ -294,7 +458,6 @@ DEFINE_KERNEL_LAUNCHER(Sum, int8_t, math::PlusFunctor, int8_t(0));
DEFINE_KERNEL_LAUNCHER(Sum, uint8_t, math::PlusFunctor, uint8_t(0)); DEFINE_KERNEL_LAUNCHER(Sum, uint8_t, math::PlusFunctor, uint8_t(0));
DEFINE_KERNEL_LAUNCHER(Sum, int, math::PlusFunctor, int(0)); DEFINE_KERNEL_LAUNCHER(Sum, int, math::PlusFunctor, int(0));
DEFINE_KERNEL_LAUNCHER(Sum, int64_t, math::PlusFunctor, int64_t(0)); DEFINE_KERNEL_LAUNCHER(Sum, int64_t, math::PlusFunctor, int64_t(0));
DEFINE_KERNEL_LAUNCHER(Sum, float16, math::PlusFunctor, cast::to<float16>(0.f));
DEFINE_KERNEL_LAUNCHER(Sum, float, math::PlusFunctor, 0.f); DEFINE_KERNEL_LAUNCHER(Sum, float, math::PlusFunctor, 0.f);
DEFINE_KERNEL_LAUNCHER(Sum, double, math::PlusFunctor, 0.); DEFINE_KERNEL_LAUNCHER(Sum, double, math::PlusFunctor, 0.);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
......
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_UTILS_MATH_SORT_H_
#define DRAGON_UTILS_MATH_SORT_H_
#endif // DRAGON_UTILS_MATH_SORT_H_
...@@ -33,7 +33,7 @@ void ApplyMask( ...@@ -33,7 +33,7 @@ void ApplyMask(
template <typename T, class Context> template <typename T, class Context>
void Dropout( void Dropout(
const int count, const int count,
const float prob, const float ratio,
const float scale, const float scale,
const T* x, const T* x,
uint8_t* mask, uint8_t* mask,
......
...@@ -133,7 +133,7 @@ class Dropout(Layer): ...@@ -133,7 +133,7 @@ class Dropout(Layer):
The **Dropout** function is defined as: The **Dropout** function is defined as:
.. math:: \text{Dropout}(x) = x * \text{Bernoulli}(p=1 - prob) .. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - \text{rate}))
Examples: Examples:
...@@ -150,7 +150,7 @@ class Dropout(Layer): ...@@ -150,7 +150,7 @@ class Dropout(Layer):
Parameters Parameters
---------- ----------
rate : Union[float, dragon.Tensor] rate : Union[float, dragon.Tensor]
The dropping probability. The dropping ratio.
""" """
super(Dropout, self).__init__(**kwargs) super(Dropout, self).__init__(**kwargs)
...@@ -161,7 +161,7 @@ class Dropout(Layer): ...@@ -161,7 +161,7 @@ class Dropout(Layer):
if self.trainable: if self.trainable:
return activation_ops.dropout( return activation_ops.dropout(
inputs, inputs,
prob=self.rate, ratio=self.rate,
inplace=self.inplace, inplace=self.inplace,
) )
return inputs return inputs
...@@ -238,8 +238,7 @@ class Permute(Layer): ...@@ -238,8 +238,7 @@ class Permute(Layer):
if sorted(dims) != list(range(1, len(dims) + 1)): if sorted(dims) != list(range(1, len(dims) + 1)):
raise ValueError( raise ValueError(
'Argument <dims> should be consecutive and start from 1.\n' 'Argument <dims> should be consecutive and start from 1.\n'
'Got {}'.format(str(dims)) 'Got {}'.format(str(dims)))
)
self.input_spec = InputSpec(ndim=len(self.dims) + 1) self.input_spec = InputSpec(ndim=len(self.dims) + 1)
def call(self, inputs): def call(self, inputs):
......
...@@ -408,7 +408,7 @@ def dropout(x, rate, name=None, **kwargs): ...@@ -408,7 +408,7 @@ def dropout(x, rate, name=None, **kwargs):
x : dragon.Tensor x : dragon.Tensor
The tensor :math:`x`. The tensor :math:`x`.
rate : Union[float, dragon.Tensor] rate : Union[float, dragon.Tensor]
The dropping probability. The dropping ratio.
name : str, optional name : str, optional
The operation name. The operation name.
......
...@@ -97,14 +97,14 @@ class TestActivationOps(OpTestCase): ...@@ -97,14 +97,14 @@ class TestActivationOps(OpTestCase):
self.cudnn_ws = dragon.Workspace() self.cudnn_ws = dragon.Workspace()
def test_dropout(self): def test_dropout(self):
prob = 0. ratio = 0.
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution): with execution_context().mode(execution):
data = uniform((2, 3)) data = uniform((2, 3))
x = new_tensor(data) x = new_tensor(data)
with dragon.GradientTape() as tape: with dragon.GradientTape() as tape:
tape.watch(x) tape.watch(x)
y = dragon.nn.dropout(x, prob=prob) y = dragon.nn.dropout(x, ratio=ratio)
dx = tape.gradient(y, [x], output_gradients=[x])[0] dx = tape.gradient(y, [x], output_gradients=[x])[0]
self.assertEqual([y, dx], [data, data]) self.assertEqual([y, dx], [data, data])
...@@ -121,7 +121,7 @@ class TestActivationOps(OpTestCase): ...@@ -121,7 +121,7 @@ class TestActivationOps(OpTestCase):
self.test_dropout() self.test_dropout()
def test_drop_block2d(self): def test_drop_block2d(self):
keep_prob, block_size = 1., 2 ratio, block_size = 0., 2
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution): with execution_context().mode(execution):
for data_format in ('NCHW', 'NHWC'): for data_format in ('NCHW', 'NHWC'):
...@@ -132,8 +132,8 @@ class TestActivationOps(OpTestCase): ...@@ -132,8 +132,8 @@ class TestActivationOps(OpTestCase):
tape.watch(x) tape.watch(x)
y = dragon.nn.drop_block2d( y = dragon.nn.drop_block2d(
x, x,
ratio=ratio,
block_size=block_size, block_size=block_size,
keep_prob=keep_prob,
data_format=data_format) data_format=data_format)
dx = tape.gradient(y, [x], output_gradients=[x])[0] dx = tape.gradient(y, [x], output_gradients=[x])[0]
self.assertEqual([y, dx], [data, data]) self.assertEqual([y, dx], [data, data])
...@@ -145,14 +145,14 @@ class TestActivationOps(OpTestCase): ...@@ -145,14 +145,14 @@ class TestActivationOps(OpTestCase):
self.test_drop_block2d() self.test_drop_block2d()
def test_drop_path(self): def test_drop_path(self):
prob = 0. ratio = 0.
for execution in ('EAGER_MODE', 'GRAPH_MODE'): for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution): with execution_context().mode(execution):
data = uniform((2, 3)) data = uniform((2, 3))
x = new_tensor(data) x = new_tensor(data)
with dragon.GradientTape() as tape: with dragon.GradientTape() as tape:
tape.watch(x) tape.watch(x)
y = dragon.nn.drop_path(x, prob=prob) y = dragon.nn.drop_path(x, ratio=ratio)
dx = tape.gradient(y, [x], output_gradients=[x])[0] dx = tape.gradient(y, [x], output_gradients=[x])[0]
self.assertEqual([y, dx], [data, data]) self.assertEqual([y, dx], [data, data])
......
...@@ -35,6 +35,10 @@ TESTS_AND_SOURCES = [ ...@@ -35,6 +35,10 @@ TESTS_AND_SOURCES = [
('torch/test_torch', 'dragon.vm.torch.core'), ('torch/test_torch', 'dragon.vm.torch.core'),
] ]
DISTRIBUTED_BLOCKLIST = [
'dragon/test_distributed',
]
TESTS = [t[0] for t in TESTS_AND_SOURCES] TESTS = [t[0] for t in TESTS_AND_SOURCES]
SOURCES = [t[1] for t in TESTS_AND_SOURCES] SOURCES = [t[1] for t in TESTS_AND_SOURCES]
...@@ -66,6 +70,10 @@ def parse_args(): ...@@ -66,6 +70,10 @@ def parse_args():
metavar='TESTS', metavar='TESTS',
default=[], default=[],
help='select a set of tests to exclude') help='select a set of tests to exclude')
parser.add_argument(
'--ignore-distributed-blocklist',
action='store_true',
help='always run blocklisted distributed tests')
return parser.parse_args() return parser.parse_args()
...@@ -95,6 +103,9 @@ def main(): ...@@ -95,6 +103,9 @@ def main():
base_command = get_base_command(args) base_command = get_base_command(args)
tests, sources = get_selected_tests(args, TESTS, SOURCES) tests, sources = get_selected_tests(args, TESTS, SOURCES)
for i, test in enumerate(tests): for i, test in enumerate(tests):
if (test in DISTRIBUTED_BLOCKLIST and
not args.ignore_distributed_blocklist):
continue
command = base_command[:] command = base_command[:]
if args.coverage: if args.coverage:
if sources[i]: if sources[i]:
......
...@@ -292,26 +292,26 @@ class TestModules(OpTestCase): ...@@ -292,26 +292,26 @@ class TestModules(OpTestCase):
self.assertEqual(y, result) self.assertEqual(y, result)
def test_dropout(self): def test_dropout(self):
prob = 0. p = 0.
data = uniform((2, 3)) data = uniform((2, 3))
x = new_tensor(data) x = new_tensor(data)
m = torch.nn.Dropout(p=prob, inplace=True) m = torch.nn.Dropout(p, inplace=True)
y, _ = m(x), repr(m) y, _ = m(x), repr(m)
self.assertEqual(y, data) self.assertEqual(y, data)
def test_drop_block2d(self): def test_drop_block2d(self):
keep_prob = 1. p = 0.
data = uniform((2, 3, 4, 4)) data = uniform((2, 3, 4, 4))
x = new_tensor(data) x = new_tensor(data)
m = torch.nn.DropBlock2d(kp=keep_prob, block_size=2, inplace=True) m = torch.nn.DropBlock2d(p, block_size=2, inplace=True)
y, _ = m(x), repr(m) y, _ = m(x), repr(m)
self.assertEqual(y, data) self.assertEqual(y, data)
def test_drop_path(self): def test_drop_path(self):
prob = 0. p = 0.
data = uniform((2, 3)) data = uniform((2, 3))
x = new_tensor(data) x = new_tensor(data)
m = torch.nn.DropPath(p=prob, inplace=True) m = torch.nn.DropPath(p, inplace=True)
y, _ = m(x), repr(m) y, _ = m(x), repr(m)
self.assertEqual(y, data) self.assertEqual(y, data)
......
...@@ -427,16 +427,16 @@ def dropout(input, p=0.5, training=True, inplace=False): ...@@ -427,16 +427,16 @@ def dropout(input, p=0.5, training=True, inplace=False):
The **Dropout** function is defined as: The **Dropout** function is defined as:
.. math:: \text{Dropout}(x) = x * \text{Bernoulli}(p=1 - prob) .. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - p))
Parameters Parameters
---------- ----------
input : dragon.vm.torch.Tensor input : dragon.vm.torch.Tensor
The input tensor. The input tensor.
p : float, optional, default=0.5 p : float, optional, default=0.5
The dropping prob. The dropping ratio.
training : bool, optional, default=True training : bool, optional, default=True
The training flag. Apply dropping if **True**.
inplace : bool, optional, default=False inplace : bool, optional, default=False
Whether to do the operation in-place. Whether to do the operation in-place.
...@@ -453,50 +453,40 @@ def dropout(input, p=0.5, training=True, inplace=False): ...@@ -453,50 +453,40 @@ def dropout(input, p=0.5, training=True, inplace=False):
if not training: if not training:
return input return input
return _functions.Dropout \ return _functions.Dropout \
.instantiate( .instantiate(input.device) \
input.device, .apply(input, p, inplace=inplace)
p=p,
).apply(input, inplace=inplace)
def drop_block2d( def drop_block2d(
input, input,
kp=0.9, p=0.1,
block_size=7, block_size=7,
alpha=1.,
decrement=0.,
training=True, training=True,
inplace=False, inplace=False,
slot=None,
): ):
r"""Set the spatial blocks over input to zero randomly. r"""Set the spatial blocks over input to zero randomly.
The **DropBlock** function is defined as: The **DropBlock** function is defined as:
.. math:: .. math::
\text{DropBlock}(x) = x \cdot \text{Bernoulli}(\alpha\cdot\gamma) \\ \text{DropBlock}(x_{ijk}) =
\quad \\ \text{where}\quad \gamma = x_{ijk} * (r_{ik} \sim \mathcal{B}(1, 1 - \gamma)) \\ \quad \\
\frac{keep\_prob}{block\_size^{n}} \text{where}\quad \gamma =
\frac{feat\_size^{n}}{(feat\_size - block\_size + 1)^n} \frac{p}{\text{block\_size}^{n}}
\frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n}
Parameters Parameters
---------- ----------
input : dragon.vm.torch.Tensor input : dragon.vm.torch.Tensor
The input tensor. The input tensor.
kp : float, optional, default=0.9 p : float, optional, default=0.1
The keeping prob. The dropping ratio.
block_size : int, optional, default=7 block_size : int, optional, default=7
The size of a spatial block. The spatial block size.
alpha : float, optional, default=1.
The scale factor to :math:`\gamma`.
decrement : float, optional, default=0.
The decrement value to ``kp``.
training : bool, optional, default=True training : bool, optional, default=True
The training flag. Apply dropping if **True**.
inplace : bool, optional, default=False inplace : bool, optional, default=False
Whether to do the operation in-place. Whether to do the operation in-place.
slot : int, optional
The optional slot index.
Returns Returns
------- -------
...@@ -513,28 +503,17 @@ def drop_block2d( ...@@ -513,28 +503,17 @@ def drop_block2d(
return _functions.DropBlock2d \ return _functions.DropBlock2d \
.instantiate( .instantiate(
input.device, input.device,
keep_prob=kp,
block_size=block_size, block_size=block_size,
alpha=alpha, ).apply(input, p, inplace=inplace)
decrement=decrement,
slot=slot,
).apply(input, inplace=inplace)
def drop_path( def drop_path(input, p=0.2, training=True, inplace=False):
input,
p=0.2,
increment=0.,
training=True,
inplace=False,
slot=None,
):
r"""Set the examples over input to zero randomly. r"""Set the examples over input to zero randomly.
`[Larsson et.al, 2016] <https://arxiv.org/abs/1605.07648>`_. `[Larsson et.al, 2016] <https://arxiv.org/abs/1605.07648>`_.
The **DropPath** function is defined as: The **DropPath** function is defined as:
.. math:: \text{DropPath}(x) = x * \text{Bernoulli}(p=1 - prob) .. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - p))
Parameters Parameters
---------- ----------
...@@ -542,14 +521,10 @@ def drop_path( ...@@ -542,14 +521,10 @@ def drop_path(
The input tensor. The input tensor.
p : float, optional, default=0.2 p : float, optional, default=0.2
The dropping prob. The dropping prob.
increment : float, optional, default=0.
The increment value to ``p``.
training : bool, optional, default=True training : bool, optional, default=True
The training flag. Apply dropping if **True**.
inplace : bool, optional, default=False inplace : bool, optional, default=False
Whether to do the operation in-place. Whether to do the operation in-place.
slot : int, optional
The optional slot index.
Returns Returns
------- -------
...@@ -564,12 +539,8 @@ def drop_path( ...@@ -564,12 +539,8 @@ def drop_path(
if not training: if not training:
return input return input
return _functions.DropPath \ return _functions.DropPath \
.instantiate( .instantiate(input.device) \
input.device, .apply(input, p, inplace=inplace)
p=p,
increment=increment,
slot=slot,
).apply(input, inplace=inplace)
def elu(input, alpha=1., inplace=False): def elu(input, alpha=1., inplace=False):
......
...@@ -169,55 +169,56 @@ class DepthwiseConv2d(_ConvNd): ...@@ -169,55 +169,56 @@ class DepthwiseConv2d(_ConvNd):
super(DepthwiseConv2d, self).__init__(key, dev, **kwargs) super(DepthwiseConv2d, self).__init__(key, dev, **kwargs)
class DropBlock2d(_Activation): class Dropout(function.Function):
"""Dropout function."""
def __init__(self, key, dev, **kwargs):
super(Dropout, self).__init__(key, dev, **kwargs)
def attributes(self):
return {
'op_type': 'Dropout',
'arguments': {'ratio_desc': '${HANDLE}/ratio'},
}
def feed(self, ws, handle, ratio):
self.feed_arg(ws, '{}/ratio'.format(handle), ratio, 'float32')
def forward(self, input, ratio, inplace=False):
out = input if inplace else self.alloc()
return self.dispatch([input], [out],
callback=lambda ws, handle:
self.feed(ws, handle, ratio))
class DropBlock2d(Dropout):
"""DropBlock2d function.""" """DropBlock2d function."""
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(DropBlock2d, self).__init__(key, dev, **kwargs) super(DropBlock2d, self).__init__(key, dev, **kwargs)
self.block_size = kwargs.get('block_size', 7) self.block_size = kwargs.get('block_size', 7)
self.keep_prob = kwargs.get('keep_prob', 0.9)
self.alpha = kwargs.get('alpha', 1.)
self.decrement = kwargs.get('decrement', 0.)
def attributes(self): def attributes(self):
return { return {
'op_type': 'DropBlock2d', 'op_type': 'DropBlock2d',
'arguments': { 'arguments': {
'ratio_desc': '${HANDLE}/ratio',
'block_size': self.block_size, 'block_size': self.block_size,
'keep_prob': self.keep_prob,
'alpha': self.alpha,
'decrement': self.decrement,
'data_format': 'NCHW', 'data_format': 'NCHW',
} }
} }
class Dropout(_Activation): class DropPath(Dropout):
"""Dropout function."""
def __init__(self, key, dev, **kwargs):
super(Dropout, self).__init__(key, dev, **kwargs)
self.p = kwargs.get('p', 0.5)
def attributes(self):
return {'op_type': 'Dropout', 'arguments': {'prob': self.p}}
class DropPath(_Activation):
"""DropPath function.""" """DropPath function."""
def __init__(self, key, dev, **kwargs): def __init__(self, key, dev, **kwargs):
super(DropPath, self).__init__(key, dev, **kwargs) super(DropPath, self).__init__(key, dev, **kwargs)
self.p = kwargs.get('p', 0.2)
self.increment = kwargs.get('increment', 0.)
def attributes(self): def attributes(self):
return { return {
'op_type': 'DropPath', 'op_type': 'DropPath',
'arguments': { 'arguments': {'ratio_desc': '${HANDLE}/ratio'},
'prob': self.p,
'increment': self.increment,
}
} }
......
...@@ -25,10 +25,10 @@ class DropBlock2d(Module): ...@@ -25,10 +25,10 @@ class DropBlock2d(Module):
The **DropBlock** function is defined as: The **DropBlock** function is defined as:
.. math:: .. math::
\text{DropBlock}(x_{ijk} = \text{DropBlock}(x_{ijk}) =
x_{ijk} * (r_{ik} \sim \mathcal{B}(1, \alpha\gamma)) \\ \quad \\ x_{ijk} * (r_{ik} \sim \mathcal{B}(1, 1 - \gamma)) \\ \quad \\
\text{where}\quad \gamma = \text{where}\quad \gamma =
\frac{\text{keep\_prob}}{\text{block\_size}^{n}} \frac{p}{\text{block\_size}^{n}}
\frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n} \frac{\text{feat\_size}^{n}}{(\text{feat\_size} - \text{block\_size} + 1)^n}
Examples: Examples:
...@@ -45,56 +45,35 @@ class DropBlock2d(Module): ...@@ -45,56 +45,35 @@ class DropBlock2d(Module):
""" """
# Store the global unique slot index def __init__(self, p=0.1, block_size=7, inplace=False):
_DEFAULT_UNIQUE_SLOT_ID = 0
def __init__(
self,
kp=0.9,
block_size=7,
alpha=1.,
decrement=0.,
inplace=False,
):
r"""Create a ``DropBlock2d`` module. r"""Create a ``DropBlock2d`` module.
Parameters Parameters
---------- ----------
kp : float, optional, default=0.9 p : float, optional, default=0.1
The keeping prob. The dropping ratio.
block_size : int, optional, default=7 block_size : int, optional, default=7
The size of a spatial block. The size of a spatial block.
alpha : float, optional, default=1.
The scale factor to :math:`\gamma`.
decrement : float, optional, default=0.
The decrement value to ``kp``.
inplace : bool, optional, default=False inplace : bool, optional, default=False
Whether to do the operation in-place. Whether to do the operation in-place.
""" """
super(DropBlock2d, self).__init__() super(DropBlock2d, self).__init__()
self.kp = kp self.p = p
self.block_size = block_size self.block_size = block_size
self.alpha = alpha
self.decrement = decrement
self.inplace = inplace self.inplace = inplace
DropBlock2d._DEFAULT_UNIQUE_SLOT_ID += 1
self.slot = DropBlock2d._DEFAULT_UNIQUE_SLOT_ID
def extra_repr(self): def extra_repr(self):
inplace_str = ', inplace' if self.inplace else '' inplace_str = ', inplace' if self.inplace else ''
return 'block_size={}, kp={}{}'.format(self.block_size, self.kp, inplace_str) return 'p={}, block_size={}{}' \
.format(self.p, self.block_size, inplace_str)
def forward(self, input): def forward(self, input):
return F.drop_block2d( return F.drop_block2d(
input, input, self.p,
kp=self.kp,
block_size=self.block_size, block_size=self.block_size,
alpha=self.alpha,
decrement=self.decrement,
training=self.training, training=self.training,
inplace=self.inplace, inplace=self.inplace,
slot=self.slot,
) )
...@@ -104,7 +83,7 @@ class Dropout(Module): ...@@ -104,7 +83,7 @@ class Dropout(Module):
The **Dropout** function is defined as: The **Dropout** function is defined as:
.. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - \text{prob})) .. math:: \text{Dropout}(x) = x * (r \sim \mathcal{B}(1, 1 - p))
Examples: Examples:
...@@ -131,7 +110,7 @@ class Dropout(Module): ...@@ -131,7 +110,7 @@ class Dropout(Module):
Parameters Parameters
---------- ----------
p : float, optional, default=0.5 p : float, optional, default=0.5
The dropping prob. The dropping ratio.
inplace : bool, optional, default=False inplace : bool, optional, default=False
Whether to do the operation in-place. Whether to do the operation in-place.
...@@ -154,7 +133,7 @@ class DropPath(Module): ...@@ -154,7 +133,7 @@ class DropPath(Module):
The **DropPath** function is defined as: The **DropPath** function is defined as:
.. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - \text{prob})) .. math:: \text{DropPath}(x_{ij}) = x_{ij} * (r_{i} \sim \mathcal{B}(1, 1 - p))
Examples: Examples:
...@@ -170,39 +149,24 @@ class DropPath(Module): ...@@ -170,39 +149,24 @@ class DropPath(Module):
""" """
# Store the global unique slot index def __init__(self, p=0.2, inplace=False):
_DEFAULT_UNIQUE_SLOT_ID = 0
def __init__(self, p=0.2, increment=0., inplace=False):
"""Create a ``DropPath`` module. """Create a ``DropPath`` module.
Parameters Parameters
---------- ----------
p : float, optional, default=0.2 p : float, optional, default=0.2
The dropping prob. The dropping ratio.
increment : float, optional, default=0.
The increment value to ``p``.
inplace : bool, optional, default=False inplace : bool, optional, default=False
Whether to do the operation in-place. Whether to do the operation in-place.
""" """
super(DropPath, self).__init__() super(DropPath, self).__init__()
self.p = p self.p = p
self.increment = increment
self.inplace = inplace self.inplace = inplace
DropPath._DEFAULT_UNIQUE_SLOT_ID += 1
self.slot = DropPath._DEFAULT_UNIQUE_SLOT_ID
def extra_repr(self): def extra_repr(self):
inplace_str = ', inplace' if self.inplace else '' inplace_str = ', inplace' if self.inplace else ''
return 'p={}{}'.format(self.p, inplace_str) return 'p={}{}'.format(self.p, inplace_str)
def forward(self, input): def forward(self, input):
return F.drop_path( return F.drop_path(input, self.p, self.training, self.inplace)
input,
p=self.p,
increment=self.increment,
training=self.training,
inplace=self.inplace,
slot=self.slot,
)
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!