Commit 746f2cbb by Ting PAN

Add FP16 support for DepthwiseConv2d && SyncBN Operator

Summary:
This commit adds pseudo FP16 kernels with FP32 conversions
for DepthwiseConv2d and SyncBN operator.
1 parent d56e67d1
Showing with 705 additions and 765 deletions
......@@ -81,6 +81,9 @@ dragon
`function(...) <dragon/function.html>`_
: Compile a function and return an executable.
`get_num_threads(...) <dragon/get_num_threads.html>`_
: Return the number of threads for cpu parallelism.
`get_workspace(...) <dragon/get_workspace.html>`_
: Return the current default workspace.
......@@ -138,6 +141,9 @@ dragon
`reshape(...) <dragon/reshape.html>`_
: Change the dimensions of input.
`set_num_threads(...) <dragon/set_num_threads.html>`_
: Set the number of threads for cpu parallelism.
`shape(...) <dragon/shape.html>`_
: Return the shape of input.
......@@ -204,6 +210,7 @@ dragon
dragon/fill
dragon/flatten
dragon/function
dragon/get_num_threads
dragon/get_workspace
dragon/gradients
dragon/graph_mode
......@@ -223,6 +230,7 @@ dragon
dragon/repeat
dragon/reset_workspace
dragon/reshape
dragon/set_num_threads
dragon/shape
dragon/slice
dragon/sort
......
get_num_threads
===============
.. autofunction:: dragon.get_num_threads
.. raw:: html
<style>
h1:before {
content: "dragon.";
color: #103d3e;
}
</style>
set_num_threads
===============
.. autofunction:: dragon.set_num_threads
.. raw:: html
<style>
h1:before {
content: "dragon.";
color: #103d3e;
}
</style>
......@@ -18,7 +18,7 @@
#include "dragon/core/operator_schema.h"
#include "dragon/core/registry.h"
#include "dragon/core/tensor.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
namespace dragon {
......
......@@ -19,6 +19,11 @@
#include "dragon/core/typeid.h"
#ifndef HFLT_MAX
#define HFLT_MAX 65504.F
#define HFLT_MIN 6.10e-5F
#endif
namespace dragon {
typedef std::vector<int> vec32_t;
......
......@@ -34,7 +34,7 @@ void _DropBlock2dNCHW(
}
} // Share the mask between channels
}
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......@@ -65,7 +65,7 @@ void _DropBlock2dNHWC(
}
} // Share the mask between channels
}
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -86,7 +85,7 @@ void DropPath<float16, CUDAContext>(
const auto nthreads = rows * cols; \
const auto thresh = 1.f - (1.f / scale); \
_DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, cols, thresh, cast::to<T>(scale), x, mask, y); \
nthreads, cols, thresh, convert::To<T>(scale), x, mask, y); \
}
DEFINE_KERNEL_LAUNCHER(float);
......
#include "dragon/utils/cast.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -63,28 +62,29 @@ void _Dropout<float16>(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ApplyMask<T, CPUContext>( \
const int count, \
const float scale, \
const T* x, \
const uint8_t* mask, \
T* y, \
CPUContext* ctx) { \
_ApplyMask(count, cast::to<T>(scale), x, mask, y); \
} \
template <> \
void Dropout<T, CPUContext>( \
const int count, \
const float ratio, \
const float scale, \
const T* x, \
uint8_t* mask, \
T* y, \
uint32_t* r, \
CPUContext* ctx) { \
_Dropout(count, cast::to<T>(ratio), cast::to<T>(scale), x, mask, y, ctx); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ApplyMask<T, CPUContext>( \
const int count, \
const float scale, \
const T* x, \
const uint8_t* mask, \
T* y, \
CPUContext* ctx) { \
_ApplyMask(count, convert::To<T>(scale), x, mask, y); \
} \
template <> \
void Dropout<T, CPUContext>( \
const int count, \
const float ratio, \
const float scale, \
const T* x, \
uint8_t* mask, \
T* y, \
uint32_t* r, \
CPUContext* ctx) { \
_Dropout( \
count, convert::To<T>(ratio), convert::To<T>(scale), x, mask, y, ctx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -113,7 +112,7 @@ void Dropout<float16, CUDAContext>(
T* y, \
CUDAContext* ctx) { \
_ApplyMask<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(scale), x, mask, y); \
count, convert::To<T>(scale), x, mask, y); \
} \
template <> \
void Dropout<T, CUDAContext>( \
......@@ -128,7 +127,7 @@ void Dropout<float16, CUDAContext>(
math::Random(count, r, ctx); \
auto threshold = static_cast<uint32_t>(UINT_MAX * ratio); \
_Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, threshold, cast::to<T>(scale), x, r, mask, y); \
count, threshold, convert::To<T>(scale), x, r, mask, y); \
}
DEFINE_KERNEL_LAUNCHER(float);
......
#include "dragon/utils/cast.h"
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -50,19 +50,19 @@ void _EluGrad<float16>(
template <> \
void Elu<T, CPUContext>( \
const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \
_Elu(count, cast::to<T>(alpha), x, y); \
_Elu(count, convert::To<T>(alpha), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void EluGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_EluGrad(count, cast::to<T>(alpha), dy, y, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void EluGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_EluGrad(count, convert::To<T>(alpha), dy, y, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
#include "dragon/utils/cast.h"
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -56,28 +56,28 @@ void _HardSigmoidGrad<float16>(
} // namespace
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void HardSigmoid<T, CPUContext>( \
const int count, \
const float alpha, \
const float beta, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_HardSigmoid(count, cast::to<T>(alpha), cast::to<T>(beta), x, y); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void HardSigmoid<T, CPUContext>( \
const int count, \
const float alpha, \
const float beta, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_HardSigmoid(count, convert::To<T>(alpha), convert::To<T>(beta), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void HardSigmoidGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_HardSigmoidGrad(count, cast::to<T>(alpha), dy, y, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void HardSigmoidGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_HardSigmoidGrad(count, convert::To<T>(alpha), dy, y, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
#include "dragon/utils/cast.h"
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -59,29 +59,30 @@ void _HardSwishGrad<float16>(
} // namespace
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void HardSwish<T, CPUContext>( \
const int count, \
const float alpha, \
const float beta, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_HardSwish(count, cast::to<T>(alpha), cast::to<T>(beta), x, y); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void HardSwish<T, CPUContext>( \
const int count, \
const float alpha, \
const float beta, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_HardSwish(count, convert::To<T>(alpha), convert::To<T>(beta), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void HardSwishGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const float beta, \
const T* dy, \
const T* x, \
T* dx, \
CPUContext* ctx) { \
_HardSwishGrad(count, cast::to<T>(alpha), cast::to<T>(beta), dy, x, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void HardSwishGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const float beta, \
const T* dy, \
const T* x, \
T* dx, \
CPUContext* ctx) { \
_HardSwishGrad( \
count, convert::To<T>(alpha), convert::To<T>(beta), dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......
#include "dragon/utils/cast.h"
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -87,7 +87,7 @@ void _ReluNGrad<float16>(
template <> \
void Relu<T, CPUContext>( \
const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \
_Relu(count, cast::to<T>(alpha), x, y); \
_Relu(count, convert::To<T>(alpha), x, y); \
} \
template <> \
void ReluN<T, CPUContext>( \
......@@ -96,29 +96,29 @@ void _ReluNGrad<float16>(
const T* x, \
T* y, \
CPUContext* ctx) { \
_ReluN(count, cast::to<T>(max_value), x, y); \
_ReluN(count, convert::To<T>(max_value), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ReluGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_ReluGrad(count, cast::to<T>(alpha), dy, y, dx); \
} \
template <> \
void ReluNGrad<T, CPUContext>( \
const int count, \
const float max_value, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_ReluNGrad(count, cast::to<T>(max_value), dy, y, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ReluGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_ReluGrad(count, convert::To<T>(alpha), dy, y, dx); \
} \
template <> \
void ReluNGrad<T, CPUContext>( \
const int count, \
const float max_value, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_ReluNGrad(count, convert::To<T>(max_value), dy, y, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -287,13 +287,13 @@ void ReluN<float16, CUDAContext>(
0,
ctx->cuda_stream()>>>(
count >> 1,
cast::to<half>(max_value),
convert::To<half>(max_value),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
cast::to<half>(max_value),
convert::To<half>(max_value),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
......@@ -339,14 +339,14 @@ void ReluNGrad<float16, CUDAContext>(
0,
ctx->cuda_stream()>>>(
count >> 1,
cast::to<half2>(max_value),
convert::To<half2>(max_value),
reinterpret_cast<const half2*>(dy),
reinterpret_cast<const half2*>(y),
reinterpret_cast<half2*>(dx));
} else {
_ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
cast::to<half>(max_value),
convert::To<half>(max_value),
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(y),
reinterpret_cast<half*>(dx));
......@@ -362,7 +362,7 @@ void ReluNGrad<float16, CUDAContext>(
T* y, \
CUDAContext* ctx) { \
_Relu<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(alpha), x, y); \
count, convert::To<T>(alpha), x, y); \
} \
template <> \
void ReluN<T, CUDAContext>( \
......@@ -372,7 +372,7 @@ void ReluNGrad<float16, CUDAContext>(
T* y, \
CUDAContext* ctx) { \
_ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(max_value), x, y); \
count, convert::To<T>(max_value), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
......@@ -385,7 +385,7 @@ void ReluNGrad<float16, CUDAContext>(
T* dx, \
CUDAContext* ctx) { \
_ReluGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(alpha), dy, y, dx); \
count, convert::To<T>(alpha), dy, y, dx); \
} \
template <> \
void ReluNGrad<T, CUDAContext>( \
......@@ -396,7 +396,7 @@ void ReluNGrad<float16, CUDAContext>(
T* dx, \
CUDAContext* ctx) { \
_ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(max_value), dy, y, dx); \
count, convert::To<T>(max_value), dy, y, dx); \
}
DEFINE_KERNEL_LAUNCHER(float);
......
#include "dragon/utils/cast.h"
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -57,29 +57,29 @@ void _SeluGrad<float16>(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Selu<T, CPUContext>( \
const int count, \
const float alpha, \
const float gamma, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_Selu(count, cast::to<T>(alpha), cast::to<T>(gamma), x, y); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Selu<T, CPUContext>( \
const int count, \
const float alpha, \
const float gamma, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_Selu(count, convert::To<T>(alpha), convert::To<T>(gamma), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void SeluGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const float gamma, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* tx) { \
_SeluGrad(count, cast::to<T>(alpha), cast::to<T>(gamma), dy, y, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void SeluGrad<T, CPUContext>( \
const int count, \
const float alpha, \
const float gamma, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* tx) { \
_SeluGrad(count, convert::To<T>(alpha), convert::To<T>(gamma), dy, y, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/device/common_cub.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -200,7 +199,7 @@ void Softmax<float16, CUDAContext>(
rows,
cols,
inner_dim,
cast::to<half>(std::numeric_limits<float>::lowest()),
convert::To<half>(std::numeric_limits<float>::lowest()),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -35,7 +35,7 @@ __global__ void _Tanh<half2>(const int nthreads, const half2* x, half2* y) {
template <typename T>
__global__ void _TanhGrad(const int nthreads, const T* dy, const T* y, T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = dy[i] * (T(1) - utils::math::Square(y[i]));
dx[i] = dy[i] * (T(1) - math::utils::Square(y[i]));
}
}
......@@ -44,7 +44,7 @@ __global__ void
_TanhGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half(
__half2float(dy[i]) * (1.f - utils::math::Square(__half2float(y[i]))));
__half2float(dy[i]) * (1.f - math::utils::Square(__half2float(y[i]))));
}
}
......@@ -58,8 +58,8 @@ __global__ void _TanhGrad<half2>(
const float2 val = __half22float2(y[i]);
const float2 grad = __half22float2(dy[i]);
dx[i] = __floats2half2_rn(
grad.x * (1.f - utils::math::Square(val.x)),
grad.y * (1.f - utils::math::Square(val.y)));
grad.x * (1.f - math::utils::Square(val.x)),
grad.y * (1.f - math::utils::Square(val.y)));
}
}
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -28,7 +28,7 @@ void _ChannelNormalize(
if (d == axis) wi = idx[d];
}
y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, idx.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
}
}
......
......@@ -26,7 +26,7 @@ void _CumSum(
} else {
y[i] = exclusive ? T(0) : x[i];
}
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -11,7 +10,7 @@ namespace {
template <typename T>
void _SetEye(const int n, const int m, const int k, T* y) {
for (int i = 0; i < n; ++i) {
y[i * m + k + i] = cast::to<T>(1.f);
y[i * m + k + i] = convert::To<T>(1.f);
}
}
......@@ -23,7 +22,7 @@ void _SetEye(const int n, const int m, const int k, T* y) {
template <> \
void Eye<T, CPUContext>( \
const int n, const int m, const int k, T* y, CPUContext* ctx) { \
math::Set(n* m, cast::to<T>(0.f), y, ctx); \
math::Set(n* m, convert::To<T>(0.f), y, ctx); \
if (k > 0) { \
if (m - k > 0) _SetEye(m - k, m, k, y); \
} else { \
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -37,7 +36,7 @@ void Eye<float16, CUDAContext>(
const int k,
float16* y,
CUDAContext* ctx) {
math::Set(n * m, cast::to<float16>(0.f), y, ctx);
math::Set(n * m, convert::To<float16>(0.f), y, ctx);
if (k > 0) {
if (m - k > 0) {
_SetEye<<<CUDA_BLOCKS(m - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
......
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -16,12 +16,12 @@ void _RowwiseLinSpace(
T* y) {
for (int i = 0; i < cols; ++i) {
const auto delta = (stop[i] - start[i]) / double(rows - 1);
y[i] = cast::to<T>(start[i]);
y[i] = convert::To<T>(start[i]);
if (rows > 1) {
y[i + (rows - 1) * cols] = cast::to<T>(stop[i]);
y[i + (rows - 1) * cols] = convert::To<T>(stop[i]);
}
for (int j = 1; j < rows - 1; ++j) {
y[i + j * cols] = cast::to<T>(start[i] + double(j) * delta);
y[i + j * cols] = convert::To<T>(start[i] + double(j) * delta);
}
}
}
......@@ -36,12 +36,12 @@ void _ColwiseLinSpace(
for (int i = 0; i < rows; ++i) {
const auto delta = (stop[i] - start[i]) / double(cols - 1);
auto* offset_y = y + i * cols;
offset_y[0] = cast::to<T>(start[i]);
offset_y[0] = convert::To<T>(start[i]);
if (cols > 1) {
offset_y[cols - 1] = cast::to<T>(stop[i]);
offset_y[cols - 1] = convert::To<T>(stop[i]);
}
for (int j = 1; j < cols - 1; ++j) {
offset_y[j] = cast::to<T>(start[i] + double(j) * delta);
offset_y[j] = convert::To<T>(start[i] + double(j) * delta);
}
}
}
......
#include "dragon/utils/cast.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -52,17 +51,17 @@ void _MaskedSelectGrad(
_MaskedSelect(num_selected, index, x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(IndexType, ValueType) \
template <> \
void MaskedSelectGrad<IndexType, ValueType, CPUContext>( \
const int count, \
const int num_selected, \
const IndexType* index, \
const ValueType* dy, \
ValueType* dx, \
CPUContext* ctx) { \
math::Set(count, cast::to<ValueType>(0.f), dx, ctx); \
_MaskedSelectGrad(num_selected, index, dy, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(IndexType, ValueType) \
template <> \
void MaskedSelectGrad<IndexType, ValueType, CPUContext>( \
const int count, \
const int num_selected, \
const IndexType* index, \
const ValueType* dy, \
ValueType* dx, \
CPUContext* ctx) { \
math::Set(count, convert::To<ValueType>(0.f), dx, ctx); \
_MaskedSelectGrad(num_selected, index, dy, dx); \
}
DEFINE_KERNEL_LAUNCHER(int, bool);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -61,7 +61,7 @@ __global__ void _MaskedSelectGrad(
const ValueType* dy, \
ValueType* dx, \
CUDAContext* ctx) { \
math::Set(count, cast::to<ValueType>(0.f), dx, ctx); \
math::Set(count, convert::To<ValueType>(0.f), dx, ctx); \
_MaskedSelectGrad<<< \
CUDA_BLOCKS(num_selected), \
CUDA_THREADS, \
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -30,7 +29,7 @@ void _ConstPad(
xi += r * x_strides[d];
}
y[yi] = d >= 0 ? value : x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......@@ -56,7 +55,7 @@ void _ReflectPad(
xi += r * x_strides[d];
}
y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......@@ -80,7 +79,7 @@ void _EdgePad(
xi += r * x_strides[d];
}
y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......@@ -102,20 +101,27 @@ void _EdgePad(
_##name(num_dims, x_dims, x_strides, y_dims, pads, x, y); \
}
#define DEFINE_CONST_KERNEL_LAUNCHER(T) \
template <> \
void ConstPad<T, CPUContext>( \
const int num_dims, \
const int64_t* x_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const int64_t* pads, \
const float value, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_ConstPad( \
num_dims, x_dims, x_strides, y_dims, pads, cast::to<T>(value), x, y); \
#define DEFINE_CONST_KERNEL_LAUNCHER(T) \
template <> \
void ConstPad<T, CPUContext>( \
const int num_dims, \
const int64_t* x_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const int64_t* pads, \
const float value, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_ConstPad( \
num_dims, \
x_dims, \
x_strides, \
y_dims, \
pads, \
convert::To<T>(value), \
x, \
y); \
}
DEFINE_CONST_KERNEL_LAUNCHER(bool);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -114,7 +113,7 @@ __global__ void _EdgePad(
X_strides, \
Y_dims, \
X_pads, \
cast::to<T>(value), \
convert::To<T>(value), \
x, \
y); \
}
......
#include "dragon/utils/cast.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -14,7 +14,7 @@ void _Range(const int count, const double start, const double delta, T* y) {
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
y[i] = cast::to<T>(start + double(i) * delta);
y[i] = convert::To<T>(start + double(i) * delta);
}
}
......
......@@ -26,7 +26,7 @@ void _ReduceSumGrad(
yi += (index[d] % y_dims[d]) * y_strides[d];
}
dx[xi] = dy[yi] * scale;
utils::math::IncreaseIndexInDims(num_dims, x_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, x_dims, index.data());
}
}
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -25,7 +25,7 @@ void _Slice(
xi += (index[d] + starts[d]) * x_strides[d];
}
y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......@@ -47,7 +47,7 @@ void _SliceGrad(
xi += (index[d] + starts[d]) * x_strides[d];
}
dx[xi] = dy[yi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......
......@@ -25,7 +25,7 @@ void _Tile(
xi += (index[d] % x_dims[d]) * x_strides[d];
}
y[i] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......
......@@ -162,7 +162,7 @@ __global__ void _SelectViaDeviceSort(
/* ------------------- Launcher Separator ------------------- */
#define PLACE_BLOCK_SORT_CASE(T, items_per_thread) \
#define BLOCKSORT_KERNEL(T, items_per_thread) \
_SelectViaBlockSort<T, items_per_thread> \
<<<CUDA_2D_BLOCKS(rows), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, \
......@@ -175,15 +175,15 @@ __global__ void _SelectViaDeviceSort(
reinterpret_cast<T*>(value), \
index)
#define PLACE_BLOCK_SORT_CASES(T) \
#define DISPATCH_BLOCKSORT_KERNEL(T) \
if (cols <= CUDA_THREADS) { \
PLACE_BLOCK_SORT_CASE(T, 1); \
BLOCKSORT_KERNEL(T, 1); \
} else if (cols <= CUDA_THREADS * 2) { \
PLACE_BLOCK_SORT_CASE(T, 2); \
BLOCKSORT_KERNEL(T, 2); \
} else if (cols <= CUDA_THREADS * 4) { \
PLACE_BLOCK_SORT_CASE(T, 4); \
BLOCKSORT_KERNEL(T, 4); \
} else if (cols <= CUDA_THREADS * 8) { \
PLACE_BLOCK_SORT_CASE(T, 8); \
BLOCKSORT_KERNEL(T, 8); \
} else { \
LOG(FATAL) << "Too larger dimension (> " << CUDA_THREADS * 8 \
<< ") to launch the cuda kernel"; \
......@@ -238,7 +238,7 @@ __global__ void _SelectViaDeviceSort(
return; \
} \
T2 init = largest > 0 ? kLowest : kMax; \
PLACE_BLOCK_SORT_CASES(T2); \
DISPATCH_BLOCKSORT_KERNEL(T2); \
}
DEFINE_KERNEL_LAUNCHER(
......@@ -277,8 +277,8 @@ DEFINE_KERNEL_LAUNCHER(
std::numeric_limits<double>::lowest(),
std::numeric_limits<double>::max());
#undef PLACE_BLOCK_SORT_CASE
#undef PLACE_BLOCK_SORT_CASES
#undef BLOCK_SORTKERNEL
#undef DISPATCH_BLOCKSORT_KERNEL
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -24,7 +24,7 @@ void _Transpose(
xi += index[d] * x_strides[d];
}
y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......@@ -45,7 +45,7 @@ void _TransposeGrad(
xi += index[d] * x_strides[d];
}
dx[xi] = dy[yi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
......
......@@ -25,7 +25,7 @@ void _Assign(
yi += (index[d] + starts[d]) * y_strides[d];
}
y[yi] = x[i];
utils::math::IncreaseIndexInDims(num_dims, x_dims, index.data());
math::utils::IncreaseIndexInDims(num_dims, x_dims, index.data());
}
}
......
......@@ -19,7 +19,7 @@ void _BroadcastLossGrad(
const int count = outer_dim * axis_dim * inner_dim;
for (int i = 0; i < count; ++i) {
dx[i] *= dy[idx[0] * inner_dim + idx[2]];
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......@@ -78,32 +78,32 @@ void BroadcastLossGrad<float16, CPUContext>(
y[0] = math::Sum(count, 1.f / inv_scale, x, ctx); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ReduceLossGrad<T, CPUContext>( \
const int count, \
const int num_masks, \
const float normalizer, \
const T* dy, \
const T* mask, \
T* dx, \
CPUContext* ctx) { \
float inv_scale = std::max( \
0.5f, \
num_masks > 0 && normalizer < 0.f \
? (float)math::Sum(num_masks, 1.f, mask, ctx) \
: normalizer); \
math::Scale(count, cast::to<float>(dy[0]) / inv_scale, dx, dx, ctx); \
} \
template <> \
void BroadcastLossGrad<T, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const T* dy, \
T* dx, \
CPUContext* ctx) { \
_BroadcastLossGrad(outer_dim, inner_dim, axis_dim, dy, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ReduceLossGrad<T, CPUContext>( \
const int count, \
const int num_masks, \
const float normalizer, \
const T* dy, \
const T* mask, \
T* dx, \
CPUContext* ctx) { \
float inv_scale = std::max( \
0.5f, \
num_masks > 0 && normalizer < 0.f \
? (float)math::Sum(num_masks, 1.f, mask, ctx) \
: normalizer); \
math::Scale(count, convert::To<float>(dy[0]) / inv_scale, dx, dx, ctx); \
} \
template <> \
void BroadcastLossGrad<T, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const T* dy, \
T* dx, \
CPUContext* ctx) { \
_BroadcastLossGrad(outer_dim, inner_dim, axis_dim, dy, dx); \
}
DEFINE_KERNEL_LAUNCHER(float);
......
......@@ -28,7 +28,7 @@ void _NLLLoss(
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -logit[k], mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
......@@ -53,7 +53,7 @@ void _NLLLossGrad(
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -48,7 +48,7 @@ void _SigmoidFocalLoss(
loss[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1;
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......@@ -96,7 +96,7 @@ void _SigmoidFocalLossGrad(
dx[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1;
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -29,7 +29,7 @@ void _SparseSoftmaxCrossEntropy(
loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
......@@ -60,7 +60,7 @@ void _SparseSoftmaxCrossEntropyGrad(
dx[k] -= LogitType(1);
mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
......
#include "dragon/utils/cast.h"
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -22,15 +22,15 @@ void _Clip<float16>(
const float16 high,
const float16* x,
float16* y) {
auto lowf = cast::to<float>(low);
auto highf = cast::to<float>(high);
auto lowf = convert::To<float>(low);
auto highf = convert::To<float>(high);
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
auto val = cast::to<float>(x[i]);
auto val = convert::To<float>(x[i]);
val = std::max(lowf, std::min(val, highf));
y[i] = cast::to<float16>(val);
y[i] = convert::To<float16>(val);
}
}
......@@ -56,14 +56,14 @@ void _ClipGrad<float16>(
const float16* dy,
const float16* x,
float16* dx) {
auto lowf = cast::to<float>(low);
auto highf = cast::to<float>(high);
auto kZero = cast::to<float16>(0.f);
auto lowf = convert::To<float>(low);
auto highf = convert::To<float>(high);
auto kZero = convert::To<float16>(0.f);
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
auto val = cast::to<float>(x[i]);
auto val = convert::To<float>(x[i]);
dx[i] = (val < lowf || val > highf) ? kZero : dy[i];
}
} // ClipGrad
......@@ -72,29 +72,29 @@ void _ClipGrad<float16>(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Clip<T, CPUContext>( \
const int count, \
const float low, \
const float high, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_Clip(count, cast::to<T>(low), cast::to<T>(high), x, y); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Clip<T, CPUContext>( \
const int count, \
const float low, \
const float high, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_Clip(count, convert::To<T>(low), convert::To<T>(high), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ClipGrad<T, CPUContext>( \
const int count, \
const float low, \
const float high, \
const T* dy, \
const T* x, \
T* dx, \
CPUContext* ctx) { \
_ClipGrad(count, cast::to<T>(low), cast::to<T>(high), dy, x, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ClipGrad<T, CPUContext>( \
const int count, \
const float low, \
const float high, \
const T* dy, \
const T* x, \
T* dx, \
CPUContext* ctx) { \
_ClipGrad(count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -104,8 +103,8 @@ void Clip<float16, CUDAContext>(
CUDAContext* ctx) {
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
cast::to<half>(low),
cast::to<half>(high),
convert::To<half>(low),
convert::To<half>(high),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
......@@ -121,8 +120,8 @@ void ClipGrad<float16, CUDAContext>(
CUDAContext* ctx) {
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
cast::to<half>(low),
cast::to<half>(high),
convert::To<half>(low),
convert::To<half>(high),
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(dx));
......@@ -138,7 +137,7 @@ void ClipGrad<float16, CUDAContext>(
T* y, \
CUDAContext* ctx) { \
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(low), cast::to<T>(high), x, y); \
count, convert::To<T>(low), convert::To<T>(high), x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
......@@ -152,7 +151,7 @@ void ClipGrad<float16, CUDAContext>(
T* dx, \
CUDAContext* ctx) { \
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(low), cast::to<T>(high), dy, x, dx); \
count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -70,7 +70,7 @@ template <typename T>
__global__ void
_ReciprocalGrad(const int nthreads, const T* dy, const T* y, T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = -dy[i] * utils::math::Square(y[i]);
dx[i] = -dy[i] * math::utils::Square(y[i]);
}
}
......@@ -82,7 +82,7 @@ __global__ void _ReciprocalGrad<half>(
half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half(
-__half2float(dy[i]) * utils::math::Square(__half2float(y[i])));
-__half2float(dy[i]) * math::utils::Square(__half2float(y[i])));
}
}
......@@ -103,7 +103,7 @@ __global__ void _ReciprocalGrad<half2>(
template <typename T>
__global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = T(-0.5) * dy[i] * utils::math::Cube(y[i]);
dx[i] = T(-0.5) * dy[i] * math::utils::Cube(y[i]);
}
}
......@@ -112,7 +112,7 @@ __global__ void
_RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half(
-0.5f * __half2float(dy[i]) * utils::math::Cube(__half2float(y[i])));
-0.5f * __half2float(dy[i]) * math::utils::Cube(__half2float(y[i])));
}
}
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -106,14 +106,14 @@ void _Moments(
y_dims[axes[i]] = 1;
// Case #1: Rowwise Reduce
if (utils::math::IsRowwiseReduce(
if (math::utils::IsRowwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) {
_RowwiseMoments(rows, cols, x, mean, var);
return;
}
// Case #2: Colwise Reduce
if (utils::math::IsColwiseReduce(
if (math::utils::IsColwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) {
_ColwiseMoments(rows, cols, x, mean, var);
return;
......@@ -121,8 +121,8 @@ void _Moments(
// Case #3: Generic Reduce
vec32_t axesT(num_dims), stridesT(num_dims), dimsT(num_dims);
utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data());
utils::math::ComputeTransposeStrides(
math::utils::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data());
math::utils::ComputeTransposeStrides(
num_dims, dims, axesT.data(), stridesT.data());
rows = cols = 1;
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_cub.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -28,10 +28,10 @@ __global__ void _RowwiseMoments(
const int xi = j * cols + i;
#if __CUDA_ARCH__ >= 350
m_val += __ldg(x + xi);
v_val += utils::math::Square(__ldg(x + xi));
v_val += math::utils::Square(__ldg(x + xi));
#else
m_val += x[xi];
v_val += utils::math::Square(x[xi]);
v_val += math::utils::Square(x[xi]);
#endif
}
m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
......@@ -59,7 +59,7 @@ __global__ void _RowwiseMoments<half, float>(
CUDA_2D_KERNEL_LOOP2(j, rows) {
const int xi = j * cols + i;
m_val += __half2float(__ldg(x + xi));
v_val += utils::math::Square(__half2float(__ldg(x + xi)));
v_val += math::utils::Square(__half2float(__ldg(x + xi)));
}
m_val = BlockReduce<float>(m_storage).Sum(m_val);
v_val = BlockReduce<float>(v_storage).Sum(v_val);
......@@ -87,10 +87,10 @@ __global__ void _ColwiseMoments(
const int xi = i * cols + j;
#if __CUDA_ARCH__ >= 350
m_val += __ldg(x + xi);
v_val += utils::math::Square(__ldg(x + xi));
v_val += math::utils::Square(__ldg(x + xi));
#else
m_val += x[xi];
v_val += utils::math::Square(x[xi]);
v_val += math::utils::Square(x[xi]);
#endif
}
m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
......@@ -118,7 +118,7 @@ __global__ void _ColwiseMoments<half, float>(
CUDA_2D_KERNEL_LOOP2(j, cols) {
const int xi = i * cols + j;
m_val += __half2float(__ldg(x + xi));
v_val += utils::math::Square(__half2float(__ldg(x + xi)));
v_val += math::utils::Square(__half2float(__ldg(x + xi)));
}
m_val = BlockReduce<float>(m_storage).Sum(m_val);
v_val = BlockReduce<float>(v_storage).Sum(v_val);
......@@ -154,10 +154,10 @@ __global__ void _GenericMoments(
}
#if __CUDA_ARCH__ >= 350
m_val += __ldg(x + xi);
v_val += utils::math::Square(__ldg(x + xi));
v_val += math::utils::Square(__ldg(x + xi));
#else
m_val += x[xi];
v_val += utils::math::Square(x[xi]);
v_val += math::utils::Square(x[xi]);
#endif
}
m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
......@@ -194,10 +194,10 @@ __global__ void _GenericMoments(
}
#if __CUDA_ARCH__ >= 350
m_val += __half2float(__ldg(x + xi));
v_val += utils::math::Square(__half2float(__ldg(x + xi)));
v_val += math::utils::Square(__half2float(__ldg(x + xi)));
#else
m_val += __half2float(x[xi]);
v_val += utils::math::Square(__half2float(x[xi]));
v_val += math::utils::Square(__half2float(x[xi]));
#endif
}
m_val = BlockReduce<float>(m_storage).Sum(m_val);
......@@ -226,7 +226,7 @@ void _Moments(
y_dims[axes[i]] = 1;
/*! Case #1: Rowwise Reduce */
if (utils::math::IsRowwiseReduce(
if (math::utils::IsRowwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) {
_RowwiseMoments<<<
CUDA_2D_BLOCKS(cols),
......@@ -237,7 +237,7 @@ void _Moments(
}
/*! Case #2: Colwise Reduce */
if (utils::math::IsColwiseReduce(
if (math::utils::IsColwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) {
_ColwiseMoments<<<
CUDA_2D_BLOCKS(rows),
......@@ -250,8 +250,8 @@ void _Moments(
/*! Case #3: Generic Reduce */
CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> axesT, stridesT, dimsT;
utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data);
utils::math::ComputeTransposeStrides(
math::utils::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data);
math::utils::ComputeTransposeStrides(
num_dims, dims, axesT.data, stridesT.data);
rows = cols = 1;
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -47,7 +47,7 @@ void _AvgPool2dNCHW(
for (int w = wstart; w < wend; ++w)
val += offset_x[h * W + w];
y[i] = val / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -89,7 +89,7 @@ void _AvgPool2dNHWC(
for (int w = wstart; w < wend; ++w)
val += offset_x[(h * W + w) * C];
y[i] = val / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -130,7 +130,7 @@ void _AvgPool2dGradNCHW(
for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w)
offset_dx[h * W + w] += dy[i] / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -170,7 +170,7 @@ void _AvgPool2dGradNHWC(
for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w)
offset_dx[(h * W + w) * C] += dy[i] / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -253,7 +253,7 @@ void _AvgPool2dGradNHWC(
const T* dy, \
T* dx, \
CPUContext* ctx) { \
math::Set(N* C* H* W, cast::to<T>(0.f), dx, ctx); \
math::Set(N* C* H* W, convert::To<T>(0.f), dx, ctx); \
if (data_format == "NCHW") { \
_AvgPool2dGradNCHW( \
N, \
......
#include "dragon/utils/eigen_utils.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......
......@@ -117,70 +117,17 @@ void _DepthwiseConv2dNHWC(
/* ------------------- Launcher Separator ------------------- */
template <>
void DepthwiseConv2d<float, CPUContext>(
const int N,
const int C,
const int H,
const int W,
const int out_h,
const int out_w,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const string& data_format,
const float* x,
const float* w,
float* y,
CPUContext* ctx) {
if (data_format == "NCHW") {
_DepthwiseConv2dNCHW(
N,
C,
H,
W,
out_h,
out_w,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
x,
w,
y);
} else {
_DepthwiseConv2dNHWC(
N,
C,
H,
W,
out_h,
out_w,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
x,
w,
y);
#define DISPATCH_DATA_KERNEL(name, ...) \
if (data_format == "NCHW") { \
name##NCHW(__VA_ARGS__); \
} else if (data_format == "NHWC") { \
name##NHWC(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
}
}
template <>
void DepthwiseConv2dGrad<float, CPUContext>(
void DepthwiseConv2d<float16, CPUContext>(
const int N,
const int C,
const int H,
......@@ -196,15 +143,15 @@ void DepthwiseConv2dGrad<float, CPUContext>(
const int dilation_h,
const int dilation_w,
const string& data_format,
const float* dy,
const float* w,
float* dx,
const float16* x,
const float16* w,
float16* y,
CPUContext* ctx) {
NOT_IMPLEMENTED;
} // DepthwiseConv2dGrad
CPU_FP16_NOT_SUPPORTED;
}
template <>
void DepthwiseConv2dWGrad<float, CPUContext>(
void DepthwiseConv2d<float, CPUContext>(
const int N,
const int C,
const int H,
......@@ -220,12 +167,82 @@ void DepthwiseConv2dWGrad<float, CPUContext>(
const int dilation_h,
const int dilation_w,
const string& data_format,
const float* dy,
const float* x,
float* dw,
const float* w,
float* y,
CPUContext* ctx) {
NOT_IMPLEMENTED;
} // DepthwiseConv2dWGrad
DISPATCH_DATA_KERNEL(
_DepthwiseConv2d,
N,
C,
H,
W,
out_h,
out_w,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
x,
w,
y);
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void DepthwiseConv2dGrad<T, CPUContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int kernel_h, \
const int kernel_w, \
const int stride_h, \
const int stride_w, \
const int pad_h, \
const int pad_w, \
const int dilation_h, \
const int dilation_w, \
const string& data_format, \
const T* dy, \
const T* w, \
T* dx, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
} \
template <> \
void DepthwiseConv2dWGrad<T, CPUContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int kernel_h, \
const int kernel_w, \
const int stride_h, \
const int stride_w, \
const int pad_h, \
const int pad_w, \
const int dilation_h, \
const int dilation_w, \
const string& data_format, \
const T* dy, \
const T* x, \
T* dw, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
}
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel
......
#include "dragon/utils/cast.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -52,7 +52,7 @@ void _MaxPool2dNCHW(
}
y[i] = val;
mask[i] = mxi;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -99,7 +99,7 @@ void _MaxPool2dNHWC(
}
y[i] = val;
mask[i] = mxi;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -129,7 +129,7 @@ void _MaxPool2dGradNCHW(
if (mask[i] != -1) {
dx[idx[0] * CHW + idx[1] * HW + mask[i]] += dy[i];
}
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......@@ -158,7 +158,7 @@ void _MaxPool2dGradNHWC(
if (mask[i] != -1) {
dx[idx[0] * HWC + mask[i]] += dy[i];
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
......@@ -245,7 +245,7 @@ void _MaxPool2dGradNHWC(
const int* mask, \
T* dx, \
CPUContext* ctx) { \
math::Set(N* C* H* W, cast::to<T>(0.f), dx, ctx); \
math::Set(N* C* H* W, convert::To<T>(0.f), dx, ctx); \
if (data_format == "NCHW") { \
_MaxPool2dGradNCHW( \
N, \
......
......@@ -62,7 +62,7 @@ void _ResizeLinearNCHW(
t = tl + (tr - tl) * u;
b = bl + (br - bl) * u;
y[i] = static_cast<T>(t + (b - t) * v);
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -99,7 +99,7 @@ void _ResizeLinearNHWC(
t = tl + (tr - tl) * u;
b = bl + (br - bl) * u;
y[i] = static_cast<T>(t + (b - t) * v);
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -135,7 +135,7 @@ void _ResizeLinearGradNCHW(
dx[(offset + ti) * W + ri] += u * dt; // tr
dx[(offset + bi) * W + li] += (1.f - u) * db; // bl
dx[(offset + bi) * W + ri] += u * db; // br
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -171,7 +171,7 @@ void _ResizeLinearGradNHWC(
dx[((offset + ti) * W + ri) * C + idx[3]] += u * dt; // tr
dx[((offset + bi) * W + li) * C + idx[3]] += (1.f - u) * db; // bl
dx[((offset + bi) * W + ri) * C + idx[3]] += u * db; // br
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......
......@@ -27,7 +27,7 @@ void _ResizeNearestNCHW(
h_in = std::min(int(idx[2] * scale_h), h_max);
w_in = std::min(int(idx[3] * scale_w), w_max);
y[i] = x[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in];
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -52,7 +52,7 @@ void _ResizeNearestNHWC(
w_in = std::min(int(idx[2] * scale_w), w_max);
memcpy(
y + i * C, x + (((idx[0] * H) + h_in) * W + w_in) * C, C * sizeof(T));
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
}
}
......@@ -76,7 +76,7 @@ void _ResizeNearestGradNCHW(
h_in = std::min(int(idx[2] * scale_h), h_max);
w_in = std::min(int(idx[3] * scale_w), w_max);
dx[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in] += (float)dy[i];
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......@@ -100,7 +100,7 @@ void _ResizeNearestGradNHWC(
h_in = std::min(int(idx[1] * scale_h), h_max);
w_in = std::min(int(idx[2] * scale_w), w_max);
dx[(((idx[0] * H) + h_in) * W + w_in) * C + idx[3]] += (float)dy[i];
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data());
math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
}
}
......
......@@ -14,6 +14,7 @@
#define DRAGON_MODULES_PYTHON_CONFIG_H_
#include "dragon/modules/python/common.h"
#include "dragon/utils/device/common_eigen.h"
namespace dragon {
......@@ -22,9 +23,16 @@ namespace python {
namespace config {
void RegisterModule(py::module& m) {
/*! \brief Set the logging severity */
m.def("SetLoggingLevel", [](const string& severity) {
SetLogDestination(severity);
});
/*! \brief Set the number of threads for cpu parallelism */
m.def("SetNumThreads", [](int num) { Eigen::setNbThreads(num); });
/*! \brief Return the number of threads for cpu parallelism */
m.def("GetNumThreads", []() { return Eigen::nbThreads(); });
}
} // namespace config
......
......@@ -14,7 +14,6 @@
#define DRAGON_MODULES_PYTHON_OPERATOR_H_
#include "dragon/modules/python/common.h"
#include "dragon/utils/eigen_utils.h"
namespace dragon {
......
......@@ -19,7 +19,7 @@ void ExpandOp<Context>::DoRunWithType() {
// Store for the gradient calculation
STORE_INPUT_SPEC(0);
if (utils::math::IsBinaryBroadcast(X.dims(), X_dims, Y_dims)) {
if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, Y_dims)) {
math::Set(
X.ndim(),
X.dims().data(),
......@@ -47,7 +47,7 @@ void ExpandGradientOp<Context>::DoRunWithType() {
vec32_t X_broadcast_axes, _;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
dX->dims(), dY.dims(), dY.dims(), X_broadcast_axes, _);
if (X_broadcast_axes.empty()) {
......
......@@ -62,7 +62,7 @@ void IndexSelectGradientOp<Context>::DoRunWithType() {
// Reset the accumulating gradient
math::Set(
dX->count(),
cast::to<T>(0.f),
convert::To<T>(0.f),
dX->template mutable_data<T, Context>(),
ctx());
......
......@@ -46,7 +46,7 @@ template <class Context>
template <typename T>
void FillOp<Context>::DoRunWithType() {
auto* y = Output(0)->template mutable_data<T, Context>();
math::Set(Output(0)->count(), cast::to<T>(value_), y, ctx());
math::Set(Output(0)->count(), convert::To<T>(value_), y, ctx());
}
template <class Context>
......
......@@ -15,7 +15,7 @@ void OneHotOp<Context>::DoRunWithType() {
// Brush the off-value over all
math::Set(
X.count() * depth_,
cast::to<T>((float)off_value_),
convert::To<T>((float)off_value_),
Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx());
......
......@@ -87,7 +87,7 @@ void SliceGradientOp<Context>::DoRunWithType() {
// Zero the redundant gradients
auto* dx = dX->template mutable_data<T, Context>();
math::Set(dX->count(), cast::to<T>(0.f), dx, ctx());
math::Set(dX->count(), convert::To<T>(0.f), dx, ctx());
// Copy the dY to the right positions
kernel::SliceGrad(
......
......@@ -75,7 +75,7 @@ void SplitGradientOp<Context>::DoRunWithType() {
if (!Input(i).has_name()) {
math::Set(
dX->count(),
cast::to<T>(0.f),
convert::To<T>(0.f),
dX->template mutable_data<T, Context>(),
ctx());
break;
......
......@@ -14,8 +14,8 @@ void WhereOp<Context>::DoRunWithType() {
<< "\nExcepted bool or uint8 condition tensor.";
vec64_t AB_dims, Y_dims;
if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), AB_dims) &&
utils::math::IsBinaryBroadcast(AB_dims, C.dims(), Y_dims)) {
if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), AB_dims) &&
math::utils::IsBinaryBroadcast(AB_dims, C.dims(), Y_dims)) {
math::Where(
A.ndim(),
A.dims().data(),
......@@ -50,7 +50,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient and zeros
......@@ -68,7 +68,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
if (scratch_size > 0) {
scratch = ctx()->workspace()->template data<T, Context>({scratch_size})[0];
zeros = scratch + (scratch_size - 1);
math::Set(1, cast::to<T>(0.f), zeros, ctx());
math::Set(1, convert::To<T>(0.f), zeros, ctx());
}
if (dA->has_name()) {
......
......@@ -43,11 +43,11 @@ void AssignOp<Context>::DoRunWithType() {
if (X.dims() != X_dims) {
vec64_t dims1, dims2;
if (utils::math::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
CHECK(X_dims == dims1)
<< "\nCould not assign with shapes " << X.DimString() << " "
<< Tensor::DimString(X_dims);
utils::math::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
if (dims1 != dims2) {
auto* scratch = ctx()->workspace()->template data<T, Context>(
{X_broadcast.count()})[0];
......
......@@ -14,8 +14,8 @@ void MaskedAssignOp<Context>::DoRunWithType() {
<< "\nExcepted bool or uint8 mask.";
vec64_t X_dims, Y_dims;
if (utils::math::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
utils::math::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
math::utils::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
Y_dims == Y->dims()) {
math::Where(
X.ndim(),
......
......@@ -13,7 +13,7 @@ void GradientGenerateOp<Context>::DoRunWithType() {
Y->ReshapeLike(Input(i));
math::Set(
Y->count(),
cast::to<T>(defaults_[i]),
convert::To<T>(defaults_[i]),
Y->template mutable_data<T, Context>(),
ctx());
}
......
......@@ -105,7 +105,7 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
math::Set(dX->count(), cast::to<LogitType>(0.f), dx, ctx());
math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
kernel::NLLLossGrad(
outer_dim,
......
......@@ -21,7 +21,7 @@ void AddOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
} else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Add(
A.ndim(),
......@@ -51,7 +51,7 @@ void AddGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
if (dA->has_name()) {
......
......@@ -21,7 +21,7 @@ void DivOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
} else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Div(
A.ndim(),
......@@ -52,7 +52,7 @@ void DivGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A_ref.dims(),
B_ref.dims(),
dY.dims(),
......
......@@ -93,7 +93,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(Invert, T);
B.template data<T, Context>(), \
Y->Reshape(Y_dims)->template mutable_data<TOut, Context>(), \
ctx()); \
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { \
} else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { \
math::name( \
A.ndim(), \
A.dims().data(), \
......
......@@ -13,7 +13,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient
......
......@@ -13,7 +13,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient
......
......@@ -40,7 +40,7 @@ void MomentsOp<Context>::DoRunWithType() {
ctx());
math::Set(
1,
cast::to<Ty>(0.f),
convert::To<Ty>(0.f),
Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
ctx());
} else {
......
......@@ -21,7 +21,7 @@ void MulOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
} else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Mul(
A.ndim(),
......@@ -52,7 +52,7 @@ void MulGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A_ref.dims(),
B_ref.dims(),
dY.dims(),
......
......@@ -12,7 +12,7 @@ void PowGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient
......@@ -99,7 +99,7 @@ void PowGradientOp<Context>::DoRunWithType() {
ctx());
math::ReplaceNaN(
A.count(),
cast::to<T>(0.f),
convert::To<T>(0.f),
dA->template data<T, Context>(),
dA->template mutable_data<T, Context>(),
ctx());
......@@ -141,7 +141,7 @@ void PowGradientOp<Context>::DoRunWithType() {
A.template data<T, Context>(),
scratch,
ctx());
math::ReplaceNaN(Y.count(), cast::to<T>(0.f), scratch, scratch, ctx());
math::ReplaceNaN(Y.count(), convert::To<T>(0.f), scratch, scratch, ctx());
if (B_broadcast_axes.empty()) {
math::Mul(
Y.count(), scratch, B.template data<T, Context>(), scratch, ctx());
......
......@@ -9,7 +9,7 @@ void SignGradientOp<Context>::DoRunWithType() {
auto &dY = Input(0), *dX = Output(0);
math::Set(
dY.count(),
cast::to<T>(0.f),
convert::To<T>(0.f),
dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
ctx());
}
......
......@@ -21,7 +21,7 @@ void SubOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
} else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Sub(
A.ndim(),
......@@ -51,7 +51,7 @@ void SubGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes(
math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
if (dA->has_name()) {
......
......@@ -19,47 +19,92 @@ void BatchNormOp<Context>::TrainingImpl() {
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* beta = Input(2).template data<ParamType, Context>();
auto* rm = Input(3).template mutable_data<ParamType, Context>();
auto* rv = Input(4).template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* y = Output(0)->template mutable_data<InputType, Context>();
// Compute moments
if (data_format() == "NCHW") {
vec32_t dims = {(int)N_, (int)C_, (int)S_};
vec32_t axes = {0, 2};
kernel::Moments(3, dims.data(), 2, axes.data(), x, mu, rsig, ctx());
} else if (data_format() == "NHWC") {
vec32_t dims = {(int)(N_ * S_), (int)C_};
vec32_t axes = {0};
kernel::Moments(2, dims.data(), 1, axes.data(), x, mu, rsig, ctx());
if (sync_stats_ > 0) {
#ifdef USE_MPI
// Compute E(X) and E(X^2)
kernel::BatchNormExpectation(
N_,
C_,
S_,
ParamType(1) / (N_ * comm_size_ * S_),
data_format(),
x,
mu,
rsig,
ctx());
// Compute D(X) = E(X^2) - E(X)^2
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)mu,
(void*)mu,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)rsig,
(void*)rsig,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif // USE_NCCL
} else {
AllReduce(mu, mu, C_);
AllReduce(rsig, rsig, C_);
}
math::Square(C_, mu, scale, ctx());
math::Sub(C_, rsig, scale, rsig, ctx());
#endif // USE_MPI
} else {
if (data_format() == "NCHW") {
vec32_t dims = {(int)N_, (int)C_, (int)S_};
vec32_t axes = {0, 2};
kernel::Moments(3, dims.data(), 2, axes.data(), x, mu, rsig, ctx());
} else if (data_format() == "NHWC") {
vec32_t dims = {(int)(N_ * S_), (int)C_};
vec32_t axes = {0};
kernel::Moments(2, dims.data(), 1, axes.data(), x, mu, rsig, ctx());
}
}
// Compute running statistics
if (is_recomputing_ == 0) {
// Running(X) = (1 - momentum) * Cur(X) + momentum * Running(X)
math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
}
// Fuse parameters along channel axis
// [mu, rsig, alpha, beta] => [scale, bias]
// Inverse stddev from variance
math::InvStd(C_, epsilon_, rsig, rsig, ctx());
math::Mul(C_, gamma, rsig, scale, ctx());
math::Mul(C_, scale, mu, bias, ctx());
math::Sub(C_, beta, bias, bias, ctx());
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
}
// Fuse parameters to compute affine transformation
kernel::BatchNorm(
N_,
C_,
S_,
data_format(),
x,
mu,
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
scale,
X_bias->template mutable_data<ParamType, Context>(),
Output(0)->template mutable_data<InputType, Context>(),
ctx());
}
template <class Context>
......@@ -70,31 +115,30 @@ void BatchNormOp<Context>::InferenceImpl() {
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* beta = Input(2).template data<ParamType, Context>();
auto* rm = Input(3).template data<ParamType, Context>();
auto* rv = Input(4).template data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* y = Output(0)->template mutable_data<InputType, Context>();
// Fuse parameters along channel axis
// [mu, rsig, alpha, beta] => [scale, bias]
math::InvStd(C_, epsilon_, rv, bias, ctx());
math::Mul(C_, gamma, bias, scale, ctx());
math::Mul(C_, scale, rm, bias, ctx());
math::Sub(C_, beta, bias, bias, ctx());
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
}
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
// Inverse stddev from variance
math::InvStd(C_, epsilon_, rv, rsig, ctx());
// Fuse parameters to compute affine transformation
kernel::BatchNorm(
N_,
C_,
S_,
data_format(),
Input(0).template data<InputType, Context>(),
Input(3).template data<ParamType, Context>(),
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
Output(0)->template mutable_data<InputType, Context>(),
ctx());
}
template <class Context>
......@@ -113,9 +157,15 @@ void BatchNormOp<Context>::RunOnDevice() {
} else {
InferenceImpl<float, float>();
}
} else if (Input(0).template IsType<float16>()) {
if (is_training_) {
TrainingImpl<float16, float>();
} else {
InferenceImpl<float16, float>();
}
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"});
types::to_string(Input(0).meta()), {"float16", "float32"});
}
}
......@@ -124,21 +174,71 @@ template <typename InputType, typename ParamType>
void BatchNormGradientOp<Context>::TrainingImpl() {
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto *X_scale = Buffer("X_scale"), *X_bias = Buffer("X_bias");
// Gradient w.r.t. gamma, beta and input
kernel::BatchNormBackwardTraining(
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* dy = Input(4).template data<InputType, Context>();
auto* mu = X_mu->template data<ParamType, Context>();
auto* rsig = X_rsig->template data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
// Gradient w.r.t. gamma and beta
kernel::BatchNormInternalGrad(
N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());
if (sync_stats_ > 0) {
#ifdef USE_MPI
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)dgamma,
(void*)scale,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)dbeta,
(void*)bias,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif // USE_NCCL
} else {
AllReduce(dgamma, scale, C_);
AllReduce(dbeta, bias, C_);
}
math::Scale(C_, ParamType(1) / comm_size_, scale, scale, ctx());
math::Scale(C_, ParamType(1) / comm_size_, bias, bias, ctx());
#endif // USE_MPI
} else {
scale = dgamma, bias = dbeta;
}
// Gradient w.r.t. input
kernel::BatchNormTrainingGrad(
N_,
C_,
S_,
data_format(),
Input(0).template data<InputType, Context>(), // x
X_mu->template data<ParamType, Context>(), // mu
X_rsig->template data<ParamType, Context>(), // rsig
Input(1).template data<ParamType, Context>(), // gamma
Input(4).template data<InputType, Context>(), // dy
Output(0)->template mutable_data<InputType, Context>(), // dx
dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dgamma
dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // dbeta
x,
mu,
rsig,
gamma,
scale,
bias,
dy,
Output(0)->template mutable_data<InputType, Context>(),
ctx());
}
......@@ -158,11 +258,11 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
}
// Restore inverse stddev from variance
// Inverse stddev from variance
math::InvStd(C_, epsilon_, rv, rsig, ctx());
// Gradient w.r.t. gamma, beta and input
kernel::BatchNormBackwardInference(
kernel::BatchNormInferenceGrad(
N_,
C_,
S_,
......@@ -172,9 +272,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(4).template data<InputType, Context>(), // dy
dX->template mutable_data<InputType, Context>(),
dgamma,
dbeta,
dX->template mutable_data<InputType, Context>(),
ctx());
}
......@@ -190,9 +290,15 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
} else {
InferenceImpl<float, float>();
}
} else if (Input(0).template IsType<float16>()) {
if (is_training_ > 0) {
TrainingImpl<float16, float>();
} else {
InferenceImpl<float16, float>();
}
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"});
types::to_string(Input(0).meta()), {"float16", "float32"});
}
}
......
......@@ -35,7 +35,8 @@ class BatchNormOpBase : public GenericOpBase<Context> {
: GenericOpBase<Context>(def, ws),
momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)) {}
use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
USE_OPERATOR_FUNCTIONS;
void DetermineBaseArguments() {
......@@ -58,7 +59,8 @@ class BatchNormOpBase : public GenericOpBase<Context> {
protected:
float momentum_;
double epsilon_;
int64_t use_stats_, N_, C_, S_;
int64_t N_, C_, S_;
int64_t use_stats_, sync_stats_;
int64_t is_training_, is_recomputing_;
};
......@@ -69,6 +71,7 @@ class BatchNormOpBase : public GenericOpBase<Context> {
using BatchNormOpBase<Context>::momentum_; \
using BatchNormOpBase<Context>::epsilon_; \
using BatchNormOpBase<Context>::use_stats_; \
using BatchNormOpBase<Context>::sync_stats_; \
using BatchNormOpBase<Context>::N_; \
using BatchNormOpBase<Context>::C_; \
using BatchNormOpBase<Context>::S_; \
......@@ -82,6 +85,9 @@ class BatchNormOp : public BatchNormOpBase<Context> {
: BatchNormOpBase<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
#ifdef USE_MPI
USE_COLLECTIVE_FUNCTIONS;
#endif
void RunOnDevice() override;
......@@ -99,50 +105,19 @@ class BatchNormGradientOp : public BatchNormOpBase<Context> {
: BatchNormOpBase<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
void RunOnDevice() override;
template <typename InputType, typename ParamType>
void TrainingImpl();
template <typename InputType, typename ParamType>
void InferenceImpl();
};
#ifdef USE_MPI
template <class Context>
class SyncBatchNormOp : public BatchNormOp<Context> {
public:
SyncBatchNormOp(const OperatorDef& def, Workspace* ws)
: BatchNormOp<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
USE_COLLECTIVE_FUNCTIONS;
#endif
void RunOnDevice() override;
template <typename InputType, typename ParamType>
void TrainingImpl();
};
template <class Context>
class SyncBatchNormGradientOp : public BatchNormGradientOp<Context> {
public:
SyncBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
: BatchNormGradientOp<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
USE_COLLECTIVE_FUNCTIONS;
void RunOnDevice() override;
template <typename InputType, typename ParamType>
void TrainingImpl();
void InferenceImpl();
};
#endif // USE_MPI
#ifdef USE_CUDNN
template <class Context>
......
#ifdef USE_MPI
#include "dragon/core/workspace.h"
#include "dragon/operators/normalization/batch_norm_op.h"
#include "dragon/utils/filler.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename InputType, typename ParamType>
void SyncBatchNormOp<Context>::TrainingImpl() {
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
auto* X_mu = Buffer("X_mu")->Reshape({C_});
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* beta = Input(2).template data<ParamType, Context>();
auto* rm = Input(3).template mutable_data<ParamType, Context>();
auto* rv = Input(4).template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* y = Output(0)->template mutable_data<InputType, Context>();
// Compute E(X) and E(X^2)
kernel::BatchNormExpectation(
N_,
C_,
S_,
ParamType(1) / (N_ * comm_size_ * S_),
data_format(),
x,
mu,
rsig,
ctx());
// Compute D(X) = E(X^2) - E(X)^2
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)mu,
(void*)mu,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)rsig,
(void*)rsig,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif
} else {
AllReduce(mu, mu, C_);
AllReduce(rsig, rsig, C_);
}
math::Square(C_, mu, y, ctx());
math::Sub(C_, rsig, y, rsig, ctx());
// Compute running statistics
if (is_recomputing_ == 0) {
// Running(X) = (1 - momentum) * Cur(X) + momentum * Running(X)
math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
}
// Fuse parameters along channel axis
// [mu, rsig, alpha, beta] => [scale, bias]
math::InvStd(C_, epsilon_, rsig, rsig, ctx());
math::Mul(C_, gamma, rsig, scale, ctx());
math::Mul(C_, scale, mu, bias, ctx());
math::Sub(C_, beta, bias, bias, ctx());
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
}
}
template <class Context>
void SyncBatchNormOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Get the recomputing flag
auto* flag = workspace()->GetTensor("/share/flag/recomputing");
is_recomputing_ = flag->template data<bool, CPUContext>()[0] ? 1 : 0;
// Dispatch the training or inference impl
Output(0)->ReshapeLike(Input(0));
if (Input(0).template IsType<float>()) {
if (is_training_ > 0) {
TrainingImpl<float, float>();
} else {
this->template InferenceImpl<float, float>();
}
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"});
}
}
template <class Context>
template <typename InputType, typename ParamType>
void SyncBatchNormGradientOp<Context>::TrainingImpl() {
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto *X_scale = Buffer("X_scale"), *X_bias = Buffer("X_bias");
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* dy = Input(4).template data<InputType, Context>();
auto* mu = X_mu->template data<ParamType, Context>();
auto* rsig = X_rsig->template data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
// Gradient w.r.t. gamma and beta of local batch
kernel::BatchNormInternalGrad(
N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());
// Gradient w.r.t. gamma and beta of global batch
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)dgamma,
(void*)scale,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)dbeta,
(void*)bias,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif
} else {
AllReduce(dgamma, scale, C_);
AllReduce(dbeta, bias, C_);
}
math::Scale(C_, ParamType(1) / comm_size_, scale, scale, ctx());
math::Scale(C_, ParamType(1) / comm_size_, bias, bias, ctx());
// Gradient w.r.t. input
kernel::BatchNormTrainingGrad(
N_,
C_,
S_,
data_format(),
x,
mu,
rsig,
gamma,
scale,
bias,
dy,
Output(0)->template mutable_data<InputType, Context>(),
ctx());
}
template <class Context>
void SyncBatchNormGradientOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Dispatch the training or inference impl
Output(0)->ReshapeLike(Input(0));
if (Input(0).template IsType<float>()) {
if (is_training_ > 0) {
TrainingImpl<float, float>();
} else {
this->template InferenceImpl<float, float>();
}
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"});
}
}
DEPLOY_CPU_OPERATOR(SyncBatchNorm);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(SyncBatchNorm);
#endif
DEPLOY_CPU_OPERATOR(SyncBatchNormGradient);
REGISTER_CPU_OPERATOR(SyncBatchNorm, BatchNormOp<CPUContext>);
REGISTER_CPU_OPERATOR(SyncBatchNormGradient, BatchNormGradientOp<CPUContext>);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(SyncBatchNormGradient);
REGISTER_CUDA_OPERATOR(SyncBatchNorm, BatchNormOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(SyncBatchNormGradient, BatchNormGradientOp<CUDAContext>);
#endif
OPERATOR_SCHEMA(SyncBatchNorm)
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!