Commit 746f2cbb by Ting PAN

Add FP16 support for DepthwiseConv2d && SyncBN Operator

Summary:
This commit adds pseudo FP16 kernels with FP32 conversions
for DepthwiseConv2d and SyncBN operator.
1 parent d56e67d1
Showing with 705 additions and 765 deletions
...@@ -81,6 +81,9 @@ dragon ...@@ -81,6 +81,9 @@ dragon
`function(...) <dragon/function.html>`_ `function(...) <dragon/function.html>`_
: Compile a function and return an executable. : Compile a function and return an executable.
`get_num_threads(...) <dragon/get_num_threads.html>`_
: Return the number of threads for cpu parallelism.
`get_workspace(...) <dragon/get_workspace.html>`_ `get_workspace(...) <dragon/get_workspace.html>`_
: Return the current default workspace. : Return the current default workspace.
...@@ -138,6 +141,9 @@ dragon ...@@ -138,6 +141,9 @@ dragon
`reshape(...) <dragon/reshape.html>`_ `reshape(...) <dragon/reshape.html>`_
: Change the dimensions of input. : Change the dimensions of input.
`set_num_threads(...) <dragon/set_num_threads.html>`_
: Set the number of threads for cpu parallelism.
`shape(...) <dragon/shape.html>`_ `shape(...) <dragon/shape.html>`_
: Return the shape of input. : Return the shape of input.
...@@ -204,6 +210,7 @@ dragon ...@@ -204,6 +210,7 @@ dragon
dragon/fill dragon/fill
dragon/flatten dragon/flatten
dragon/function dragon/function
dragon/get_num_threads
dragon/get_workspace dragon/get_workspace
dragon/gradients dragon/gradients
dragon/graph_mode dragon/graph_mode
...@@ -223,6 +230,7 @@ dragon ...@@ -223,6 +230,7 @@ dragon
dragon/repeat dragon/repeat
dragon/reset_workspace dragon/reset_workspace
dragon/reshape dragon/reshape
dragon/set_num_threads
dragon/shape dragon/shape
dragon/slice dragon/slice
dragon/sort dragon/sort
......
get_num_threads
===============
.. autofunction:: dragon.get_num_threads
.. raw:: html
<style>
h1:before {
content: "dragon.";
color: #103d3e;
}
</style>
set_num_threads
===============
.. autofunction:: dragon.set_num_threads
.. raw:: html
<style>
h1:before {
content: "dragon.";
color: #103d3e;
}
</style>
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "dragon/core/operator_schema.h" #include "dragon/core/operator_schema.h"
#include "dragon/core/registry.h" #include "dragon/core/registry.h"
#include "dragon/core/tensor.h" #include "dragon/core/tensor.h"
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
namespace dragon { namespace dragon {
......
...@@ -19,6 +19,11 @@ ...@@ -19,6 +19,11 @@
#include "dragon/core/typeid.h" #include "dragon/core/typeid.h"
#ifndef HFLT_MAX
#define HFLT_MAX 65504.F
#define HFLT_MIN 6.10e-5F
#endif
namespace dragon { namespace dragon {
typedef std::vector<int> vec32_t; typedef std::vector<int> vec32_t;
......
...@@ -34,7 +34,7 @@ void _DropBlock2dNCHW( ...@@ -34,7 +34,7 @@ void _DropBlock2dNCHW(
} }
} // Share the mask between channels } // Share the mask between channels
} }
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
...@@ -65,7 +65,7 @@ void _DropBlock2dNHWC( ...@@ -65,7 +65,7 @@ void _DropBlock2dNHWC(
} }
} // Share the mask between channels } // Share the mask between channels
} }
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -86,7 +85,7 @@ void DropPath<float16, CUDAContext>( ...@@ -86,7 +85,7 @@ void DropPath<float16, CUDAContext>(
const auto nthreads = rows * cols; \ const auto nthreads = rows * cols; \
const auto thresh = 1.f - (1.f / scale); \ const auto thresh = 1.f - (1.f / scale); \
_DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _DropPath<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, cols, thresh, cast::to<T>(scale), x, mask, y); \ nthreads, cols, thresh, convert::To<T>(scale), x, mask, y); \
} }
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
......
#include "dragon/utils/cast.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -63,28 +62,29 @@ void _Dropout<float16>( ...@@ -63,28 +62,29 @@ void _Dropout<float16>(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void ApplyMask<T, CPUContext>( \ void ApplyMask<T, CPUContext>( \
const int count, \ const int count, \
const float scale, \ const float scale, \
const T* x, \ const T* x, \
const uint8_t* mask, \ const uint8_t* mask, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ApplyMask(count, cast::to<T>(scale), x, mask, y); \ _ApplyMask(count, convert::To<T>(scale), x, mask, y); \
} \ } \
template <> \ template <> \
void Dropout<T, CPUContext>( \ void Dropout<T, CPUContext>( \
const int count, \ const int count, \
const float ratio, \ const float ratio, \
const float scale, \ const float scale, \
const T* x, \ const T* x, \
uint8_t* mask, \ uint8_t* mask, \
T* y, \ T* y, \
uint32_t* r, \ uint32_t* r, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_Dropout(count, cast::to<T>(ratio), cast::to<T>(scale), x, mask, y, ctx); \ _Dropout( \
count, convert::To<T>(ratio), convert::To<T>(scale), x, mask, y, ctx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -113,7 +112,7 @@ void Dropout<float16, CUDAContext>( ...@@ -113,7 +112,7 @@ void Dropout<float16, CUDAContext>(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_ApplyMask<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _ApplyMask<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(scale), x, mask, y); \ count, convert::To<T>(scale), x, mask, y); \
} \ } \
template <> \ template <> \
void Dropout<T, CUDAContext>( \ void Dropout<T, CUDAContext>( \
...@@ -128,7 +127,7 @@ void Dropout<float16, CUDAContext>( ...@@ -128,7 +127,7 @@ void Dropout<float16, CUDAContext>(
math::Random(count, r, ctx); \ math::Random(count, r, ctx); \
auto threshold = static_cast<uint32_t>(UINT_MAX * ratio); \ auto threshold = static_cast<uint32_t>(UINT_MAX * ratio); \
_Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _Dropout<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, threshold, cast::to<T>(scale), x, r, mask, y); \ count, threshold, convert::To<T>(scale), x, r, mask, y); \
} }
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -50,19 +50,19 @@ void _EluGrad<float16>( ...@@ -50,19 +50,19 @@ void _EluGrad<float16>(
template <> \ template <> \
void Elu<T, CPUContext>( \ void Elu<T, CPUContext>( \
const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \ const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \
_Elu(count, cast::to<T>(alpha), x, y); \ _Elu(count, convert::To<T>(alpha), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void EluGrad<T, CPUContext>( \ void EluGrad<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_EluGrad(count, cast::to<T>(alpha), dy, y, dx); \ _EluGrad(count, convert::To<T>(alpha), dy, y, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -56,28 +56,28 @@ void _HardSigmoidGrad<float16>( ...@@ -56,28 +56,28 @@ void _HardSigmoidGrad<float16>(
} // namespace } // namespace
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void HardSigmoid<T, CPUContext>( \ void HardSigmoid<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const float beta, \ const float beta, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_HardSigmoid(count, cast::to<T>(alpha), cast::to<T>(beta), x, y); \ _HardSigmoid(count, convert::To<T>(alpha), convert::To<T>(beta), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void HardSigmoidGrad<T, CPUContext>( \ void HardSigmoidGrad<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_HardSigmoidGrad(count, cast::to<T>(alpha), dy, y, dx); \ _HardSigmoidGrad(count, convert::To<T>(alpha), dy, y, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -59,29 +59,30 @@ void _HardSwishGrad<float16>( ...@@ -59,29 +59,30 @@ void _HardSwishGrad<float16>(
} // namespace } // namespace
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void HardSwish<T, CPUContext>( \ void HardSwish<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const float beta, \ const float beta, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_HardSwish(count, cast::to<T>(alpha), cast::to<T>(beta), x, y); \ _HardSwish(count, convert::To<T>(alpha), convert::To<T>(beta), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void HardSwishGrad<T, CPUContext>( \ void HardSwishGrad<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const float beta, \ const float beta, \
const T* dy, \ const T* dy, \
const T* x, \ const T* x, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_HardSwishGrad(count, cast::to<T>(alpha), cast::to<T>(beta), dy, x, dx); \ _HardSwishGrad( \
count, convert::To<T>(alpha), convert::To<T>(beta), dy, x, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -87,7 +87,7 @@ void _ReluNGrad<float16>( ...@@ -87,7 +87,7 @@ void _ReluNGrad<float16>(
template <> \ template <> \
void Relu<T, CPUContext>( \ void Relu<T, CPUContext>( \
const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \ const int count, const float alpha, const T* x, T* y, CPUContext* ctx) { \
_Relu(count, cast::to<T>(alpha), x, y); \ _Relu(count, convert::To<T>(alpha), x, y); \
} \ } \
template <> \ template <> \
void ReluN<T, CPUContext>( \ void ReluN<T, CPUContext>( \
...@@ -96,29 +96,29 @@ void _ReluNGrad<float16>( ...@@ -96,29 +96,29 @@ void _ReluNGrad<float16>(
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ReluN(count, cast::to<T>(max_value), x, y); \ _ReluN(count, convert::To<T>(max_value), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void ReluGrad<T, CPUContext>( \ void ReluGrad<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ReluGrad(count, cast::to<T>(alpha), dy, y, dx); \ _ReluGrad(count, convert::To<T>(alpha), dy, y, dx); \
} \ } \
template <> \ template <> \
void ReluNGrad<T, CPUContext>( \ void ReluNGrad<T, CPUContext>( \
const int count, \ const int count, \
const float max_value, \ const float max_value, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ReluNGrad(count, cast::to<T>(max_value), dy, y, dx); \ _ReluNGrad(count, convert::To<T>(max_value), dy, y, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -287,13 +287,13 @@ void ReluN<float16, CUDAContext>( ...@@ -287,13 +287,13 @@ void ReluN<float16, CUDAContext>(
0, 0,
ctx->cuda_stream()>>>( ctx->cuda_stream()>>>(
count >> 1, count >> 1,
cast::to<half>(max_value), convert::To<half>(max_value),
reinterpret_cast<const half2*>(x), reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)); reinterpret_cast<half2*>(y));
} else { } else {
_ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, count,
cast::to<half>(max_value), convert::To<half>(max_value),
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
} }
...@@ -339,14 +339,14 @@ void ReluNGrad<float16, CUDAContext>( ...@@ -339,14 +339,14 @@ void ReluNGrad<float16, CUDAContext>(
0, 0,
ctx->cuda_stream()>>>( ctx->cuda_stream()>>>(
count >> 1, count >> 1,
cast::to<half2>(max_value), convert::To<half2>(max_value),
reinterpret_cast<const half2*>(dy), reinterpret_cast<const half2*>(dy),
reinterpret_cast<const half2*>(y), reinterpret_cast<const half2*>(y),
reinterpret_cast<half2*>(dx)); reinterpret_cast<half2*>(dx));
} else { } else {
_ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, count,
cast::to<half>(max_value), convert::To<half>(max_value),
reinterpret_cast<const half*>(dy), reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(y), reinterpret_cast<const half*>(y),
reinterpret_cast<half*>(dx)); reinterpret_cast<half*>(dx));
...@@ -362,7 +362,7 @@ void ReluNGrad<float16, CUDAContext>( ...@@ -362,7 +362,7 @@ void ReluNGrad<float16, CUDAContext>(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_Relu<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _Relu<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(alpha), x, y); \ count, convert::To<T>(alpha), x, y); \
} \ } \
template <> \ template <> \
void ReluN<T, CUDAContext>( \ void ReluN<T, CUDAContext>( \
...@@ -372,7 +372,7 @@ void ReluNGrad<float16, CUDAContext>( ...@@ -372,7 +372,7 @@ void ReluNGrad<float16, CUDAContext>(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _ReluN<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(max_value), x, y); \ count, convert::To<T>(max_value), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
...@@ -385,7 +385,7 @@ void ReluNGrad<float16, CUDAContext>( ...@@ -385,7 +385,7 @@ void ReluNGrad<float16, CUDAContext>(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_ReluGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _ReluGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(alpha), dy, y, dx); \ count, convert::To<T>(alpha), dy, y, dx); \
} \ } \
template <> \ template <> \
void ReluNGrad<T, CUDAContext>( \ void ReluNGrad<T, CUDAContext>( \
...@@ -396,7 +396,7 @@ void ReluNGrad<float16, CUDAContext>( ...@@ -396,7 +396,7 @@ void ReluNGrad<float16, CUDAContext>(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _ReluNGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(max_value), dy, y, dx); \ count, convert::To<T>(max_value), dy, y, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -57,29 +57,29 @@ void _SeluGrad<float16>( ...@@ -57,29 +57,29 @@ void _SeluGrad<float16>(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void Selu<T, CPUContext>( \ void Selu<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const float gamma, \ const float gamma, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_Selu(count, cast::to<T>(alpha), cast::to<T>(gamma), x, y); \ _Selu(count, convert::To<T>(alpha), convert::To<T>(gamma), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void SeluGrad<T, CPUContext>( \ void SeluGrad<T, CPUContext>( \
const int count, \ const int count, \
const float alpha, \ const float alpha, \
const float gamma, \ const float gamma, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* tx) { \ CPUContext* tx) { \
_SeluGrad(count, cast::to<T>(alpha), cast::to<T>(gamma), dy, y, dx); \ _SeluGrad(count, convert::To<T>(alpha), convert::To<T>(gamma), dy, y, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/device/common_cub.h" #include "dragon/utils/device/common_cub.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -200,7 +199,7 @@ void Softmax<float16, CUDAContext>( ...@@ -200,7 +199,7 @@ void Softmax<float16, CUDAContext>(
rows, rows,
cols, cols,
inner_dim, inner_dim,
cast::to<half>(std::numeric_limits<float>::lowest()), convert::To<half>(std::numeric_limits<float>::lowest()),
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
} }
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
...@@ -35,7 +35,7 @@ __global__ void _Tanh<half2>(const int nthreads, const half2* x, half2* y) { ...@@ -35,7 +35,7 @@ __global__ void _Tanh<half2>(const int nthreads, const half2* x, half2* y) {
template <typename T> template <typename T>
__global__ void _TanhGrad(const int nthreads, const T* dy, const T* y, T* dx) { __global__ void _TanhGrad(const int nthreads, const T* dy, const T* y, T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = dy[i] * (T(1) - utils::math::Square(y[i])); dx[i] = dy[i] * (T(1) - math::utils::Square(y[i]));
} }
} }
...@@ -44,7 +44,7 @@ __global__ void ...@@ -44,7 +44,7 @@ __global__ void
_TanhGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) { _TanhGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half( dx[i] = __float2half(
__half2float(dy[i]) * (1.f - utils::math::Square(__half2float(y[i])))); __half2float(dy[i]) * (1.f - math::utils::Square(__half2float(y[i]))));
} }
} }
...@@ -58,8 +58,8 @@ __global__ void _TanhGrad<half2>( ...@@ -58,8 +58,8 @@ __global__ void _TanhGrad<half2>(
const float2 val = __half22float2(y[i]); const float2 val = __half22float2(y[i]);
const float2 grad = __half22float2(dy[i]); const float2 grad = __half22float2(dy[i]);
dx[i] = __floats2half2_rn( dx[i] = __floats2half2_rn(
grad.x * (1.f - utils::math::Square(val.x)), grad.x * (1.f - math::utils::Square(val.x)),
grad.y * (1.f - utils::math::Square(val.y))); grad.y * (1.f - math::utils::Square(val.y)));
} }
} }
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
...@@ -28,7 +28,7 @@ void _ChannelNormalize( ...@@ -28,7 +28,7 @@ void _ChannelNormalize(
if (d == axis) wi = idx[d]; if (d == axis) wi = idx[d];
} }
y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi]; y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, idx.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
} }
} }
......
...@@ -26,7 +26,7 @@ void _CumSum( ...@@ -26,7 +26,7 @@ void _CumSum(
} else { } else {
y[i] = exclusive ? T(0) : x[i]; y[i] = exclusive ? T(0) : x[i];
} }
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
......
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -11,7 +10,7 @@ namespace { ...@@ -11,7 +10,7 @@ namespace {
template <typename T> template <typename T>
void _SetEye(const int n, const int m, const int k, T* y) { void _SetEye(const int n, const int m, const int k, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i * m + k + i] = cast::to<T>(1.f); y[i * m + k + i] = convert::To<T>(1.f);
} }
} }
...@@ -23,7 +22,7 @@ void _SetEye(const int n, const int m, const int k, T* y) { ...@@ -23,7 +22,7 @@ void _SetEye(const int n, const int m, const int k, T* y) {
template <> \ template <> \
void Eye<T, CPUContext>( \ void Eye<T, CPUContext>( \
const int n, const int m, const int k, T* y, CPUContext* ctx) { \ const int n, const int m, const int k, T* y, CPUContext* ctx) { \
math::Set(n* m, cast::to<T>(0.f), y, ctx); \ math::Set(n* m, convert::To<T>(0.f), y, ctx); \
if (k > 0) { \ if (k > 0) { \
if (m - k > 0) _SetEye(m - k, m, k, y); \ if (m - k > 0) _SetEye(m - k, m, k, y); \
} else { \ } else { \
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -37,7 +36,7 @@ void Eye<float16, CUDAContext>( ...@@ -37,7 +36,7 @@ void Eye<float16, CUDAContext>(
const int k, const int k,
float16* y, float16* y,
CUDAContext* ctx) { CUDAContext* ctx) {
math::Set(n * m, cast::to<float16>(0.f), y, ctx); math::Set(n * m, convert::To<float16>(0.f), y, ctx);
if (k > 0) { if (k > 0) {
if (m - k > 0) { if (m - k > 0) {
_SetEye<<<CUDA_BLOCKS(m - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _SetEye<<<CUDA_BLOCKS(m - k), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -16,12 +16,12 @@ void _RowwiseLinSpace( ...@@ -16,12 +16,12 @@ void _RowwiseLinSpace(
T* y) { T* y) {
for (int i = 0; i < cols; ++i) { for (int i = 0; i < cols; ++i) {
const auto delta = (stop[i] - start[i]) / double(rows - 1); const auto delta = (stop[i] - start[i]) / double(rows - 1);
y[i] = cast::to<T>(start[i]); y[i] = convert::To<T>(start[i]);
if (rows > 1) { if (rows > 1) {
y[i + (rows - 1) * cols] = cast::to<T>(stop[i]); y[i + (rows - 1) * cols] = convert::To<T>(stop[i]);
} }
for (int j = 1; j < rows - 1; ++j) { for (int j = 1; j < rows - 1; ++j) {
y[i + j * cols] = cast::to<T>(start[i] + double(j) * delta); y[i + j * cols] = convert::To<T>(start[i] + double(j) * delta);
} }
} }
} }
...@@ -36,12 +36,12 @@ void _ColwiseLinSpace( ...@@ -36,12 +36,12 @@ void _ColwiseLinSpace(
for (int i = 0; i < rows; ++i) { for (int i = 0; i < rows; ++i) {
const auto delta = (stop[i] - start[i]) / double(cols - 1); const auto delta = (stop[i] - start[i]) / double(cols - 1);
auto* offset_y = y + i * cols; auto* offset_y = y + i * cols;
offset_y[0] = cast::to<T>(start[i]); offset_y[0] = convert::To<T>(start[i]);
if (cols > 1) { if (cols > 1) {
offset_y[cols - 1] = cast::to<T>(stop[i]); offset_y[cols - 1] = convert::To<T>(stop[i]);
} }
for (int j = 1; j < cols - 1; ++j) { for (int j = 1; j < cols - 1; ++j) {
offset_y[j] = cast::to<T>(start[i] + double(j) * delta); offset_y[j] = convert::To<T>(start[i] + double(j) * delta);
} }
} }
} }
......
#include "dragon/utils/cast.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -52,17 +51,17 @@ void _MaskedSelectGrad( ...@@ -52,17 +51,17 @@ void _MaskedSelectGrad(
_MaskedSelect(num_selected, index, x, y); \ _MaskedSelect(num_selected, index, x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(IndexType, ValueType) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(IndexType, ValueType) \
template <> \ template <> \
void MaskedSelectGrad<IndexType, ValueType, CPUContext>( \ void MaskedSelectGrad<IndexType, ValueType, CPUContext>( \
const int count, \ const int count, \
const int num_selected, \ const int num_selected, \
const IndexType* index, \ const IndexType* index, \
const ValueType* dy, \ const ValueType* dy, \
ValueType* dx, \ ValueType* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
math::Set(count, cast::to<ValueType>(0.f), dx, ctx); \ math::Set(count, convert::To<ValueType>(0.f), dx, ctx); \
_MaskedSelectGrad(num_selected, index, dy, dx); \ _MaskedSelectGrad(num_selected, index, dy, dx); \
} }
DEFINE_KERNEL_LAUNCHER(int, bool); DEFINE_KERNEL_LAUNCHER(int, bool);
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -61,7 +61,7 @@ __global__ void _MaskedSelectGrad( ...@@ -61,7 +61,7 @@ __global__ void _MaskedSelectGrad(
const ValueType* dy, \ const ValueType* dy, \
ValueType* dx, \ ValueType* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
math::Set(count, cast::to<ValueType>(0.f), dx, ctx); \ math::Set(count, convert::To<ValueType>(0.f), dx, ctx); \
_MaskedSelectGrad<<< \ _MaskedSelectGrad<<< \
CUDA_BLOCKS(num_selected), \ CUDA_BLOCKS(num_selected), \
CUDA_THREADS, \ CUDA_THREADS, \
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -30,7 +29,7 @@ void _ConstPad( ...@@ -30,7 +29,7 @@ void _ConstPad(
xi += r * x_strides[d]; xi += r * x_strides[d];
} }
y[yi] = d >= 0 ? value : x[xi]; y[yi] = d >= 0 ? value : x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
...@@ -56,7 +55,7 @@ void _ReflectPad( ...@@ -56,7 +55,7 @@ void _ReflectPad(
xi += r * x_strides[d]; xi += r * x_strides[d];
} }
y[yi] = x[xi]; y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
...@@ -80,7 +79,7 @@ void _EdgePad( ...@@ -80,7 +79,7 @@ void _EdgePad(
xi += r * x_strides[d]; xi += r * x_strides[d];
} }
y[yi] = x[xi]; y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
...@@ -102,20 +101,27 @@ void _EdgePad( ...@@ -102,20 +101,27 @@ void _EdgePad(
_##name(num_dims, x_dims, x_strides, y_dims, pads, x, y); \ _##name(num_dims, x_dims, x_strides, y_dims, pads, x, y); \
} }
#define DEFINE_CONST_KERNEL_LAUNCHER(T) \ #define DEFINE_CONST_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void ConstPad<T, CPUContext>( \ void ConstPad<T, CPUContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_dims, \ const int64_t* x_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const int64_t* pads, \ const int64_t* pads, \
const float value, \ const float value, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ConstPad( \ _ConstPad( \
num_dims, x_dims, x_strides, y_dims, pads, cast::to<T>(value), x, y); \ num_dims, \
x_dims, \
x_strides, \
y_dims, \
pads, \
convert::To<T>(value), \
x, \
y); \
} }
DEFINE_CONST_KERNEL_LAUNCHER(bool); DEFINE_CONST_KERNEL_LAUNCHER(bool);
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -114,7 +113,7 @@ __global__ void _EdgePad( ...@@ -114,7 +113,7 @@ __global__ void _EdgePad(
X_strides, \ X_strides, \
Y_dims, \ Y_dims, \
X_pads, \ X_pads, \
cast::to<T>(value), \ convert::To<T>(value), \
x, \ x, \
y); \ y); \
} }
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -14,7 +14,7 @@ void _Range(const int count, const double start, const double delta, T* y) { ...@@ -14,7 +14,7 @@ void _Range(const int count, const double start, const double delta, T* y) {
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
y[i] = cast::to<T>(start + double(i) * delta); y[i] = convert::To<T>(start + double(i) * delta);
} }
} }
......
...@@ -26,7 +26,7 @@ void _ReduceSumGrad( ...@@ -26,7 +26,7 @@ void _ReduceSumGrad(
yi += (index[d] % y_dims[d]) * y_strides[d]; yi += (index[d] % y_dims[d]) * y_strides[d];
} }
dx[xi] = dy[yi] * scale; dx[xi] = dy[yi] * scale;
utils::math::IncreaseIndexInDims(num_dims, x_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, x_dims, index.data());
} }
} }
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
...@@ -25,7 +25,7 @@ void _Slice( ...@@ -25,7 +25,7 @@ void _Slice(
xi += (index[d] + starts[d]) * x_strides[d]; xi += (index[d] + starts[d]) * x_strides[d];
} }
y[yi] = x[xi]; y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
...@@ -47,7 +47,7 @@ void _SliceGrad( ...@@ -47,7 +47,7 @@ void _SliceGrad(
xi += (index[d] + starts[d]) * x_strides[d]; xi += (index[d] + starts[d]) * x_strides[d];
} }
dx[xi] = dy[yi]; dx[xi] = dy[yi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
......
...@@ -25,7 +25,7 @@ void _Tile( ...@@ -25,7 +25,7 @@ void _Tile(
xi += (index[d] % x_dims[d]) * x_strides[d]; xi += (index[d] % x_dims[d]) * x_strides[d];
} }
y[i] = x[xi]; y[i] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
......
...@@ -162,7 +162,7 @@ __global__ void _SelectViaDeviceSort( ...@@ -162,7 +162,7 @@ __global__ void _SelectViaDeviceSort(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define PLACE_BLOCK_SORT_CASE(T, items_per_thread) \ #define BLOCKSORT_KERNEL(T, items_per_thread) \
_SelectViaBlockSort<T, items_per_thread> \ _SelectViaBlockSort<T, items_per_thread> \
<<<CUDA_2D_BLOCKS(rows), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_2D_BLOCKS(rows), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
rows, \ rows, \
...@@ -175,15 +175,15 @@ __global__ void _SelectViaDeviceSort( ...@@ -175,15 +175,15 @@ __global__ void _SelectViaDeviceSort(
reinterpret_cast<T*>(value), \ reinterpret_cast<T*>(value), \
index) index)
#define PLACE_BLOCK_SORT_CASES(T) \ #define DISPATCH_BLOCKSORT_KERNEL(T) \
if (cols <= CUDA_THREADS) { \ if (cols <= CUDA_THREADS) { \
PLACE_BLOCK_SORT_CASE(T, 1); \ BLOCKSORT_KERNEL(T, 1); \
} else if (cols <= CUDA_THREADS * 2) { \ } else if (cols <= CUDA_THREADS * 2) { \
PLACE_BLOCK_SORT_CASE(T, 2); \ BLOCKSORT_KERNEL(T, 2); \
} else if (cols <= CUDA_THREADS * 4) { \ } else if (cols <= CUDA_THREADS * 4) { \
PLACE_BLOCK_SORT_CASE(T, 4); \ BLOCKSORT_KERNEL(T, 4); \
} else if (cols <= CUDA_THREADS * 8) { \ } else if (cols <= CUDA_THREADS * 8) { \
PLACE_BLOCK_SORT_CASE(T, 8); \ BLOCKSORT_KERNEL(T, 8); \
} else { \ } else { \
LOG(FATAL) << "Too larger dimension (> " << CUDA_THREADS * 8 \ LOG(FATAL) << "Too larger dimension (> " << CUDA_THREADS * 8 \
<< ") to launch the cuda kernel"; \ << ") to launch the cuda kernel"; \
...@@ -238,7 +238,7 @@ __global__ void _SelectViaDeviceSort( ...@@ -238,7 +238,7 @@ __global__ void _SelectViaDeviceSort(
return; \ return; \
} \ } \
T2 init = largest > 0 ? kLowest : kMax; \ T2 init = largest > 0 ? kLowest : kMax; \
PLACE_BLOCK_SORT_CASES(T2); \ DISPATCH_BLOCKSORT_KERNEL(T2); \
} }
DEFINE_KERNEL_LAUNCHER( DEFINE_KERNEL_LAUNCHER(
...@@ -277,8 +277,8 @@ DEFINE_KERNEL_LAUNCHER( ...@@ -277,8 +277,8 @@ DEFINE_KERNEL_LAUNCHER(
std::numeric_limits<double>::lowest(), std::numeric_limits<double>::lowest(),
std::numeric_limits<double>::max()); std::numeric_limits<double>::max());
#undef PLACE_BLOCK_SORT_CASE #undef BLOCK_SORTKERNEL
#undef PLACE_BLOCK_SORT_CASES #undef DISPATCH_BLOCKSORT_KERNEL
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
...@@ -24,7 +24,7 @@ void _Transpose( ...@@ -24,7 +24,7 @@ void _Transpose(
xi += index[d] * x_strides[d]; xi += index[d] * x_strides[d];
} }
y[yi] = x[xi]; y[yi] = x[xi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
...@@ -45,7 +45,7 @@ void _TransposeGrad( ...@@ -45,7 +45,7 @@ void _TransposeGrad(
xi += index[d] * x_strides[d]; xi += index[d] * x_strides[d];
} }
dx[xi] = dy[yi]; dx[xi] = dy[yi];
utils::math::IncreaseIndexInDims(num_dims, y_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
} }
} }
......
...@@ -25,7 +25,7 @@ void _Assign( ...@@ -25,7 +25,7 @@ void _Assign(
yi += (index[d] + starts[d]) * y_strides[d]; yi += (index[d] + starts[d]) * y_strides[d];
} }
y[yi] = x[i]; y[yi] = x[i];
utils::math::IncreaseIndexInDims(num_dims, x_dims, index.data()); math::utils::IncreaseIndexInDims(num_dims, x_dims, index.data());
} }
} }
......
...@@ -19,7 +19,7 @@ void _BroadcastLossGrad( ...@@ -19,7 +19,7 @@ void _BroadcastLossGrad(
const int count = outer_dim * axis_dim * inner_dim; const int count = outer_dim * axis_dim * inner_dim;
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
dx[i] *= dy[idx[0] * inner_dim + idx[2]]; dx[i] *= dy[idx[0] * inner_dim + idx[2]];
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
...@@ -78,32 +78,32 @@ void BroadcastLossGrad<float16, CPUContext>( ...@@ -78,32 +78,32 @@ void BroadcastLossGrad<float16, CPUContext>(
y[0] = math::Sum(count, 1.f / inv_scale, x, ctx); \ y[0] = math::Sum(count, 1.f / inv_scale, x, ctx); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void ReduceLossGrad<T, CPUContext>( \ void ReduceLossGrad<T, CPUContext>( \
const int count, \ const int count, \
const int num_masks, \ const int num_masks, \
const float normalizer, \ const float normalizer, \
const T* dy, \ const T* dy, \
const T* mask, \ const T* mask, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
float inv_scale = std::max( \ float inv_scale = std::max( \
0.5f, \ 0.5f, \
num_masks > 0 && normalizer < 0.f \ num_masks > 0 && normalizer < 0.f \
? (float)math::Sum(num_masks, 1.f, mask, ctx) \ ? (float)math::Sum(num_masks, 1.f, mask, ctx) \
: normalizer); \ : normalizer); \
math::Scale(count, cast::to<float>(dy[0]) / inv_scale, dx, dx, ctx); \ math::Scale(count, convert::To<float>(dy[0]) / inv_scale, dx, dx, ctx); \
} \ } \
template <> \ template <> \
void BroadcastLossGrad<T, CPUContext>( \ void BroadcastLossGrad<T, CPUContext>( \
const int outer_dim, \ const int outer_dim, \
const int inner_dim, \ const int inner_dim, \
const int axis_dim, \ const int axis_dim, \
const T* dy, \ const T* dy, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_BroadcastLossGrad(outer_dim, inner_dim, axis_dim, dy, dx); \ _BroadcastLossGrad(outer_dim, inner_dim, axis_dim, dy, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(float);
......
...@@ -28,7 +28,7 @@ void _NLLLoss( ...@@ -28,7 +28,7 @@ void _NLLLoss(
k = (idx[0] * axis_dim + label) * inner_dim + idx[1]; k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -logit[k], mask[i] = LogitType(1); loss[i] = -logit[k], mask[i] = LogitType(1);
} }
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
} }
} }
...@@ -53,7 +53,7 @@ void _NLLLossGrad( ...@@ -53,7 +53,7 @@ void _NLLLossGrad(
k = (idx[0] * axis_dim + label) * inner_dim + idx[1]; k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dlogit[k] = LogitType(-1), mask[i] = LogitType(1); dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
} }
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
} }
} }
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
...@@ -48,7 +48,7 @@ void _SigmoidFocalLoss( ...@@ -48,7 +48,7 @@ void _SigmoidFocalLoss(
loss[i] += -c2 * neg_term * neg_alpha; loss[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1; mask[i] = c1;
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
...@@ -96,7 +96,7 @@ void _SigmoidFocalLossGrad( ...@@ -96,7 +96,7 @@ void _SigmoidFocalLossGrad(
dx[i] += -c2 * neg_term * neg_alpha; dx[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1; mask[i] = c1;
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
...@@ -29,7 +29,7 @@ void _SparseSoftmaxCrossEntropy( ...@@ -29,7 +29,7 @@ void _SparseSoftmaxCrossEntropy(
loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN))); loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
mask[i] = LogitType(1); mask[i] = LogitType(1);
} }
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
} }
} }
...@@ -60,7 +60,7 @@ void _SparseSoftmaxCrossEntropyGrad( ...@@ -60,7 +60,7 @@ void _SparseSoftmaxCrossEntropyGrad(
dx[k] -= LogitType(1); dx[k] -= LogitType(1);
mask[i] = LogitType(1); mask[i] = LogitType(1);
} }
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
} }
} }
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -22,15 +22,15 @@ void _Clip<float16>( ...@@ -22,15 +22,15 @@ void _Clip<float16>(
const float16 high, const float16 high,
const float16* x, const float16* x,
float16* y) { float16* y) {
auto lowf = cast::to<float>(low); auto lowf = convert::To<float>(low);
auto highf = cast::to<float>(high); auto highf = convert::To<float>(high);
#ifdef USE_OPENMP #ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
auto val = cast::to<float>(x[i]); auto val = convert::To<float>(x[i]);
val = std::max(lowf, std::min(val, highf)); val = std::max(lowf, std::min(val, highf));
y[i] = cast::to<float16>(val); y[i] = convert::To<float16>(val);
} }
} }
...@@ -56,14 +56,14 @@ void _ClipGrad<float16>( ...@@ -56,14 +56,14 @@ void _ClipGrad<float16>(
const float16* dy, const float16* dy,
const float16* x, const float16* x,
float16* dx) { float16* dx) {
auto lowf = cast::to<float>(low); auto lowf = convert::To<float>(low);
auto highf = cast::to<float>(high); auto highf = convert::To<float>(high);
auto kZero = cast::to<float16>(0.f); auto kZero = convert::To<float16>(0.f);
#ifdef USE_OPENMP #ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count)) #pragma omp parallel for num_threads(OMP_THREADS(count))
#endif #endif
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
auto val = cast::to<float>(x[i]); auto val = convert::To<float>(x[i]);
dx[i] = (val < lowf || val > highf) ? kZero : dy[i]; dx[i] = (val < lowf || val > highf) ? kZero : dy[i];
} }
} // ClipGrad } // ClipGrad
...@@ -72,29 +72,29 @@ void _ClipGrad<float16>( ...@@ -72,29 +72,29 @@ void _ClipGrad<float16>(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void Clip<T, CPUContext>( \ void Clip<T, CPUContext>( \
const int count, \ const int count, \
const float low, \ const float low, \
const float high, \ const float high, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_Clip(count, cast::to<T>(low), cast::to<T>(high), x, y); \ _Clip(count, convert::To<T>(low), convert::To<T>(high), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void ClipGrad<T, CPUContext>( \ void ClipGrad<T, CPUContext>( \
const int count, \ const int count, \
const float low, \ const float low, \
const float high, \ const float high, \
const T* dy, \ const T* dy, \
const T* x, \ const T* x, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_ClipGrad(count, cast::to<T>(low), cast::to<T>(high), dy, x, dx); \ _ClipGrad(count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
} }
DEFINE_KERNEL_LAUNCHER(int8_t); DEFINE_KERNEL_LAUNCHER(int8_t);
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -104,8 +103,8 @@ void Clip<float16, CUDAContext>( ...@@ -104,8 +103,8 @@ void Clip<float16, CUDAContext>(
CUDAContext* ctx) { CUDAContext* ctx) {
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, count,
cast::to<half>(low), convert::To<half>(low),
cast::to<half>(high), convert::To<half>(high),
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)); reinterpret_cast<half*>(y));
} }
...@@ -121,8 +120,8 @@ void ClipGrad<float16, CUDAContext>( ...@@ -121,8 +120,8 @@ void ClipGrad<float16, CUDAContext>(
CUDAContext* ctx) { CUDAContext* ctx) {
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, count,
cast::to<half>(low), convert::To<half>(low),
cast::to<half>(high), convert::To<half>(high),
reinterpret_cast<const half*>(dy), reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(x), reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(dx)); reinterpret_cast<half*>(dx));
...@@ -138,7 +137,7 @@ void ClipGrad<float16, CUDAContext>( ...@@ -138,7 +137,7 @@ void ClipGrad<float16, CUDAContext>(
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(low), cast::to<T>(high), x, y); \ count, convert::To<T>(low), convert::To<T>(high), x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
...@@ -152,7 +151,7 @@ void ClipGrad<float16, CUDAContext>( ...@@ -152,7 +151,7 @@ void ClipGrad<float16, CUDAContext>(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, cast::to<T>(low), cast::to<T>(high), dy, x, dx); \ count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
} }
DEFINE_KERNEL_LAUNCHER(int8_t); DEFINE_KERNEL_LAUNCHER(int8_t);
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
...@@ -70,7 +70,7 @@ template <typename T> ...@@ -70,7 +70,7 @@ template <typename T>
__global__ void __global__ void
_ReciprocalGrad(const int nthreads, const T* dy, const T* y, T* dx) { _ReciprocalGrad(const int nthreads, const T* dy, const T* y, T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = -dy[i] * utils::math::Square(y[i]); dx[i] = -dy[i] * math::utils::Square(y[i]);
} }
} }
...@@ -82,7 +82,7 @@ __global__ void _ReciprocalGrad<half>( ...@@ -82,7 +82,7 @@ __global__ void _ReciprocalGrad<half>(
half* dx) { half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half( dx[i] = __float2half(
-__half2float(dy[i]) * utils::math::Square(__half2float(y[i]))); -__half2float(dy[i]) * math::utils::Square(__half2float(y[i])));
} }
} }
...@@ -103,7 +103,7 @@ __global__ void _ReciprocalGrad<half2>( ...@@ -103,7 +103,7 @@ __global__ void _ReciprocalGrad<half2>(
template <typename T> template <typename T>
__global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) { __global__ void _RsqrtGrad(const int nthreads, const T* dy, const T* y, T* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = T(-0.5) * dy[i] * utils::math::Cube(y[i]); dx[i] = T(-0.5) * dy[i] * math::utils::Cube(y[i]);
} }
} }
...@@ -112,7 +112,7 @@ __global__ void ...@@ -112,7 +112,7 @@ __global__ void
_RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) { _RsqrtGrad<half>(const int nthreads, const half* dy, const half* y, half* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = __float2half( dx[i] = __float2half(
-0.5f * __half2float(dy[i]) * utils::math::Cube(__half2float(y[i]))); -0.5f * __half2float(dy[i]) * math::utils::Cube(__half2float(y[i])));
} }
} }
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -106,14 +106,14 @@ void _Moments( ...@@ -106,14 +106,14 @@ void _Moments(
y_dims[axes[i]] = 1; y_dims[axes[i]] = 1;
// Case #1: Rowwise Reduce // Case #1: Rowwise Reduce
if (utils::math::IsRowwiseReduce( if (math::utils::IsRowwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) { num_dims, dims, y_dims.data(), &rows, &cols)) {
_RowwiseMoments(rows, cols, x, mean, var); _RowwiseMoments(rows, cols, x, mean, var);
return; return;
} }
// Case #2: Colwise Reduce // Case #2: Colwise Reduce
if (utils::math::IsColwiseReduce( if (math::utils::IsColwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) { num_dims, dims, y_dims.data(), &rows, &cols)) {
_ColwiseMoments(rows, cols, x, mean, var); _ColwiseMoments(rows, cols, x, mean, var);
return; return;
...@@ -121,8 +121,8 @@ void _Moments( ...@@ -121,8 +121,8 @@ void _Moments(
// Case #3: Generic Reduce // Case #3: Generic Reduce
vec32_t axesT(num_dims), stridesT(num_dims), dimsT(num_dims); vec32_t axesT(num_dims), stridesT(num_dims), dimsT(num_dims);
utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data()); math::utils::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data());
utils::math::ComputeTransposeStrides( math::utils::ComputeTransposeStrides(
num_dims, dims, axesT.data(), stridesT.data()); num_dims, dims, axesT.data(), stridesT.data());
rows = cols = 1; rows = cols = 1;
......
#ifdef USE_CUDA #ifdef USE_CUDA
#include "dragon/core/context_cuda.h" #include "dragon/core/context_cuda.h"
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/device/common_cub.h" #include "dragon/utils/device/common_cub.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -28,10 +28,10 @@ __global__ void _RowwiseMoments( ...@@ -28,10 +28,10 @@ __global__ void _RowwiseMoments(
const int xi = j * cols + i; const int xi = j * cols + i;
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
m_val += __ldg(x + xi); m_val += __ldg(x + xi);
v_val += utils::math::Square(__ldg(x + xi)); v_val += math::utils::Square(__ldg(x + xi));
#else #else
m_val += x[xi]; m_val += x[xi];
v_val += utils::math::Square(x[xi]); v_val += math::utils::Square(x[xi]);
#endif #endif
} }
m_val = BlockReduce<Ty>(m_storage).Sum(m_val); m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
...@@ -59,7 +59,7 @@ __global__ void _RowwiseMoments<half, float>( ...@@ -59,7 +59,7 @@ __global__ void _RowwiseMoments<half, float>(
CUDA_2D_KERNEL_LOOP2(j, rows) { CUDA_2D_KERNEL_LOOP2(j, rows) {
const int xi = j * cols + i; const int xi = j * cols + i;
m_val += __half2float(__ldg(x + xi)); m_val += __half2float(__ldg(x + xi));
v_val += utils::math::Square(__half2float(__ldg(x + xi))); v_val += math::utils::Square(__half2float(__ldg(x + xi)));
} }
m_val = BlockReduce<float>(m_storage).Sum(m_val); m_val = BlockReduce<float>(m_storage).Sum(m_val);
v_val = BlockReduce<float>(v_storage).Sum(v_val); v_val = BlockReduce<float>(v_storage).Sum(v_val);
...@@ -87,10 +87,10 @@ __global__ void _ColwiseMoments( ...@@ -87,10 +87,10 @@ __global__ void _ColwiseMoments(
const int xi = i * cols + j; const int xi = i * cols + j;
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
m_val += __ldg(x + xi); m_val += __ldg(x + xi);
v_val += utils::math::Square(__ldg(x + xi)); v_val += math::utils::Square(__ldg(x + xi));
#else #else
m_val += x[xi]; m_val += x[xi];
v_val += utils::math::Square(x[xi]); v_val += math::utils::Square(x[xi]);
#endif #endif
} }
m_val = BlockReduce<Ty>(m_storage).Sum(m_val); m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
...@@ -118,7 +118,7 @@ __global__ void _ColwiseMoments<half, float>( ...@@ -118,7 +118,7 @@ __global__ void _ColwiseMoments<half, float>(
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
const int xi = i * cols + j; const int xi = i * cols + j;
m_val += __half2float(__ldg(x + xi)); m_val += __half2float(__ldg(x + xi));
v_val += utils::math::Square(__half2float(__ldg(x + xi))); v_val += math::utils::Square(__half2float(__ldg(x + xi)));
} }
m_val = BlockReduce<float>(m_storage).Sum(m_val); m_val = BlockReduce<float>(m_storage).Sum(m_val);
v_val = BlockReduce<float>(v_storage).Sum(v_val); v_val = BlockReduce<float>(v_storage).Sum(v_val);
...@@ -154,10 +154,10 @@ __global__ void _GenericMoments( ...@@ -154,10 +154,10 @@ __global__ void _GenericMoments(
} }
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
m_val += __ldg(x + xi); m_val += __ldg(x + xi);
v_val += utils::math::Square(__ldg(x + xi)); v_val += math::utils::Square(__ldg(x + xi));
#else #else
m_val += x[xi]; m_val += x[xi];
v_val += utils::math::Square(x[xi]); v_val += math::utils::Square(x[xi]);
#endif #endif
} }
m_val = BlockReduce<Ty>(m_storage).Sum(m_val); m_val = BlockReduce<Ty>(m_storage).Sum(m_val);
...@@ -194,10 +194,10 @@ __global__ void _GenericMoments( ...@@ -194,10 +194,10 @@ __global__ void _GenericMoments(
} }
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
m_val += __half2float(__ldg(x + xi)); m_val += __half2float(__ldg(x + xi));
v_val += utils::math::Square(__half2float(__ldg(x + xi))); v_val += math::utils::Square(__half2float(__ldg(x + xi)));
#else #else
m_val += __half2float(x[xi]); m_val += __half2float(x[xi]);
v_val += utils::math::Square(__half2float(x[xi])); v_val += math::utils::Square(__half2float(x[xi]));
#endif #endif
} }
m_val = BlockReduce<float>(m_storage).Sum(m_val); m_val = BlockReduce<float>(m_storage).Sum(m_val);
...@@ -226,7 +226,7 @@ void _Moments( ...@@ -226,7 +226,7 @@ void _Moments(
y_dims[axes[i]] = 1; y_dims[axes[i]] = 1;
/*! Case #1: Rowwise Reduce */ /*! Case #1: Rowwise Reduce */
if (utils::math::IsRowwiseReduce( if (math::utils::IsRowwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) { num_dims, dims, y_dims.data(), &rows, &cols)) {
_RowwiseMoments<<< _RowwiseMoments<<<
CUDA_2D_BLOCKS(cols), CUDA_2D_BLOCKS(cols),
...@@ -237,7 +237,7 @@ void _Moments( ...@@ -237,7 +237,7 @@ void _Moments(
} }
/*! Case #2: Colwise Reduce */ /*! Case #2: Colwise Reduce */
if (utils::math::IsColwiseReduce( if (math::utils::IsColwiseReduce(
num_dims, dims, y_dims.data(), &rows, &cols)) { num_dims, dims, y_dims.data(), &rows, &cols)) {
_ColwiseMoments<<< _ColwiseMoments<<<
CUDA_2D_BLOCKS(rows), CUDA_2D_BLOCKS(rows),
...@@ -250,8 +250,8 @@ void _Moments( ...@@ -250,8 +250,8 @@ void _Moments(
/*! Case #3: Generic Reduce */ /*! Case #3: Generic Reduce */
CUDA_TENSOR_DIMS_CHECK(num_dims); CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> axesT, stridesT, dimsT; SimpleArray<int, CUDA_TENSOR_MAX_DIMS> axesT, stridesT, dimsT;
utils::math::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data); math::utils::TransposeAxesForReduce(num_dims, num_axes, axes, axesT.data);
utils::math::ComputeTransposeStrides( math::utils::ComputeTransposeStrides(
num_dims, dims, axesT.data, stridesT.data); num_dims, dims, axesT.data, stridesT.data);
rows = cols = 1; rows = cols = 1;
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/omp_utils.h" #include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -47,7 +47,7 @@ void _AvgPool2dNCHW( ...@@ -47,7 +47,7 @@ void _AvgPool2dNCHW(
for (int w = wstart; w < wend; ++w) for (int w = wstart; w < wend; ++w)
val += offset_x[h * W + w]; val += offset_x[h * W + w];
y[i] = val / area; y[i] = val / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -89,7 +89,7 @@ void _AvgPool2dNHWC( ...@@ -89,7 +89,7 @@ void _AvgPool2dNHWC(
for (int w = wstart; w < wend; ++w) for (int w = wstart; w < wend; ++w)
val += offset_x[(h * W + w) * C]; val += offset_x[(h * W + w) * C];
y[i] = val / area; y[i] = val / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -130,7 +130,7 @@ void _AvgPool2dGradNCHW( ...@@ -130,7 +130,7 @@ void _AvgPool2dGradNCHW(
for (int h = hstart; h < hend; ++h) for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w) for (int w = wstart; w < wend; ++w)
offset_dx[h * W + w] += dy[i] / area; offset_dx[h * W + w] += dy[i] / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -170,7 +170,7 @@ void _AvgPool2dGradNHWC( ...@@ -170,7 +170,7 @@ void _AvgPool2dGradNHWC(
for (int h = hstart; h < hend; ++h) for (int h = hstart; h < hend; ++h)
for (int w = wstart; w < wend; ++w) for (int w = wstart; w < wend; ++w)
offset_dx[(h * W + w) * C] += dy[i] / area; offset_dx[(h * W + w) * C] += dy[i] / area;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -253,7 +253,7 @@ void _AvgPool2dGradNHWC( ...@@ -253,7 +253,7 @@ void _AvgPool2dGradNHWC(
const T* dy, \ const T* dy, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
math::Set(N* C* H* W, cast::to<T>(0.f), dx, ctx); \ math::Set(N* C* H* W, convert::To<T>(0.f), dx, ctx); \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
_AvgPool2dGradNCHW( \ _AvgPool2dGradNCHW( \
N, \ N, \
......
#include "dragon/utils/eigen_utils.h" #include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
......
#include "dragon/utils/cast.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
......
...@@ -117,70 +117,17 @@ void _DepthwiseConv2dNHWC( ...@@ -117,70 +117,17 @@ void _DepthwiseConv2dNHWC(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
template <> #define DISPATCH_DATA_KERNEL(name, ...) \
void DepthwiseConv2d<float, CPUContext>( if (data_format == "NCHW") { \
const int N, name##NCHW(__VA_ARGS__); \
const int C, } else if (data_format == "NHWC") { \
const int H, name##NHWC(__VA_ARGS__); \
const int W, } else { \
const int out_h, LOG(FATAL) << "Unknown DataFormat: " << data_format; \
const int out_w,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const int pad_h,
const int pad_w,
const int dilation_h,
const int dilation_w,
const string& data_format,
const float* x,
const float* w,
float* y,
CPUContext* ctx) {
if (data_format == "NCHW") {
_DepthwiseConv2dNCHW(
N,
C,
H,
W,
out_h,
out_w,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
x,
w,
y);
} else {
_DepthwiseConv2dNHWC(
N,
C,
H,
W,
out_h,
out_w,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
x,
w,
y);
} }
}
template <> template <>
void DepthwiseConv2dGrad<float, CPUContext>( void DepthwiseConv2d<float16, CPUContext>(
const int N, const int N,
const int C, const int C,
const int H, const int H,
...@@ -196,15 +143,15 @@ void DepthwiseConv2dGrad<float, CPUContext>( ...@@ -196,15 +143,15 @@ void DepthwiseConv2dGrad<float, CPUContext>(
const int dilation_h, const int dilation_h,
const int dilation_w, const int dilation_w,
const string& data_format, const string& data_format,
const float* dy, const float16* x,
const float* w, const float16* w,
float* dx, float16* y,
CPUContext* ctx) { CPUContext* ctx) {
NOT_IMPLEMENTED; CPU_FP16_NOT_SUPPORTED;
} // DepthwiseConv2dGrad }
template <> template <>
void DepthwiseConv2dWGrad<float, CPUContext>( void DepthwiseConv2d<float, CPUContext>(
const int N, const int N,
const int C, const int C,
const int H, const int H,
...@@ -220,12 +167,82 @@ void DepthwiseConv2dWGrad<float, CPUContext>( ...@@ -220,12 +167,82 @@ void DepthwiseConv2dWGrad<float, CPUContext>(
const int dilation_h, const int dilation_h,
const int dilation_w, const int dilation_w,
const string& data_format, const string& data_format,
const float* dy,
const float* x, const float* x,
float* dw, const float* w,
float* y,
CPUContext* ctx) { CPUContext* ctx) {
NOT_IMPLEMENTED; DISPATCH_DATA_KERNEL(
} // DepthwiseConv2dWGrad _DepthwiseConv2d,
N,
C,
H,
W,
out_h,
out_w,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
x,
w,
y);
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void DepthwiseConv2dGrad<T, CPUContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int kernel_h, \
const int kernel_w, \
const int stride_h, \
const int stride_w, \
const int pad_h, \
const int pad_w, \
const int dilation_h, \
const int dilation_w, \
const string& data_format, \
const T* dy, \
const T* w, \
T* dx, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
} \
template <> \
void DepthwiseConv2dWGrad<T, CPUContext>( \
const int N, \
const int C, \
const int H, \
const int W, \
const int out_h, \
const int out_w, \
const int kernel_h, \
const int kernel_w, \
const int stride_h, \
const int stride_w, \
const int pad_h, \
const int pad_w, \
const int dilation_h, \
const int dilation_w, \
const string& data_format, \
const T* dy, \
const T* x, \
T* dw, \
CPUContext* ctx) { \
NOT_IMPLEMENTED; \
}
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
#undef DEFINE_GRAD_KERNEL_LAUNCHER
} // namespace kernel } // namespace kernel
......
#include "dragon/utils/cast.h" #include "dragon/utils/conversions.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -52,7 +52,7 @@ void _MaxPool2dNCHW( ...@@ -52,7 +52,7 @@ void _MaxPool2dNCHW(
} }
y[i] = val; y[i] = val;
mask[i] = mxi; mask[i] = mxi;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -99,7 +99,7 @@ void _MaxPool2dNHWC( ...@@ -99,7 +99,7 @@ void _MaxPool2dNHWC(
} }
y[i] = val; y[i] = val;
mask[i] = mxi; mask[i] = mxi;
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -129,7 +129,7 @@ void _MaxPool2dGradNCHW( ...@@ -129,7 +129,7 @@ void _MaxPool2dGradNCHW(
if (mask[i] != -1) { if (mask[i] != -1) {
dx[idx[0] * CHW + idx[1] * HW + mask[i]] += dy[i]; dx[idx[0] * CHW + idx[1] * HW + mask[i]] += dy[i];
} }
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
...@@ -158,7 +158,7 @@ void _MaxPool2dGradNHWC( ...@@ -158,7 +158,7 @@ void _MaxPool2dGradNHWC(
if (mask[i] != -1) { if (mask[i] != -1) {
dx[idx[0] * HWC + mask[i]] += dy[i]; dx[idx[0] * HWC + mask[i]] += dy[i];
} }
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
} }
} }
...@@ -245,7 +245,7 @@ void _MaxPool2dGradNHWC( ...@@ -245,7 +245,7 @@ void _MaxPool2dGradNHWC(
const int* mask, \ const int* mask, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
math::Set(N* C* H* W, cast::to<T>(0.f), dx, ctx); \ math::Set(N* C* H* W, convert::To<T>(0.f), dx, ctx); \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
_MaxPool2dGradNCHW( \ _MaxPool2dGradNCHW( \
N, \ N, \
......
...@@ -62,7 +62,7 @@ void _ResizeLinearNCHW( ...@@ -62,7 +62,7 @@ void _ResizeLinearNCHW(
t = tl + (tr - tl) * u; t = tl + (tr - tl) * u;
b = bl + (br - bl) * u; b = bl + (br - bl) * u;
y[i] = static_cast<T>(t + (b - t) * v); y[i] = static_cast<T>(t + (b - t) * v);
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -99,7 +99,7 @@ void _ResizeLinearNHWC( ...@@ -99,7 +99,7 @@ void _ResizeLinearNHWC(
t = tl + (tr - tl) * u; t = tl + (tr - tl) * u;
b = bl + (br - bl) * u; b = bl + (br - bl) * u;
y[i] = static_cast<T>(t + (b - t) * v); y[i] = static_cast<T>(t + (b - t) * v);
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -135,7 +135,7 @@ void _ResizeLinearGradNCHW( ...@@ -135,7 +135,7 @@ void _ResizeLinearGradNCHW(
dx[(offset + ti) * W + ri] += u * dt; // tr dx[(offset + ti) * W + ri] += u * dt; // tr
dx[(offset + bi) * W + li] += (1.f - u) * db; // bl dx[(offset + bi) * W + li] += (1.f - u) * db; // bl
dx[(offset + bi) * W + ri] += u * db; // br dx[(offset + bi) * W + ri] += u * db; // br
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -171,7 +171,7 @@ void _ResizeLinearGradNHWC( ...@@ -171,7 +171,7 @@ void _ResizeLinearGradNHWC(
dx[((offset + ti) * W + ri) * C + idx[3]] += u * dt; // tr dx[((offset + ti) * W + ri) * C + idx[3]] += u * dt; // tr
dx[((offset + bi) * W + li) * C + idx[3]] += (1.f - u) * db; // bl dx[((offset + bi) * W + li) * C + idx[3]] += (1.f - u) * db; // bl
dx[((offset + bi) * W + ri) * C + idx[3]] += u * db; // br dx[((offset + bi) * W + ri) * C + idx[3]] += u * db; // br
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
......
...@@ -27,7 +27,7 @@ void _ResizeNearestNCHW( ...@@ -27,7 +27,7 @@ void _ResizeNearestNCHW(
h_in = std::min(int(idx[2] * scale_h), h_max); h_in = std::min(int(idx[2] * scale_h), h_max);
w_in = std::min(int(idx[3] * scale_w), w_max); w_in = std::min(int(idx[3] * scale_w), w_max);
y[i] = x[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in]; y[i] = x[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in];
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -52,7 +52,7 @@ void _ResizeNearestNHWC( ...@@ -52,7 +52,7 @@ void _ResizeNearestNHWC(
w_in = std::min(int(idx[2] * scale_w), w_max); w_in = std::min(int(idx[2] * scale_w), w_max);
memcpy( memcpy(
y + i * C, x + (((idx[0] * H) + h_in) * W + w_in) * C, C * sizeof(T)); y + i * C, x + (((idx[0] * H) + h_in) * W + w_in) * C, C * sizeof(T));
utils::math::IncreaseIndexInDims(3, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(3, dims.data(), idx.data());
} }
} }
...@@ -76,7 +76,7 @@ void _ResizeNearestGradNCHW( ...@@ -76,7 +76,7 @@ void _ResizeNearestGradNCHW(
h_in = std::min(int(idx[2] * scale_h), h_max); h_in = std::min(int(idx[2] * scale_h), h_max);
w_in = std::min(int(idx[3] * scale_w), w_max); w_in = std::min(int(idx[3] * scale_w), w_max);
dx[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in] += (float)dy[i]; dx[(((idx[0] * C) + idx[1]) * H + h_in) * W + w_in] += (float)dy[i];
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
...@@ -100,7 +100,7 @@ void _ResizeNearestGradNHWC( ...@@ -100,7 +100,7 @@ void _ResizeNearestGradNHWC(
h_in = std::min(int(idx[1] * scale_h), h_max); h_in = std::min(int(idx[1] * scale_h), h_max);
w_in = std::min(int(idx[2] * scale_w), w_max); w_in = std::min(int(idx[2] * scale_w), w_max);
dx[(((idx[0] * H) + h_in) * W + w_in) * C + idx[3]] += (float)dy[i]; dx[(((idx[0] * H) + h_in) * W + w_in) * C + idx[3]] += (float)dy[i];
utils::math::IncreaseIndexInDims(4, dims.data(), idx.data()); math::utils::IncreaseIndexInDims(4, dims.data(), idx.data());
} }
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#define DRAGON_MODULES_PYTHON_CONFIG_H_ #define DRAGON_MODULES_PYTHON_CONFIG_H_
#include "dragon/modules/python/common.h" #include "dragon/modules/python/common.h"
#include "dragon/utils/device/common_eigen.h"
namespace dragon { namespace dragon {
...@@ -22,9 +23,16 @@ namespace python { ...@@ -22,9 +23,16 @@ namespace python {
namespace config { namespace config {
void RegisterModule(py::module& m) { void RegisterModule(py::module& m) {
/*! \brief Set the logging severity */
m.def("SetLoggingLevel", [](const string& severity) { m.def("SetLoggingLevel", [](const string& severity) {
SetLogDestination(severity); SetLogDestination(severity);
}); });
/*! \brief Set the number of threads for cpu parallelism */
m.def("SetNumThreads", [](int num) { Eigen::setNbThreads(num); });
/*! \brief Return the number of threads for cpu parallelism */
m.def("GetNumThreads", []() { return Eigen::nbThreads(); });
} }
} // namespace config } // namespace config
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#define DRAGON_MODULES_PYTHON_OPERATOR_H_ #define DRAGON_MODULES_PYTHON_OPERATOR_H_
#include "dragon/modules/python/common.h" #include "dragon/modules/python/common.h"
#include "dragon/utils/eigen_utils.h"
namespace dragon { namespace dragon {
......
...@@ -19,7 +19,7 @@ void ExpandOp<Context>::DoRunWithType() { ...@@ -19,7 +19,7 @@ void ExpandOp<Context>::DoRunWithType() {
// Store for the gradient calculation // Store for the gradient calculation
STORE_INPUT_SPEC(0); STORE_INPUT_SPEC(0);
if (utils::math::IsBinaryBroadcast(X.dims(), X_dims, Y_dims)) { if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, Y_dims)) {
math::Set( math::Set(
X.ndim(), X.ndim(),
X.dims().data(), X.dims().data(),
...@@ -47,7 +47,7 @@ void ExpandGradientOp<Context>::DoRunWithType() { ...@@ -47,7 +47,7 @@ void ExpandGradientOp<Context>::DoRunWithType() {
vec32_t X_broadcast_axes, _; vec32_t X_broadcast_axes, _;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
dX->dims(), dY.dims(), dY.dims(), X_broadcast_axes, _); dX->dims(), dY.dims(), dY.dims(), X_broadcast_axes, _);
if (X_broadcast_axes.empty()) { if (X_broadcast_axes.empty()) {
......
...@@ -62,7 +62,7 @@ void IndexSelectGradientOp<Context>::DoRunWithType() { ...@@ -62,7 +62,7 @@ void IndexSelectGradientOp<Context>::DoRunWithType() {
// Reset the accumulating gradient // Reset the accumulating gradient
math::Set( math::Set(
dX->count(), dX->count(),
cast::to<T>(0.f), convert::To<T>(0.f),
dX->template mutable_data<T, Context>(), dX->template mutable_data<T, Context>(),
ctx()); ctx());
......
...@@ -46,7 +46,7 @@ template <class Context> ...@@ -46,7 +46,7 @@ template <class Context>
template <typename T> template <typename T>
void FillOp<Context>::DoRunWithType() { void FillOp<Context>::DoRunWithType() {
auto* y = Output(0)->template mutable_data<T, Context>(); auto* y = Output(0)->template mutable_data<T, Context>();
math::Set(Output(0)->count(), cast::to<T>(value_), y, ctx()); math::Set(Output(0)->count(), convert::To<T>(value_), y, ctx());
} }
template <class Context> template <class Context>
......
...@@ -15,7 +15,7 @@ void OneHotOp<Context>::DoRunWithType() { ...@@ -15,7 +15,7 @@ void OneHotOp<Context>::DoRunWithType() {
// Brush the off-value over all // Brush the off-value over all
math::Set( math::Set(
X.count() * depth_, X.count() * depth_,
cast::to<T>((float)off_value_), convert::To<T>((float)off_value_),
Y->Reshape(Y_dims)->template mutable_data<T, Context>(), Y->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
......
...@@ -87,7 +87,7 @@ void SliceGradientOp<Context>::DoRunWithType() { ...@@ -87,7 +87,7 @@ void SliceGradientOp<Context>::DoRunWithType() {
// Zero the redundant gradients // Zero the redundant gradients
auto* dx = dX->template mutable_data<T, Context>(); auto* dx = dX->template mutable_data<T, Context>();
math::Set(dX->count(), cast::to<T>(0.f), dx, ctx()); math::Set(dX->count(), convert::To<T>(0.f), dx, ctx());
// Copy the dY to the right positions // Copy the dY to the right positions
kernel::SliceGrad( kernel::SliceGrad(
......
...@@ -75,7 +75,7 @@ void SplitGradientOp<Context>::DoRunWithType() { ...@@ -75,7 +75,7 @@ void SplitGradientOp<Context>::DoRunWithType() {
if (!Input(i).has_name()) { if (!Input(i).has_name()) {
math::Set( math::Set(
dX->count(), dX->count(),
cast::to<T>(0.f), convert::To<T>(0.f),
dX->template mutable_data<T, Context>(), dX->template mutable_data<T, Context>(),
ctx()); ctx());
break; break;
......
...@@ -14,8 +14,8 @@ void WhereOp<Context>::DoRunWithType() { ...@@ -14,8 +14,8 @@ void WhereOp<Context>::DoRunWithType() {
<< "\nExcepted bool or uint8 condition tensor."; << "\nExcepted bool or uint8 condition tensor.";
vec64_t AB_dims, Y_dims; vec64_t AB_dims, Y_dims;
if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), AB_dims) && if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), AB_dims) &&
utils::math::IsBinaryBroadcast(AB_dims, C.dims(), Y_dims)) { math::utils::IsBinaryBroadcast(AB_dims, C.dims(), Y_dims)) {
math::Where( math::Where(
A.ndim(), A.ndim(),
A.dims().data(), A.dims().data(),
...@@ -50,7 +50,7 @@ void WhereGradientOp<Context>::DoRunWithType() { ...@@ -50,7 +50,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes); A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient and zeros // Temporal space to store the intermediate gradient and zeros
...@@ -68,7 +68,7 @@ void WhereGradientOp<Context>::DoRunWithType() { ...@@ -68,7 +68,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
if (scratch_size > 0) { if (scratch_size > 0) {
scratch = ctx()->workspace()->template data<T, Context>({scratch_size})[0]; scratch = ctx()->workspace()->template data<T, Context>({scratch_size})[0];
zeros = scratch + (scratch_size - 1); zeros = scratch + (scratch_size - 1);
math::Set(1, cast::to<T>(0.f), zeros, ctx()); math::Set(1, convert::To<T>(0.f), zeros, ctx());
} }
if (dA->has_name()) { if (dA->has_name()) {
......
...@@ -43,11 +43,11 @@ void AssignOp<Context>::DoRunWithType() { ...@@ -43,11 +43,11 @@ void AssignOp<Context>::DoRunWithType() {
if (X.dims() != X_dims) { if (X.dims() != X_dims) {
vec64_t dims1, dims2; vec64_t dims1, dims2;
if (utils::math::IsBinaryBroadcast(X.dims(), X_dims, dims1)) { if (math::utils::IsBinaryBroadcast(X.dims(), X_dims, dims1)) {
CHECK(X_dims == dims1) CHECK(X_dims == dims1)
<< "\nCould not assign with shapes " << X.DimString() << " " << "\nCould not assign with shapes " << X.DimString() << " "
<< Tensor::DimString(X_dims); << Tensor::DimString(X_dims);
utils::math::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2); math::utils::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
if (dims1 != dims2) { if (dims1 != dims2) {
auto* scratch = ctx()->workspace()->template data<T, Context>( auto* scratch = ctx()->workspace()->template data<T, Context>(
{X_broadcast.count()})[0]; {X_broadcast.count()})[0];
......
...@@ -14,8 +14,8 @@ void MaskedAssignOp<Context>::DoRunWithType() { ...@@ -14,8 +14,8 @@ void MaskedAssignOp<Context>::DoRunWithType() {
<< "\nExcepted bool or uint8 mask."; << "\nExcepted bool or uint8 mask.";
vec64_t X_dims, Y_dims; vec64_t X_dims, Y_dims;
if (utils::math::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) && if (math::utils::IsBinaryBroadcast(X.dims(), X_mask.dims(), X_dims) &&
utils::math::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) && math::utils::IsBinaryBroadcast(X_dims, Y->dims(), Y_dims) &&
Y_dims == Y->dims()) { Y_dims == Y->dims()) {
math::Where( math::Where(
X.ndim(), X.ndim(),
......
...@@ -13,7 +13,7 @@ void GradientGenerateOp<Context>::DoRunWithType() { ...@@ -13,7 +13,7 @@ void GradientGenerateOp<Context>::DoRunWithType() {
Y->ReshapeLike(Input(i)); Y->ReshapeLike(Input(i));
math::Set( math::Set(
Y->count(), Y->count(),
cast::to<T>(defaults_[i]), convert::To<T>(defaults_[i]),
Y->template mutable_data<T, Context>(), Y->template mutable_data<T, Context>(),
ctx()); ctx());
} }
......
...@@ -105,7 +105,7 @@ void NLLLossGradientOp<Context>::DoRunWithType() { ...@@ -105,7 +105,7 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
auto* dx = dX->template mutable_data<LogitType, Context>(); auto* dx = dX->template mutable_data<LogitType, Context>();
auto* mask = auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0]; ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
math::Set(dX->count(), cast::to<LogitType>(0.f), dx, ctx()); math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
kernel::NLLLossGrad( kernel::NLLLossGrad(
outer_dim, outer_dim,
......
...@@ -21,7 +21,7 @@ void AddOp<Context>::DoRunWithType() { ...@@ -21,7 +21,7 @@ void AddOp<Context>::DoRunWithType() {
B.template data<T, Context>(), B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(), Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims)); auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Add( math::Add(
A.ndim(), A.ndim(),
...@@ -51,7 +51,7 @@ void AddGradientOp<Context>::DoRunWithType() { ...@@ -51,7 +51,7 @@ void AddGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes); A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
if (dA->has_name()) { if (dA->has_name()) {
......
...@@ -21,7 +21,7 @@ void DivOp<Context>::DoRunWithType() { ...@@ -21,7 +21,7 @@ void DivOp<Context>::DoRunWithType() {
B.template data<T, Context>(), B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(), Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims)); auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Div( math::Div(
A.ndim(), A.ndim(),
...@@ -52,7 +52,7 @@ void DivGradientOp<Context>::DoRunWithType() { ...@@ -52,7 +52,7 @@ void DivGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A_ref.dims(), A_ref.dims(),
B_ref.dims(), B_ref.dims(),
dY.dims(), dY.dims(),
......
...@@ -93,7 +93,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(Invert, T); ...@@ -93,7 +93,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(Invert, T);
B.template data<T, Context>(), \ B.template data<T, Context>(), \
Y->Reshape(Y_dims)->template mutable_data<TOut, Context>(), \ Y->Reshape(Y_dims)->template mutable_data<TOut, Context>(), \
ctx()); \ ctx()); \
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { \ } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { \
math::name( \ math::name( \
A.ndim(), \ A.ndim(), \
A.dims().data(), \ A.dims().data(), \
......
...@@ -13,7 +13,7 @@ void MaximumGradientOp<Context>::DoRunWithType() { ...@@ -13,7 +13,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes); A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient // Temporal space to store the intermediate gradient
......
...@@ -13,7 +13,7 @@ void MinimumGradientOp<Context>::DoRunWithType() { ...@@ -13,7 +13,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes); A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient // Temporal space to store the intermediate gradient
......
...@@ -40,7 +40,7 @@ void MomentsOp<Context>::DoRunWithType() { ...@@ -40,7 +40,7 @@ void MomentsOp<Context>::DoRunWithType() {
ctx()); ctx());
math::Set( math::Set(
1, 1,
cast::to<Ty>(0.f), convert::To<Ty>(0.f),
Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(), Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
ctx()); ctx());
} else { } else {
......
...@@ -21,7 +21,7 @@ void MulOp<Context>::DoRunWithType() { ...@@ -21,7 +21,7 @@ void MulOp<Context>::DoRunWithType() {
B.template data<T, Context>(), B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(), Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims)); auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Mul( math::Mul(
A.ndim(), A.ndim(),
...@@ -52,7 +52,7 @@ void MulGradientOp<Context>::DoRunWithType() { ...@@ -52,7 +52,7 @@ void MulGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A_ref.dims(), A_ref.dims(),
B_ref.dims(), B_ref.dims(),
dY.dims(), dY.dims(),
......
...@@ -12,7 +12,7 @@ void PowGradientOp<Context>::DoRunWithType() { ...@@ -12,7 +12,7 @@ void PowGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes); A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
// Temporal space to store the intermediate gradient // Temporal space to store the intermediate gradient
...@@ -99,7 +99,7 @@ void PowGradientOp<Context>::DoRunWithType() { ...@@ -99,7 +99,7 @@ void PowGradientOp<Context>::DoRunWithType() {
ctx()); ctx());
math::ReplaceNaN( math::ReplaceNaN(
A.count(), A.count(),
cast::to<T>(0.f), convert::To<T>(0.f),
dA->template data<T, Context>(), dA->template data<T, Context>(),
dA->template mutable_data<T, Context>(), dA->template mutable_data<T, Context>(),
ctx()); ctx());
...@@ -141,7 +141,7 @@ void PowGradientOp<Context>::DoRunWithType() { ...@@ -141,7 +141,7 @@ void PowGradientOp<Context>::DoRunWithType() {
A.template data<T, Context>(), A.template data<T, Context>(),
scratch, scratch,
ctx()); ctx());
math::ReplaceNaN(Y.count(), cast::to<T>(0.f), scratch, scratch, ctx()); math::ReplaceNaN(Y.count(), convert::To<T>(0.f), scratch, scratch, ctx());
if (B_broadcast_axes.empty()) { if (B_broadcast_axes.empty()) {
math::Mul( math::Mul(
Y.count(), scratch, B.template data<T, Context>(), scratch, ctx()); Y.count(), scratch, B.template data<T, Context>(), scratch, ctx());
......
...@@ -9,7 +9,7 @@ void SignGradientOp<Context>::DoRunWithType() { ...@@ -9,7 +9,7 @@ void SignGradientOp<Context>::DoRunWithType() {
auto &dY = Input(0), *dX = Output(0); auto &dY = Input(0), *dX = Output(0);
math::Set( math::Set(
dY.count(), dY.count(),
cast::to<T>(0.f), convert::To<T>(0.f),
dX->ReshapeLike(dY)->template mutable_data<T, Context>(), dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
......
...@@ -21,7 +21,7 @@ void SubOp<Context>::DoRunWithType() { ...@@ -21,7 +21,7 @@ void SubOp<Context>::DoRunWithType() {
B.template data<T, Context>(), B.template data<T, Context>(),
Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(), Output(0, {0, 1})->Reshape(Y_dims)->template mutable_data<T, Context>(),
ctx()); ctx());
} else if (utils::math::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) { } else if (math::utils::IsBinaryBroadcast(A.dims(), B.dims(), Y_dims)) {
auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims)); auto* Y = Output(0, CheckOutputAliases(A, B, Output(0), Y_dims));
math::Sub( math::Sub(
A.ndim(), A.ndim(),
...@@ -51,7 +51,7 @@ void SubGradientOp<Context>::DoRunWithType() { ...@@ -51,7 +51,7 @@ void SubGradientOp<Context>::DoRunWithType() {
vec32_t A_broadcast_axes, B_broadcast_axes; vec32_t A_broadcast_axes, B_broadcast_axes;
vec32_t Y_dims(dY.dims().begin(), dY.dims().end()); vec32_t Y_dims(dY.dims().begin(), dY.dims().end());
utils::math::ComputeBinaryBroadcastAxes( math::utils::ComputeBinaryBroadcastAxes(
A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes); A.dims(), B.dims(), dY.dims(), A_broadcast_axes, B_broadcast_axes);
if (dA->has_name()) { if (dA->has_name()) {
......
...@@ -19,47 +19,92 @@ void BatchNormOp<Context>::TrainingImpl() { ...@@ -19,47 +19,92 @@ void BatchNormOp<Context>::TrainingImpl() {
auto* X_bias = Buffer("X_bias")->Reshape({C_}); auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<InputType, Context>(); auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* beta = Input(2).template data<ParamType, Context>();
auto* rm = Input(3).template mutable_data<ParamType, Context>(); auto* rm = Input(3).template mutable_data<ParamType, Context>();
auto* rv = Input(4).template mutable_data<ParamType, Context>(); auto* rv = Input(4).template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>(); auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>(); auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>(); auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* y = Output(0)->template mutable_data<InputType, Context>();
// Compute moments // Compute moments
if (data_format() == "NCHW") { if (sync_stats_ > 0) {
vec32_t dims = {(int)N_, (int)C_, (int)S_}; #ifdef USE_MPI
vec32_t axes = {0, 2}; // Compute E(X) and E(X^2)
kernel::Moments(3, dims.data(), 2, axes.data(), x, mu, rsig, ctx()); kernel::BatchNormExpectation(
} else if (data_format() == "NHWC") { N_,
vec32_t dims = {(int)(N_ * S_), (int)C_}; C_,
vec32_t axes = {0}; S_,
kernel::Moments(2, dims.data(), 1, axes.data(), x, mu, rsig, ctx()); ParamType(1) / (N_ * comm_size_ * S_),
data_format(),
x,
mu,
rsig,
ctx());
// Compute D(X) = E(X^2) - E(X)^2
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)mu,
(void*)mu,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)rsig,
(void*)rsig,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif // USE_NCCL
} else {
AllReduce(mu, mu, C_);
AllReduce(rsig, rsig, C_);
}
math::Square(C_, mu, scale, ctx());
math::Sub(C_, rsig, scale, rsig, ctx());
#endif // USE_MPI
} else {
if (data_format() == "NCHW") {
vec32_t dims = {(int)N_, (int)C_, (int)S_};
vec32_t axes = {0, 2};
kernel::Moments(3, dims.data(), 2, axes.data(), x, mu, rsig, ctx());
} else if (data_format() == "NHWC") {
vec32_t dims = {(int)(N_ * S_), (int)C_};
vec32_t axes = {0};
kernel::Moments(2, dims.data(), 1, axes.data(), x, mu, rsig, ctx());
}
} }
// Compute running statistics // Compute running statistics
if (is_recomputing_ == 0) { if (is_recomputing_ == 0) {
// Running(X) = (1 - momentum) * Cur(X) + momentum * Running(X)
math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx()); math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx()); math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
} }
// Fuse parameters along channel axis // Inverse stddev from variance
// [mu, rsig, alpha, beta] => [scale, bias]
math::InvStd(C_, epsilon_, rsig, rsig, ctx()); math::InvStd(C_, epsilon_, rsig, rsig, ctx());
math::Mul(C_, gamma, rsig, scale, ctx());
math::Mul(C_, scale, mu, bias, ctx()); // Fuse parameters to compute affine transformation
math::Sub(C_, beta, bias, bias, ctx()); kernel::BatchNorm(
N_,
// Compute affine transformation C_,
if (data_format() == "NCHW") { S_,
kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx()); data_format(),
} else if (data_format() == "NHWC") { x,
kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx()); mu,
} rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
scale,
X_bias->template mutable_data<ParamType, Context>(),
Output(0)->template mutable_data<InputType, Context>(),
ctx());
} }
template <class Context> template <class Context>
...@@ -70,31 +115,30 @@ void BatchNormOp<Context>::InferenceImpl() { ...@@ -70,31 +115,30 @@ void BatchNormOp<Context>::InferenceImpl() {
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType); TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType); TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
auto* X_scale = Buffer("X_scale")->Reshape({C_}); auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* X_bias = Buffer("X_bias")->Reshape({C_}); auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* beta = Input(2).template data<ParamType, Context>();
auto* rm = Input(3).template data<ParamType, Context>();
auto* rv = Input(4).template data<ParamType, Context>(); auto* rv = Input(4).template data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>(); auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* y = Output(0)->template mutable_data<InputType, Context>(); // Inverse stddev from variance
math::InvStd(C_, epsilon_, rv, rsig, ctx());
// Fuse parameters along channel axis
// [mu, rsig, alpha, beta] => [scale, bias] // Fuse parameters to compute affine transformation
math::InvStd(C_, epsilon_, rv, bias, ctx()); kernel::BatchNorm(
math::Mul(C_, gamma, bias, scale, ctx()); N_,
math::Mul(C_, scale, rm, bias, ctx()); C_,
math::Sub(C_, beta, bias, bias, ctx()); S_,
data_format(),
// Compute affine transformation Input(0).template data<InputType, Context>(),
if (data_format() == "NCHW") { Input(3).template data<ParamType, Context>(),
kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx()); rsig,
} else if (data_format() == "NHWC") { Input(1).template data<ParamType, Context>(), // gamma
kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx()); Input(2).template data<ParamType, Context>(), // beta
} X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
Output(0)->template mutable_data<InputType, Context>(),
ctx());
} }
template <class Context> template <class Context>
...@@ -113,9 +157,15 @@ void BatchNormOp<Context>::RunOnDevice() { ...@@ -113,9 +157,15 @@ void BatchNormOp<Context>::RunOnDevice() {
} else { } else {
InferenceImpl<float, float>(); InferenceImpl<float, float>();
} }
} else if (Input(0).template IsType<float16>()) {
if (is_training_) {
TrainingImpl<float16, float>();
} else {
InferenceImpl<float16, float>();
}
} else { } else {
LOG(FATAL) << MessageForUnsupported( LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"}); types::to_string(Input(0).meta()), {"float16", "float32"});
} }
} }
...@@ -124,21 +174,71 @@ template <typename InputType, typename ParamType> ...@@ -124,21 +174,71 @@ template <typename InputType, typename ParamType>
void BatchNormGradientOp<Context>::TrainingImpl() { void BatchNormGradientOp<Context>::TrainingImpl() {
auto *dX = Output(0), *dW = Output(1), *dB = Output(2); auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig"); auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto *X_scale = Buffer("X_scale"), *X_bias = Buffer("X_bias");
// Gradient w.r.t. gamma, beta and input auto* x = Input(0).template data<InputType, Context>();
kernel::BatchNormBackwardTraining( auto* gamma = Input(1).template data<ParamType, Context>();
auto* dy = Input(4).template data<InputType, Context>();
auto* mu = X_mu->template data<ParamType, Context>();
auto* rsig = X_rsig->template data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
// Gradient w.r.t. gamma and beta
kernel::BatchNormInternalGrad(
N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());
if (sync_stats_ > 0) {
#ifdef USE_MPI
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)dgamma,
(void*)scale,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)dbeta,
(void*)bias,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif // USE_NCCL
} else {
AllReduce(dgamma, scale, C_);
AllReduce(dbeta, bias, C_);
}
math::Scale(C_, ParamType(1) / comm_size_, scale, scale, ctx());
math::Scale(C_, ParamType(1) / comm_size_, bias, bias, ctx());
#endif // USE_MPI
} else {
scale = dgamma, bias = dbeta;
}
// Gradient w.r.t. input
kernel::BatchNormTrainingGrad(
N_, N_,
C_, C_,
S_, S_,
data_format(), data_format(),
Input(0).template data<InputType, Context>(), // x x,
X_mu->template data<ParamType, Context>(), // mu mu,
X_rsig->template data<ParamType, Context>(), // rsig rsig,
Input(1).template data<ParamType, Context>(), // gamma gamma,
Input(4).template data<InputType, Context>(), // dy scale,
Output(0)->template mutable_data<InputType, Context>(), // dx bias,
dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dgamma dy,
dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // dbeta Output(0)->template mutable_data<InputType, Context>(),
ctx()); ctx());
} }
...@@ -158,11 +258,11 @@ void BatchNormGradientOp<Context>::InferenceImpl() { ...@@ -158,11 +258,11 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>(); dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
} }
// Restore inverse stddev from variance // Inverse stddev from variance
math::InvStd(C_, epsilon_, rv, rsig, ctx()); math::InvStd(C_, epsilon_, rv, rsig, ctx());
// Gradient w.r.t. gamma, beta and input // Gradient w.r.t. gamma, beta and input
kernel::BatchNormBackwardInference( kernel::BatchNormInferenceGrad(
N_, N_,
C_, C_,
S_, S_,
...@@ -172,9 +272,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() { ...@@ -172,9 +272,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
rsig, rsig,
Input(1).template data<ParamType, Context>(), // gamma Input(1).template data<ParamType, Context>(), // gamma
Input(4).template data<InputType, Context>(), // dy Input(4).template data<InputType, Context>(), // dy
dX->template mutable_data<InputType, Context>(),
dgamma, dgamma,
dbeta, dbeta,
dX->template mutable_data<InputType, Context>(),
ctx()); ctx());
} }
...@@ -190,9 +290,15 @@ void BatchNormGradientOp<Context>::RunOnDevice() { ...@@ -190,9 +290,15 @@ void BatchNormGradientOp<Context>::RunOnDevice() {
} else { } else {
InferenceImpl<float, float>(); InferenceImpl<float, float>();
} }
} else if (Input(0).template IsType<float16>()) {
if (is_training_ > 0) {
TrainingImpl<float16, float>();
} else {
InferenceImpl<float16, float>();
}
} else { } else {
LOG(FATAL) << MessageForUnsupported( LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"}); types::to_string(Input(0).meta()), {"float16", "float32"});
} }
} }
......
...@@ -35,7 +35,8 @@ class BatchNormOpBase : public GenericOpBase<Context> { ...@@ -35,7 +35,8 @@ class BatchNormOpBase : public GenericOpBase<Context> {
: GenericOpBase<Context>(def, ws), : GenericOpBase<Context>(def, ws),
momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)), momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)), epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)) {} use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void DetermineBaseArguments() { void DetermineBaseArguments() {
...@@ -58,7 +59,8 @@ class BatchNormOpBase : public GenericOpBase<Context> { ...@@ -58,7 +59,8 @@ class BatchNormOpBase : public GenericOpBase<Context> {
protected: protected:
float momentum_; float momentum_;
double epsilon_; double epsilon_;
int64_t use_stats_, N_, C_, S_; int64_t N_, C_, S_;
int64_t use_stats_, sync_stats_;
int64_t is_training_, is_recomputing_; int64_t is_training_, is_recomputing_;
}; };
...@@ -69,6 +71,7 @@ class BatchNormOpBase : public GenericOpBase<Context> { ...@@ -69,6 +71,7 @@ class BatchNormOpBase : public GenericOpBase<Context> {
using BatchNormOpBase<Context>::momentum_; \ using BatchNormOpBase<Context>::momentum_; \
using BatchNormOpBase<Context>::epsilon_; \ using BatchNormOpBase<Context>::epsilon_; \
using BatchNormOpBase<Context>::use_stats_; \ using BatchNormOpBase<Context>::use_stats_; \
using BatchNormOpBase<Context>::sync_stats_; \
using BatchNormOpBase<Context>::N_; \ using BatchNormOpBase<Context>::N_; \
using BatchNormOpBase<Context>::C_; \ using BatchNormOpBase<Context>::C_; \
using BatchNormOpBase<Context>::S_; \ using BatchNormOpBase<Context>::S_; \
...@@ -82,6 +85,9 @@ class BatchNormOp : public BatchNormOpBase<Context> { ...@@ -82,6 +85,9 @@ class BatchNormOp : public BatchNormOpBase<Context> {
: BatchNormOpBase<Context>(def, ws) {} : BatchNormOpBase<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS; USE_BATCHNORM_FUNCTIONS;
#ifdef USE_MPI
USE_COLLECTIVE_FUNCTIONS;
#endif
void RunOnDevice() override; void RunOnDevice() override;
...@@ -99,50 +105,19 @@ class BatchNormGradientOp : public BatchNormOpBase<Context> { ...@@ -99,50 +105,19 @@ class BatchNormGradientOp : public BatchNormOpBase<Context> {
: BatchNormOpBase<Context>(def, ws) {} : BatchNormOpBase<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS; USE_BATCHNORM_FUNCTIONS;
void RunOnDevice() override;
template <typename InputType, typename ParamType>
void TrainingImpl();
template <typename InputType, typename ParamType>
void InferenceImpl();
};
#ifdef USE_MPI #ifdef USE_MPI
template <class Context>
class SyncBatchNormOp : public BatchNormOp<Context> {
public:
SyncBatchNormOp(const OperatorDef& def, Workspace* ws)
: BatchNormOp<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
USE_COLLECTIVE_FUNCTIONS; USE_COLLECTIVE_FUNCTIONS;
#endif
void RunOnDevice() override; void RunOnDevice() override;
template <typename InputType, typename ParamType> template <typename InputType, typename ParamType>
void TrainingImpl(); void TrainingImpl();
};
template <class Context>
class SyncBatchNormGradientOp : public BatchNormGradientOp<Context> {
public:
SyncBatchNormGradientOp(const OperatorDef& def, Workspace* ws)
: BatchNormGradientOp<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
USE_COLLECTIVE_FUNCTIONS;
void RunOnDevice() override;
template <typename InputType, typename ParamType> template <typename InputType, typename ParamType>
void TrainingImpl(); void InferenceImpl();
}; };
#endif // USE_MPI
#ifdef USE_CUDNN #ifdef USE_CUDNN
template <class Context> template <class Context>
......
#ifdef USE_MPI #ifdef USE_MPI
#include "dragon/core/workspace.h"
#include "dragon/operators/normalization/batch_norm_op.h" #include "dragon/operators/normalization/batch_norm_op.h"
#include "dragon/utils/filler.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
template <class Context> REGISTER_CPU_OPERATOR(SyncBatchNorm, BatchNormOp<CPUContext>);
template <typename InputType, typename ParamType> REGISTER_CPU_OPERATOR(SyncBatchNormGradient, BatchNormGradientOp<CPUContext>);
void SyncBatchNormOp<Context>::TrainingImpl() {
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
auto* X_mu = Buffer("X_mu")->Reshape({C_});
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* beta = Input(2).template data<ParamType, Context>();
auto* rm = Input(3).template mutable_data<ParamType, Context>();
auto* rv = Input(4).template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* y = Output(0)->template mutable_data<InputType, Context>();
// Compute E(X) and E(X^2)
kernel::BatchNormExpectation(
N_,
C_,
S_,
ParamType(1) / (N_ * comm_size_ * S_),
data_format(),
x,
mu,
rsig,
ctx());
// Compute D(X) = E(X^2) - E(X)^2
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)mu,
(void*)mu,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)rsig,
(void*)rsig,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif
} else {
AllReduce(mu, mu, C_);
AllReduce(rsig, rsig, C_);
}
math::Square(C_, mu, y, ctx());
math::Sub(C_, rsig, y, rsig, ctx());
// Compute running statistics
if (is_recomputing_ == 0) {
// Running(X) = (1 - momentum) * Cur(X) + momentum * Running(X)
math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
}
// Fuse parameters along channel axis
// [mu, rsig, alpha, beta] => [scale, bias]
math::InvStd(C_, epsilon_, rsig, rsig, ctx());
math::Mul(C_, gamma, rsig, scale, ctx());
math::Mul(C_, scale, mu, bias, ctx());
math::Sub(C_, beta, bias, bias, ctx());
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::ChannelAffine(N_, S_, C_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::ChannelAffine(N_ * S_, 1, C_, x, scale, bias, y, ctx());
}
}
template <class Context>
void SyncBatchNormOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Get the recomputing flag
auto* flag = workspace()->GetTensor("/share/flag/recomputing");
is_recomputing_ = flag->template data<bool, CPUContext>()[0] ? 1 : 0;
// Dispatch the training or inference impl
Output(0)->ReshapeLike(Input(0));
if (Input(0).template IsType<float>()) {
if (is_training_ > 0) {
TrainingImpl<float, float>();
} else {
this->template InferenceImpl<float, float>();
}
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"});
}
}
template <class Context>
template <typename InputType, typename ParamType>
void SyncBatchNormGradientOp<Context>::TrainingImpl() {
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto *X_scale = Buffer("X_scale"), *X_bias = Buffer("X_bias");
auto* x = Input(0).template data<InputType, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* dy = Input(4).template data<InputType, Context>();
auto* mu = X_mu->template data<ParamType, Context>();
auto* rsig = X_rsig->template data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* bias = X_bias->template mutable_data<ParamType, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
// Gradient w.r.t. gamma and beta of local batch
kernel::BatchNormInternalGrad(
N_, C_, S_, data_format(), x, mu, rsig, gamma, dy, dgamma, dbeta, ctx());
// Gradient w.r.t. gamma and beta of global batch
ctx()->FinishDeviceComputation();
if (enable_nccl_) {
#ifdef USE_NCCL
auto nccl_comm_ = this->nccl_comm();
auto nccl_dtype_ = this->template nccl_dtype<ParamType>();
NCCL_CHECK(ncclAllReduce(
(void*)dgamma,
(void*)scale,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
NCCL_CHECK(ncclAllReduce(
(void*)dbeta,
(void*)bias,
C_,
nccl_dtype_,
ncclSum,
nccl_comm_,
((CUDAContext*)ctx())->cuda_stream()));
#endif
} else {
AllReduce(dgamma, scale, C_);
AllReduce(dbeta, bias, C_);
}
math::Scale(C_, ParamType(1) / comm_size_, scale, scale, ctx());
math::Scale(C_, ParamType(1) / comm_size_, bias, bias, ctx());
// Gradient w.r.t. input
kernel::BatchNormTrainingGrad(
N_,
C_,
S_,
data_format(),
x,
mu,
rsig,
gamma,
scale,
bias,
dy,
Output(0)->template mutable_data<InputType, Context>(),
ctx());
}
template <class Context>
void SyncBatchNormGradientOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Dispatch the training or inference impl
Output(0)->ReshapeLike(Input(0));
if (Input(0).template IsType<float>()) {
if (is_training_ > 0) {
TrainingImpl<float, float>();
} else {
this->template InferenceImpl<float, float>();
}
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(Input(0).meta()), {"float32"});
}
}
DEPLOY_CPU_OPERATOR(SyncBatchNorm);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(SyncBatchNorm);
#endif
DEPLOY_CPU_OPERATOR(SyncBatchNormGradient);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(SyncBatchNormGradient); REGISTER_CUDA_OPERATOR(SyncBatchNorm, BatchNormOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(SyncBatchNormGradient, BatchNormGradientOp<CUDAContext>);
#endif #endif
OPERATOR_SCHEMA(SyncBatchNorm) OPERATOR_SCHEMA(SyncBatchNorm)
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!