Commit a79a3bba by Ting PAN

Refactor the python distribution script

Summary:
This commit correctly uses the distribution commands to collect the
python package and the compiled dynamic libraries.
1 parent 494774d3
Showing with 404 additions and 449 deletions
...@@ -62,6 +62,7 @@ if (USE_MPI) ...@@ -62,6 +62,7 @@ if (USE_MPI)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/mpi/lib) list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/mpi/lib)
endif() endif()
if (USE_CUDNN) if (USE_CUDNN)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib64) list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib64)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib/x64) list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib/x64)
endif() endif()
......
...@@ -147,7 +147,7 @@ class DRAGON_API Tensor { ...@@ -147,7 +147,7 @@ class DRAGON_API Tensor {
} else { } else {
auto* new_memory = other->memory(); auto* new_memory = other->memory();
if (new_memory != nullptr) { if (new_memory != nullptr) {
CHECK_LE(size_, new_memory->size()) CHECK_LE(size_ * meta_.itemsize(), new_memory->size())
<< "\nMap from a memory with smaller capacity."; << "\nMap from a memory with smaller capacity.";
mapped_memory_ = new_memory; mapped_memory_ = new_memory;
capacity_ = new_memory->size(); capacity_ = new_memory->size();
......
...@@ -25,33 +25,33 @@ void _DropPath( ...@@ -25,33 +25,33 @@ void _DropPath(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void DropPath<T, CPUContext>( \ void DropPath<T, CPUContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const float ratio, \ const float ratio, \
const float scale, \ const float scale, \
const T* x, \ const T* x, \
T* y, \ T* y, \
uint8_t* mask, \ uint8_t* mask, \
uint32_t* /* r */, \ uint32_t* /* r */, \
CPUContext* ctx) { \ CPUContext* ctx) { \
math::RandomBernoulli(N, 1.f - ratio, mask, ctx); \ math::RandomBernoulli(N, 1.f - ratio, mask, ctx); \
_DropPath(N, C, math::AccmulatorType<T>::type(scale), mask, x, y); \ _DropPath(N, C, math::AccumulatorType<T>::type(scale), mask, x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void DropPathGrad<T, CPUContext>( \ void DropPathGrad<T, CPUContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const float scale, \ const float scale, \
const uint8_t* mask, \ const uint8_t* mask, \
const T* dy, \ const T* dy, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_DropPath(N, C, math::AccmulatorType<T>::type(scale), mask, dy, dx); \ _DropPath(N, C, math::AccumulatorType<T>::type(scale), mask, dy, dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
...@@ -62,7 +62,7 @@ __global__ void _DropPathGrad( ...@@ -62,7 +62,7 @@ __global__ void _DropPathGrad(
_DropPath<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _DropPath<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxC, \ NxC, \
C, \ C, \
convert::To<math::AccmulatorType<T>::type>(scale), \ convert::To<math::AccumulatorType<T>::type>(scale), \
static_cast<uint32_t>(UINT_MAX * ratio), \ static_cast<uint32_t>(UINT_MAX * ratio), \
r, \ r, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
...@@ -84,7 +84,7 @@ __global__ void _DropPathGrad( ...@@ -84,7 +84,7 @@ __global__ void _DropPathGrad(
_DropPathGrad<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _DropPathGrad<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxC, \ NxC, \
C, \ C, \
convert::To<math::AccmulatorType<T>::type>(scale), \ convert::To<math::AccumulatorType<T>::type>(scale), \
mask, \ mask, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
......
...@@ -43,7 +43,7 @@ __global__ void _Dropout( ...@@ -43,7 +43,7 @@ __global__ void _Dropout(
math::Random(N, r, ctx); \ math::Random(N, r, ctx); \
_Dropout<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _Dropout<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
convert::To<math::AccmulatorType<T>::type>(scale), \ convert::To<math::AccumulatorType<T>::type>(scale), \
static_cast<uint32_t>(UINT_MAX * ratio), \ static_cast<uint32_t>(UINT_MAX * ratio), \
r, \ r, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
......
...@@ -64,7 +64,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) { ...@@ -64,7 +64,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) {
#define DEFINE_KERNEL_LAUNCHER(name, T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void name<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \ void name<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \
_##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _##name<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
...@@ -75,7 +75,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) { ...@@ -75,7 +75,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) {
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \ const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _##name<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
......
...@@ -49,8 +49,8 @@ __global__ void _HardSigmoidGrad( ...@@ -49,8 +49,8 @@ __global__ void _HardSigmoidGrad(
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_HardSigmoid<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _HardSigmoid<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \ convert::To<math::AccumulatorType<T>::type>(alpha), \
convert::To<math::AccmulatorType<T>::type>(beta), \ convert::To<math::AccumulatorType<T>::type>(beta), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
...@@ -66,7 +66,7 @@ __global__ void _HardSigmoidGrad( ...@@ -66,7 +66,7 @@ __global__ void _HardSigmoidGrad(
CUDAContext* ctx) { \ CUDAContext* ctx) { \
_HardSigmoidGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _HardSigmoidGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \ convert::To<math::AccumulatorType<T>::type>(alpha), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(y), \ reinterpret_cast<const math::ScalarType<T>::type*>(y), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
......
...@@ -36,27 +36,27 @@ __global__ void _HardSwishGrad(const int N, const T* dy, const T* x, T* dx) { ...@@ -36,27 +36,27 @@ __global__ void _HardSwishGrad(const int N, const T* dy, const T* x, T* dx) {
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void HardSwish<T, CUDAContext>( \ void HardSwish<T, CUDAContext>( \
const int N, const T* x, T* y, CUDAContext* ctx) { \ const int N, const T* x, T* y, CUDAContext* ctx) { \
_HardSwish<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _HardSwish<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void HardSwishGrad<T, CUDAContext>( \ void HardSwishGrad<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \ const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_HardSwishGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _HardSwishGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
...@@ -38,40 +38,40 @@ void _SeluGrad( ...@@ -38,40 +38,40 @@ void _SeluGrad(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void Selu<T, CPUContext>( \ void Selu<T, CPUContext>( \
const int N, \ const int N, \
const float alpha, \ const float alpha, \
const float gamma, \ const float gamma, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_Selu( \ _Selu( \
N, \ N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \ convert::To<math::AccumulatorType<T>::type>(alpha), \
convert::To<math::AccmulatorType<T>::type>(gamma), \ convert::To<math::AccumulatorType<T>::type>(gamma), \
x, \ x, \
y); \ y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void SeluGrad<T, CPUContext>( \ void SeluGrad<T, CPUContext>( \
const int N, \ const int N, \
const float alpha, \ const float alpha, \
const float gamma, \ const float gamma, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_SeluGrad( \ _SeluGrad( \
N, \ N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \ convert::To<math::AccumulatorType<T>::type>(alpha), \
convert::To<math::AccmulatorType<T>::type>(gamma), \ convert::To<math::AccumulatorType<T>::type>(gamma), \
dy, \ dy, \
y, \ y, \
dx); \ dx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
...@@ -35,23 +35,23 @@ __global__ void _SiluGrad(const int N, const T* dy, const T* x, T* dx) { ...@@ -35,23 +35,23 @@ __global__ void _SiluGrad(const int N, const T* dy, const T* x, T* dx) {
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void Silu<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \ void Silu<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \
_Silu<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _Silu<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void SiluGrad<T, CUDAContext>( \ void SiluGrad<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \ const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_SiluGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _SiluGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
...@@ -285,7 +285,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce( ...@@ -285,7 +285,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \ const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
_##name##ViaWarpReduce< \ _##name##ViaWarpReduce< \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \ math::AccumulatorType<T>::type> \
<<<nblocks, \ <<<nblocks, \
dim3(CUDA_WARP_SIZE, WARP_ITEMS), \ dim3(CUDA_WARP_SIZE, WARP_ITEMS), \
0, \ 0, \
...@@ -299,7 +299,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce( ...@@ -299,7 +299,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
} \ } \
_##name##ViaBlockReduce< \ _##name##ViaBlockReduce< \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \ math::AccumulatorType<T>::type> \
<<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
...@@ -323,7 +323,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce( ...@@ -323,7 +323,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \ const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
_##name##ViaWarpReduce< \ _##name##ViaWarpReduce< \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \ math::AccumulatorType<T>::type> \
<<<nblocks, \ <<<nblocks, \
dim3(CUDA_WARP_SIZE, WARP_ITEMS), \ dim3(CUDA_WARP_SIZE, WARP_ITEMS), \
0, \ 0, \
...@@ -338,7 +338,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce( ...@@ -338,7 +338,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
} \ } \
_##name##ViaBlockReduce< \ _##name##ViaBlockReduce< \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \ math::AccumulatorType<T>::type> \
<<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
......
...@@ -69,26 +69,26 @@ __global__ void _RepeatGrad( ...@@ -69,26 +69,26 @@ __global__ void _RepeatGrad(
NxCxS2, C, S, S2, x, y); \ NxCxS2, C, S, S2, x, y); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void RepeatGrad<T, CUDAContext>( \ void RepeatGrad<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const int repeats, \ const int repeats, \
const T* dy, \ const T* dy, \
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto S2 = S * repeats; \ const auto S2 = S * repeats; \
const auto NxCxS = N * C * S; \ const auto NxCxS = N * C * S; \
_RepeatGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _RepeatGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \ NxCxS, \
C, \ C, \
S, \ S, \
S2, \ S2, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(bool); DEFINE_KERNEL_LAUNCHER(bool);
......
...@@ -77,7 +77,7 @@ __global__ void _BroadcastLossGrad( ...@@ -77,7 +77,7 @@ __global__ void _BroadcastLossGrad(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
using ScalarT = math::ScalarType<T>::type; \ using ScalarT = math::ScalarType<T>::type; \
using AccT = math::AccmulatorType<T>::type; \ using AccT = math::AccumulatorType<T>::type; \
if (num_masks > 0 && normalizer < 0.f) { \ if (num_masks > 0 && normalizer < 0.f) { \
auto* num_valid = const_cast<T*>(mask + num_masks); \ auto* num_valid = const_cast<T*>(mask + num_masks); \
math::Sum(num_masks, 1.f, mask, num_valid, ctx); \ math::Sum(num_masks, 1.f, mask, num_valid, ctx); \
...@@ -105,7 +105,7 @@ __global__ void _BroadcastLossGrad( ...@@ -105,7 +105,7 @@ __global__ void _BroadcastLossGrad(
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
using ScalarT = math::ScalarType<T>::type; \ using ScalarT = math::ScalarType<T>::type; \
using AccT = math::AccmulatorType<T>::type; \ using AccT = math::AccumulatorType<T>::type; \
const auto CxS = C * S; \ const auto CxS = C * S; \
const auto NxCxS = N * CxS; \ const auto NxCxS = N * CxS; \
_BroadcastLossGrad<ScalarT, AccT> \ _BroadcastLossGrad<ScalarT, AccT> \
......
...@@ -42,7 +42,7 @@ __global__ void _SmoothL1Grad(const int N, const AccT beta, const T* x, T* y) { ...@@ -42,7 +42,7 @@ __global__ void _SmoothL1Grad(const int N, const AccT beta, const T* x, T* y) {
const int N, const float beta, const T* x, T* y, CUDAContext* ctx) { \ const int N, const float beta, const T* x, T* y, CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \ N, \
convert::To<math::AccmulatorType<T>::type>(beta), \ convert::To<math::AccumulatorType<T>::type>(beta), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
......
...@@ -70,12 +70,12 @@ void _ReduceSumGradImpl( ...@@ -70,12 +70,12 @@ void _ReduceSumGradImpl(
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \ DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
_ReduceSumGradImpl, \ _ReduceSumGradImpl, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
num_dims, \ num_dims, \
x_dims, \ x_dims, \
y_dims, \ y_dims, \
y_strides, \ y_strides, \
convert::To<math::AccmulatorType<T>::type>(scale), \ convert::To<math::AccumulatorType<T>::type>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx), \ reinterpret_cast<math::ScalarType<T>::type*>(dx), \
ctx); \ ctx); \
......
...@@ -440,7 +440,7 @@ void _AvgPool3dGradNHWC( ...@@ -440,7 +440,7 @@ void _AvgPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
N, \ N, \
C, \ C, \
H, \ H, \
...@@ -492,7 +492,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad ...@@ -492,7 +492,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
N, \ N, \
C, \ C, \
D, \ D, \
......
...@@ -457,7 +457,7 @@ __global__ void _AvgPool3dGradNHWC( ...@@ -457,7 +457,7 @@ __global__ void _AvgPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
nthreads, \ nthreads, \
...@@ -512,7 +512,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad ...@@ -512,7 +512,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
nthreads, \ nthreads, \
......
...@@ -37,35 +37,35 @@ __global__ void _BiasAdd( ...@@ -37,35 +37,35 @@ __global__ void _BiasAdd(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void BiasAdd<T, CUDAContext>( \ void BiasAdd<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const T* x, \ const T* x, \
const T* bias, \ const T* bias, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \ const auto NxCxS = N * C * S; \
if (S == 1) { \ if (S == 1) { \
_BiasAdd<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \ NxCxS, \
C, \ C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \ reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \ } else { \
_BiasAdd<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \ NxCxS, \
S, \ S, \
C, \ C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \ reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} \ } \
} }
DEFINE_KERNEL_LAUNCHER(uint8_t); DEFINE_KERNEL_LAUNCHER(uint8_t);
......
...@@ -393,7 +393,7 @@ void _MaxPool3dGradNHWC( ...@@ -393,7 +393,7 @@ void _MaxPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
N, \ N, \
C, \ C, \
H, \ H, \
...@@ -447,7 +447,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad ...@@ -447,7 +447,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
N, \ N, \
C, \ C, \
D, \ D, \
......
...@@ -472,7 +472,7 @@ __global__ void _MaxPool3dGradNHWC( ...@@ -472,7 +472,7 @@ __global__ void _MaxPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
nthreads, \ nthreads, \
...@@ -529,7 +529,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad ...@@ -529,7 +529,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
DISPATCH_POOL_KERNEL( \ DISPATCH_POOL_KERNEL( \
_##name, \ _##name, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \ CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \ CUDA_THREADS, \
nthreads, \ nthreads, \
......
...@@ -49,7 +49,7 @@ __global__ void _ResizeLinear2dNCHW( ...@@ -49,7 +49,7 @@ __global__ void _ResizeLinear2dNCHW(
const int w_out = yi % out_w; const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h; const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C; const int n = yi / out_w / out_h / C;
const float h = TransformCoordinate(h_out, scale_h, align_corners); const float h = TransformCoordinate(h_out, scale_h, align_corners);
const float w = TransformCoordinate(w_out, scale_w, align_corners); const float w = TransformCoordinate(w_out, scale_w, align_corners);
...@@ -129,7 +129,7 @@ __global__ void _ResizeLinear2dGradNCHW( ...@@ -129,7 +129,7 @@ __global__ void _ResizeLinear2dGradNCHW(
const int w_out = yi % out_w; const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h; const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C; const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C; const int n = yi / out_w / out_h / C;
const float h = TransformCoordinate(h_out, scale_h, align_corners); const float h = TransformCoordinate(h_out, scale_h, align_corners);
const float w = TransformCoordinate(w_out, scale_w, align_corners); const float w = TransformCoordinate(w_out, scale_w, align_corners);
......
...@@ -54,23 +54,22 @@ class NumpyWrapper { ...@@ -54,23 +54,22 @@ class NumpyWrapper {
} }
return py::reinterpret_steal<py::object>(array); return py::reinterpret_steal<py::object>(array);
} }
auto* array = PyArray_SimpleNewFromData( return py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
dims.size(), dims.size(),
dims.data(), dims.data(),
dtypes::to_npy(meta), dtypes::to_npy(meta),
const_cast<void*>(tensor_->raw_data<CPUContext>())); const_cast<void*>(tensor_->raw_data<CPUContext>())));
return py::reinterpret_steal<py::object>(array);
} }
Tensor* From(py::object obj, bool copy) { Tensor* From(py::object obj, bool copy) {
auto* array = auto* array = PyArray_GETCONTIGUOUS((PyArrayObject*)obj.ptr());
PyArray_GETCONTIGUOUS(reinterpret_cast<PyArrayObject*>(obj.ptr()));
const auto& meta = dtypes::from_npy(PyArray_TYPE(array)); const auto& meta = dtypes::from_npy(PyArray_TYPE(array));
CHECK(meta.id() != 0) << "\nUnsupported numpy array type."; CHECK(meta.id() != 0) << "\nUnsupported numpy array type.";
auto* npy_dims = PyArray_DIMS(array); auto* npy_dims = PyArray_DIMS(array);
auto* data = static_cast<void*>(PyArray_DATA(array)); auto* data = static_cast<void*>(PyArray_DATA(array));
vector<int64_t> dims(npy_dims, npy_dims + PyArray_NDIM(array)); vector<int64_t> dims(npy_dims, npy_dims + PyArray_NDIM(array));
auto* memory = tensor_->set_meta(meta)->Reshape(dims)->memory(); tensor_->set_meta(meta)->Reshape(dims);
auto* memory = tensor_->MapFrom(nullptr)->memory();
if (copy) { if (copy) {
auto device_type = memory ? memory->info()["device_type"] : "cpu"; auto device_type = memory ? memory->info()["device_type"] : "cpu";
if (device_type == "cuda") { if (device_type == "cuda") {
......
...@@ -5,64 +5,33 @@ namespace dragon { ...@@ -5,64 +5,33 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
pair<float, float> ClipOp<Context>::ComputeBoundsWithType() {
auto meta = TypeMeta::Make<T>();
if (meta.template Match<int8_t>()) {
return std::make_pair(std::max(low_, -128.f), std::min(high_, 127.f));
} else if (meta.template Match<uint8_t>()) {
return std::make_pair(std::max(low_, 0.f), std::min(high_, 255.f));
} else if (meta.template Match<int>()) {
return std::make_pair(
std::max(low_, -214748e4f), std::min(high_, 214748e4f));
} else if (meta.template Match<int64_t>()) {
return std::make_pair(
std::max(low_, -922337e13f), std::min(high_, 922337e13f));
} else if (meta.template Match<float16>()) {
return std::make_pair(std::max(low_, -65505.f), std::min(high_, 65504.f));
} else {
return std::make_pair(std::max(low_, -FLT_MAX), std::min(high_, FLT_MAX));
}
}
template <class Context>
template <typename T>
void ClipOp<Context>::DoRunWithType() { void ClipOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
auto bounds = ComputeBoundsWithType<T>(); auto limits = this->template GetLimits<T>();
kernels::Clip( kernels::Clip(
X.count(), X.count(),
bounds.first, limits.first,
bounds.second, limits.second,
X.template data<T, Context>(), X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(), Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context> template <class Context>
void ClipOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void ClipGradientOp<Context>::DoRunWithType() { void ClipGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(1), *dX = Output(0); auto &X = Input(0), &dY = Input(1), *dX = Output(0);
auto bounds = this->template ComputeBoundsWithType<T>(); auto limits = this->template GetLimits<T>();
kernels::ClipGrad( kernels::ClipGrad(
X.count(), X.count(),
bounds.first, limits.first,
bounds.second, limits.second,
dY.template data<T, Context>(), dY.template data<T, Context>(),
X.template data<T, Context>(), X.template data<T, Context>(),
dX->ReshapeLike(X)->template mutable_data<T, Context>(), dX->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context>
void ClipGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Clip); DEPLOY_CPU_OPERATOR(Clip);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Clip); DEPLOY_CUDA_OPERATOR(Clip);
...@@ -82,7 +51,7 @@ OPERATOR_SCHEMA(Clip) ...@@ -82,7 +51,7 @@ OPERATOR_SCHEMA(Clip)
OPERATOR_SCHEMA(ClipGradient) OPERATOR_SCHEMA(ClipGradient)
/* X, dY */ /* X, dY */
.NumInputs(2) .NumInputs(2)
/* X, dX */ /* dX */
.NumOutputs(1); .NumOutputs(1);
REGISTER_GRADIENT(Clip, GenericGradientMaker); REGISTER_GRADIENT(Clip, GenericGradientMaker);
......
...@@ -26,10 +26,31 @@ class ClipOp : public Operator<Context> { ...@@ -26,10 +26,31 @@ class ClipOp : public Operator<Context> {
high_(OP_SINGLE_ARG(float, "high", FLT_MAX)) {} high_(OP_SINGLE_ARG(float, "high", FLT_MAX)) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
template <typename T> template <typename T>
pair<float, float> ComputeBoundsWithType(); std::pair<float, float> GetLimits() {
float type_min, type_max;
const auto meta = TypeMeta::Make<T>();
if (meta.template Match<uint8_t>()) {
type_min = float(std::numeric_limits<uint8_t>::min());
type_max = float(std::numeric_limits<uint8_t>::max());
} else if (meta.template Match<int8_t>()) {
type_min = float(std::numeric_limits<int8_t>::min());
type_max = float(std::numeric_limits<int8_t>::max());
} else if (meta.template Match<int>()) {
type_min = float(std::numeric_limits<int>::min());
type_max = float(std::numeric_limits<int>::max());
} else if (meta.template Match<float16>()) {
type_min = -65505.f, type_max = 65504.f;
} else {
type_min = std::numeric_limits<float>::min();
type_max = std::numeric_limits<float>::max();
}
return std::make_pair(std::max(low_, type_min), std::min(high_, type_max));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -45,7 +66,9 @@ class ClipGradientOp final : public ClipOp<Context> { ...@@ -45,7 +66,9 @@ class ClipGradientOp final : public ClipOp<Context> {
: ClipOp<Context>(def, ws) {} : ClipOp<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void MomentsOp<Context>::DoRunWithType() { void MomentsOp<Context>::DoRunWithType() {
using OutputT = typename math::AccmulatorType<T>::type; using OutputT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1); auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);
// Compute reduce axes. // Compute reduce axes.
......
...@@ -8,7 +8,7 @@ namespace dragon { ...@@ -8,7 +8,7 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void BatchNormOp<Context>::RunTraining() { void BatchNormOp<Context>::RunTraining() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT); INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT); INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT); INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT);
...@@ -99,7 +99,7 @@ void BatchNormOp<Context>::RunTraining() { ...@@ -99,7 +99,7 @@ void BatchNormOp<Context>::RunTraining() {
template <class Context> template <class Context>
template <typename T> template <typename T>
void BatchNormOp<Context>::RunInference() { void BatchNormOp<Context>::RunInference() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT); INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT); INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT); INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT);
...@@ -132,7 +132,7 @@ void BatchNormOp<Context>::RunInference() { ...@@ -132,7 +132,7 @@ void BatchNormOp<Context>::RunInference() {
template <class Context> template <class Context>
template <typename T> template <typename T>
void BatchNormGradientOp<Context>::RunTraining() { void BatchNormGradientOp<Context>::RunTraining() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2); auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig"); auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig");
auto* X_params = Output("X_params")->Reshape({C_ * 2}); auto* X_params = Output("X_params")->Reshape({C_ * 2});
...@@ -213,7 +213,7 @@ void BatchNormGradientOp<Context>::RunTraining() { ...@@ -213,7 +213,7 @@ void BatchNormGradientOp<Context>::RunTraining() {
template <class Context> template <class Context>
template <typename T> template <typename T>
void BatchNormGradientOp<Context>::RunInference() { void BatchNormGradientOp<Context>::RunInference() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2); auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto* X_params = Output("X_params")->Reshape({C_}); auto* X_params = Output("X_params")->Reshape({C_});
......
...@@ -8,7 +8,7 @@ namespace dragon { ...@@ -8,7 +8,7 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void GroupNormOp<Context>::DoRunWithType() { void GroupNormOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
auto &W = Input(1), &B = Input(2); auto &W = Input(1), &B = Input(2);
GetBaseArguments(); GetBaseArguments();
...@@ -55,7 +55,7 @@ void GroupNormOp<Context>::DoRunWithType() { ...@@ -55,7 +55,7 @@ void GroupNormOp<Context>::DoRunWithType() {
template <class Context> template <class Context>
template <typename T> template <typename T>
void GroupNormGradientOp<Context>::DoRunWithType() { void GroupNormGradientOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), &W = Input(1), &dY = Input(2); auto &X = Input(0), &W = Input(1), &dY = Input(2);
auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig"); auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig");
auto *dX = Output(0), *dW = Output(1), *dB = Output(2); auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
......
...@@ -7,7 +7,7 @@ namespace dragon { ...@@ -7,7 +7,7 @@ namespace dragon {
template <class Context> template <class Context>
template <typename T> template <typename T>
void LayerNormOp<Context>::DoRunWithType() { void LayerNormOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
auto &W = Input(1), &B = Input(2); auto &W = Input(1), &B = Input(2);
GET_OP_AXIS_ARG(axis, X.ndim(), -1); GET_OP_AXIS_ARG(axis, X.ndim(), -1);
......
...@@ -16,6 +16,7 @@ from __future__ import print_function as _print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function as _print_function
from dragon.core.distributed.backend import is_initialized from dragon.core.distributed.backend import is_initialized
from dragon.core.distributed.backend import is_mpi_available from dragon.core.distributed.backend import is_mpi_available
from dragon.core.distributed.backend import is_nccl_available from dragon.core.distributed.backend import is_nccl_available
from dragon.core.distributed.backend import finalize
from dragon.core.distributed.backend import get_backend from dragon.core.distributed.backend import get_backend
from dragon.core.distributed.backend import get_group from dragon.core.distributed.backend import get_group
from dragon.core.distributed.backend import get_rank from dragon.core.distributed.backend import get_rank
......
...@@ -213,11 +213,6 @@ class OpLib(object): ...@@ -213,11 +213,6 @@ class OpLib(object):
for output in outputs: for output in outputs:
output._requires_grad = False output._requires_grad = False
# Ensure the named operator for the tracing graph.
if hasattr(graph_tape, '_tracing') and not op_name:
op_def.name = op_name = execute_ws.create_handle(op_def.type)
graph_tape.add_handle(op_name)
# Emit to dispatch this execution. # Emit to dispatch this execution.
for feed_key, value_type in run_config['feed_dict'].items(): for feed_key, value_type in run_config['feed_dict'].items():
dest = execute_ws.create_tensor(op_name + '/' + feed_key) dest = execute_ws.create_tensor(op_name + '/' + feed_key)
......
...@@ -14,6 +14,7 @@ from __future__ import division ...@@ -14,6 +14,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
from dragon.core.distributed.backend import is_initialized from dragon.core.distributed.backend import is_initialized
from dragon.core.distributed.backend import finalize
from dragon.core.distributed.backend import get_backend from dragon.core.distributed.backend import get_backend
from dragon.core.distributed.backend import get_group from dragon.core.distributed.backend import get_group
from dragon.core.distributed.backend import get_rank from dragon.core.distributed.backend import get_rank
......
...@@ -169,6 +169,12 @@ def is_nccl_available(): ...@@ -169,6 +169,12 @@ def is_nccl_available():
return _b.ncclIsAvailable() return _b.ncclIsAvailable()
def finalize():
"""Finalize the distributed environment."""
global _GLOBAL_MPI_CONTEXT
_GLOBAL_MPI_CONTEXT = None
def get_backend(group): def get_backend(group):
"""Return the backend of given process group. """Return the backend of given process group.
......
...@@ -58,12 +58,16 @@ class DataReader(multiprocessing.Process): ...@@ -58,12 +58,16 @@ class DataReader(multiprocessing.Process):
""" """
class PartBoundaries(object): class BufferBound(object):
"""Record the boundary of current part.""" """Record the boundary of current buffer."""
def __init__(self, start, end): def __init__(self, start, end):
self.start, self.end = start, end self.start, self.end = start, end
@property
def is_depleted(self):
return self.start == self.end
def __init__(self, **kwargs): def __init__(self, **kwargs):
"""Create a ``DataReader``. """Create a ``DataReader``.
...@@ -91,13 +95,14 @@ class DataReader(multiprocessing.Process): ...@@ -91,13 +95,14 @@ class DataReader(multiprocessing.Process):
self._part_idx = kwargs.get('part_idx', 0) self._part_idx = kwargs.get('part_idx', 0)
self._num_parts = kwargs.get('num_parts', 1) self._num_parts = kwargs.get('num_parts', 1)
self._shuffle = kwargs.get('shuffle', False) self._shuffle = kwargs.get('shuffle', False)
self._initial_fill = kwargs.get('initial_fill', 1024) if self._shuffle else 1 self._initial_fill = kwargs.get('initial_fill', 1024)
self._seed = kwargs.get('seed', config.config().random_seed) self._seed = kwargs.get('seed', config.config().random_seed)
self._stick_to_part = kwargs.get('stick_to_part', True)
self._first, self._cursor, self._last = 0, 0, 0 self._first, self._cursor, self._last = 0, 0, 0
self._part_size = 0 self._part_size = 0
self._num_examples = 0 self._num_examples = 0
self._example_buffer = [] self._buffer_seq = []
self._parts = [] self._buffer_bounds = []
self._reader_queue = None self._reader_queue = None
def before_first(self): def before_first(self):
...@@ -110,43 +115,45 @@ class DataReader(multiprocessing.Process): ...@@ -110,43 +115,45 @@ class DataReader(multiprocessing.Process):
self._cursor += 1 self._cursor += 1
return self._dataset.get() return self._dataset.get()
def reset(self, stick_to_part=False): def reset(self):
"""Reset the environment of dataset.""" """Reset the environment of dataset."""
# Redirect to the adjacent part if available. # Redirect to the adjacent part if available.
if not stick_to_part: if not self._stick_to_part:
self._part_idx = (self._part_idx + 1) % self._num_parts self._part_idx = (self._part_idx + 1) % self._num_parts
self._first = self._part_idx * self._part_size self._first = self._part_idx * self._part_size
self._last = min(self._first + self._part_size, self._num_examples) self._last = min(self._first + self._part_size, self._num_examples)
self.before_first() self.before_first()
# Use the new boundaries to avoid sampling duplicates # Use new boundary to avoid sampling duplicates
# when buffer size is greater than dataset size. # when buffer size is greater than dataset size.
counter = self._parts[-1].end counter = self._buffer_bounds[-1].end
self._parts.append(DataReader.PartBoundaries(counter, counter)) self._buffer_bounds.append(self.BufferBound(counter, counter))
def run(self): def run(self):
"""Start the process.""" """Start the process."""
self._init_dataset() self._init_dataset()
# Persist a loop to read examples. # Persist a loop to read examples.
while True: while True:
# Pop the depleted part if necessary. # Pop the depleted buffer if necessary.
if self._parts[0].start == self._parts[0].end: if self._buffer_bounds[0].is_depleted:
self._parts.pop(0) self._buffer_bounds.pop(0)
offset = 0 pop_bound = self._buffer_bounds[0]
push_bound = self._buffer_bounds[-1]
pop_offset = 0
if self._shuffle: if self._shuffle:
# Sample a random offset if shuffle required. # Sample a random offset.
offset = self._parts[0].end - self._parts[0].start pop_range = pop_bound.end - pop_bound.start
offset = int(numpy.random.uniform(high=offset)) pop_offset = numpy.random.randint(0, pop_range)
# Choose a loaded example from the buffer. # Pop an example from the buffer.
i = self._parts[0].start % len(self._example_buffer) i = pop_bound.start % len(self._buffer_seq)
j = (self._parts[0].start + offset) % len(self._example_buffer) j = (pop_bound.start + pop_offset) % len(self._buffer_seq)
self._reader_queue.put(self._example_buffer[j]) self._reader_queue.put(self._buffer_seq[j])
self._example_buffer[j] = self._example_buffer[i] self._buffer_seq[j] = self._buffer_seq[i]
# Load and push back a new example into the buffer. # Push an example into the buffer.
k = self._parts[-1].end % len(self._example_buffer) k = push_bound.end % len(self._buffer_seq)
self._example_buffer[k] = self.next_example() self._buffer_seq[k] = self.next_example()
# Increase the part boundaries. # Increase the buffer boundary.
self._parts[-1].end += 1 push_bound.end += 1
self._parts[0].start += 1 pop_bound.start += 1
# Reset the cursor if necessary. # Reset the cursor if necessary.
if self._cursor >= self._last: if self._cursor >= self._last:
self.reset() self.reset()
...@@ -162,12 +169,12 @@ class DataReader(multiprocessing.Process): ...@@ -162,12 +169,12 @@ class DataReader(multiprocessing.Process):
# Determine the part specification. # Determine the part specification.
self._num_examples = self._dataset.size self._num_examples = self._dataset.size
self._part_size = (self._num_examples + self._num_parts - 1) // self._num_parts self._part_size = (self._num_examples + self._num_parts - 1) // self._num_parts
self._parts.append(DataReader.PartBoundaries(0, 0))
# Fill the initial buffer to support random sampling. # Fill the initial buffer to support random sampling.
self.reset(stick_to_part=True) self._buffer_bounds.append(self.BufferBound(0, 0))
for i in range(self._initial_fill): self.reset()
self._example_buffer.append(self.next_example()) for _ in range(self._initial_fill):
self._parts[-1].end += 1 self._buffer_bounds[-1].end += 1
self._buffer_seq.append(self.next_example())
if self._cursor >= self._last: if self._cursor >= self._last:
self.reset() self.reset()
...@@ -117,20 +117,18 @@ def export( ...@@ -117,20 +117,18 @@ def export(
if input_names is not None: if input_names is not None:
raise ValueError( raise ValueError(
'Excepted the input names from <args>.\n' 'Excepted the input names from <args>.\n'
'You should set the <input_names> to None.' 'You should set the <input_names> to None.')
)
inputs, input_names, args = \ inputs, input_names, args = \
list(args.values()), list(args.keys()), [args] list(args.values()), list(args.keys()), [args]
else: else:
inputs = args = nest.flatten(args) inputs = args = nest.flatten(args)
# Run the model to get the outputs. # Run the model to get the outputs.
execute_ws = workspace.Workspace() graph_tape = tapes.Tape()
execute_ws.merge_from(workspace.get_workspace()) graph_tape._tracing = True # Enable tracing.
with execute_ws.as_default(): graph_tape._exporting = True # Enable exporting.
with tapes.Tape() as model_tape: with graph_tape:
model_tape._exporting = True outputs = model(*args)
outputs = model(*args)
# Process the outputs # Process the outputs
if isinstance(outputs, dict): if isinstance(outputs, dict):
...@@ -159,7 +157,7 @@ def export( ...@@ -159,7 +157,7 @@ def export(
graph_def.output.extend([output_names[i]]) graph_def.output.extend([output_names[i]])
# Add operators. # Add operators.
for op_def in model_tape.get_elements(): for op_def in graph_tape.get_elements():
ops_def.append(dragon_pb2.OperatorDef()) ops_def.append(dragon_pb2.OperatorDef())
ops_def[-1].ParseFromString(op_def.SerializeAs()) ops_def[-1].ParseFromString(op_def.SerializeAs())
graph_def.op.extend(ops_def) graph_def.op.extend(ops_def)
...@@ -176,17 +174,16 @@ def export( ...@@ -176,17 +174,16 @@ def export(
constants[k] = v constants[k] = v
# Export. # Export.
with execute_ws.as_default(): model = graph_def_to_onnx_model(
model = graph_def_to_onnx_model( graph_def=graph_def,
graph_def=graph_def, input_names=input_names,
input_names=input_names, output_names=output_names,
output_names=output_names, input_shapes=input_shapes,
input_shapes=input_shapes, constants=constants,
constants=constants, value_info=value_info,
value_info=value_info, opset_version=opset_version,
opset_version=opset_version, workspace=workspace.get_workspace(),
workspace=execute_ws, verbose=verbose,
verbose=verbose, enable_onnx_checker=enable_onnx_checker,
enable_onnx_checker=enable_onnx_checker, )
) serialization.save_bytes(serialization.serialize_proto(model), f)
serialization.save_bytes(serialization.serialize_proto(model), f)
...@@ -14,90 +14,48 @@ from __future__ import division ...@@ -14,90 +14,48 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import os import os
import setuptools
import setuptools.command.install
import shutil import shutil
import subprocess import subprocess
import sys import sys
import setuptools
import setuptools.command.build_py
import setuptools.command.install
try: try:
# Override a non-pure "wheel" for pybind distributions # Override a non-pure "wheel" for pybind distributions.
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
class bdist_wheel(_bdist_wheel): class bdist_wheel(_bdist_wheel):
def finalize_options(self): def finalize_options(self):
_bdist_wheel.finalize_options(self) super(bdist_wheel, self).finalize_options()
self.root_is_pure = False self.root_is_pure = False
except ImportError: except ImportError:
bdist_wheel = None bdist_wheel = None
# Read the current version info version = git_version = None
with open('version.txt', 'r') as f: with open('version.txt', 'r') as f:
version = f.read().strip() version = f.read().strip()
try: if os.path.exists('.git'):
git_version = subprocess.check_output( try:
['git', 'rev-parse', 'HEAD'], cwd='../').decode('ascii').strip() git_version = subprocess.check_output(
except (OSError, subprocess.CalledProcessError): ['git', 'rev-parse', 'HEAD'], cwd='../')
git_version = None git_version = git_version.decode('ascii').strip()
except (OSError, subprocess.CalledProcessError):
pass
def clean(): def clean_builds():
"""Remove the work directories.""" """Clean the builds."""
if os.path.exists('dragon/version.py'): if os.path.exists('dragon/version.py'):
shutil.rmtree('dragon') shutil.rmtree('dragon')
if os.path.exists('build/lib'):
shutil.rmtree('build/lib')
if os.path.exists('seeta_dragon.egg-info'): if os.path.exists('seeta_dragon.egg-info'):
shutil.rmtree('seeta_dragon.egg-info') shutil.rmtree('seeta_dragon.egg-info')
def configure():
"""Prepare the package files."""
clean()
# Create a temporary site-package directory.
shutil.copytree('python', 'dragon')
# Copy headers.
shutil.copytree('../targets/native/include', 'dragon/include')
# Copy "caffe" => "dragon.vm.caffe"
shutil.copytree('../caffe', 'dragon/vm/caffe')
# Copy "dali" => "dragon.vm.dali"
shutil.copytree('../dali', 'dragon/vm/dali')
# Copy "tensorflow" => "dragon.vm.tensorflow"
shutil.copytree('../tensorflow', 'dragon/vm/tensorflow')
# Copy "tensorlayer" => "dragon.vm.tensorlayer"
shutil.copytree('../tensorlayer', 'dragon/vm/tensorlayer')
# Copy "tensorrt/python" => "dragon.vm.tensorrt"
shutil.copytree('../tensorrt/python', 'dragon/vm/tensorrt')
# Copy "torch" => "dragon.vm.torch"
shutil.copytree('../torch', 'dragon/vm/torch')
# Copy "torchvision" => "dragon.vm.torchvision"
shutil.copytree('../torchvision', 'dragon/vm/torchvision')
# Copy the pre-built libraries.
if not os.path.exists('dragon/lib'):
os.makedirs('dragon/lib')
for src, dest in find_libraries().items():
if os.path.exists(src):
shutil.copy(src, dest)
else:
print('ERROR: Unable to find the library at <%s>.\n'
'Build it before installing to package.' % src)
shutil.rmtree('dragon')
sys.exit()
# Write the version file.
with open('dragon/version.py', 'w') as f:
f.write("from __future__ import absolute_import\n"
"from __future__ import division\n"
"from __future__ import print_function\n\n"
"version = '{}'\n"
"git_version = '{}'\n".format(version, git_version))
class install(setuptools.command.install.install):
"""Old-style command to prevent from installing egg."""
def run(self):
setuptools.command.install.install.run(self)
def find_libraries(): def find_libraries():
"""Return the pre-built libraries.""" """Return the pre-built libraries."""
in_prefix = '' if sys.platform == 'win32' else 'lib' in_prefix = '' if sys.platform == 'win32' else 'lib'
...@@ -110,7 +68,7 @@ def find_libraries(): ...@@ -110,7 +68,7 @@ def find_libraries():
'../targets/native/lib/{}dragon{}'.format(in_prefix, in_suffix): '../targets/native/lib/{}dragon{}'.format(in_prefix, in_suffix):
'dragon/lib/{}dragon{}'.format(in_prefix, in_suffix), 'dragon/lib/{}dragon{}'.format(in_prefix, in_suffix),
'../targets/native/lib/{}dragon_python{}'.format(in_prefix, in_suffix): '../targets/native/lib/{}dragon_python{}'.format(in_prefix, in_suffix):
'dragon/lib/libdragon_python{}'.format(out_suffix) 'dragon/lib/libdragon_python{}'.format(out_suffix),
} }
if sys.platform == 'win32': if sys.platform == 'win32':
libraries['../targets/native/lib/dragon.lib'] = 'dragon/lib/dragon.lib' libraries['../targets/native/lib/dragon.lib'] = 'dragon/lib/dragon.lib'
...@@ -118,30 +76,74 @@ def find_libraries(): ...@@ -118,30 +76,74 @@ def find_libraries():
return libraries return libraries
def find_packages(): def find_packages(top):
"""Return the python sources installed to package.""" """Return the python sources installed to package."""
packages = [] packages = []
for root, _, files in os.walk('dragon'): for root, _, _ in os.walk(top):
if os.path.exists(os.path.join(root, '__init__.py')): if os.path.exists(os.path.join(root, '__init__.py')):
packages.append(root) packages.append(root)
return packages return packages
def find_package_data(): def find_package_data(top):
"""Return the external data installed to package.""" """Return the external data installed to package."""
headers, libraries = [], [] headers, libraries = [], []
for root, _, files in os.walk('dragon/include'): for root, _, files in os.walk(top + '/include'):
root = root[len('dragon/'):] root = root[len(top + '/'):]
for file in files: for file in files:
headers.append(os.path.join(root, file)) headers.append(os.path.join(root, file))
for root, _, files in os.walk('dragon/lib'): for root, _, files in os.walk(top + '/lib'):
root = root[len('dragon/'):] root = root[len(top + '/'):]
for file in files: for file in files:
libraries.append(os.path.join(root, file)) libraries.append(os.path.join(root, file))
return headers + libraries return headers + libraries
configure() class BuildPyCommand(setuptools.command.build_py.build_py):
"""Enhanced 'build_py' command."""
def build_packages(self):
clean_builds()
shutil.copytree('python', 'dragon')
shutil.copytree('../caffe', 'dragon/vm/caffe')
shutil.copytree('../dali', 'dragon/vm/dali')
shutil.copytree('../tensorflow', 'dragon/vm/tensorflow')
shutil.copytree('../tensorlayer', 'dragon/vm/tensorlayer')
shutil.copytree('../tensorrt/python', 'dragon/vm/tensorrt')
shutil.copytree('../torch', 'dragon/vm/torch')
shutil.copytree('../torchvision', 'dragon/vm/torchvision')
with open('dragon/version.py', 'w') as f:
f.write("from __future__ import absolute_import\n"
"from __future__ import division\n"
"from __future__ import print_function\n\n"
"version = '{}'\n"
"git_version = '{}'\n".format(version, git_version))
self.packages = find_packages('dragon')
super(BuildPyCommand, self).build_packages()
def build_package_data(self):
shutil.copytree('../targets/native/include', 'dragon/include')
if not os.path.exists('dragon/lib'):
os.makedirs('dragon/lib')
for src, dest in find_libraries().items():
if os.path.exists(src):
shutil.copy(src, dest)
else:
print('ERROR: Unable to find the library at <%s>.\n'
'Build it before installing to package.' % src)
sys.exit()
self.package_data = {'dragon': find_package_data('dragon')}
super(BuildPyCommand, self).build_package_data()
class InstallCommand(setuptools.command.install.install):
"""Enhanced 'install' command."""
def run(self):
# Old-style install instead of egg.
super(InstallCommand, self).run()
setuptools.setup( setuptools.setup(
name='seeta-dragon', name='seeta-dragon',
version=version, version=version,
...@@ -150,31 +152,30 @@ setuptools.setup( ...@@ -150,31 +152,30 @@ setuptools.setup(
url='https://github.com/seetaresearch/dragon', url='https://github.com/seetaresearch/dragon',
author='SeetaTech', author='SeetaTech',
license='BSD 2-Clause', license='BSD 2-Clause',
packages=find_packages(), packages=find_packages('python'),
package_data={'dragon': find_package_data()},
package_dir={'dragon': 'dragon'}, package_dir={'dragon': 'dragon'},
cmdclass={'bdist_wheel': bdist_wheel, 'install': install}, cmdclass={'bdist_wheel': bdist_wheel,
'build_py': BuildPyCommand,
'install': InstallCommand},
python_requires='>=3.6', python_requires='>=3.6',
install_requires=['numpy', 'protobuf', 'kpl-dataset'], install_requires=['numpy', 'protobuf', 'kpl-dataset'],
classifiers=[ classifiers=['Development Status :: 5 - Production/Stable',
'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers',
'Intended Audience :: Developers', 'Intended Audience :: Education',
'Intended Audience :: Education', 'Intended Audience :: Science/Research',
'Intended Audience :: Science/Research', 'License :: OSI Approved :: BSD License',
'License :: OSI Approved :: BSD License', 'Programming Language :: C++',
'Programming Language :: C++', 'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3 :: Only', 'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.9', 'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Software Development',
'Topic :: Software Development', 'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules'],
'Topic :: Software Development :: Libraries :: Python Modules',
],
) )
clean() clean_builds()
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_UTILS_MATH_SORT_H_
#define DRAGON_UTILS_MATH_SORT_H_
#include "dragon/core/context.h"
namespace dragon {
namespace math {
/*
* Sort Utilities.
*/
namespace utils {
template <typename T>
inline void ArgPartition(
const int count,
const int kth,
const bool descend,
const T* v,
vec64_t& indices) {
indices.resize(count);
std::iota(indices.begin(), indices.end(), 0);
if (descend) {
std::nth_element(
indices.begin(),
indices.begin() + kth,
indices.end(),
[&v](int64_t lhs, int64_t rhs) { return v[lhs] > v[rhs]; });
} else {
std::nth_element(
indices.begin(),
indices.begin() + kth,
indices.end(),
[&v](int64_t lhs, int64_t rhs) { return v[lhs] < v[rhs]; });
}
}
} // namespace utils
} // namespace math
} // namespace dragon
#endif // DRAGON_UTILS_MATH_SORT_H_
...@@ -54,10 +54,14 @@ void _AffineImpl( ...@@ -54,10 +54,14 @@ void _AffineImpl(
const T* scale, const T* scale,
const T* bias, const T* bias,
T* y) { T* y) {
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) { if (num_dims == 1 && num_dims == 1 && axes[0] == 0) {
_AffineChannel(dims[0], dims[1], x, scale, bias, y); _AffineChannel(1, dims[0], x, scale, bias, y); // [NxC]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], x, scale, bias, y); // [N, C]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 0) {
_AffineChannel(1, dims[0], dims[1], x, scale, bias, y); // [NxC, S]
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) { } else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y); _AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y); // [N, C, S]
} else { } else {
LOG(FATAL) << "Unsupported affine dimensions."; LOG(FATAL) << "Unsupported affine dimensions.";
} }
......
...@@ -65,12 +65,18 @@ void _AffineImpl( ...@@ -65,12 +65,18 @@ void _AffineImpl(
T* y, T* y,
CUDAContext* ctx) { CUDAContext* ctx) {
const auto N = math::utils::Prod(num_dims, dims); const auto N = math::utils::Prod(num_dims, dims);
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) { if (num_dims == 1 && num_axes == 1 && axes[0] == 0) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], x, scale, bias, y); N, dims[0], x, scale, bias, y); // [NxC]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], x, scale, bias, y); // [N, C]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 0) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[0], dims[1], x, scale, bias, y); // [NxC, S]
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) { } else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( _AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], dims[2], x, scale, bias, y); N, dims[1], dims[2], x, scale, bias, y); // [N, C, S]
} else { } else {
LOG(FATAL) << "Unsupported affine dimensions."; LOG(FATAL) << "Unsupported affine dimensions.";
} }
......
...@@ -40,19 +40,19 @@ class ScalarType<float16> { ...@@ -40,19 +40,19 @@ class ScalarType<float16> {
#endif #endif
template <typename T> template <typename T>
class AccmulatorType { class AccumulatorType {
public: public:
typedef float type; typedef float type;
}; };
template <> template <>
class AccmulatorType<int64_t> { class AccumulatorType<int64_t> {
public: public:
typedef double type; typedef double type;
}; };
template <> template <>
class AccmulatorType<double> { class AccumulatorType<double> {
public: public:
typedef double type; typedef double type;
}; };
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include "dragon/utils/math/functional.h" #include "dragon/utils/math/functional.h"
#include "dragon/utils/math/random.h" #include "dragon/utils/math/random.h"
#include "dragon/utils/math/reduce.h" #include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/sort.h"
#include "dragon/utils/math/transform.h" #include "dragon/utils/math/transform.h"
#include "dragon/utils/math/transpose.h" #include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/types.h" #include "dragon/utils/math/types.h"
......
...@@ -46,6 +46,10 @@ class TestBackend(unittest.TestCase): ...@@ -46,6 +46,10 @@ class TestBackend(unittest.TestCase):
with group.as_default(): with group.as_default():
self.assertEqual(dragon.distributed.get_rank(group), 0) self.assertEqual(dragon.distributed.get_rank(group), 0)
@unittest.skipIf(not TEST_MPI, 'MPI unavailable')
def test_finalize(self):
dragon.distributed.finalize()
if __name__ == '__main__': if __name__ == '__main__':
run_tests() run_tests()
...@@ -39,13 +39,14 @@ class FunctionGuard(function_lib.FunctionGuard): ...@@ -39,13 +39,14 @@ class FunctionGuard(function_lib.FunctionGuard):
if not isinstance(input, Tensor) and input_spec is None: if not isinstance(input, Tensor) and input_spec is None:
inputs.append(input) inputs.append(input)
continue continue
shape = getattr(input, 'shape', None) input_spec = input_spec or {}
dtype = getattr(input, 'dtype', None) for k in ('shape', 'dtype', 'device'):
device = getattr(input, 'device', None) input_spec[k] = getattr(input, k, input_spec.get(k, None))
if input_spec is not None: inputs.append(Tensor(*input_spec['shape'],
device = input_spec['device'] dtype=input_spec['dtype'],
shape, dtype = input_spec['shape'], input_spec['dtype'] device=input_spec['device']))
inputs.append(Tensor(*shape, dtype=dtype, device=device)) if isinstance(input, Tensor):
inputs[-1].copy_(input)
with tapes.Tape() as function_tape: with tapes.Tape() as function_tape:
function_tape._tracing = True function_tape._tracing = True
attributes['inputs'] = inputs attributes['inputs'] = inputs
......
...@@ -3299,7 +3299,6 @@ class Tensor(object): ...@@ -3299,7 +3299,6 @@ class Tensor(object):
def __del__(self): def __del__(self):
if self._deleter: if self._deleter:
# print(self._impl.name)
self._deleter.release(self._impl.name) self._deleter.release(self._impl.name)
def __eq__(self, other): def __eq__(self, other):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!