Commit a79a3bba by Ting PAN

Refactor the python distribution script

Summary:
This commit correctly uses the distribution commands to collect the
python package and the compiled dynamic libraries.
1 parent 494774d3
Showing with 476 additions and 521 deletions
......@@ -62,6 +62,7 @@ if (USE_MPI)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/mpi/lib)
endif()
if (USE_CUDNN)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib64)
list(APPEND THIRD_PARTY_LIBRARY_DIRS ${THIRD_PARTY_DIR}/cudnn/lib/x64)
endif()
......
......@@ -147,7 +147,7 @@ class DRAGON_API Tensor {
} else {
auto* new_memory = other->memory();
if (new_memory != nullptr) {
CHECK_LE(size_, new_memory->size())
CHECK_LE(size_ * meta_.itemsize(), new_memory->size())
<< "\nMap from a memory with smaller capacity.";
mapped_memory_ = new_memory;
capacity_ = new_memory->size();
......
......@@ -25,33 +25,33 @@ void _DropPath(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void DropPath<T, CPUContext>( \
const int N, \
const int C, \
const float ratio, \
const float scale, \
const T* x, \
T* y, \
uint8_t* mask, \
uint32_t* /* r */, \
CPUContext* ctx) { \
math::RandomBernoulli(N, 1.f - ratio, mask, ctx); \
_DropPath(N, C, math::AccmulatorType<T>::type(scale), mask, x, y); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void DropPath<T, CPUContext>( \
const int N, \
const int C, \
const float ratio, \
const float scale, \
const T* x, \
T* y, \
uint8_t* mask, \
uint32_t* /* r */, \
CPUContext* ctx) { \
math::RandomBernoulli(N, 1.f - ratio, mask, ctx); \
_DropPath(N, C, math::AccumulatorType<T>::type(scale), mask, x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void DropPathGrad<T, CPUContext>( \
const int N, \
const int C, \
const float scale, \
const uint8_t* mask, \
const T* dy, \
T* dx, \
CPUContext* ctx) { \
_DropPath(N, C, math::AccmulatorType<T>::type(scale), mask, dy, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void DropPathGrad<T, CPUContext>( \
const int N, \
const int C, \
const float scale, \
const uint8_t* mask, \
const T* dy, \
T* dx, \
CPUContext* ctx) { \
_DropPath(N, C, math::AccumulatorType<T>::type(scale), mask, dy, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
......@@ -62,7 +62,7 @@ __global__ void _DropPathGrad(
_DropPath<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxC, \
C, \
convert::To<math::AccmulatorType<T>::type>(scale), \
convert::To<math::AccumulatorType<T>::type>(scale), \
static_cast<uint32_t>(UINT_MAX * ratio), \
r, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
......@@ -84,7 +84,7 @@ __global__ void _DropPathGrad(
_DropPathGrad<<<CUDA_BLOCKS(NxC), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxC, \
C, \
convert::To<math::AccmulatorType<T>::type>(scale), \
convert::To<math::AccumulatorType<T>::type>(scale), \
mask, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
......
......@@ -43,7 +43,7 @@ __global__ void _Dropout(
math::Random(N, r, ctx); \
_Dropout<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<math::AccmulatorType<T>::type>(scale), \
convert::To<math::AccumulatorType<T>::type>(scale), \
static_cast<uint32_t>(UINT_MAX * ratio), \
r, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
......
......@@ -64,7 +64,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) {
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \
_##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
_##name<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
......@@ -75,7 +75,7 @@ __global__ void _ApproxGeluGrad(const int N, const T* dy, const T* x, T* dx) {
template <> \
void name<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
_##name<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
......
......@@ -49,8 +49,8 @@ __global__ void _HardSigmoidGrad(
CUDAContext* ctx) { \
_HardSigmoid<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \
convert::To<math::AccmulatorType<T>::type>(beta), \
convert::To<math::AccumulatorType<T>::type>(alpha), \
convert::To<math::AccumulatorType<T>::type>(beta), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
......@@ -66,7 +66,7 @@ __global__ void _HardSigmoidGrad(
CUDAContext* ctx) { \
_HardSigmoidGrad<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \
convert::To<math::AccumulatorType<T>::type>(alpha), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(y), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
......
......@@ -36,27 +36,27 @@ __global__ void _HardSwishGrad(const int N, const T* dy, const T* x, T* dx) {
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void HardSwish<T, CUDAContext>( \
const int N, const T* x, T* y, CUDAContext* ctx) { \
_HardSwish<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void HardSwish<T, CUDAContext>( \
const int N, const T* x, T* y, CUDAContext* ctx) { \
_HardSwish<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void HardSwishGrad<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_HardSwishGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void HardSwishGrad<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_HardSwishGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
......@@ -138,7 +138,7 @@ __global__ void _PReluWGrad(
DISPATCH_CWISE_PRELU_KERNEL( \
_PRelu, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
CUDA_BLOCKS(NxCxS), \
CUDA_THREADS, \
NxCxS, \
......@@ -148,7 +148,7 @@ __global__ void _PReluWGrad(
reinterpret_cast<const math::ScalarType<T>::type*>(w), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \
_PRelu<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
_PRelu<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
......@@ -157,76 +157,76 @@ __global__ void _PReluWGrad(
} \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void PReluGrad<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const string& data_format, \
const T* dy, \
const T* x, \
const T* w, \
T* dx, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
if (C > 1) { \
DISPATCH_CWISE_PRELU_KERNEL( \
_PReluGrad, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
CUDA_BLOCKS(NxCxS), \
CUDA_THREADS, \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(w), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} else { \
_PReluGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(w), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} \
} \
template <> \
void PReluWGrad<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const string& data_format, \
const T* dy, \
const T* x, \
T* dw, \
CUDAContext* ctx) { \
const auto NxS = N * S; \
const auto NxCxS = NxS * C; \
if (C > 1) { \
DISPATCH_CWISE_PRELU_KERNEL( \
_PReluWGrad, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
C, \
CUDA_THREADS, \
NxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dw)); \
} else { \
_PReluWGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<1, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dw)); \
} \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void PReluGrad<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const string& data_format, \
const T* dy, \
const T* x, \
const T* w, \
T* dx, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
if (C > 1) { \
DISPATCH_CWISE_PRELU_KERNEL( \
_PReluGrad, \
math::ScalarType<T>::type, \
math::AccumulatorType<T>::type, \
CUDA_BLOCKS(NxCxS), \
CUDA_THREADS, \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(w), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} else { \
_PReluGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(w), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} \
} \
template <> \
void PReluWGrad<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const string& data_format, \
const T* dy, \
const T* x, \
T* dw, \
CUDAContext* ctx) { \
const auto NxS = N * S; \
const auto NxCxS = NxS * C; \
if (C > 1) { \
DISPATCH_CWISE_PRELU_KERNEL( \
_PReluWGrad, \
math::ScalarType<T>::type, \
math::AccumulatorType<T>::type, \
C, \
CUDA_THREADS, \
NxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dw)); \
} else { \
_PReluWGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<1, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dw)); \
} \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
......@@ -38,40 +38,40 @@ void _SeluGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Selu<T, CPUContext>( \
const int N, \
const float alpha, \
const float gamma, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_Selu( \
N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \
convert::To<math::AccmulatorType<T>::type>(gamma), \
x, \
y); \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Selu<T, CPUContext>( \
const int N, \
const float alpha, \
const float gamma, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_Selu( \
N, \
convert::To<math::AccumulatorType<T>::type>(alpha), \
convert::To<math::AccumulatorType<T>::type>(gamma), \
x, \
y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void SeluGrad<T, CPUContext>( \
const int N, \
const float alpha, \
const float gamma, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_SeluGrad( \
N, \
convert::To<math::AccmulatorType<T>::type>(alpha), \
convert::To<math::AccmulatorType<T>::type>(gamma), \
dy, \
y, \
dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void SeluGrad<T, CPUContext>( \
const int N, \
const float alpha, \
const float gamma, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_SeluGrad( \
N, \
convert::To<math::AccumulatorType<T>::type>(alpha), \
convert::To<math::AccumulatorType<T>::type>(gamma), \
dy, \
y, \
dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
......@@ -35,23 +35,23 @@ __global__ void _SiluGrad(const int N, const T* dy, const T* x, T* dx) {
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Silu<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* ctx) { \
_Silu<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
_Silu<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void SiluGrad<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_SiluGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void SiluGrad<T, CUDAContext>( \
const int N, const T* dy, const T* x, T* dx, CUDAContext* ctx) { \
_SiluGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
}
DEFINE_KERNEL_LAUNCHER(float16);
......
......@@ -285,7 +285,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
_##name##ViaWarpReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
math::AccumulatorType<T>::type> \
<<<nblocks, \
dim3(CUDA_WARP_SIZE, WARP_ITEMS), \
0, \
......@@ -299,7 +299,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
} \
_##name##ViaBlockReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
math::AccumulatorType<T>::type> \
<<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \
S, \
......@@ -323,7 +323,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
_##name##ViaWarpReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
math::AccumulatorType<T>::type> \
<<<nblocks, \
dim3(CUDA_WARP_SIZE, WARP_ITEMS), \
0, \
......@@ -338,7 +338,7 @@ __global__ void _LogSoftmaxGradViaBlockReduce(
} \
_##name##ViaBlockReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
math::AccumulatorType<T>::type> \
<<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \
S, \
......
......@@ -69,26 +69,26 @@ __global__ void _RepeatGrad(
NxCxS2, C, S, S2, x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void RepeatGrad<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const int repeats, \
const T* dy, \
T* dx, \
CUDAContext* ctx) { \
const auto S2 = S * repeats; \
const auto NxCxS = N * C * S; \
_RepeatGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
C, \
S, \
S2, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void RepeatGrad<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const int repeats, \
const T* dy, \
T* dx, \
CUDAContext* ctx) { \
const auto S2 = S * repeats; \
const auto NxCxS = N * C * S; \
_RepeatGrad<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
C, \
S, \
S2, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
}
DEFINE_KERNEL_LAUNCHER(bool);
......
......@@ -77,7 +77,7 @@ __global__ void _BroadcastLossGrad(
T* dx, \
CUDAContext* ctx) { \
using ScalarT = math::ScalarType<T>::type; \
using AccT = math::AccmulatorType<T>::type; \
using AccT = math::AccumulatorType<T>::type; \
if (num_masks > 0 && normalizer < 0.f) { \
auto* num_valid = const_cast<T*>(mask + num_masks); \
math::Sum(num_masks, 1.f, mask, num_valid, ctx); \
......@@ -105,7 +105,7 @@ __global__ void _BroadcastLossGrad(
T* dx, \
CUDAContext* ctx) { \
using ScalarT = math::ScalarType<T>::type; \
using AccT = math::AccmulatorType<T>::type; \
using AccT = math::AccumulatorType<T>::type; \
const auto CxS = C * S; \
const auto NxCxS = N * CxS; \
_BroadcastLossGrad<ScalarT, AccT> \
......
......@@ -42,7 +42,7 @@ __global__ void _SmoothL1Grad(const int N, const AccT beta, const T* x, T* y) {
const int N, const float beta, const T* x, T* y, CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<math::AccmulatorType<T>::type>(beta), \
convert::To<math::AccumulatorType<T>::type>(beta), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
......
......@@ -70,12 +70,12 @@ void _ReduceSumGradImpl(
DISPATCH_FUNC_BY_VALUE_WITH_TYPE_2( \
_ReduceSumGradImpl, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
num_dims, \
x_dims, \
y_dims, \
y_strides, \
convert::To<math::AccmulatorType<T>::type>(scale), \
convert::To<math::AccumulatorType<T>::type>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<math::ScalarType<T>::type*>(dx), \
ctx); \
......
......@@ -440,7 +440,7 @@ void _AvgPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
N, \
C, \
H, \
......@@ -492,7 +492,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double); // AvgPool2dGrad
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
N, \
C, \
D, \
......
......@@ -457,7 +457,7 @@ __global__ void _AvgPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......@@ -512,7 +512,7 @@ DEFINE_KERNEL_LAUNCHER(AvgPool2dGrad, double, (H * W)); // AvgPool2dGrad
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......
......@@ -37,35 +37,35 @@ __global__ void _BiasAdd(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void BiasAdd<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
const T* bias, \
T* y, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
if (S == 1) { \
_BiasAdd<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \
_BiasAdd<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void BiasAdd<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
const T* bias, \
T* y, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
if (S == 1) { \
_BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \
_BiasAdd<math::ScalarType<T>::type, math::AccumulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} \
}
DEFINE_KERNEL_LAUNCHER(uint8_t);
......
......@@ -393,7 +393,7 @@ void _MaxPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
N, \
C, \
H, \
......@@ -447,7 +447,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double); // MaxPool2dGrad
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
N, \
C, \
D, \
......
......@@ -472,7 +472,7 @@ __global__ void _MaxPool3dGradNHWC(
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......@@ -529,7 +529,7 @@ DEFINE_KERNEL_LAUNCHER(MaxPool2dGrad, double, (H * W)); // MaxPool2dGrad
DISPATCH_POOL_KERNEL( \
_##name, \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \
math::AccumulatorType<T>::type, \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
nthreads, \
......
......@@ -49,7 +49,7 @@ __global__ void _ResizeLinear2dNCHW(
const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C;
const int n = yi / out_w / out_h / C;
const float h = TransformCoordinate(h_out, scale_h, align_corners);
const float w = TransformCoordinate(w_out, scale_w, align_corners);
......@@ -129,7 +129,7 @@ __global__ void _ResizeLinear2dGradNCHW(
const int w_out = yi % out_w;
const int h_out = (yi / out_w) % out_h;
const int c = (yi / out_w / out_h) % C;
const int n = yi / out_w / out_w / C;
const int n = yi / out_w / out_h / C;
const float h = TransformCoordinate(h_out, scale_h, align_corners);
const float w = TransformCoordinate(w_out, scale_w, align_corners);
......
......@@ -54,23 +54,22 @@ class NumpyWrapper {
}
return py::reinterpret_steal<py::object>(array);
}
auto* array = PyArray_SimpleNewFromData(
return py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
dims.size(),
dims.data(),
dtypes::to_npy(meta),
const_cast<void*>(tensor_->raw_data<CPUContext>()));
return py::reinterpret_steal<py::object>(array);
const_cast<void*>(tensor_->raw_data<CPUContext>())));
}
Tensor* From(py::object obj, bool copy) {
auto* array =
PyArray_GETCONTIGUOUS(reinterpret_cast<PyArrayObject*>(obj.ptr()));
auto* array = PyArray_GETCONTIGUOUS((PyArrayObject*)obj.ptr());
const auto& meta = dtypes::from_npy(PyArray_TYPE(array));
CHECK(meta.id() != 0) << "\nUnsupported numpy array type.";
auto* npy_dims = PyArray_DIMS(array);
auto* data = static_cast<void*>(PyArray_DATA(array));
vector<int64_t> dims(npy_dims, npy_dims + PyArray_NDIM(array));
auto* memory = tensor_->set_meta(meta)->Reshape(dims)->memory();
tensor_->set_meta(meta)->Reshape(dims);
auto* memory = tensor_->MapFrom(nullptr)->memory();
if (copy) {
auto device_type = memory ? memory->info()["device_type"] : "cpu";
if (device_type == "cuda") {
......
......@@ -5,64 +5,33 @@ namespace dragon {
template <class Context>
template <typename T>
pair<float, float> ClipOp<Context>::ComputeBoundsWithType() {
auto meta = TypeMeta::Make<T>();
if (meta.template Match<int8_t>()) {
return std::make_pair(std::max(low_, -128.f), std::min(high_, 127.f));
} else if (meta.template Match<uint8_t>()) {
return std::make_pair(std::max(low_, 0.f), std::min(high_, 255.f));
} else if (meta.template Match<int>()) {
return std::make_pair(
std::max(low_, -214748e4f), std::min(high_, 214748e4f));
} else if (meta.template Match<int64_t>()) {
return std::make_pair(
std::max(low_, -922337e13f), std::min(high_, 922337e13f));
} else if (meta.template Match<float16>()) {
return std::make_pair(std::max(low_, -65505.f), std::min(high_, 65504.f));
} else {
return std::make_pair(std::max(low_, -FLT_MAX), std::min(high_, FLT_MAX));
}
}
template <class Context>
template <typename T>
void ClipOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
auto bounds = ComputeBoundsWithType<T>();
auto limits = this->template GetLimits<T>();
kernels::Clip(
X.count(),
bounds.first,
bounds.second,
limits.first,
limits.second,
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void ClipOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void ClipGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(1), *dX = Output(0);
auto bounds = this->template ComputeBoundsWithType<T>();
auto limits = this->template GetLimits<T>();
kernels::ClipGrad(
X.count(),
bounds.first,
bounds.second,
limits.first,
limits.second,
dY.template data<T, Context>(),
X.template data<T, Context>(),
dX->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void ClipGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Clip);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Clip);
......@@ -82,7 +51,7 @@ OPERATOR_SCHEMA(Clip)
OPERATOR_SCHEMA(ClipGradient)
/* X, dY */
.NumInputs(2)
/* X, dX */
/* dX */
.NumOutputs(1);
REGISTER_GRADIENT(Clip, GenericGradientMaker);
......
......@@ -26,10 +26,31 @@ class ClipOp : public Operator<Context> {
high_(OP_SINGLE_ARG(float, "high", FLT_MAX)) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
template <typename T>
pair<float, float> ComputeBoundsWithType();
std::pair<float, float> GetLimits() {
float type_min, type_max;
const auto meta = TypeMeta::Make<T>();
if (meta.template Match<uint8_t>()) {
type_min = float(std::numeric_limits<uint8_t>::min());
type_max = float(std::numeric_limits<uint8_t>::max());
} else if (meta.template Match<int8_t>()) {
type_min = float(std::numeric_limits<int8_t>::min());
type_max = float(std::numeric_limits<int8_t>::max());
} else if (meta.template Match<int>()) {
type_min = float(std::numeric_limits<int>::min());
type_max = float(std::numeric_limits<int>::max());
} else if (meta.template Match<float16>()) {
type_min = -65505.f, type_max = 65504.f;
} else {
type_min = std::numeric_limits<float>::min();
type_max = std::numeric_limits<float>::max();
}
return std::make_pair(std::max(low_, type_min), std::min(high_, type_max));
}
template <typename T>
void DoRunWithType();
......@@ -45,7 +66,9 @@ class ClipGradientOp final : public ClipOp<Context> {
: ClipOp<Context>(def, ws) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......
......@@ -7,7 +7,7 @@ namespace dragon {
template <class Context>
template <typename T>
void MomentsOp<Context>::DoRunWithType() {
using OutputT = typename math::AccmulatorType<T>::type;
using OutputT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);
// Compute reduce axes.
......
......@@ -8,7 +8,7 @@ namespace dragon {
template <class Context>
template <typename T>
void BatchNormOp<Context>::RunTraining() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT);
......@@ -99,7 +99,7 @@ void BatchNormOp<Context>::RunTraining() {
template <class Context>
template <typename T>
void BatchNormOp<Context>::RunInference() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(Input(3), vec64_t({C_}), ParamT);
......@@ -132,7 +132,7 @@ void BatchNormOp<Context>::RunInference() {
template <class Context>
template <typename T>
void BatchNormGradientOp<Context>::RunTraining() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig");
auto* X_params = Output("X_params")->Reshape({C_ * 2});
......@@ -213,7 +213,7 @@ void BatchNormGradientOp<Context>::RunTraining() {
template <class Context>
template <typename T>
void BatchNormGradientOp<Context>::RunInference() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto* X_params = Output("X_params")->Reshape({C_});
......
......@@ -8,7 +8,7 @@ namespace dragon {
template <class Context>
template <typename T>
void GroupNormOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), *Y = Output(0);
auto &W = Input(1), &B = Input(2);
GetBaseArguments();
......@@ -55,7 +55,7 @@ void GroupNormOp<Context>::DoRunWithType() {
template <class Context>
template <typename T>
void GroupNormGradientOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), &W = Input(1), &dY = Input(2);
auto &X_mu = Input("X_mu"), &X_rsig = Input("X_rsig");
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
......
......@@ -7,7 +7,7 @@ namespace dragon {
template <class Context>
template <typename T>
void LayerNormOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type;
using ParamT = typename math::AccumulatorType<T>::type;
auto &X = Input(0), *Y = Output(0);
auto &W = Input(1), &B = Input(2);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
......
......@@ -16,6 +16,7 @@ from __future__ import print_function as _print_function
from dragon.core.distributed.backend import is_initialized
from dragon.core.distributed.backend import is_mpi_available
from dragon.core.distributed.backend import is_nccl_available
from dragon.core.distributed.backend import finalize
from dragon.core.distributed.backend import get_backend
from dragon.core.distributed.backend import get_group
from dragon.core.distributed.backend import get_rank
......
......@@ -213,11 +213,6 @@ class OpLib(object):
for output in outputs:
output._requires_grad = False
# Ensure the named operator for the tracing graph.
if hasattr(graph_tape, '_tracing') and not op_name:
op_def.name = op_name = execute_ws.create_handle(op_def.type)
graph_tape.add_handle(op_name)
# Emit to dispatch this execution.
for feed_key, value_type in run_config['feed_dict'].items():
dest = execute_ws.create_tensor(op_name + '/' + feed_key)
......
......@@ -14,6 +14,7 @@ from __future__ import division
from __future__ import print_function
from dragon.core.distributed.backend import is_initialized
from dragon.core.distributed.backend import finalize
from dragon.core.distributed.backend import get_backend
from dragon.core.distributed.backend import get_group
from dragon.core.distributed.backend import get_rank
......
......@@ -169,6 +169,12 @@ def is_nccl_available():
return _b.ncclIsAvailable()
def finalize():
"""Finalize the distributed environment."""
global _GLOBAL_MPI_CONTEXT
_GLOBAL_MPI_CONTEXT = None
def get_backend(group):
"""Return the backend of given process group.
......
......@@ -58,12 +58,16 @@ class DataReader(multiprocessing.Process):
"""
class PartBoundaries(object):
"""Record the boundary of current part."""
class BufferBound(object):
"""Record the boundary of current buffer."""
def __init__(self, start, end):
self.start, self.end = start, end
@property
def is_depleted(self):
return self.start == self.end
def __init__(self, **kwargs):
"""Create a ``DataReader``.
......@@ -91,13 +95,14 @@ class DataReader(multiprocessing.Process):
self._part_idx = kwargs.get('part_idx', 0)
self._num_parts = kwargs.get('num_parts', 1)
self._shuffle = kwargs.get('shuffle', False)
self._initial_fill = kwargs.get('initial_fill', 1024) if self._shuffle else 1
self._initial_fill = kwargs.get('initial_fill', 1024)
self._seed = kwargs.get('seed', config.config().random_seed)
self._stick_to_part = kwargs.get('stick_to_part', True)
self._first, self._cursor, self._last = 0, 0, 0
self._part_size = 0
self._num_examples = 0
self._example_buffer = []
self._parts = []
self._buffer_seq = []
self._buffer_bounds = []
self._reader_queue = None
def before_first(self):
......@@ -110,43 +115,45 @@ class DataReader(multiprocessing.Process):
self._cursor += 1
return self._dataset.get()
def reset(self, stick_to_part=False):
def reset(self):
"""Reset the environment of dataset."""
# Redirect to the adjacent part if available.
if not stick_to_part:
if not self._stick_to_part:
self._part_idx = (self._part_idx + 1) % self._num_parts
self._first = self._part_idx * self._part_size
self._last = min(self._first + self._part_size, self._num_examples)
self.before_first()
# Use the new boundaries to avoid sampling duplicates
# Use new boundary to avoid sampling duplicates
# when buffer size is greater than dataset size.
counter = self._parts[-1].end
self._parts.append(DataReader.PartBoundaries(counter, counter))
counter = self._buffer_bounds[-1].end
self._buffer_bounds.append(self.BufferBound(counter, counter))
def run(self):
"""Start the process."""
self._init_dataset()
# Persist a loop to read examples.
while True:
# Pop the depleted part if necessary.
if self._parts[0].start == self._parts[0].end:
self._parts.pop(0)
offset = 0
# Pop the depleted buffer if necessary.
if self._buffer_bounds[0].is_depleted:
self._buffer_bounds.pop(0)
pop_bound = self._buffer_bounds[0]
push_bound = self._buffer_bounds[-1]
pop_offset = 0
if self._shuffle:
# Sample a random offset if shuffle required.
offset = self._parts[0].end - self._parts[0].start
offset = int(numpy.random.uniform(high=offset))
# Choose a loaded example from the buffer.
i = self._parts[0].start % len(self._example_buffer)
j = (self._parts[0].start + offset) % len(self._example_buffer)
self._reader_queue.put(self._example_buffer[j])
self._example_buffer[j] = self._example_buffer[i]
# Load and push back a new example into the buffer.
k = self._parts[-1].end % len(self._example_buffer)
self._example_buffer[k] = self.next_example()
# Increase the part boundaries.
self._parts[-1].end += 1
self._parts[0].start += 1
# Sample a random offset.
pop_range = pop_bound.end - pop_bound.start
pop_offset = numpy.random.randint(0, pop_range)
# Pop an example from the buffer.
i = pop_bound.start % len(self._buffer_seq)
j = (pop_bound.start + pop_offset) % len(self._buffer_seq)
self._reader_queue.put(self._buffer_seq[j])
self._buffer_seq[j] = self._buffer_seq[i]
# Push an example into the buffer.
k = push_bound.end % len(self._buffer_seq)
self._buffer_seq[k] = self.next_example()
# Increase the buffer boundary.
push_bound.end += 1
pop_bound.start += 1
# Reset the cursor if necessary.
if self._cursor >= self._last:
self.reset()
......@@ -162,12 +169,12 @@ class DataReader(multiprocessing.Process):
# Determine the part specification.
self._num_examples = self._dataset.size
self._part_size = (self._num_examples + self._num_parts - 1) // self._num_parts
self._parts.append(DataReader.PartBoundaries(0, 0))
# Fill the initial buffer to support random sampling.
self.reset(stick_to_part=True)
for i in range(self._initial_fill):
self._example_buffer.append(self.next_example())
self._parts[-1].end += 1
self._buffer_bounds.append(self.BufferBound(0, 0))
self.reset()
for _ in range(self._initial_fill):
self._buffer_bounds[-1].end += 1
self._buffer_seq.append(self.next_example())
if self._cursor >= self._last:
self.reset()
......@@ -117,20 +117,18 @@ def export(
if input_names is not None:
raise ValueError(
'Excepted the input names from <args>.\n'
'You should set the <input_names> to None.'
)
'You should set the <input_names> to None.')
inputs, input_names, args = \
list(args.values()), list(args.keys()), [args]
else:
inputs = args = nest.flatten(args)
# Run the model to get the outputs.
execute_ws = workspace.Workspace()
execute_ws.merge_from(workspace.get_workspace())
with execute_ws.as_default():
with tapes.Tape() as model_tape:
model_tape._exporting = True
outputs = model(*args)
graph_tape = tapes.Tape()
graph_tape._tracing = True # Enable tracing.
graph_tape._exporting = True # Enable exporting.
with graph_tape:
outputs = model(*args)
# Process the outputs
if isinstance(outputs, dict):
......@@ -159,7 +157,7 @@ def export(
graph_def.output.extend([output_names[i]])
# Add operators.
for op_def in model_tape.get_elements():
for op_def in graph_tape.get_elements():
ops_def.append(dragon_pb2.OperatorDef())
ops_def[-1].ParseFromString(op_def.SerializeAs())
graph_def.op.extend(ops_def)
......@@ -176,17 +174,16 @@ def export(
constants[k] = v
# Export.
with execute_ws.as_default():
model = graph_def_to_onnx_model(
graph_def=graph_def,
input_names=input_names,
output_names=output_names,
input_shapes=input_shapes,
constants=constants,
value_info=value_info,
opset_version=opset_version,
workspace=execute_ws,
verbose=verbose,
enable_onnx_checker=enable_onnx_checker,
)
serialization.save_bytes(serialization.serialize_proto(model), f)
model = graph_def_to_onnx_model(
graph_def=graph_def,
input_names=input_names,
output_names=output_names,
input_shapes=input_shapes,
constants=constants,
value_info=value_info,
opset_version=opset_version,
workspace=workspace.get_workspace(),
verbose=verbose,
enable_onnx_checker=enable_onnx_checker,
)
serialization.save_bytes(serialization.serialize_proto(model), f)
......@@ -14,90 +14,48 @@ from __future__ import division
from __future__ import print_function
import os
import setuptools
import setuptools.command.install
import shutil
import subprocess
import sys
import setuptools
import setuptools.command.build_py
import setuptools.command.install
try:
# Override a non-pure "wheel" for pybind distributions
# Override a non-pure "wheel" for pybind distributions.
from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
class bdist_wheel(_bdist_wheel):
def finalize_options(self):
_bdist_wheel.finalize_options(self)
super(bdist_wheel, self).finalize_options()
self.root_is_pure = False
except ImportError:
bdist_wheel = None
# Read the current version info
version = git_version = None
with open('version.txt', 'r') as f:
version = f.read().strip()
try:
git_version = subprocess.check_output(
['git', 'rev-parse', 'HEAD'], cwd='../').decode('ascii').strip()
except (OSError, subprocess.CalledProcessError):
git_version = None
if os.path.exists('.git'):
try:
git_version = subprocess.check_output(
['git', 'rev-parse', 'HEAD'], cwd='../')
git_version = git_version.decode('ascii').strip()
except (OSError, subprocess.CalledProcessError):
pass
def clean():
"""Remove the work directories."""
def clean_builds():
"""Clean the builds."""
if os.path.exists('dragon/version.py'):
shutil.rmtree('dragon')
if os.path.exists('build/lib'):
shutil.rmtree('build/lib')
if os.path.exists('seeta_dragon.egg-info'):
shutil.rmtree('seeta_dragon.egg-info')
def configure():
"""Prepare the package files."""
clean()
# Create a temporary site-package directory.
shutil.copytree('python', 'dragon')
# Copy headers.
shutil.copytree('../targets/native/include', 'dragon/include')
# Copy "caffe" => "dragon.vm.caffe"
shutil.copytree('../caffe', 'dragon/vm/caffe')
# Copy "dali" => "dragon.vm.dali"
shutil.copytree('../dali', 'dragon/vm/dali')
# Copy "tensorflow" => "dragon.vm.tensorflow"
shutil.copytree('../tensorflow', 'dragon/vm/tensorflow')
# Copy "tensorlayer" => "dragon.vm.tensorlayer"
shutil.copytree('../tensorlayer', 'dragon/vm/tensorlayer')
# Copy "tensorrt/python" => "dragon.vm.tensorrt"
shutil.copytree('../tensorrt/python', 'dragon/vm/tensorrt')
# Copy "torch" => "dragon.vm.torch"
shutil.copytree('../torch', 'dragon/vm/torch')
# Copy "torchvision" => "dragon.vm.torchvision"
shutil.copytree('../torchvision', 'dragon/vm/torchvision')
# Copy the pre-built libraries.
if not os.path.exists('dragon/lib'):
os.makedirs('dragon/lib')
for src, dest in find_libraries().items():
if os.path.exists(src):
shutil.copy(src, dest)
else:
print('ERROR: Unable to find the library at <%s>.\n'
'Build it before installing to package.' % src)
shutil.rmtree('dragon')
sys.exit()
# Write the version file.
with open('dragon/version.py', 'w') as f:
f.write("from __future__ import absolute_import\n"
"from __future__ import division\n"
"from __future__ import print_function\n\n"
"version = '{}'\n"
"git_version = '{}'\n".format(version, git_version))
class install(setuptools.command.install.install):
"""Old-style command to prevent from installing egg."""
def run(self):
setuptools.command.install.install.run(self)
def find_libraries():
"""Return the pre-built libraries."""
in_prefix = '' if sys.platform == 'win32' else 'lib'
......@@ -110,7 +68,7 @@ def find_libraries():
'../targets/native/lib/{}dragon{}'.format(in_prefix, in_suffix):
'dragon/lib/{}dragon{}'.format(in_prefix, in_suffix),
'../targets/native/lib/{}dragon_python{}'.format(in_prefix, in_suffix):
'dragon/lib/libdragon_python{}'.format(out_suffix)
'dragon/lib/libdragon_python{}'.format(out_suffix),
}
if sys.platform == 'win32':
libraries['../targets/native/lib/dragon.lib'] = 'dragon/lib/dragon.lib'
......@@ -118,30 +76,74 @@ def find_libraries():
return libraries
def find_packages():
def find_packages(top):
"""Return the python sources installed to package."""
packages = []
for root, _, files in os.walk('dragon'):
for root, _, _ in os.walk(top):
if os.path.exists(os.path.join(root, '__init__.py')):
packages.append(root)
return packages
def find_package_data():
def find_package_data(top):
"""Return the external data installed to package."""
headers, libraries = [], []
for root, _, files in os.walk('dragon/include'):
root = root[len('dragon/'):]
for root, _, files in os.walk(top + '/include'):
root = root[len(top + '/'):]
for file in files:
headers.append(os.path.join(root, file))
for root, _, files in os.walk('dragon/lib'):
root = root[len('dragon/'):]
for root, _, files in os.walk(top + '/lib'):
root = root[len(top + '/'):]
for file in files:
libraries.append(os.path.join(root, file))
return headers + libraries
configure()
class BuildPyCommand(setuptools.command.build_py.build_py):
"""Enhanced 'build_py' command."""
def build_packages(self):
clean_builds()
shutil.copytree('python', 'dragon')
shutil.copytree('../caffe', 'dragon/vm/caffe')
shutil.copytree('../dali', 'dragon/vm/dali')
shutil.copytree('../tensorflow', 'dragon/vm/tensorflow')
shutil.copytree('../tensorlayer', 'dragon/vm/tensorlayer')
shutil.copytree('../tensorrt/python', 'dragon/vm/tensorrt')
shutil.copytree('../torch', 'dragon/vm/torch')
shutil.copytree('../torchvision', 'dragon/vm/torchvision')
with open('dragon/version.py', 'w') as f:
f.write("from __future__ import absolute_import\n"
"from __future__ import division\n"
"from __future__ import print_function\n\n"
"version = '{}'\n"
"git_version = '{}'\n".format(version, git_version))
self.packages = find_packages('dragon')
super(BuildPyCommand, self).build_packages()
def build_package_data(self):
shutil.copytree('../targets/native/include', 'dragon/include')
if not os.path.exists('dragon/lib'):
os.makedirs('dragon/lib')
for src, dest in find_libraries().items():
if os.path.exists(src):
shutil.copy(src, dest)
else:
print('ERROR: Unable to find the library at <%s>.\n'
'Build it before installing to package.' % src)
sys.exit()
self.package_data = {'dragon': find_package_data('dragon')}
super(BuildPyCommand, self).build_package_data()
class InstallCommand(setuptools.command.install.install):
"""Enhanced 'install' command."""
def run(self):
# Old-style install instead of egg.
super(InstallCommand, self).run()
setuptools.setup(
name='seeta-dragon',
version=version,
......@@ -150,31 +152,30 @@ setuptools.setup(
url='https://github.com/seetaresearch/dragon',
author='SeetaTech',
license='BSD 2-Clause',
packages=find_packages(),
package_data={'dragon': find_package_data()},
packages=find_packages('python'),
package_dir={'dragon': 'dragon'},
cmdclass={'bdist_wheel': bdist_wheel, 'install': install},
cmdclass={'bdist_wheel': bdist_wheel,
'build_py': BuildPyCommand,
'install': InstallCommand},
python_requires='>=3.6',
install_requires=['numpy', 'protobuf', 'kpl-dataset'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: BSD License',
'Programming Language :: C++',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development',
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules',
],
classifiers=['Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: BSD License',
'Programming Language :: C++',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development',
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules'],
)
clean()
clean_builds()
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_UTILS_MATH_SORT_H_
#define DRAGON_UTILS_MATH_SORT_H_
#include "dragon/core/context.h"
namespace dragon {
namespace math {
/*
* Sort Utilities.
*/
namespace utils {
template <typename T>
inline void ArgPartition(
const int count,
const int kth,
const bool descend,
const T* v,
vec64_t& indices) {
indices.resize(count);
std::iota(indices.begin(), indices.end(), 0);
if (descend) {
std::nth_element(
indices.begin(),
indices.begin() + kth,
indices.end(),
[&v](int64_t lhs, int64_t rhs) { return v[lhs] > v[rhs]; });
} else {
std::nth_element(
indices.begin(),
indices.begin() + kth,
indices.end(),
[&v](int64_t lhs, int64_t rhs) { return v[lhs] < v[rhs]; });
}
}
} // namespace utils
} // namespace math
} // namespace dragon
#endif // DRAGON_UTILS_MATH_SORT_H_
......@@ -54,10 +54,14 @@ void _AffineImpl(
const T* scale,
const T* bias,
T* y) {
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], x, scale, bias, y);
if (num_dims == 1 && num_dims == 1 && axes[0] == 0) {
_AffineChannel(1, dims[0], x, scale, bias, y); // [NxC]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], x, scale, bias, y); // [N, C]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 0) {
_AffineChannel(1, dims[0], dims[1], x, scale, bias, y); // [NxC, S]
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y);
_AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y); // [N, C, S]
} else {
LOG(FATAL) << "Unsupported affine dimensions.";
}
......
......@@ -65,12 +65,18 @@ void _AffineImpl(
T* y,
CUDAContext* ctx) {
const auto N = math::utils::Prod(num_dims, dims);
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
if (num_dims == 1 && num_axes == 1 && axes[0] == 0) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], x, scale, bias, y);
N, dims[0], x, scale, bias, y); // [NxC]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], x, scale, bias, y); // [N, C]
} else if (num_dims == 2 && num_axes == 1 && axes[0] == 0) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[0], dims[1], x, scale, bias, y); // [NxC, S]
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], dims[2], x, scale, bias, y);
N, dims[1], dims[2], x, scale, bias, y); // [N, C, S]
} else {
LOG(FATAL) << "Unsupported affine dimensions.";
}
......
......@@ -40,19 +40,19 @@ class ScalarType<float16> {
#endif
template <typename T>
class AccmulatorType {
class AccumulatorType {
public:
typedef float type;
};
template <>
class AccmulatorType<int64_t> {
class AccumulatorType<int64_t> {
public:
typedef double type;
};
template <>
class AccmulatorType<double> {
class AccumulatorType<double> {
public:
typedef double type;
};
......
......@@ -20,7 +20,6 @@
#include "dragon/utils/math/functional.h"
#include "dragon/utils/math/random.h"
#include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/sort.h"
#include "dragon/utils/math/transform.h"
#include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/types.h"
......
......@@ -46,6 +46,10 @@ class TestBackend(unittest.TestCase):
with group.as_default():
self.assertEqual(dragon.distributed.get_rank(group), 0)
@unittest.skipIf(not TEST_MPI, 'MPI unavailable')
def test_finalize(self):
dragon.distributed.finalize()
if __name__ == '__main__':
run_tests()
......@@ -39,13 +39,14 @@ class FunctionGuard(function_lib.FunctionGuard):
if not isinstance(input, Tensor) and input_spec is None:
inputs.append(input)
continue
shape = getattr(input, 'shape', None)
dtype = getattr(input, 'dtype', None)
device = getattr(input, 'device', None)
if input_spec is not None:
device = input_spec['device']
shape, dtype = input_spec['shape'], input_spec['dtype']
inputs.append(Tensor(*shape, dtype=dtype, device=device))
input_spec = input_spec or {}
for k in ('shape', 'dtype', 'device'):
input_spec[k] = getattr(input, k, input_spec.get(k, None))
inputs.append(Tensor(*input_spec['shape'],
dtype=input_spec['dtype'],
device=input_spec['device']))
if isinstance(input, Tensor):
inputs[-1].copy_(input)
with tapes.Tape() as function_tape:
function_tape._tracing = True
attributes['inputs'] = inputs
......
......@@ -3299,7 +3299,6 @@ class Tensor(object):
def __del__(self):
if self._deleter:
# print(self._impl.name)
self._deleter.release(self._impl.name)
def __eq__(self, other):
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!