Commit 1ad360e9 by Ting PAN

Add tests of operator spec for AutoGraph

Summary:
This commit tests the correctness of shape inference and data type
blended by autograph module.
1 parent 1bd78a3c
Showing with 1313 additions and 1031 deletions
......@@ -7,16 +7,16 @@ namespace kernel {
namespace {
template <typename Tx, typename Ty>
template <typename InputT, typename OutputT>
void _ChannelNormalize(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const Tx* x,
const InputT* x,
const float* mean,
const float* std,
Ty* y) {
OutputT* y) {
const auto count =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
vec64_t idx(num_dims, 0);
......@@ -27,7 +27,8 @@ void _ChannelNormalize(
xi += idx[d] * x_strides[d];
if (d == axis) wi = idx[d];
}
y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
y[yi] =
convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]);
math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
}
}
......@@ -36,83 +37,43 @@ void _ChannelNormalize(
/* ------------------- Launcher Separator ------------------- */
template <>
void ChannelNormalize<float16, float16, CPUContext>(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const float16* x,
const float* mean,
const float* std,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(Tx, Ty) \
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNormalize<Tx, Ty, CPUContext>( \
void ChannelNormalize<InputT, OutputT, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const Tx* x, \
const InputT* x, \
const float* mean, \
const float* std, \
Ty* y, \
OutputT* y, \
CPUContext* ctx) { \
_ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
}
#define DEFINE_FP16_KERNEL_LAUNCHER(T) \
template <> \
void ChannelNormalize<float16, T, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const float16* x, \
const float* mean, \
const float* std, \
T* y, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
} \
template <> \
void ChannelNormalize<T, float16, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const T* x, \
const float* mean, \
const float* std, \
float16* y, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
}
DEFINE_KERNEL_LAUNCHER(int8_t, float16);
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(int8_t, double);
DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, double);
DEFINE_KERNEL_LAUNCHER(int, float16);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int, double);
DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, float16);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float16, double);
DEFINE_KERNEL_LAUNCHER(float, float16);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float16);
DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -10,44 +10,23 @@ namespace kernel {
namespace {
template <typename Tx, typename Ty, int D>
__global__ void _ChannelNormalize(
const int nthreads,
const int axis,
const int num_dims,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const Tx* x,
const float* mean,
const float* std,
Ty* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
int xi = 0, wi, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d];
if (d == axis) wi = r;
}
#if __CUDA_ARCH__ >= 350
y[yi] = ((Ty)x[xi] - (Ty)__ldg(mean + wi)) / (Ty)__ldg(std + wi);
#define LDG(x, i) __ldg(x + i)
#else
y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
#define LDG(x, i) x[i]
#endif
}
}
template <typename T, int D>
__global__ void _ChannelNormalizeHalf(
template <typename InputT, typename OutputT, int D>
__global__ void _ChannelNormalize(
const int nthreads,
const int axis,
const int num_dims,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const T* x,
const InputT* x,
const float* mean,
const float* std,
half* y) {
OutputT* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
int xi = 0, wi, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
......@@ -56,119 +35,28 @@ __global__ void _ChannelNormalizeHalf(
xi += r * x_strides.data[d];
if (d == axis) wi = r;
}
#if __CUDA_ARCH__ >= 350
y[yi] = __float2half(((float)x[xi] - __ldg(mean + wi)) / __ldg(std + wi));
#else
y[yi] = __float2half(((float)x[xi] - mean[wi]) / std[wi]);
#endif
y[yi] = convert::To<OutputT>(
(convert::To<float>(x[xi]) - LDG(mean, wi)) / LDG(std, wi));
}
}
template <typename T, int D>
__global__ void _ChannelNormalizeHalf(
const int nthreads,
const int axis,
const int num_dims,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const half* x,
const float* mean,
const float* std,
T* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
int xi = 0, wi, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d];
if (d == axis) wi = r;
}
#if __CUDA_ARCH__ >= 350
y[yi] = (T)((__half2float(x[xi]) - __ldg(mean + wi)) / __ldg(std + wi));
#else
y[yi] = (T)((__half2float(x[xi]) - mean[wi]) / std[wi]);
#endif
}
}
template <int D>
__global__ void _ChannelNormalizeHalfAndHalf(
const int nthreads,
const int axis,
const int num_dims,
const SimpleArray<int, D> x_strides,
const SimpleArray<int, D> y_dims,
const half* x,
const float* mean,
const float* std,
half* y) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
int xi = 0, wi, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) {
int r;
FIXED_DIVISOR_DIV_MOD(y_dims.data[d], tmp, &tmp, &r);
xi += r * x_strides.data[d];
if (d == axis) wi = r;
}
#if __CUDA_ARCH__ >= 350
y[yi] = __float2half(
((__half2float(x[xi]) - __ldg(mean + wi)) / __ldg(std + wi)));
#else
y[yi] = __float2half(((__half2float(x[xi]) - mean[wi]) / std[wi]));
#endif
}
}
#undef LDG
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void ChannelNormalize<float16, float16, CUDAContext>(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const float16* x,
const float* mean,
const float* std,
float16* y,
CUDAContext* ctx) {
CUDA_TENSOR_DIMS_CHECK(num_dims);
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims;
const auto nthreads =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
for (int i = 0; i < num_dims; ++i) {
X_strides.data[i] = x_strides[i];
Y_dims.data[i] = y_dims[i];
}
_ChannelNormalizeHalfAndHalf<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
axis,
num_dims,
X_strides,
Y_dims,
reinterpret_cast<const half*>(x),
mean,
std,
reinterpret_cast<half*>(y));
}
#define DEFINE_KERNEL_LAUNCHER(Tx, Ty) \
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNormalize<Tx, Ty, CUDAContext>( \
void ChannelNormalize<InputT, OutputT, CUDAContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const Tx* x, \
const InputT* x, \
const float* mean, \
const float* std, \
Ty* y, \
OutputT* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \
......@@ -186,96 +74,28 @@ void ChannelNormalize<float16, float16, CUDAContext>(
nthreads, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
}
#define DEFINE_FP16_KERNEL_LAUNCHER(T) \
template <> \
void ChannelNormalize<float16, T, CUDAContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const float16* x, \
const float* mean, \
const float* std, \
T* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \
const auto nthreads = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_ChannelNormalizeHalf<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
axis, \
num_dims, \
X_strides, \
Y_dims, \
reinterpret_cast<const half*>(x), \
mean, \
std, \
y); \
} \
template <> \
void ChannelNormalize<T, float16, CUDAContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const T* x, \
const float* mean, \
const float* std, \
float16* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \
const auto nthreads = std::accumulate( \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_ChannelNormalizeHalf<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
nthreads, \
axis, \
num_dims, \
X_strides, \
Y_dims, \
x, \
mean, \
std, \
reinterpret_cast<half*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(int8_t, float16);
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(int8_t, double);
DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, double);
DEFINE_KERNEL_LAUNCHER(int, float16);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int, double);
DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, float16);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float16, double);
DEFINE_KERNEL_LAUNCHER(float, float16);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float16);
DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -7,51 +7,51 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _NLLLoss(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
loss[i] = mask[i] = LogitType(0);
loss[i] = mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -logit[k], mask[i] = LogitType(1);
loss[i] = -logit[k], mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _NLLLossGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
mask[i] = LogitType(0);
mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
dlogit[k] = LogitT(-1), mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -61,17 +61,17 @@ void _NLLLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CPUContext>( \
void name<LogitT, TargetT, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
......
......@@ -9,48 +9,48 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _NLLLoss(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
loss[yi] = mask[yi] = LogitType(0);
loss[yi] = mask[yi] = LogitT(0);
} else {
loss[yi] = -logit[(i * axis_dim + label) * inner_dim + j];
mask[yi] = LogitType(1);
mask[yi] = LogitT(1);
}
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _NLLLossGrad(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
mask[yi] = LogitType(0);
mask[yi] = LogitT(0);
} else {
dlogit[(i * axis_dim + label) * inner_dim + j] = LogitType(-1);
mask[yi] = LogitType(1);
dlogit[(i * axis_dim + label) * inner_dim + j] = LogitT(-1);
mask[yi] = LogitT(1);
}
}
}
......@@ -59,17 +59,17 @@ __global__ void _NLLLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CUDAContext>( \
void name<LogitT, TargetT, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
......@@ -7,19 +7,19 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SigmoidFocalLoss(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
std::array<int, 3> idx = {0, 0, 0};
std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
const int count = dims[0] * dims[1] * dims[2];
......@@ -27,23 +27,21 @@ void _SigmoidFocalLoss(
for (int i = 0; i < count; ++i) {
const int t = (int)target[idx[0] * inner_dim + idx[2]];
// "0" is reserved for target if negative index is zero
LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
// (1 - p)^{gamma} * log(p)
LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
std::log(std::max(p, (LogitType)FLT_MIN));
LogitT pos_term =
std::pow(LogitT(1) - p, gamma) * std::log(std::max(p, (LogitT)FLT_MIN));
// p^{gamma} * log(1 - p)
LogitType neg_term = std::pow(p, gamma) *
LogitT neg_term = std::pow(p, gamma) *
(-logit[i] * (logit[i] >= 0) -
std::log(
LogitType(1) +
std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
LogitT(1) + std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
loss[i] = LogitType(0);
loss[i] = LogitT(0);
loss[i] += -c1 * pos_term * pos_alpha;
loss[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1;
......@@ -52,19 +50,19 @@ void _SigmoidFocalLoss(
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SigmoidFocalLossGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
std::array<int, 3> idx = {0, 0, 0};
std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
const int count = dims[0] * dims[1] * dims[2];
......@@ -72,26 +70,24 @@ void _SigmoidFocalLossGrad(
for (int i = 0; i < count; ++i) {
const int t = (int)target[idx[0] * inner_dim + idx[2]];
// "0" is reserved for target if negative index is zero
LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
// (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
(LogitType(1) - p -
p * gamma * std::log(std::max(p, (LogitType)FLT_MIN)));
LogitT pos_term = std::pow(LogitT(1) - p, gamma) *
(LogitT(1) - p - p * gamma * std::log(std::max(p, (LogitT)FLT_MIN)));
// p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
LogitType neg_term = std::pow(p, gamma) *
LogitT neg_term = std::pow(p, gamma) *
((-logit[i] * (logit[i] >= 0) -
std::log(
LogitType(1) +
std::exp(logit[i] - LogitType(2) * logit[i] * (logit[i] >= 0)))) *
LogitT(1) +
std::exp(logit[i] - LogitT(2) * logit[i] * (logit[i] >= 0)))) *
(1 - p) * gamma -
p);
dx[i] = LogitType(0);
dx[i] = LogitT(0);
dx[i] += -c1 * pos_term * pos_alpha;
dx[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1;
......@@ -104,9 +100,9 @@ void _SigmoidFocalLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CPUContext>( \
void name<LogitT, TargetT, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
......@@ -114,18 +110,18 @@ void _SigmoidFocalLossGrad(
const float neg_alpha, \
const float gamma, \
const int negative_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
(LogitType)pos_alpha, \
(LogitType)neg_alpha, \
(LogitType)gamma, \
(LogitT)pos_alpha, \
(LogitT)neg_alpha, \
(LogitT)gamma, \
negative_index, \
logit, \
target, \
......
......@@ -9,19 +9,19 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SigmoidFocalLoss(
const int nthreads,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int j = yi % inner_dim;
const int k = (yi / inner_dim) % axis_dim;
......@@ -29,40 +29,39 @@ __global__ void _SigmoidFocalLoss(
const int t = target[i * inner_dim + j];
// "0" is reserved for target if negative index is zero
LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[yi]));
LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[yi]));
// (1 - p)^{gamma} * log(p)
LogitType pos_term = pow(LogitType(1) - p, gamma) * log(max(p, FLT_MIN));
LogitT pos_term = pow(LogitT(1) - p, gamma) * log(max(p, FLT_MIN));
// p^{gamma} * log(1 - p)
LogitType neg_term = pow(p, gamma) *
LogitT neg_term = pow(p, gamma) *
(-logit[yi] * (logit[yi] >= 0) -
log(LogitType(1) +
exp(logit[yi] - LogitType(2) * logit[yi] * (logit[yi] >= 0))));
log(LogitT(1) +
exp(logit[yi] - LogitT(2) * logit[yi] * (logit[yi] >= 0))));
loss[yi] = LogitType(0);
loss[yi] = LogitT(0);
loss[yi] += -c1 * pos_term * pos_alpha;
loss[yi] += -c2 * neg_term * neg_alpha;
mask[yi] = c1;
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SigmoidFocalLossGrad(
const int nthreads,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(xi, nthreads) {
const int j = xi % inner_dim;
const int k = (xi / inner_dim) % axis_dim;
......@@ -70,24 +69,23 @@ __global__ void _SigmoidFocalLossGrad(
const int t = target[i * inner_dim + j];
// "0" is reserved for target if neg index is zero
LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[xi]));
LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[xi]));
// (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
LogitType pos_term = pow(LogitType(1) - p, gamma) *
(LogitType(1) - p - p * gamma * log(max(p, FLT_MIN)));
LogitT pos_term = pow(LogitT(1) - p, gamma) *
(LogitT(1) - p - p * gamma * log(max(p, FLT_MIN)));
// p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
LogitType neg_term = pow(p, gamma) *
LogitT neg_term = pow(p, gamma) *
((-logit[xi] * (logit[xi] >= 0) -
log(LogitType(1) +
exp(logit[xi] - LogitType(2) * logit[xi] * (logit[xi] >= 0)))) *
(LogitType(1) - p) * gamma -
log(LogitT(1) +
exp(logit[xi] - LogitT(2) * logit[xi] * (logit[xi] >= 0)))) *
(LogitT(1) - p) * gamma -
p);
dx[xi] = LogitType(0);
dx[xi] = LogitT(0);
dx[xi] += -c1 * pos_term * pos_alpha;
dx[xi] += -c2 * neg_term * neg_alpha;
mask[xi] = c1;
......@@ -98,9 +96,9 @@ __global__ void _SigmoidFocalLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CUDAContext>( \
void name<LogitT, TargetT, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
......@@ -108,19 +106,19 @@ __global__ void _SigmoidFocalLossGrad(
const float neg_alpha, \
const float gamma, \
const int negative_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * axis_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
inner_dim, \
axis_dim, \
(LogitType)pos_alpha, \
(LogitType)neg_alpha, \
(LogitType)gamma, \
(LogitT)pos_alpha, \
(LogitT)neg_alpha, \
(LogitT)gamma, \
negative_index, \
logit, \
target, \
......
......@@ -7,58 +7,58 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SparseSoftmaxCrossEntropy(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
loss[i] = mask[i] = LogitType(0);
loss[i] = mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
mask[i] = LogitType(1);
loss[i] = -std::log(std::max(prob[k], LogitT(FLT_MIN)));
mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SparseSoftmaxCrossEntropyGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
LogitType* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
LogitT* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
for (int j = 0; j < axis_dim; ++j) {
(*offset_dx) = LogitType(0);
(*offset_dx) = LogitT(0);
offset_dx += inner_dim;
}
mask[i] = LogitType(0);
mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dx[k] -= LogitType(1);
mask[i] = LogitType(1);
dx[k] -= LogitT(1);
mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -68,17 +68,17 @@ void _SparseSoftmaxCrossEntropyGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CPUContext>( \
void name<LogitT, TargetT, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* prob, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* prob, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
......
......@@ -9,54 +9,54 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SparseSoftmaxCrossEntropy(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
loss[yi] = mask[yi] = LogitType(0);
loss[yi] = mask[yi] = LogitT(0);
} else {
loss[yi] = -log(max(
prob[(i * axis_dim + label) * inner_dim + j], LogitType(FLT_MIN)));
mask[yi] = LogitType(1);
loss[yi] = -log(
max(prob[(i * axis_dim + label) * inner_dim + j], LogitT(FLT_MIN)));
mask[yi] = LogitT(1);
}
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SparseSoftmaxCrossEntropyGrad(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
LogitType* offset_dx = dx + i * axis_dim * inner_dim + j;
LogitT* offset_dx = dx + i * axis_dim * inner_dim + j;
for (int k = 0; k < axis_dim; ++k) {
(*offset_dx) = LogitType(0);
(*offset_dx) = LogitT(0);
offset_dx += inner_dim;
}
mask[yi] = LogitType(0);
mask[yi] = LogitT(0);
} else {
dx[(i * axis_dim + label) * inner_dim + j] -= LogitType(1);
mask[yi] = LogitType(1);
dx[(i * axis_dim + label) * inner_dim + j] -= LogitT(1);
mask[yi] = LogitT(1);
}
}
}
......@@ -65,17 +65,17 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CUDAContext>( \
void name<LogitT, TargetT, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* prob, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* prob, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,125 +10,34 @@ namespace kernel {
namespace {
template <typename T>
template <typename T, typename AccT>
__global__ void
_Clip(const int nthreads, const T low, const T high, const T* x, T* y) {
_Clip(const int nthreads, const AccT low, const AccT high, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = max(low, min(x[i], high));
y[i] = convert::To<T>(max(low, min(convert::To<AccT>(x[i]), high)));
}
}
template <>
__global__ void _Clip<half>(
const int nthreads,
const half low,
const half high,
const half* x,
half* y) {
#if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hlt(__ldg(x + i), high)
? (__hgt(__ldg(x + i), low) ? __ldg(x + i) : low)
: high;
}
#else
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __float2half(max(kLow, min(__half2float(x[i]), kHigh)));
}
#endif
}
template <typename T>
template <typename T, typename AccT>
__global__ void _ClipGrad(
const int nthreads,
const T low,
const T high,
const AccT low,
const AccT high,
const T* dy,
const T* x,
T* dx) {
const T kZero = convert::To<T>(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350
dx[i] = __ldg(x + i) < low || __ldg(x + i) > high ? T(0) : dy[i];
#else
dx[i] = x[i] < low || x[i] > high ? T(0) : dy[i];
#endif
const AccT val = convert::To<AccT>(x[i]);
dx[i] = val < low || val > high ? kZero : dy[i];
}
}
template <>
__global__ void _ClipGrad<half>(
const int nthreads,
const half low,
const half high,
const half* dy,
const half* x,
half* dx) {
const half kZero = __float2half(0.f);
#if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] =
(__hlt(__ldg(x + i), low) || __hgt(__ldg(x + i), high)) ? kZero : dy[i];
}
#elif __CUDA_ARCH__ >= 350
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__half2float(__ldg(x + i)) < kLow ||
__half2float(__ldg(x + i)) > kHigh)
? kZero
: dy[i];
}
#else
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__half2float(x[i]) < kLow || __half2float(x[i]) > kHigh) ? kZero
: dy[i];
}
#endif
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void Clip<float16, CUDAContext>(
const int count,
const float low,
const float high,
const float16* x,
float16* y,
CUDAContext* ctx) {
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
convert::To<half>(low),
convert::To<half>(high),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
template <>
void ClipGrad<float16, CUDAContext>(
const int count,
const float low,
const float high,
const float16* dy,
const float16* x,
float16* dx,
CUDAContext* ctx) {
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
convert::To<half>(low),
convert::To<half>(high),
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(dx));
} // ClipGrad
#define DEFINE_KERNEL_LAUNCHER(T) \
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void Clip<T, CUDAContext>( \
const int count, \
......@@ -136,11 +46,12 @@ void ClipGrad<float16, CUDAContext>(
const T* x, \
T* y, \
CUDAContext* ctx) { \
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, convert::To<T>(low), convert::To<T>(high), x, y); \
_Clip<T, AccT> \
<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, low, high, x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, AccT) \
template <> \
void ClipGrad<T, CUDAContext>( \
const int count, \
......@@ -150,18 +61,21 @@ void ClipGrad<float16, CUDAContext>(
const T* x, \
T* dx, \
CUDAContext* ctx) { \
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
_ClipGrad<T, AccT> \
<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, low, high, dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
DEFINE_KERNEL_LAUNCHER(int, int);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -20,15 +20,15 @@ void _RowwiseMoments(
#pragma omp parallel for num_threads(OMP_THREADS(cols))
#endif
for (int i = 0; i < cols; ++i) {
T x_val;
AccT m_val = AccT(0), v_val = AccT(0), mu;
AccT x_val, m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < rows; ++j) {
x_val = x[j * cols + i];
x_val = convert::To<AccT>(x[j * cols + i]);
m_val += x_val;
v_val += x_val * x_val;
}
mean[i] = mu = m_val * scale;
var[i] = v_val * scale - mu * mu;
m_val *= scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val;
}
}
......@@ -44,15 +44,15 @@ void _ColwiseMoments(
#pragma omp parallel for num_threads(OMP_THREADS(rows))
#endif
for (int i = 0; i < rows; ++i) {
T x_val;
AccT m_val = AccT(0), v_val = AccT(0), mu;
AccT x_val, m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < cols; ++j) {
x_val = x[i * cols + j];
x_val = convert::To<AccT>(x[i * cols + j]);
m_val += x_val;
v_val += x_val * x_val;
}
mean[i] = mu = m_val * scale;
var[i] = v_val * scale - mu * mu;
m_val *= scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val;
}
}
......@@ -71,8 +71,7 @@ void _GenericMoments(
#pragma omp parallel for num_threads(OMP_THREADS(rows))
#endif
for (int i = 0; i < rows; ++i) {
T x_val;
AccT m_val = AccT(0), v_val = AccT(0), mu;
AccT x_val, m_val = AccT(0), v_val = AccT(0);
int xi, c, r;
for (int j = 0; j < cols; ++j) {
xi = 0;
......@@ -81,12 +80,13 @@ void _GenericMoments(
FIXED_DIVISOR_DIV_MOD(x_dims[d], c, &c, &r);
xi += r * x_strides[d];
}
x_val = x[xi];
x_val = convert::To<AccT>(x[xi]);
m_val += x_val;
v_val += x_val * x_val;
}
mean[i] = mu = m_val * scale;
var[i] = v_val * scale - mu * mu;
m_val *= scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val;
}
}
......@@ -148,19 +148,6 @@ void _Moments(
/* ------------------- Launcher Separator ------------------- */
template <>
void Moments<float16, float, CPUContext>(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float16* x,
float* mean,
float* var,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void Moments<T, AccT, CPUContext>( \
......@@ -178,7 +165,8 @@ void Moments<float16, float, CPUContext>(
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE__KERNEL_LAUNCHER
......
......@@ -201,7 +201,7 @@ void _Moments(
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, half, float);
DEFINE_KERNEL_LAUNCHER(float, float, float);
DEFINE_KERNEL_LAUNCHER(double, double, double);
......
......@@ -70,7 +70,7 @@ void _L1NormalizeGrad(
auto X = ConstEigenStridedVectorMap<T>(
x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
auto norm = std::max(X.template lpNorm<1>() / normalizer, epsilon);
auto norm2 = std::pow(norm, 2);
auto norm2 = std::pow(norm, T(2));
EigenStridedVectorMap<T>(
dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
(dY / norm) -
......@@ -98,7 +98,7 @@ void _L2NormalizeGrad(
auto X = ConstEigenStridedVectorMap<T>(
x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
auto norm = std::max(std::sqrt(X.squaredNorm() / normalizer), epsilon);
auto norm3 = std::pow(norm, 3);
auto norm3 = std::pow(norm, T(3));
EigenStridedVectorMap<T>(
dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
(dY / norm) - ((X / norm3) * dY.dot(X) / normalizer);
......
......@@ -93,7 +93,7 @@ __global__ void _L1NormalizeGrad(
val2 = BlockReduce<AccT>(storage).Sum(val2);
if (threadIdx.x == 0) {
norm = max(val1 / normalizer, epsilon);
norm2 = pow(norm, 2);
norm2 = pow(norm, AccT(2));
sum = val2 / normalizer;
}
__syncthreads();
......@@ -130,7 +130,7 @@ __global__ void _L2NormalizeGrad(
val2 = BlockReduce<AccT>(storage).Sum(val2);
if (threadIdx.x == 0) {
norm = max(sqrt(val1 / normalizer), epsilon);
norm3 = pow(norm, 3);
norm3 = pow(norm, AccT(3));
sum = val2 / normalizer;
}
__syncthreads();
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -5,7 +5,7 @@
namespace dragon {
template <class Context>
template <typename Tx, typename Ty>
template <typename InputT, typename OutputT>
void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -35,10 +35,10 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
num_dims,
X_strides.data(),
Y_dims.data(),
X.template data<Tx, Context>(),
X.template data<InputT, Context>(),
X_mean_.template data<float, Context>(),
X_std_.template data<float, Context>(),
Y->Reshape(Y_dims)->template mutable_data<Ty, Context>(),
Y->Reshape(Y_dims)->template mutable_data<OutputT, Context>(),
ctx());
}
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void NLLLossOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -19,19 +19,19 @@ void NLLLossOp<Context>::DoRunWithType() {
<< "\nNumber of preds must match the number of targets.";
auto scratches = ctx()->workspace()->template data<Context>({
(size_t)num_preds * sizeof(LogitType), // loss
(size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
(size_t)num_preds * sizeof(LogitT), // loss
(size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<LogitType*>(scratches[1]);
auto* loss = static_cast<LogitT*>(scratches[0]);
auto* mask = static_cast<LogitT*>(scratches[1]);
kernel::NLLLoss(
outer_dim,
inner_dim,
X.dim(axis),
ignore_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
loss,
mask,
ctx());
......@@ -42,7 +42,7 @@ void NLLLossOp<Context>::DoRunWithType() {
math::Copy(
num_preds,
loss,
Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
ctx());
} else {
int64_t normalizer = 1;
......@@ -59,7 +59,7 @@ void NLLLossOp<Context>::DoRunWithType() {
normalizer,
loss,
mask,
Y->Reshape({})->template mutable_data<LogitType, Context>(),
Y->Reshape({})->template mutable_data<LogitT, Context>(),
ctx());
}
}
......@@ -91,7 +91,7 @@ void NLLLossOp<Context>::RunOnDevice() {
}
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void NLLLossGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -101,19 +101,19 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
auto inner_dim = dX->count(axis + 1);
auto num_preds = outer_dim * inner_dim;
auto* dy = dY.template data<LogitType, Context>();
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* dy = dY.template data<LogitT, Context>();
auto* dx = dX->template mutable_data<LogitT, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
math::Set(dX->count(), convert::To<LogitT>(0.f), dx, ctx());
kernel::NLLLossGrad(
outer_dim,
inner_dim,
dX->dim(axis),
ignore_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
dx,
mask,
ctx());
......
......@@ -28,7 +28,7 @@ class NLLLossOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......@@ -47,7 +47,7 @@ class NLLLossGradientOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SigmoidFocalLossOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -18,11 +18,11 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
<< "\nNumber of preds must match the number of targets.";
auto scratches = ctx()->workspace()->template data<Context>({
X.size() * sizeof(LogitType), // loss
X.size() * sizeof(LogitType) + sizeof(LogitType), // mask
X.size() * sizeof(LogitT), // loss
X.size() * sizeof(LogitT) + sizeof(LogitT), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<LogitType*>(scratches[1]);
auto* loss = static_cast<LogitT*>(scratches[0]);
auto* mask = static_cast<LogitT*>(scratches[1]);
kernel::SigmoidFocalLoss(
outer_dim,
......@@ -32,8 +32,8 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
neg_alpha_,
gamma_,
negative_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
loss,
mask,
ctx());
......@@ -42,7 +42,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
math::Copy(
X.count(),
loss,
Y->ReshapeLike(X)->template mutable_data<LogitType, Context>(),
Y->ReshapeLike(X)->template mutable_data<LogitT, Context>(),
ctx());
} else {
int64_t normalizer = 1;
......@@ -59,7 +59,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
normalizer,
loss,
mask,
Y->Reshape({})->template mutable_data<LogitType, Context>(),
Y->Reshape({})->template mutable_data<LogitT, Context>(),
ctx());
}
}
......@@ -91,7 +91,7 @@ void SigmoidFocalLossOp<Context>::RunOnDevice() {
}
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -100,10 +100,10 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
auto outer_dim = dX->count(0, axis);
auto inner_dim = dX->count(axis + 1);
auto* dy = dY.template data<LogitType, Context>();
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* mask = ctx()->workspace()->template data<LogitType, Context>(
{dX->count() + 1})[0];
auto* dy = dY.template data<LogitT, Context>();
auto* dx = dX->template mutable_data<LogitT, Context>();
auto* mask =
ctx()->workspace()->template data<LogitT, Context>({dX->count() + 1})[0];
kernel::SigmoidFocalLossGrad(
outer_dim,
......@@ -113,8 +113,8 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
neg_alpha_,
gamma_,
negative_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
dx,
mask,
ctx());
......
......@@ -48,7 +48,7 @@ class SigmoidFocalLossOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......@@ -88,7 +88,7 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -45,7 +45,7 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......@@ -81,7 +81,7 @@ class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -18,20 +18,20 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
CHECK_EQ(num_preds, Input(1).count())
<< "\nNumber of preds must match the number of targets.";
auto* X_prob = Buffer("prob")->ReshapeLike(X);
auto* prob = X_prob->template mutable_data<LogitType, Context>();
auto* prob = X_prob->template mutable_data<LogitT, Context>();
auto scratches = ctx()->workspace()->template data<Context>({
(size_t)num_preds * sizeof(LogitType), // loss
(size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
(size_t)num_preds * sizeof(LogitT), // loss
(size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<LogitType*>(scratches[1]);
auto* loss = static_cast<LogitT*>(scratches[0]);
auto* mask = static_cast<LogitT*>(scratches[1]);
kernel::Softmax(
outer_dim,
inner_dim,
X.dim(axis),
X.template data<LogitType, Context>(),
X.template data<LogitT, Context>(),
prob,
ctx());
......@@ -41,7 +41,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
X.dim(axis),
ignore_index_,
prob,
Input(1).template data<TargetType, Context>(),
Input(1).template data<TargetT, Context>(),
loss,
mask,
ctx());
......@@ -52,7 +52,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
math::Copy(
num_preds,
loss,
Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
ctx());
} else {
int64_t normalizer = 1;
......@@ -69,7 +69,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
normalizer,
loss,
mask,
Y->Reshape({})->template mutable_data<LogitType, Context>(),
Y->Reshape({})->template mutable_data<LogitT, Context>(),
ctx());
}
}
......@@ -101,7 +101,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
}
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
auto &dY = Input(-1), *dX = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(Input(0));
......@@ -110,11 +110,11 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
auto inner_dim = dX->count(axis + 1);
auto num_preds = outer_dim * inner_dim;
auto* prob = Buffer("prob")->template data<LogitType, Context>();
auto* dy = Input(-1).template data<LogitType, Context>();
auto* dx = Output(0)->template mutable_data<LogitType, Context>();
auto* prob = Buffer("prob")->template data<LogitT, Context>();
auto* dy = Input(-1).template data<LogitT, Context>();
auto* dx = Output(0)->template mutable_data<LogitT, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
math::Copy(dX->count(), prob, dx, ctx());
......@@ -124,7 +124,7 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
dX->dim(axis),
ignore_index_,
prob,
Input(1).template data<TargetType, Context>(),
Input(1).template data<TargetT, Context>(),
dx,
mask,
ctx());
......
......@@ -5,8 +5,9 @@
namespace dragon {
template <class Context>
template <typename Tx, typename Ty>
template <typename T>
void MomentsOp<Context>::DoRunWithType() {
using OutputT = typename math::utils::AccmulatorType<T>::type;
auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);
// Determine the reduce axes
......@@ -35,13 +36,13 @@ void MomentsOp<Context>::DoRunWithType() {
if (X.count() == 1) {
math::Cast(
1,
X.template data<Tx, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
X.template data<T, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
ctx());
math::Set(
1,
convert::To<Ty>(0.f),
Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
convert::To<OutputT>(0.f),
Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
ctx());
} else {
kernel::Moments(
......@@ -49,35 +50,16 @@ void MomentsOp<Context>::DoRunWithType() {
X_dims.data(),
reduce_axes.size(),
reduce_axes.data(),
X.template data<Tx, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
X.template data<T, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
ctx());
}
}
template <class Context>
void MomentsOp<Context>::RunOnDevice() {
auto& X = Input(0);
if (X.template IsType<int8_t>()) {
DoRunWithType<int8_t, float>();
} else if (X.template IsType<uint8_t>()) {
DoRunWithType<uint8_t, float>();
} else if (X.template IsType<int>()) {
DoRunWithType<int, float>();
} else if (X.template IsType<int64_t>()) {
DoRunWithType<int64_t, float>();
} else if (X.template IsType<float16>()) {
DoRunWithType<float16, float>();
} else if (X.template IsType<float>()) {
DoRunWithType<float, float>();
} else if (X.template IsType<double>()) {
DoRunWithType<double, double>();
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(X.meta()),
{"int8", "uint8", "int32", "int64", "float16", "float32", "float64"});
}
DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Moments);
......
......@@ -28,7 +28,7 @@ class MomentsOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename Tx, typename Ty>
template <typename T>
void DoRunWithType();
protected:
......
......@@ -3,7 +3,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void AccuracyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -18,21 +18,21 @@ void AccuracyOp<Context>::DoRunWithType() {
int64_t acc = 0, count = 0;
int64_t cols = X.count() / outer_dim;
auto* logit = X.template data<LogitType, CPUContext>();
auto* target = Input(1).template data<TargetType, CPUContext>();
auto* logit = X.template data<LogitT, CPUContext>();
auto* target = Input(1).template data<TargetT, CPUContext>();
for (int i = 0; i < outer_dim; ++i) {
for (int j = 0; j < inner_dim; ++j) {
const int label = target[i * inner_dim + j];
if (label == ignore_index_) continue;
vector<pair<LogitType, int>> vec;
vector<pair<LogitT, int>> vec;
for (int k = 0; k < axis_dim; k++)
vec.push_back(std::make_pair(logit[i * cols + k * inner_dim + j], k));
std::partial_sort(
vec.begin(),
vec.begin() + top_k_,
vec.end(),
std::greater<pair<LogitType, int>>());
std::greater<pair<LogitT, int>>());
for (int k = 0; k < top_k_; k++) {
if (vec[k].second == label) {
acc++;
......
......@@ -28,7 +28,7 @@ class AccuracyOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -8,11 +8,11 @@ namespace dragon {
template <class Context>
template <typename T>
void BatchNormOp<Context>::TrainingImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
using ParamT = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
auto* X_mu = Buffer("X_mu")->Reshape({C_});
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
......@@ -20,11 +20,11 @@ void BatchNormOp<Context>::TrainingImpl() {
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<T, Context>();
auto* rm = Input(3).template mutable_data<ParamType, Context>();
auto* rv = Input(4).template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* rm = Input(3).template mutable_data<ParamT, Context>();
auto* rv = Input(4).template mutable_data<ParamT, Context>();
auto* mu = X_mu->template mutable_data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
auto* scale = X_scale->template mutable_data<ParamT, Context>();
// Compute moments
if (sync_stats_ > 0) {
......@@ -45,7 +45,7 @@ void BatchNormOp<Context>::TrainingImpl() {
if (enable_nccl_) {
#ifdef USE_NCCL
auto coll_comm = this->nccl_comm();
auto coll_dtype = this->template nccl_dtype<ParamType>();
auto coll_dtype = this->template nccl_dtype<ParamT>();
NCCL_CHECK(ncclAllReduce(
(void*)mu,
(void*)mu,
......@@ -84,8 +84,9 @@ void BatchNormOp<Context>::TrainingImpl() {
// Compute running statistics
if (is_recomputing_ == 0) {
math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
auto decay_factor = momentum();
math::Axpby(C_, 1.f - decay_factor, mu, decay_factor, rm, ctx());
math::Axpby(C_, 1.f - decay_factor, rsig, decay_factor, rv, ctx());
}
// Inverse stddev from variance
......@@ -100,10 +101,10 @@ void BatchNormOp<Context>::TrainingImpl() {
x,
mu,
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
scale,
X_bias->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
Output(0)->template mutable_data<T, Context>(),
ctx());
}
......@@ -111,17 +112,17 @@ void BatchNormOp<Context>::TrainingImpl() {
template <class Context>
template <typename T>
void BatchNormOp<Context>::InferenceImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
using ParamT = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* rv = Input(4).template data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* rv = Input(4).template data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
// Inverse stddev from variance
math::InvStd(C_, epsilon_, rv, rsig, ctx());
......@@ -133,12 +134,12 @@ void BatchNormOp<Context>::InferenceImpl() {
S_,
data_format(),
Input(0).template data<T, Context>(),
Input(3).template data<ParamType, Context>(),
Input(3).template data<ParamT, Context>(),
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
X_scale->template mutable_data<ParamT, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
Output(0)->template mutable_data<T, Context>(),
ctx());
}
......@@ -159,17 +160,17 @@ void BatchNormOp<Context>::RunOnDevice() {
template <class Context>
template <typename T>
void BatchNormGradientOp<Context>::TrainingImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
using ParamT = typename math::utils::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto* x = Input(0).template data<T, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* gamma = Input(1).template data<ParamT, Context>();
auto* dy = Input(4).template data<T, Context>();
auto* mu = X_mu->template data<ParamType, Context>();
auto* rsig = X_rsig->template data<ParamType, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* mu = X_mu->template data<ParamT, Context>();
auto* rsig = X_rsig->template data<ParamT, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
// Gradient w.r.t. gamma and beta
kernel::BatchNormWGrad(
......@@ -181,7 +182,7 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
if (enable_nccl_) {
#ifdef USE_NCCL
auto coll_comm = this->nccl_comm();
auto coll_dtype = this->template nccl_dtype<ParamType>();
auto coll_dtype = this->template nccl_dtype<ParamT>();
NCCL_CHECK(ncclAllReduce(
(void*)dgamma,
(void*)dgamma,
......@@ -231,18 +232,18 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
template <class Context>
template <typename T>
void BatchNormGradientOp<Context>::InferenceImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
using ParamT = typename math::utils::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* rv = Input(3).template data<ParamType, Context>();
auto* rsig = X_scale->template mutable_data<ParamType, Context>();
auto* rv = Input(3).template data<ParamT, Context>();
auto* rsig = X_scale->template mutable_data<ParamT, Context>();
// Gradient w.r.t. gamma or beta if necessary
ParamType *dgamma = nullptr, *dbeta = nullptr;
ParamT *dgamma = nullptr, *dbeta = nullptr;
if (dW->has_name() || dB->has_name()) {
dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
}
// Inverse stddev from variance
......@@ -255,9 +256,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
S_,
data_format(),
Input(0).template data<T, Context>(), // x
Input(2).template data<ParamType, Context>(), // rm
Input(2).template data<ParamT, Context>(), // rm
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(1).template data<ParamT, Context>(), // gamma
Input(4).template data<T, Context>(), // dy
dgamma,
dbeta,
......
......@@ -33,7 +33,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
public:
BatchNormOpBase(const OperatorDef& def, Workspace* ws)
: GenericOpBase<Context>(def, ws),
momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
......@@ -57,7 +56,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
}
protected:
float momentum_;
double epsilon_;
int64_t N_, C_, S_;
int64_t use_stats_, sync_stats_;
......@@ -68,7 +66,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
#define USE_BATCHNORM_FUNCTIONS \
using BatchNormOpBase<Context>::DetermineBaseArguments; \
using BatchNormOpBase<Context>::momentum_; \
using BatchNormOpBase<Context>::epsilon_; \
using BatchNormOpBase<Context>::use_stats_; \
using BatchNormOpBase<Context>::sync_stats_; \
......@@ -82,7 +79,9 @@ template <class Context>
class BatchNormOp : public BatchNormOpBase<Context> {
public:
BatchNormOp(const OperatorDef& def, Workspace* ws)
: BatchNormOpBase<Context>(def, ws) {}
: BatchNormOpBase<Context>(def, ws) {
INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
#ifdef USE_MPI
......@@ -105,6 +104,8 @@ class BatchNormOp : public BatchNormOpBase<Context> {
InferenceImpl<T>();
}
};
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
};
template <class Context>
......@@ -146,11 +147,9 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
CuDNNCreateTensorDesc(&bn_desc_);
CuDNNCreateTensorDesc(&input_desc_);
if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. \nSet it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
epsilon_ = CUDNN_BN_MIN_EPSILON;
}
INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
......@@ -168,6 +167,7 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
protected:
cudnnTensorDescriptor_t input_desc_, bn_desc_;
cudnnBatchNormMode_t bn_mode_;
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
};
template <class Context>
......@@ -178,9 +178,6 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
CuDNNCreateTensorDesc(&bn_desc_);
CuDNNCreateTensorDesc(&input_desc_);
if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. \nSet it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
epsilon_ = CUDNN_BN_MIN_EPSILON;
}
}
......@@ -211,8 +208,12 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
cudnnBatchNormMode_t bn_mode_;
};
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, CuDNNBatchNormOp, momentum);
#endif // USE_CUDNN
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, BatchNormOp, momentum);
} // namespace dragon
#endif // DRAGON_OPERATORS_NORMALIZATION_BATCH_NORM_OP_H_
......@@ -9,11 +9,11 @@ namespace dragon {
template <class Context>
template <typename T>
void CuDNNBatchNormOp<Context>::DoRunWithType() {
using ParamType = typename CuDNNType<T>::BNParamType;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
using ParamT = typename CuDNNType<T>::BNParamType;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
// Determine the descriptors
if (Input(0).ndim() == 2) {
......@@ -39,14 +39,14 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
input_desc_,
Output(0)->template mutable_data<T, Context>(), // y
bn_desc_,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
is_recomputing_ > 0 ? 0.f : 1.f - this->momentum_,
Input(3).template mutable_data<ParamType, Context>(), // rm
Input(4).template mutable_data<ParamType, Context>(), // rv
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
is_recomputing_ == 0 ? 1.f - momentum() : 0.f,
Input(3).template mutable_data<ParamT, Context>(), // rm
Input(4).template mutable_data<ParamT, Context>(), // rv
epsilon_,
X_mu->template mutable_data<ParamType, Context>(), // sm
X_rsig->template mutable_data<ParamType, Context>())); // sv
X_mu->template mutable_data<ParamT, Context>(), // sm
X_rsig->template mutable_data<ParamT, Context>())); // sv
} else {
CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
ctx()->cudnn_handle(),
......@@ -58,10 +58,10 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
input_desc_,
Output(0)->template mutable_data<T, Context>(), // y
bn_desc_,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
Input(3).template data<ParamType, Context>(), // rm
Input(4).template data<ParamType, Context>(), // rv
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
Input(3).template data<ParamT, Context>(), // rm
Input(4).template data<ParamT, Context>(), // rv
epsilon_));
}
}
......@@ -82,7 +82,7 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
template <class Context>
template <typename T>
void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
using ParamType = typename CuDNNType<T>::BNParamType;
using ParamT = typename CuDNNType<T>::BNParamType;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
......@@ -111,12 +111,12 @@ void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
input_desc_,
Output(0)->template mutable_data<T, Context>(), // dx
bn_desc_,
Input(1).template data<ParamType, Context>(), // gamma
dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dw
dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // db
Input(1).template data<ParamT, Context>(), // gamma
dW->Reshape({C_})->template mutable_data<ParamT, Context>(), // dw
dB->Reshape({C_})->template mutable_data<ParamT, Context>(), // db
epsilon_,
X_mu->template data<ParamType, Context>(), // mu
X_rsig->template data<ParamType, Context>())); // rsig
X_mu->template data<ParamT, Context>(), // mu
X_rsig->template data<ParamT, Context>())); // rsig
}
template <class Context>
......
......@@ -8,9 +8,9 @@ namespace dragon {
template <class Context>
template <typename T>
void GroupNormOp<Context>::DoRunWithType() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
using ParamT = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
auto* X_mu = Buffer("X_mu")->Reshape({N_, G_});
auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_});
......@@ -18,8 +18,8 @@ void GroupNormOp<Context>::DoRunWithType() {
auto* X_bias = Buffer("X_bias")->Reshape({N_, C_});
auto* x = Input(0).template data<T, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
// Compute the moments
if (data_format() == "NCHW") {
......@@ -45,10 +45,10 @@ void GroupNormOp<Context>::DoRunWithType() {
x,
mu,
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
X_scale->template mutable_data<ParamT, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
Output(0)->template mutable_data<T, Context>(),
ctx());
}
......@@ -63,7 +63,7 @@ void GroupNormOp<Context>::RunOnDevice() {
template <class Context>
template <typename T>
void GroupNormGradientOp<Context>::DoRunWithType() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
using ParamT = typename math::utils::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
......@@ -78,14 +78,14 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
S_,
data_format(),
Input(0).template data<T, Context>(), // x
X_mu->template data<ParamType, Context>(),
X_rsig->template data<ParamType, Context>(),
Input(1).template data<ParamType, Context>(), // gamma
X_mu->template data<ParamT, Context>(),
X_rsig->template data<ParamT, Context>(),
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<T, Context>(), // dy
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
dW->Reshape({C_})->template mutable_data<ParamType, Context>(),
dB->Reshape({C_})->template mutable_data<ParamType, Context>(),
X_scale->template mutable_data<ParamT, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
dW->Reshape({C_})->template mutable_data<ParamT, Context>(),
dB->Reshape({C_})->template mutable_data<ParamT, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
......
......@@ -183,7 +183,7 @@ def conv_spec(args, inputs, outputs):
out_size = (in_size + pad_size - k_size) // s + 1
else:
out_size = (in_size + s - 1) // s
except IndexError:
except (IndexError, TypeError):
out_size = None
out_shape[i + spatial_axis] = out_size
except (TypeError, IndexError):
......@@ -205,6 +205,12 @@ def conv_transpose_spec(args, inputs, outputs):
else:
out_shape[channel_axis] = inputs[1].shape[1]
for i in range(num_axes):
if 'output_padding_desc' in args or \
'output_padding_descs' in args or \
'output_shape_desc' in args or \
'output_shape_descs' in args:
out_shape[i + spatial_axis] = None
continue
try:
k = args['kernel_shape'][i]
s = args['strides'][i]
......@@ -219,9 +225,9 @@ def conv_transpose_spec(args, inputs, outputs):
else:
if 'output_shape' in args and args['output_shape']:
out_size = args['output_shape'][i]
else:
out_size = None
except IndexError:
if 'output_padding' in args and args['output_padding']:
out_size += args['output_padding'][i]
except (IndexError, TypeError):
out_size = None
out_shape[i + spatial_axis] = out_size
except (TypeError, IndexError):
......@@ -296,23 +302,28 @@ def eltwise_loss_spec(args, inputs, outputs):
@register('Expand')
def expand_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
shape, out_shape = args['dims'], None
if shape is None:
return outputs
try:
in_shape, out_shape = list(inputs[0].shape[:]), list(shape[:])
if len(shape) < len(in_shape):
num_keep = len(in_shape) - len(shape)
out_shape = None
if 'dims_descs' in args:
out_shape = [None] * len(args['dims_descs'])
elif 'dims_desc' in args:
out_shape = [None] * len(inputs[0].shape)
elif 'dims' in args:
in_shape = list(inputs[0].shape[:])
dims = args['dims']
out_shape = list(dims[:])
if len(dims) < len(in_shape):
num_keep = len(in_shape) - len(dims)
out_shape = in_shape[:num_keep] + out_shape
elif len(shape) > len(in_shape):
num_expand = len(shape) - len(in_shape)
elif len(dims) > len(in_shape):
num_expand = len(dims) - len(in_shape)
in_shape = [1] * num_expand + in_shape
for i, dim in enumerate(out_shape):
if dim is not None and dim < 0:
out_shape[i] = in_shape[i]
outputs[0].shape = out_shape
except TypeError:
pass
except (KeyError, TypeError):
outputs[0].shape = None
return outputs
......@@ -330,8 +341,7 @@ def expand_dims_spec(args, inputs, outputs):
out_shape[axis] = -1
j = 0
for i in range(out_rank):
if out_shape[i] is not None and \
out_shape[i] < 0:
if out_shape[i] is not None and out_shape[i] < 0:
out_shape[i] = 1
else:
if j >= len(inputs[0].shape):
......@@ -358,6 +368,8 @@ def fill_spec(args, inputs, outputs):
try:
if 'dims' in args:
outputs[0].shape = args['dims'][:]
elif 'dims_descs' in args:
outputs[0].shape = [None] * len(args['dims_descs'])
else:
outputs[0].shape = inputs[0].shape[:]
except (TypeError, KeyError, IndexError):
......@@ -432,18 +444,20 @@ def fully_connected_spec(args, inputs, outputs):
@register('ChannelNormalize')
def channel_normalize_spec(args, inputs, outputs):
outputs[0].dtype = args['dtype']
perm = args['perm']
if 'perm_desc' in args or 'perm_descs' in args:
return outputs
try:
out_shape = list(inputs[0].shape[:])
if 'perm' in args:
perm = args['perm']
if perm is None:
perm = list(range((len(inputs[0].shape))))
perm = list(range(len(inputs[0].shape)))
out_shape = list(inputs[0].shape[:])
for i, axis in enumerate(perm):
out_shape[i] = inputs[0].shape[axis]
except (TypeError, IndexError):
out_shape = None
else:
out_shape = [None] * len(out_shape)
outputs[0].shape = out_shape
except (TypeError, IndexError):
outputs[0].shape = None
return outputs
......@@ -497,37 +511,45 @@ def masked_select_spec(args, inputs, outputs):
def matmul_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
ta, tb = args['transA'], args['transB']
out_shape = None
try:
b_shape = list(inputs[1].shape[:])
a_shape = out_shape = list(inputs[0].shape[:])
out_shape[-2] = a_shape[-1] if ta else a_shape[-2]
out_shape[-1] = b_shape[-2] if tb else b_shape[-1]
except TypeError:
pass
except (TypeError, IndexError):
out_shape = None
outputs[0].shape = out_shape
return outputs
@register('Moments')
def moments_spec(args, inputs, outputs):
outputs[0].dtype = outputs[1].dtype = \
inputs[0].dtype if inputs[0].dtype == 'float64' else 'float32'
out_dtype = 'float32'
if inputs[0].dtype == 'float64':
out_dtype = 'float64'
elif inputs[0].dtype == 'int64':
out_dtype = 'float64'
outputs[0].dtype = outputs[1].dtype = out_dtype
axes, keep_dims = args['axes'], args['keep_dims']
try:
out_shape = list(inputs[0].shape[:])
for axis in axes:
if axis < len(out_shape):
out_shape[axis] = 1
out_shape[axis] = -1
if not keep_dims:
squeezed_shape = []
for d in out_shape:
if d != 1:
if d >= 0:
squeezed_shape.append(d)
out_shape = squeezed_shape
else:
out_shape = [1 if d < 0 else d for d in out_shape]
except TypeError:
if axes is None:
out_shape = (1,) if keep_dims else ()
else:
out_shape = None
outputs[0].shape = outputs[1].shape = out_shape if axes else ()
outputs[0].shape = outputs[1].shape = out_shape
return outputs
......@@ -535,10 +557,11 @@ def moments_spec(args, inputs, outputs):
def multinomial_spec(args, inputs, outputs):
outputs[0].dtype = 'int64'
try:
outputs[0].shape = inputs[0].shape[:]
outputs[0].shape[-1] = args['num_samples']
out_shape = list(inputs[0].shape[:])
out_shape[-1] = args['num_samples']
except TypeError:
pass
out_shape = None
outputs[0].shape = out_shape
return outputs
......@@ -584,11 +607,8 @@ def pad_spec(args, inputs, outputs):
@register('Permutation')
def permutation_spec(args, inputs, outputs):
outputs[0].dtype = args['dtype']
if len(inputs) == 1:
try:
outputs[0].shape = inputs[0].shape[:]
except TypeError:
pass
if 'limit_desc' in args:
outputs[0].shape = (None,)
else:
outputs[0].shape = (args['limit'],)
return outputs
......@@ -599,7 +619,7 @@ def pool_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
out_shape = None
try:
out_shape = inputs[0].shape[:]
out_shape = list(inputs[0].shape[:])
num_axes = len(out_shape) - 2
spatial_axis = 2 if args['data_format'] == 'NCHW' else 1
for i in range(num_axes):
......@@ -615,13 +635,13 @@ def pool_spec(args, inputs, outputs):
out_size = floor_or_ceil(out_size)
else:
out_size = math.ceil(float(in_size) / float(s))
except IndexError:
except TypeError:
out_size = None
out_shape[i + spatial_axis] = out_size
else:
out_shape[i + spatial_axis] = 1
except (TypeError, IndexError):
pass
out_shape = None
outputs[0].shape = out_shape
return outputs
......@@ -641,7 +661,7 @@ def range_spec(args, inputs, outputs):
start, limit, delta = slice_args
try:
outputs[0].shape = (int(math.ceil((limit - start) / delta)),)
except TypeError:
except (TypeError, ZeroDivisionError):
pass
return outputs
......@@ -662,22 +682,26 @@ def reduce_spec(args, inputs, outputs):
out_shape = list(inputs[0].shape[:])
for axis in axes:
if axis < len(out_shape):
out_shape[axis] = 1
out_shape[axis] = -1
if not keep_dims:
squeezed_shape = []
for d in out_shape:
if d != 1:
if d >= 0:
squeezed_shape.append(d)
out_shape = squeezed_shape
else:
out_shape = [1 if d < 0 else d for d in out_shape]
except TypeError:
out_shape = None
outputs[0].shape = out_shape
except (TypeError, IndexError):
pass
return outputs
@register('Repeat')
def repeat_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
if 'repeats_desc' in args:
return outputs
axis, repeats = args['axis'], args['repeats']
if axis is None:
try:
......@@ -702,8 +726,8 @@ def repeat_spec(args, inputs, outputs):
@register('Reshape')
def reshape_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
shape, out_shape = args['dims'], None
try:
shape = args['dims']
out_shape = []
n_elements, n_elements_known = None, None
try:
......@@ -714,7 +738,7 @@ def reshape_spec(args, inputs, outputs):
out_shape.append(inputs[0].shape[i])
else:
out_shape.append(s)
except TypeError:
except IndexError:
out_shape = None
try:
n_elements = math_util.prod(inputs[0].shape)
......@@ -727,8 +751,11 @@ def reshape_spec(args, inputs, outputs):
out_shape[i] = n_elements // n_elements_known
except TypeError:
out_shape[i] = None
except TypeError:
pass
except (KeyError, TypeError):
if 'dims_descs' in args:
out_shape = [None] * len(args['dims_descs'])
else:
out_shape = None
outputs[0].shape = out_shape
return outputs
......@@ -736,13 +763,12 @@ def reshape_spec(args, inputs, outputs):
@register('Resize')
def resize_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
if 'sizes_desc' in args or \
'sizes_descs' in args or \
'scales_desc' in args or \
'scales_descs' in args:
return outputs
try:
out_shape = list(inputs[0].shape[:])
if 'sizes_desc' in args or 'sizes_descs' in args or \
'scales_desc' in args or 'scales_descs' in args:
outputs[0].shape = [None] * len(out_shape)
return outputs
num_axes = len(out_shape) - 2
axis = len(out_shape) - 2 if args['data_format'] == 'NCHW' else 1
try:
......@@ -756,12 +782,15 @@ def resize_spec(args, inputs, outputs):
else:
out_shape[j] = args['sizes'][j]
elif args['scales'] is not None:
try:
if len(args['scales']) == 1:
out_shape[j] = int(out_shape[j] * args['scales'][0])
elif len(args['scales']) == num_axes:
out_shape[j] = int(out_shape[j] * args['scales'][i])
else:
out_shape[j] = int(out_shape[j] * args['sizes'][j])
out_shape[j] = int(out_shape[j] * args['scales'][j])
except TypeError:
out_shape[j] = None
except IndexError:
return outputs
outputs[0].shape = out_shape
......@@ -801,12 +830,10 @@ def shape_spec(args, inputs, outputs):
@register('Slice')
def slice_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
if 'starts_desc' in args or \
'starts_descs' in args or \
'sizes_desc' in args or \
'sizes_descs' in args:
if 'starts_desc' in args or 'starts_descs' in args or \
'sizes_desc' in args or 'sizes_descs' in args:
return outputs
starts, sizes = args['starts'], args['sizes']
starts, sizes = list(args['starts']), list(args['sizes'])
try:
in_shape = inputs[0].shape[:]
ndim = len(in_shape)
......@@ -834,7 +861,7 @@ def slice_spec(args, inputs, outputs):
def softmax_loss_spec(args, inputs, outputs):
outputs[0].dtype = 'float32'
axis, reduction = args['axis'], args['reduction']
if reduction != 'NONE':
if reduction.upper() != 'NONE':
outputs[0].shape = ()
else:
try:
......@@ -894,8 +921,6 @@ def split_spec(args, inputs, outputs):
axis = args['axis']
size_splits = args['size_splits']
slice_points = args['slice_points']
if slice_points is not None and len(slice_points) == 0:
slice_points = None
slice_offset = 0
for i in range(len(outputs)):
try:
......@@ -905,10 +930,7 @@ def split_spec(args, inputs, outputs):
except TypeError:
return outputs
if size_splits is not None:
try:
out_shape[axis] = size_splits[i]
except IndexError:
return outputs
elif slice_points is not None:
try:
if i < len(outputs) - 1:
......@@ -917,16 +939,16 @@ def split_spec(args, inputs, outputs):
else:
slice_dim = inputs[0].shape[axis] - slice_offset
out_shape[axis] = slice_dim
except (TypeError, IndexError):
return outputs
except TypeError:
out_shape[axis] = None
else:
try:
slice_dim = (out_shape[axis] + num_outputs - 1) // num_outputs
if i == num_outputs - 1:
slice_dim = out_shape[axis] - slice_dim * (num_outputs - 1)
out_shape[axis] = slice_dim
except (TypeError, IndexError):
return outputs
except TypeError:
out_shape[axis] = None
outputs[i].shape = out_shape
return outputs
......@@ -988,34 +1010,38 @@ def stack_spec(args, inputs, outputs):
@register('Tile')
def tile_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
repeats = args['repeats']
if repeats is not None:
try:
out_shape = list(inputs[0].shape[:])
if 'repeats' in args:
repeats = args['repeats']
for i, size in enumerate(repeats):
if i < len(out_shape):
try:
out_shape[i] *= size
except TypeError:
out_shape[i] = None
else:
out_shape = [None] * len(out_shape)
outputs[0].shape = out_shape
except TypeError:
pass
except (KeyError, TypeError):
outputs[0].shape = None
return outputs
@register('Transpose')
def transpose_spec(args, inputs, outputs):
outputs[0].dtype = inputs[0].dtype
if 'perm_desc' in args or 'perm_descs' in args:
return outputs
try:
out_shape = list(inputs[0].shape[:])
if 'perm' in args:
perm = args['perm']
if perm is None:
perm = list(range(((len(inputs[0].shape)) - 1), -1, -1))
out_shape = list(inputs[0].shape[:])
for i, axis in enumerate(perm):
out_shape[i] = inputs[0].shape[axis]
else:
out_shape = [None] * len(out_shape)
outputs[0].shape = out_shape
except (TypeError, IndexError):
outputs[0].shape = None
......
......@@ -58,7 +58,7 @@ def dropout(inputs, ratio=0.5, **kwargs):
if context.executing_eagerly():
return op_lib \
.instantiate() \
.apply([inputs], ratio, inplace=inplace)
.apply([inputs], args['ratio'], inplace=inplace)
else:
return op_lib.blend(**args)
......@@ -103,7 +103,7 @@ def drop_block2d(inputs, ratio=0.1, block_size=7, data_format='NCHW', **kwargs):
.instantiate(
block_size=block_size,
data_format=data_format,
).apply([inputs], ratio, inplace=inplace)
).apply([inputs], args['ratio'], inplace=inplace)
else:
return op_lib.blend(**args)
......@@ -137,7 +137,7 @@ def drop_path(inputs, ratio=0.2, **kwargs):
if context.executing_eagerly():
return op_lib \
.instantiate() \
.apply([inputs], ratio, inplace=inplace)
.apply([inputs], args['ratio'], inplace=inplace)
else:
return op_lib.blend(**args)
......
......@@ -205,9 +205,8 @@ def broadcast_to(inputs, shape, **kwargs):
op_lib = array_ops_lib.Expand
if context.executing_eagerly():
return op_lib \
.instantiate(
ndim=len(args['dims']),
).apply([inputs], args['dims'])
.instantiate(ndim=len(args['dims'])) \
.apply([inputs], args['dims'])
else:
return op_lib.blend(**args)
......@@ -1163,6 +1162,7 @@ def pad(inputs, pads, mode='constant', value=0, **kwargs):
return op_lib.blend(**args)
@ArgHelper.desc('limit', as_target=True)
def permutation(limit, dtype='int64', **kwargs):
r"""Return a tensor with value in the permuted range.
......@@ -1174,7 +1174,7 @@ def permutation(limit, dtype='int64', **kwargs):
Parameters
----------
limit: number
limit: Union[number, dragon.Tensor]
The end of interval.
dtype : str, optional, default='int64'
The optional data type.
......@@ -1192,7 +1192,7 @@ def permutation(limit, dtype='int64', **kwargs):
if context.executing_eagerly():
return op_lib \
.instantiate(dtype=dtype) \
.apply(limit, trainable=trainable)
.apply(args['limit'], trainable=trainable)
else:
return op_lib.blend(**args)
......
......@@ -49,10 +49,11 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
op_lib = control_flow_ops_lib.Assign
if context.executing_eagerly():
starts = args['starts'] if starts is not None else [0]
sizes = args['sizes'] if sizes is not None else [-1]
return op_lib \
.instantiate(
ndim=len(starts) if starts is not None else 0,
).apply(inputs, starts, sizes, inplace=inplace)
.instantiate(ndim=len(starts)) \
.apply(inputs, starts, sizes, inplace=inplace)
else:
return op_lib.blend(**args)
......
......@@ -23,6 +23,7 @@ from dragon.core.util import nest
@OpSchema.num_inputs(5)
@ArgHelper.desc('momentum', as_target=False)
def batch_norm(
inputs,
axis=-1,
......@@ -40,7 +41,8 @@ def batch_norm(
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
(1 - \text{momentum}) * x_{\text{batch}}
Parameters
----------
......@@ -48,8 +50,8 @@ def batch_norm(
The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.9
The momentum for running average.
momentum : Union[float, dragon.Tensor], optional
The value to :math:`\text{momentum}`.
epsilon : float, optional, default=1e-5
The value to :math:`\epsilon`.
use_stats : int, optional, default=-1
......@@ -62,16 +64,15 @@ def batch_norm(
"""
args = ArgHelper.parse(locals())
args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
args['epsilon'] = float(epsilon)
op_lib = normalization_ops_lib.BatchNorm
if context.executing_eagerly():
return op_lib \
.instantiate(
axis=axis,
momentum=args['momentum'],
epsilon=args['epsilon'],
use_stats=use_stats,
).apply(inputs)
).apply(inputs, args['momentum'])
else:
return op_lib.blend(**args)
......@@ -304,6 +305,7 @@ def local_response_norm(
@OpSchema.num_inputs(5)
@ArgHelper.desc('momentum', as_target=False)
def sync_batch_norm(
inputs,
axis=-1,
......@@ -322,7 +324,8 @@ def sync_batch_norm(
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
(1 - \text{momentum}) * x_{\text{batch}}
Parameters
----------
......@@ -330,8 +333,8 @@ def sync_batch_norm(
The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.9
The momentum for average.
momentum : Union[float, dragon.Tensor], optional
The value to :math:`\text{momentum}`.
epsilon : float, optional, default=1e-5
The value to :math:`\epsilon`.
use_stats : int, optional, default=-1
......@@ -346,7 +349,7 @@ def sync_batch_norm(
"""
args = ArgHelper.parse(locals())
args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
args['epsilon'] = float(epsilon)
if process_group is None:
process_group = distributed.get_group()
if process_group is None:
......@@ -356,11 +359,10 @@ def sync_batch_norm(
return op_lib \
.instantiate(
axis=axis,
momentum=args['momentum'],
epsilon=args['epsilon'],
use_stats=use_stats,
process_group=process_group,
).apply(inputs)
).apply(inputs, args['momentum'])
else:
args.update(process_group.arguments)
return op_lib.blend(**args)
......@@ -23,7 +23,6 @@ class BatchNorm(Operator):
def __init__(self, key, dev, **kwargs):
super(BatchNorm, self).__init__(key, dev, **kwargs)
self.axis = kwargs.get('axis', -1)
self.momentum = kwargs.get('momentum', 0.9)
self.epsilon = kwargs.get('epsilon', 1e-5)
self.use_stats = kwargs.get('use_stats', 0)
if self.use_stats not in (0, 1):
......@@ -34,14 +33,21 @@ class BatchNorm(Operator):
'op_type': 'BatchNorm',
'arguments': {
'axis': self.axis,
'momentum': self.momentum,
'epsilon': self.epsilon,
'use_stats': self.use_stats,
'momentum_desc': '${HANDLE}/momentum',
},
}
def forward(self, inputs):
return self.dispatch(inputs, [self.alloc()])
def setup(self, ws, handle, momentum):
self.feed_arg(ws, '%s/momentum' % handle, momentum, 'float32')
def forward(self, inputs, momentum):
return self.dispatch(
inputs, [self.alloc()],
callback=lambda ws, handle:
self.setup(ws, handle, momentum),
)
class GroupNorm(Operator):
......
......@@ -118,6 +118,7 @@ class ArgHelper(object):
if 'extra_inputs' not in arguments:
arguments['extra_inputs'] = []
arguments['extra_inputs'] += [arg]
if name in arguments:
arguments.pop(name)
arguments[name + '_desc'] = arg.id
return arguments
......@@ -141,5 +142,6 @@ class ArgHelper(object):
descs.append(ele.id)
else:
descs.append(Tensor.from_value(ele, dtype, 'DescConst').id)
if name in arguments:
arguments.pop(name)
arguments[name + '_descs'] = descs
......@@ -176,9 +176,12 @@ def conv2d_transpose(
raise ValueError('Unsupported padding algorithm: %s' % padding)
if data_format not in ('NCHW', 'NHWC'):
raise ValueError('Unsupported data format: %s' % data_format)
if 'SAME' in padding and output_shape is None:
raise ValueError('Excepted <output_shape> for same padding.')
if output_shape is not None and 'SAME' not in padding:
args['padding'] = 'SAME'
for key in ('kernel_shape', 'strides', 'pads', 'dilations'):
if key in args and args[key] is not None:
if key == 'pads':
args[key] = _normalize_pads(args[key], 2)
else:
......
......@@ -26,7 +26,7 @@ def dropout_exporter(op_def, context):
drop_ratio = arg.f
elif arg.name == 'prob_desc':
drop_ratio = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'ratio', drop_ratio)
helper.add_attribute(node, 'ratio', float(drop_ratio))
return node, const_tensors
......
......@@ -26,6 +26,9 @@ def batch_norm_exporter(op_def, context):
helper.add_attribute(node, 'epsilon', arg.f)
elif arg.name == 'momentum':
helper.add_attribute(node, 'momentum', arg.f)
elif arg.name == 'momentum_desc':
momentum = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'momentum', float(momentum))
# Weight, bias, running mean and running variance
const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
return node, const_tensors
......
......@@ -123,23 +123,51 @@ CONVERSIONS_DECL float16 To<float16, half>(half val) {
}
template <>
CONVERSIONS_DECL half To<half, float>(float val) {
return __float2half(val);
CONVERSIONS_DECL half To<half, float16>(float16 val) {
return __half_raw{val.x};
}
template <>
CONVERSIONS_DECL half To<half, float16>(float16 val) {
return __half_raw{val.x};
CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
return half2(__half2_raw{val.x, val.x});
}
template <>
CONVERSIONS_DECL half2 To<half2, float>(float val) {
return __float2half2_rn(val);
CONVERSIONS_DECL half To<half, float>(float val) {
#if CUDA_VERSION_MIN(9, 2, 0)
return __float2half(val);
#else
#if defined(__CUDA_ARCH__)
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
__half ret;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(ret)) : "f"(val));
return ret;
#undef __HALF_TO_US
#else
return To<half>(To<float16>(val));
#endif
#endif
}
template <>
CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
return half2(__half2_raw{val.x, val.x});
CONVERSIONS_DECL half2 To<half2, float>(float val) {
#if CUDA_VERSION_MIN(9, 2, 0)
return __float2half2_rn(val);
#else
#if defined(__CUDA_ARCH__)
#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
__half2 ret;
asm("{.reg .f16 low;\n"
" cvt.rn.f16.f32 low, %1;\n"
" mov.b32 %0, {low,low};}\n"
: "=r"(__HALF2_TO_UI(ret))
: "f"(val));
return ret;
#undef __HALF2_TO_UI
#else
return To<half2>(To<float16>(val));
#endif
#endif
}
#endif // USE_CUDA
......
......@@ -162,23 +162,17 @@ __global__ void _InvStd(const int n, const T eps, const T* x, T* y) {
}
}
template <>
__global__ void
_InvStd<half>(const int n, const half eps, const half* x, half* y) {
__global__ void _InvStd(const int n, const float eps, const half* x, half* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
y[i] = hrsqrt(__hadd(x[i], eps));
#endif
y[i] = __float2half(rsqrt(__half2float(x[i]) + eps));
}
}
template <>
__global__ void
_InvStd<half2>(const int n, const half2 eps, const half2* x, half2* y) {
_InvStd(const int n, const float eps, const half2* x, half2* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
y[i] = h2rsqrt(__hadd2(x[i], eps));
#endif
const float2 val = __half22float2(x[i]);
y[i] = __floats2half2_rn(rsqrt(val.x + eps), rsqrt(val.y + eps));
}
}
......@@ -206,19 +200,15 @@ __global__ void _Powx(const int n, const T exponent, const T* x, T* y) {
__global__ void
_Powx(const int n, const float exponent, const half* x, half* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
y[i] = __float2half(pow(__half2float(x[i]), exponent));
#endif
}
}
__global__ void
_Powx(const int n, const float exponent, const half2* x, half2* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
const float2 val = __half22float2(x[i]);
y[i] = __floats2half2_rn(pow(val.x, exponent), pow(val.y, exponent));
#endif
}
}
......@@ -269,20 +259,16 @@ __global__ void _Square(const int n, const T* x, T* y) {
template <typename T>
__global__ void _NotZero(const int nthreads, const T* x, bool* y) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[i] != kZero ? true : false;
y[i] = x[i] != T(0) ? true : false;
}
}
template <>
__global__ void _NotZero<half>(const int nthreads, const half* x, bool* y) {
#if __CUDA_ARCH__ >= 530
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hne(x[i], kZero) ? true : false;
y[i] = __half2float(x[i]) != 0.f ? true : false;
}
#endif
}
template <typename T>
......@@ -560,15 +546,12 @@ DRAGON_API void InvStd<float16, CUDAContext>(
if ((n & 1) == 0) {
_InvStd<<<CUDA_BLOCKS(n >> 1), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
n >> 1,
convert::To<half2>(eps),
eps,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_InvStd<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
n,
convert::To<half>(eps),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
n, eps, reinterpret_cast<const half*>(x), reinterpret_cast<half*>(y));
}
}
......
......@@ -26,7 +26,7 @@ namespace math {
template <typename T>
struct MaxFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? rhs : lhs;
}
......@@ -39,7 +39,7 @@ struct MaxFunctor {
template <>
struct MaxFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -62,7 +62,7 @@ struct MaxFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MaxFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -87,7 +87,7 @@ struct MaxFunctor<half2> {
template <typename T>
struct MinFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? lhs : rhs;
}
......@@ -100,7 +100,7 @@ struct MinFunctor {
template <>
struct MinFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -123,7 +123,7 @@ struct MinFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MinFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -148,7 +148,7 @@ struct MinFunctor<half2> {
template <typename T>
struct PlusFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs + rhs;
}
......@@ -161,7 +161,7 @@ struct PlusFunctor {
template <>
struct PlusFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -183,7 +183,7 @@ struct PlusFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct PlusFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -211,7 +211,7 @@ struct PlusFunctor<half2> {
template <typename T>
struct MinusFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs - rhs;
}
......@@ -224,7 +224,7 @@ struct MinusFunctor {
template <>
struct MinusFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -246,7 +246,7 @@ struct MinusFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MinusFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -274,7 +274,7 @@ struct MinusFunctor<half2> {
template <typename T>
struct MultipliesFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs * rhs;
}
......@@ -287,7 +287,7 @@ struct MultipliesFunctor {
template <>
struct MultipliesFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -309,7 +309,7 @@ struct MultipliesFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MultipliesFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -337,7 +337,7 @@ struct MultipliesFunctor<half2> {
template <typename T>
struct DividesFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs / rhs;
}
......@@ -350,7 +350,7 @@ struct DividesFunctor {
template <>
struct DividesFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -372,7 +372,7 @@ struct DividesFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct DividesFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -396,7 +396,7 @@ struct DividesFunctor<half2> {
template <typename T>
struct PowFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return pow(lhs, rhs);
}
......@@ -409,7 +409,7 @@ struct PowFunctor {
template <>
struct PowFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
half ret = __float2half(
......@@ -425,7 +425,7 @@ struct PowFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct PowFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -449,7 +449,7 @@ struct PowFunctor<half2> {
template <typename T>
struct EqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs == rhs;
}
......@@ -462,7 +462,7 @@ struct EqualFunctor {
template <>
struct EqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -481,7 +481,7 @@ struct EqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct EqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -496,7 +496,7 @@ struct EqualFunctor<half> {
template <typename T>
struct NotEqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs != rhs;
}
......@@ -509,7 +509,7 @@ struct NotEqualFunctor {
template <>
struct NotEqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -528,7 +528,7 @@ struct NotEqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct NotEqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -543,7 +543,7 @@ struct NotEqualFunctor<half> {
template <typename T>
struct GreaterFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs > rhs;
}
......@@ -556,7 +556,7 @@ struct GreaterFunctor {
template <>
struct GreaterFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -575,7 +575,7 @@ struct GreaterFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct GreaterFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -590,7 +590,7 @@ struct GreaterFunctor<half> {
template <typename T>
struct LessFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs < rhs;
}
......@@ -603,7 +603,7 @@ struct LessFunctor {
template <>
struct LessFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -622,7 +622,7 @@ struct LessFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct LessFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -637,7 +637,7 @@ struct LessFunctor<half> {
template <typename T>
struct GreaterEqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs >= rhs;
}
......@@ -650,7 +650,7 @@ struct GreaterEqualFunctor {
template <>
struct GreaterEqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -669,7 +669,7 @@ struct GreaterEqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct GreaterEqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -684,7 +684,7 @@ struct GreaterEqualFunctor<half> {
template <typename T>
struct LessEqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs <= rhs;
}
......@@ -697,7 +697,7 @@ struct LessEqualFunctor {
template <>
struct LessEqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -716,7 +716,7 @@ struct LessEqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct LessEqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......
......@@ -239,8 +239,8 @@ void ReduceSum<float16, CUDAContext>(
num_axes, \
axes, \
Reducer<AccT>(), \
AccT(kInit), \
AccT(scale), \
convert::To<AccT>(kInit), \
convert::To<AccT>(scale), \
x, \
y, \
ctx); \
......
......@@ -301,16 +301,16 @@ void ChannelAffine(
/* array.channel_normalize */
template <typename Tx, typename Ty, class Context>
template <typename InputT, typename OutputT, class Context>
void ChannelNormalize(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const Tx* x,
const InputT* x,
const float* mean,
const float* std,
Ty* y,
OutputT* y,
Context* ctx);
/* array.channel_shuffle */
......@@ -648,28 +648,28 @@ void BroadcastLossGrad(
/* loss.nll_loss */
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void NLLLoss(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask,
Context* ctx);
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void NLLLossGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask,
Context* ctx);
/* loss.sigmoid_ce_loss */
......@@ -694,7 +694,7 @@ void SigmoidCrossEntropyGrad(
/* loss.sigmoid_focal_loss */
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SigmoidFocalLoss(
const int outer_dim,
const int inner_dim,
......@@ -703,13 +703,13 @@ void SigmoidFocalLoss(
const float neg_alpha,
const float gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask,
Context* ctx);
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SigmoidFocalLossGrad(
const int outer_dim,
const int inner_dim,
......@@ -718,10 +718,10 @@ void SigmoidFocalLossGrad(
const float neg_alpha,
const float gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask,
Context* ctx);
/* loss.smooth_l1_loss */
......@@ -754,28 +754,28 @@ void SoftmaxCrossEntropy(
/* loss.sparse_softmax_cross_entropy */
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SparseSoftmaxCrossEntropy(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* loss,
LogitType* mask,
const LogitT* prob,
const TargetT* target,
LogitT* loss,
LogitT* mask,
Context* ctx);
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SparseSoftmaxCrossEntropyGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* dx,
LogitType* mask,
const LogitT* prob,
const TargetT* target,
LogitT* dx,
LogitT* mask,
Context* ctx);
/* math.abs */
......
......@@ -55,7 +55,7 @@ class BatchNormalization(Layer):
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.99
The momentum of moving average.
The decay factor of running average.
epsilon : float, optional, default=1e-3
The epsilon value.
center : bool, optional, default=True
......
......@@ -41,8 +41,8 @@ def batch_normalization(
The moving average of stats are calculated as:
.. math::
x_{moving} \leftarrow momentum * x_{moving} + (1 - momentum) * x_{stat}
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
(1 - \text{momentum}) * x_{\text{batch}}
Parameters
----------
......@@ -58,10 +58,10 @@ def batch_normalization(
The :math:`\gamma` tensor.
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.9
The momentum of moving average.
momentum : Union[float, dragon.Tensor], optional
The value to :math:`\text{momentum}`.
variance_epsilon : float, optional, default=1e-5
The value of epsilon.
The value to :math:`\epsilon`.
trainable : bool, optional, default=False
The optional training flag.
name : str, optional
......
......@@ -50,7 +50,7 @@ class BatchNorm(layer.Layer):
Parameters
----------
decay : float, optional, default=0.9
The decay factor for moving average.
The decay factor of running average.
epsilon : float, optional, default=1e-5
The epsilon.
act : callable, optional
......
......@@ -14,6 +14,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import unittest
import dragon
......@@ -115,5 +116,537 @@ class TestFunction(unittest.TestCase):
dragon.create_function(optimizer=optimizer)()
class TestOpSpec(unittest.TestCase):
"""Test the op spec."""
sym1 = dragon.Tensor(None, None)
sym2 = dragon.Tensor((1,))
sym3 = dragon.Tensor((1, None))
sym4 = dragon.Tensor((1, None, None, None))
sym5 = dragon.Tensor((1, None, None, None, None))
def test_accuracy(self):
with dragon.graph_mode():
self.assertEqual(dragon.metrics.accuracy(
[self.sym1, self.sym1]).shape, ())
def test_arg_reduce(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.argmax(
self.sym1, axis=0, keep_dims=True).shape, None)
self.assertEqual(dragon.math.argmax(
self.sym1, axis=0, keep_dims=False).shape, None)
self.assertEqual(dragon.math.argmax(
self.sym1, axis=None, keep_dims=True).shape, (1,))
self.assertEqual(dragon.math.argmax(
self.sym1, axis=None, keep_dims=False).shape, ())
self.assertEqual(dragon.math.argmax(
self.sym2, axis=0, keep_dims=True).shape, (1,))
self.assertEqual(dragon.math.argmax(
self.sym2, axis=0, keep_dims=False).shape, ())
def test_binary_ops(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.add(
[self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.math.add(
[self.sym2, self.sym2]).shape, (1,))
self.assertEqual(dragon.math.add(
[self.sym2, self.sym3]).shape, (1, None))
self.assertEqual(dragon.math.add(
[self.sym3, self.sym2]).shape, (1, None))
self.assertEqual(dragon.math.equal(
[self.sym1, self.sym1]).shape, None)
def test_broadcast(self):
self.assertEqual(dragon.broadcast_to(
self.sym1, shape=(1,)).shape, None)
self.assertEqual(dragon.broadcast_to(
self.sym2, shape=(1, 2)).shape, (1, 2))
self.assertEqual(dragon.broadcast_to(
self.sym3, shape=(2,)).shape, self.sym3.shape[:-1] + (2,))
self.assertEqual(dragon.broadcast_to(
self.sym3, shape=(-1, 2, 2)).shape, (1, 2, 2))
def test_cast(self):
with dragon.graph_mode():
self.assertEqual(dragon.cast(self.sym1, 'float32').shape, None)
def test_concat(self):
with dragon.graph_mode():
self.assertEqual(dragon.concat([self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.concat([self.sym1, self.sym2]).shape, (None,))
self.assertEqual(dragon.concat([self.sym2, self.sym3], axis=0).shape, (2,))
self.assertEqual(dragon.concat([self.sym2, self.sym3], axis=1).shape, None)
def test_conv(self):
w = dragon.Tensor((3, 3, 3, 3))
with dragon.graph_mode():
self.assertEqual(dragon.nn.conv2d(
[self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.nn.conv2d(
[self.sym4, w]).shape, (self.sym4.shape[0], w.shape[0], None, None))
self.assertEqual(dragon.nn.conv2d(
[w, w], kernel_shape=1, out_channels=w.shape[0]).shape, w.shape)
self.assertEqual(dragon.nn.conv2d(
[w, w], kernel_shape=1, padding='SAME').shape, w.shape)
self.assertEqual(dragon.nn.conv2d_transpose(
[self.sym4, w], out_channels=w.shape[1]).shape,
(self.sym4.shape[0], w.shape[1], None, None))
self.assertEqual(dragon.nn.conv2d_transpose(
[w, w], output_padding=(2, 2), kernel_shape=1).shape,
(w.shape[0], w.shape[1], w.shape[2] + 2, w.shape[3] + 2))
self.assertEqual(dragon.nn.conv2d_transpose(
[w, w], output_shape=(4, 4), output_padding=(2, 2), kernel_shape=1).shape,
(w.shape[0], w.shape[1], 6, 6))
def test_depth_to_space(self):
func1 = functools.partial(dragon.nn.depth_to_space, block_size=1)
func2 = functools.partial(dragon.nn.space_to_depth, block_size=1)
with dragon.graph_mode():
for func in (func1, func2):
self.assertEqual(func(self.sym1).shape, None)
self.assertEqual(func(self.sym2).shape, None)
self.assertEqual(func(self.sym4, data_format='NCHW').shape,
(self.sym4.shape[0],) + (None,) * (len(self.sym4.shape) - 1))
self.assertEqual(func(self.sym4, data_format='NCHW').shape,
(self.sym4.shape[0],) + (None,) * (len(self.sym4.shape) - 1))
self.assertEqual(func(dragon.Tensor((1, 2, 3)), data_format='NCHW').shape,
dragon.Tensor((1, 2, 3)).shape)
self.assertEqual(func(dragon.Tensor((1, 2, 3)), data_format='NHWC').shape,
dragon.Tensor((1, 2, 3)).shape)
def test_dot(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.dot(
[self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.math.dot(
[self.sym2, self.sym2]).shape, ())
self.assertEqual(dragon.math.dot(
[dragon.Tensor(()), dragon.Tensor(())]).shape, ())
self.assertEqual(dragon.math.dot(
[self.sym3, self.sym3]).shape, (self.sym3.shape[0], self.sym3.shape[1]))
self.assertEqual(dragon.math.dot(
[self.sym3, self.sym2]).shape, self.sym3.shape[:-1])
def test_eltwise_loss(self):
with dragon.graph_mode():
self.assertEqual(dragon.losses.l2_loss(
[self.sym1, self.sym1]).shape, ())
self.assertEqual(dragon.losses.l2_loss(
[self.sym1, self.sym1], reduction='none').shape, None)
def test_expand_dims(self):
with dragon.graph_mode():
self.assertEqual(dragon.expand_dims(
self.sym1, axis=1).shape, None)
self.assertEqual(dragon.expand_dims(
self.sym2, axis=1).shape, (1, 1))
self.assertEqual(dragon.expand_dims(
self.sym2, axis=-1).shape, (1, 1))
self.assertEqual(dragon.expand_dims(
self.sym3, axis=0).shape, (1, 1, None))
self.assertEqual(dragon.expand_dims(
self.sym3, axis=(0, 3)).shape, (1, 1, None, 1))
self.assertEqual(dragon.expand_dims(
self.sym3, axis=(0, 3, 5)).shape, (1, 1, None, 1))
def test_init_ops(self):
init_funcs_v1 = [dragon.fill,
dragon.ones,
dragon.random.glorot_normal,
dragon.random.glorot_uniform,
dragon.random.normal,
dragon.random.uniform,
dragon.random.truncated_normal,
dragon.zeros]
for func in init_funcs_v1:
with dragon.graph_mode():
self.assertEqual(func(shape=self.sym1.shape).shape, None)
self.assertEqual(func(shape=self.sym2.shape).shape, self.sym2.shape)
def test_flatten(self):
with dragon.graph_mode():
self.assertEqual(dragon.flatten(
self.sym1, axis=1).shape, None)
self.assertEqual(dragon.flatten(
self.sym1, keep_axes=2).shape, (None, None))
self.assertEqual(dragon.flatten(
self.sym2, keep_axes=2).shape, (1, None))
self.assertEqual(dragon.flatten(
self.sym4, keep_axes=2).shape, (1, None))
self.assertEqual(dragon.flatten(
self.sym4, axis=1, num_axes=3).shape, (1, None))
self.assertEqual(dragon.flatten(
self.sym4, axis=1, num_axes=-1).shape, (1, None))
def test_fully_connected(self):
w = dragon.Tensor((3, 2))
with dragon.graph_mode():
self.assertEqual(dragon.nn.fully_connected(
[self.sym1, w]).shape, (None, 3))
self.assertEqual(dragon.nn.fully_connected(
[self.sym1, w], transpose_w=False).shape, (None, 2))
self.assertEqual(dragon.nn.fully_connected(
[self.sym1, w], axis=-1).shape, None)
self.assertEqual(dragon.nn.fully_connected(
[self.sym1, self.sym1]).shape, (None, None))
def test_index_select(self):
with dragon.graph_mode():
self.assertEqual(dragon.index_select(
self.sym1, self.sym1).shape, None)
self.assertEqual(dragon.index_select(
self.sym1, self.sym2, axis=-1).shape, None)
self.assertEqual(dragon.index_select(
self.sym3, self.sym2, axis=1).shape, (1, 1))
def test_linspace(self):
with dragon.graph_mode():
self.assertEqual(dragon.linspace(
start=1, stop=5, num=3).shape, (3,))
self.assertEqual(dragon.linspace(
start=(1, 2), stop=(3, 4), num=3, axis=1).shape, (2, 3))
self.assertEqual(dragon.linspace(
start=(1, 2), stop=(3, 4), num=3, axis=0).shape, (3, 2))
def test_mask_select(self):
with dragon.graph_mode():
self.assertEqual(dragon.masked_select(
[self.sym1, self.sym1]).shape, (None,))
def test_matmul(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.matmul(
[self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.math.matmul(
[self.sym1, self.sym2]).shape, None)
self.assertEqual(dragon.math.matmul(
[self.sym1, self.sym3]).shape, None)
self.assertEqual(dragon.math.matmul(
[self.sym2, self.sym3]).shape, None)
self.assertEqual(dragon.math.matmul(
[self.sym3, self.sym3]).shape, (1, None))
self.assertEqual(dragon.math.matmul(
[self.sym4, self.sym3]).shape, (1, None, None, None))
self.assertEqual(dragon.math.matmul(
[self.sym4, self.sym4]).shape, (1, None, None, None))
def test_moments(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.moments(self.sym1)[0].shape, ())
self.assertEqual(dragon.math.moments(self.sym1, axis=0)[0].shape, None)
self.assertEqual(dragon.math.moments(self.sym1, keep_dims=True)[0].shape, (1,))
self.assertEqual(dragon.math.moments(self.sym2)[0].shape, ())
self.assertEqual(dragon.math.moments(self.sym2, axis=0)[0].shape, ())
self.assertEqual(dragon.math.moments(self.sym2, axis=1)[0].shape, (1,))
self.assertEqual(dragon.math.moments(self.sym2, axis=0, keep_dims=True)[0].shape, (1,))
self.assertEqual(dragon.math.moments(dragon.Tensor(None, 'float64'))[0].dtype, 'float64')
self.assertEqual(dragon.math.moments(dragon.Tensor(None, 'int64'))[0].dtype, 'float64')
def test_multinomial(self):
with dragon.graph_mode():
self.assertEqual(dragon.random.multinomial(self.sym1).shape, None)
self.assertEqual(dragon.random.multinomial(self.sym2, num_samples=2).shape, (2,))
def test_non_zero(self):
with dragon.graph_mode():
self.assertEqual(dragon.nonzero(self.sym1).shape, None)
self.assertEqual(dragon.nonzero(self.sym2).shape, (None, 1))
def test_one_hot(self):
with dragon.graph_mode():
self.assertEqual(dragon.one_hot(self.sym1, depth=2).shape, None)
self.assertEqual(dragon.one_hot(self.sym2, depth=2).shape, (1, 2))
def test_pad(self):
with dragon.graph_mode():
self.assertEqual(dragon.pad(self.sym1, pads=[(1, 1)]).shape, None)
self.assertEqual(dragon.pad(self.sym3, pads=[(1, 1)]).shape, (3, None))
self.assertEqual(dragon.pad(self.sym3, pads=[(1, 1), (1, 1)]).shape, (3, None))
def test_permutation(self):
with dragon.graph_mode():
self.assertEqual(dragon.random.permutation(5).shape, (5,))
def test_pool(self):
func = functools.partial(dragon.nn.pool2d, kernel_shape=3, strides=1, pads=1)
with dragon.graph_mode():
self.assertEqual(func(self.sym1).shape, None)
self.assertEqual(func(self.sym3).shape, (1, None))
self.assertEqual(func(self.sym4).shape, (1, None, None, None))
self.assertEqual(func(self.sym4, global_pooling=True).shape, (1, None, 1, 1))
self.assertEqual(func(dragon.Tensor((1, 3, 4, 4))).shape, (1, 3, 4, 4))
self.assertEqual(func(dragon.Tensor((1, 3, 4, 4)), padding='SAME').shape, (1, 3, 4, 4))
def test_predicative(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.is_inf(self.sym1).shape, self.sym1.shape)
self.assertEqual(dragon.math.is_inf(self.sym3).shape, self.sym3.shape)
self.assertEqual(dragon.math.is_nan(self.sym1).shape, self.sym1.shape)
self.assertEqual(dragon.math.is_nan(self.sym3).shape, self.sym3.shape)
def test_range(self):
with dragon.graph_mode():
self.assertEqual(dragon.range(3).shape, (3,))
self.assertEqual(dragon.range(3, 4).shape, (1,))
self.assertEqual(dragon.range(3, delta=0).shape, None)
def test_reduce(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.sum(self.sym1).shape, ())
self.assertEqual(dragon.math.sum(self.sym1, axis=0).shape, None)
self.assertEqual(dragon.math.sum(self.sym1, keep_dims=True).shape, ())
self.assertEqual(dragon.math.sum(self.sym2, axis=0).shape, ())
self.assertEqual(dragon.math.sum(self.sym2, axis=1).shape, (1,))
self.assertEqual(dragon.math.sum(self.sym2, axis=0, keep_dims=True).shape, (1,))
def test_repeat(self):
with dragon.graph_mode():
self.assertEqual(dragon.repeat(self.sym1, axis=None, repeats=2).shape, (None,))
self.assertEqual(dragon.repeat(self.sym1, axis=0, repeats=2).shape, None)
self.assertEqual(dragon.repeat(self.sym2, axis=None, repeats=2).shape, (2,))
self.assertEqual(dragon.repeat(self.sym3, axis=0, repeats=2).shape, (2, None))
self.assertEqual(dragon.repeat(self.sym3, axis=1, repeats=2).shape, (1, None))
def test_reshape(self):
with dragon.graph_mode():
self.assertEqual(dragon.reshape(self.sym2, shape=(0, 1)).shape, (1, 1))
self.assertEqual(dragon.reshape(self.sym3, shape=(0, -1)).shape, (1, None))
self.assertEqual(dragon.reshape(self.sym3, shape=(0, 1, 0)).shape, None)
def test_resize(self):
with dragon.graph_mode():
self.assertEqual(dragon.vision.resize(
self.sym4, sizes=(1,)).shape, (1, None, 1, 1))
self.assertEqual(dragon.vision.resize(
self.sym4, sizes=(1, 1)).shape, (1, None, 1, 1))
self.assertEqual(dragon.vision.resize(
self.sym4, sizes=(1, 1, 1, 1)).shape, (1, None, 1, 1))
self.assertEqual(dragon.vision.resize(
self.sym4, scales=(1,)).shape, (1, None, None, None))
self.assertEqual(dragon.vision.resize(
self.sym4, scales=(1, 1)).shape, (1, None, None, None))
self.assertEqual(dragon.vision.resize(
self.sym4, scales=(1, 1, 1, 1)).shape, (1, None, None, None))
self.assertEqual(dragon.vision.resize(
self.sym5, sizes=(1, 1, 1, 1)).shape, None)
def test_roi_pool(self):
rois = dragon.Tensor((2, 5))
func = functools.partial(dragon.vision.roi_pool, pooled_h=7, pooled_w=7)
with dragon.graph_mode():
self.assertEqual(func([self.sym1, rois]).shape, None)
self.assertEqual(func([self.sym4, rois]).shape, (2, None, 7, 7))
self.assertEqual(func([self.sym4, self.sym1]).shape, (None, None, 7, 7))
def test_slice(self):
with dragon.graph_mode():
self.assertEqual(dragon.slice(self.sym1, (1,), (1,)).shape, None)
self.assertEqual(dragon.slice(self.sym3, (1,), (1,)).shape, (1, None))
def test_softmax_loss(self):
with dragon.graph_mode():
self.assertEqual(dragon.losses.sparse_softmax_cross_entropy(
[self.sym1, self.sym1]).shape, ())
self.assertEqual(dragon.losses.sparse_softmax_cross_entropy(
[self.sym1, self.sym1], reduction='none').shape, None)
self.assertEqual(dragon.losses.sparse_softmax_cross_entropy(
[self.sym3, self.sym1], reduction='none').shape, (self.sym3.shape[0],))
def test_sort(self):
with dragon.graph_mode():
self.assertEqual(dragon.sort(self.sym1)[0].shape, None)
self.assertEqual(dragon.sort(self.sym2)[0].shape, self.sym2.shape)
def test_split(self):
with dragon.graph_mode():
self.assertEqual(dragon.split(self.sym1, 2)[0].shape, None)
self.assertEqual(dragon.split(self.sym2, 2)[0].shape, (1,))
self.assertEqual(dragon.split(self.sym2, 2, axis=1)[0].shape, None)
self.assertEqual(dragon.split(self.sym2, (1, 1))[0].shape, (1,))
self.assertEqual(dragon.split(self.sym2, 2, slice_points=(1,))[0].shape, (1,))
self.assertEqual(dragon.split(self.sym3, 2, axis=1)[0].shape, (1, None))
self.assertEqual(dragon.split(self.sym3, 2, axis=1, slice_points=(1,))[1].shape, (1, None))
def test_squeeze(self):
with dragon.graph_mode():
self.assertEqual(dragon.squeeze(self.sym1).shape, None)
self.assertEqual(dragon.squeeze(self.sym2).shape, ())
self.assertEqual(dragon.squeeze(self.sym2, axis=-1).shape, ())
self.assertEqual(dragon.squeeze(self.sym3).shape, (None,))
def test_stack(self):
with dragon.graph_mode():
self.assertEqual(dragon.stack([self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.stack([self.sym3, self.sym2]).shape, (2, 1, None))
self.assertEqual(dragon.stack([self.sym3, self.sym3]).shape, (2, 1, None))
self.assertEqual(dragon.stack([self.sym3, self.sym3], axis=-1).shape, (1, None, 2))
def test_tile(self):
with dragon.graph_mode():
self.assertEqual(dragon.tile(
self.sym1, repeats=(1, 2)).shape, None)
self.assertEqual(dragon.tile(
self.sym3, repeats=(1, 2)).shape, (1, None))
def test_topk(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.top_k(self.sym1)[0].shape, None)
self.assertEqual(dragon.math.top_k(self.sym2, k=2)[0].shape, (2,))
self.assertEqual(dragon.math.top_k(self.sym2, axis=1)[0].shape, None)
def test_unchanged(self):
with dragon.graph_mode():
self.assertEqual(dragon.math.negative(self.sym1).shape, None)
def test_unique(self):
with dragon.graph_mode():
self.assertEqual(dragon.unique(self.sym1).shape, (None,))
self.assertEqual(dragon.unique(self.sym1, return_counts=True)[1].shape, (None,))
self.assertEqual(dragon.unique(self.sym1, return_inverse=True)[1].shape, None)
self.assertEqual(dragon.unique(self.sym1,
return_inverse=True,
return_counts=True)[1].shape, None)
class TestOpSpecWithTensorDesc(unittest.TestCase):
"""Test the op spec with tensor descriptors."""
sym1 = dragon.Tensor(None)
sym2 = dragon.Tensor((1, None))
sym3 = dragon.Tensor((1, None, None, None))
shape1 = dragon.shape(sym1)
shape2 = [1, shape1, 1]
def test_broadcast_to(self):
with dragon.graph_mode():
self.assertEqual(dragon.broadcast_to(
self.sym1, shape=self.shape1).shape, None)
self.assertEqual(dragon.broadcast_to(
self.sym2, shape=self.shape1).shape, (None,) * len(self.sym2.shape))
self.assertEqual(dragon.broadcast_to(
self.sym2, shape=self.shape2).shape, (None,) * len(self.shape2))
def test_channel_normalize(self):
func = functools.partial(dragon.channel_normalize,
mean=(1., 1., 1.), std=(1., 1., 1.))
with dragon.graph_mode():
self.assertEqual(func(self.sym1).shape, None)
self.assertEqual(func(self.sym1, perm=self.shape1).shape, None)
self.assertEqual(func(self.sym2).shape, self.sym2.shape)
self.assertEqual(func(self.sym2, perm=self.shape1).shape,
(None,) * len(self.sym2.shape))
self.assertEqual(func(self.sym2, perm=self.shape2).shape,
(None,) * len(self.sym2.shape))
def test_conv_transpose(self):
w = dragon.Tensor((3, 3, 3, 3))
with dragon.graph_mode():
self.assertEqual(dragon.nn.conv2d_transpose(
[self.sym1, self.sym1]).shape, None)
self.assertEqual(dragon.nn.conv2d_transpose(
[self.sym3, self.sym1]).shape, None)
self.assertEqual(dragon.nn.conv2d_transpose(
[self.sym3, w]).shape, (self.sym3.shape[0], w.shape[0], None, None))
self.assertEqual(dragon.nn.conv2d_transpose(
[w, w], output_padding=self.shape1).shape,
(w.shape[0], w.shape[0], None, None))
self.assertEqual(dragon.nn.conv2d_transpose(
[w, w], output_padding=self.shape2).shape,
(w.shape[0], w.shape[0], None, None))
self.assertEqual(dragon.nn.conv2d_transpose(
[w, w], output_shape=self.shape1).shape,
(w.shape[0], w.shape[0], None, None))
self.assertEqual(dragon.nn.conv2d_transpose(
[w, w], output_shape=self.shape2).shape,
(w.shape[0], w.shape[0], None, None))
def test_init_ops(self):
init_funcs_v1 = [dragon.fill,
dragon.ones,
dragon.random.glorot_normal,
dragon.random.glorot_uniform,
dragon.random.normal,
dragon.random.uniform,
dragon.random.truncated_normal,
dragon.zeros]
init_funcs_v2 = [dragon.ones_like,
dragon.random.normal_like,
dragon.random.uniform_like,
dragon.zeros_like]
for func in init_funcs_v1:
with dragon.graph_mode():
self.assertEqual(func(shape=self.shape1).shape, None)
self.assertEqual(func(shape=self.shape2).shape, (None,) * len(self.shape2))
for func in init_funcs_v2:
with dragon.graph_mode():
self.assertEqual(func(self.sym1).shape, None)
self.assertEqual(func(self.sym2).shape, self.sym2.shape)
def test_permutation(self):
with dragon.graph_mode():
self.assertEqual(dragon.random.permutation(self.sym1).shape, (None,))
def test_repeat(self):
with dragon.graph_mode():
self.assertEqual(dragon.repeat(
self.sym1, repeats=self.shape1).shape, None)
self.assertEqual(dragon.repeat(
self.sym2, repeats=self.shape1).shape, None)
def test_reshape(self):
with dragon.graph_mode():
self.assertEqual(dragon.reshape(
self.sym1, shape=self.shape1).shape, None)
self.assertEqual(dragon.reshape(
self.sym2, shape=self.shape1).shape, None)
self.assertEqual(dragon.reshape(
self.sym2, shape=self.shape2).shape, (None,) * len(self.shape2))
def test_resize(self):
with dragon.graph_mode():
self.assertEqual(dragon.vision.resize(
self.sym1, sizes=self.shape1).shape, None)
self.assertEqual(dragon.vision.resize(
self.sym1, scales=self.shape1).shape, None)
self.assertEqual(dragon.vision.resize(
self.sym2, sizes=self.shape1).shape, (None,) * len(self.sym2.shape))
self.assertEqual(dragon.vision.resize(
self.sym2, scales=self.shape1).shape, (None,) * len(self.sym2.shape))
self.assertEqual(dragon.vision.resize(
self.sym2, sizes=self.shape2).shape, (None,) * len(self.sym2.shape))
self.assertEqual(dragon.vision.resize(
self.sym2, scales=self.shape2).shape, (None,) * len(self.sym2.shape))
def test_slice(self):
with dragon.graph_mode():
self.assertEqual(dragon.slice(
self.sym1, starts=self.shape1, sizes=self.shape1).shape, None)
self.assertEqual(dragon.slice(
self.sym2, starts=self.shape1, sizes=self.shape1).shape, None)
self.assertEqual(dragon.slice(
self.sym2, starts=self.shape2, sizes=self.shape2).shape, None)
def test_tile(self):
with dragon.graph_mode():
self.assertEqual(dragon.tile(
self.sym1, repeats=self.shape1).shape, None)
self.assertEqual(dragon.tile(
self.sym2, repeats=self.shape1).shape, (None,) * len(self.sym2.shape))
self.assertEqual(dragon.tile(
self.sym2, repeats=self.shape2).shape, (None,) * len(self.sym2.shape))
def test_transpose(self):
with dragon.graph_mode():
self.assertEqual(dragon.transpose(self.sym1).shape, None)
self.assertEqual(dragon.transpose(self.sym1, perm=self.shape1).shape, None)
self.assertEqual(dragon.transpose(self.sym2).shape, self.sym2.shape[::-1])
self.assertEqual(dragon.transpose(
self.sym2, perm=self.shape1).shape, (None,) * len(self.sym2.shape))
self.assertEqual(dragon.transpose(
self.sym2, perm=self.shape2).shape, (None,) * len(self.sym2.shape))
if __name__ == '__main__':
run_tests()
......@@ -8,7 +8,6 @@
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Code generator for Runtime API."""
from __future__ import absolute_import
......
......@@ -89,7 +89,8 @@ def batch_norm(
The moving average of stats are calculated as:
.. math:: x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
Parameters
----------
......@@ -124,9 +125,9 @@ def batch_norm(
.instantiate(
input.device,
training=training,
momentum=momentum,
epsilon=eps,
).apply(input, running_mean, running_var, weight, bias)
).apply(input, running_mean, running_var,
weight, bias, momentum)
def binary_cross_entropy_with_logits(
......@@ -1598,7 +1599,7 @@ def sync_batch_norm(
The moving average of stats are calculated as:
.. math::
x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{\text{batch}}
Additionally, you can specify ``process_group`` to perform synchronization.
......
......@@ -111,24 +111,31 @@ class BatchNorm(function.Function):
def __init__(self, key, dev, **kwargs):
super(BatchNorm, self).__init__(key, dev, **kwargs)
self.momentum = kwargs.get('momentum', 0.1)
self.epsilon = kwargs.get('epsilon', 1e-5)
self.training = kwargs.get('training', False)
self.track_stats = kwargs.get('track_stats', True)
def setup(self, ws, handle, momentum):
self.feed_arg(ws, '{}/momentum'.format(handle), 1.0 - momentum, 'float32')
def attributes(self):
return {
'op_type': 'BatchNorm',
'arguments': {
'axis': 1,
'momentum': 1. - self.momentum,
'epsilon': self.epsilon,
'use_stats': int(not self.training),
'momentum_desc': '${HANDLE}/momentum',
}
}
def forward(self, input, running_mean, running_var, weight, bias):
def forward(self, input, running_mean, running_var, weight, bias, momentum):
inputs = [input, weight, bias, running_mean, running_var]
return self.dispatch(inputs, [self.alloc()])
return self.dispatch(
inputs, [self.alloc()],
callback=lambda ws, handle:
self.setup(ws, handle, momentum),
)
class Conv2d(_ConvNd):
......
......@@ -25,6 +25,8 @@ from dragon.vm.torch.core.tensor import Tensor
class _BatchNorm(Module):
"""BatchNorm base module."""
def __init__(
self,
num_features,
......@@ -45,20 +47,26 @@ class _BatchNorm(Module):
else:
self.register_buffer('weight', init_funcs.ones(num_features))
self.register_buffer('bias', init_funcs.zeros(num_features))
if self.track_running_stats:
self.num_batches_tracked = 0
else:
self.num_batches_tracked = None
self.register_buffer('running_mean', init_funcs.zeros(num_features))
self.register_buffer('running_var', init_funcs.ones(num_features))
self.inputs = [self.running_mean, self.running_var, self.weight, self.bias]
self.reset_parameters()
def reset_parameters(self):
if self.affine:
self.weight.data.one_()
self.bias.data.zero_()
def reset_running_stats(self):
if self.track_running_stats:
self.running_mean.zero_()
self.running_var.fill_(1)
self.num_batches_tracked = 0
def reset_parameters(self):
self.reset_running_stats()
if self.affine:
self.weight.data.one_()
self.bias.data.zero_()
def extra_repr(self):
return '{num_features}, ' \
......@@ -72,7 +80,7 @@ class _BatchNorm(Module):
return F.batch_norm(
input, *self.inputs,
training=self.training,
momentum=self.momentum,
momentum=self._get_momentum(),
eps=self.eps
)
......@@ -82,6 +90,19 @@ class _BatchNorm(Module):
return self # Float32 parameters are required.
return super(_BatchNorm, self)._apply(fn)
def _get_momentum(self):
"""Return the current momentum value."""
momentum = 0.0 if self.momentum is None else self.momentum
if self.track_running_stats:
if self.training:
if self.num_batches_tracked is not None:
self.num_batches_tracked += 1
if self.momentum is None:
momentum = 1.0 / float(self.num_batches_tracked)
else:
momentum = 0.0
return momentum
class BatchNorm1d(_BatchNorm):
r"""Apply the batch normalization over 2d input.
......@@ -93,7 +114,8 @@ class BatchNorm1d(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
See Also
--------
......@@ -109,16 +131,16 @@ class BatchNorm1d(_BatchNorm):
affine=True,
track_running_stats=True,
):
"""Create a ``BatchNorm1d`` module.
r"""Create a ``BatchNorm1d`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -142,7 +164,8 @@ class BatchNorm2d(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
See Also
--------
......@@ -158,16 +181,16 @@ class BatchNorm2d(_BatchNorm):
affine=True,
track_running_stats=True,
):
"""Create a ``BatchNorm2d`` module.
r"""Create a ``BatchNorm2d`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -191,7 +214,8 @@ class BatchNorm3d(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
See Also
--------
......@@ -207,16 +231,16 @@ class BatchNorm3d(_BatchNorm):
affine=True,
track_running_stats=True,
):
"""Create a ``BatchNorm3d`` module.
r"""Create a ``BatchNorm3d`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -240,7 +264,8 @@ class SyncBatchNorm(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
Additionally, specify ``process_group`` to perform synchronization.
......@@ -261,16 +286,16 @@ class SyncBatchNorm(_BatchNorm):
track_running_stats=True,
process_group=None,
):
"""Create a ``SyncBatchNorm`` module.
r"""Create a ``SyncBatchNorm`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -292,7 +317,7 @@ class SyncBatchNorm(_BatchNorm):
return F.sync_batch_norm(
input, *self.inputs,
training=self.training,
momentum=self.momentum,
momentum=self._get_momentum(),
eps=self.eps,
process_group=self.process_group
)
......@@ -300,6 +325,6 @@ class SyncBatchNorm(_BatchNorm):
return F.batch_norm(
input, *self.inputs,
training=self.training,
momentum=self.momentum,
momentum=self._get_momentum(),
eps=self.eps
)
......@@ -61,7 +61,7 @@ class AffineChannel(Module):
fix_bias=False,
inplace=False,
):
"""Create an ``Affine`` module.
"""Create an ``AffineChannel`` module.
Parameters
----------
......@@ -141,7 +141,7 @@ class GroupNorm(Module):
eps=1e-5,
affine=True,
):
"""Create a ``GroupNorm`` module.
r"""Create a ``GroupNorm`` module.
Parameters
----------
......@@ -150,7 +150,7 @@ class GroupNorm(Module):
num_channels : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
......@@ -228,11 +228,11 @@ class LocalResponseNorm(Module):
size : int, required
The number of neighbouring channels to sum over.
alpha : float, optional, default=0.0001
The scale value :math:`\alpha`.
The value to :math:`\alpha`.
beta : float, optional, default=0.75
The exponent value :math:`\beta`.
The value to :math:`\beta`.
k : float, optional, default=1.
The bias constant :math:`k`.
The value to :math:`k`.
"""
super(LocalResponseNorm, self).__init__()
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!