Commit 1ad360e9 by Ting PAN

Add tests of operator spec for AutoGraph

Summary:
This commit tests the correctness of shape inference and data type
blended by autograph module.
1 parent 1bd78a3c
Showing with 726 additions and 823 deletions
......@@ -7,16 +7,16 @@ namespace kernel {
namespace {
template <typename Tx, typename Ty>
template <typename InputT, typename OutputT>
void _ChannelNormalize(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const Tx* x,
const InputT* x,
const float* mean,
const float* std,
Ty* y) {
OutputT* y) {
const auto count =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
vec64_t idx(num_dims, 0);
......@@ -27,7 +27,8 @@ void _ChannelNormalize(
xi += idx[d] * x_strides[d];
if (d == axis) wi = idx[d];
}
y[yi] = ((Ty)x[xi] - (Ty)mean[wi]) / (Ty)std[wi];
y[yi] =
convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]);
math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
}
}
......@@ -36,83 +37,43 @@ void _ChannelNormalize(
/* ------------------- Launcher Separator ------------------- */
template <>
void ChannelNormalize<float16, float16, CPUContext>(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const float16* x,
const float* mean,
const float* std,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(Tx, Ty) \
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNormalize<Tx, Ty, CPUContext>( \
void ChannelNormalize<InputT, OutputT, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const Tx* x, \
const InputT* x, \
const float* mean, \
const float* std, \
Ty* y, \
OutputT* y, \
CPUContext* ctx) { \
_ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
}
#define DEFINE_FP16_KERNEL_LAUNCHER(T) \
template <> \
void ChannelNormalize<float16, T, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const float16* x, \
const float* mean, \
const float* std, \
T* y, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
} \
template <> \
void ChannelNormalize<T, float16, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const T* x, \
const float* mean, \
const float* std, \
float16* y, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
}
DEFINE_KERNEL_LAUNCHER(int8_t, float16);
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(int8_t, double);
DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, double);
DEFINE_KERNEL_LAUNCHER(int, float16);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int, double);
DEFINE_KERNEL_LAUNCHER(int64_t, float16);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, float16);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float16, double);
DEFINE_KERNEL_LAUNCHER(float, float16);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(float, double);
DEFINE_KERNEL_LAUNCHER(double, float16);
DEFINE_KERNEL_LAUNCHER(double, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_FP16_KERNEL_LAUNCHER(int8_t);
DEFINE_FP16_KERNEL_LAUNCHER(uint8_t);
DEFINE_FP16_KERNEL_LAUNCHER(int);
DEFINE_FP16_KERNEL_LAUNCHER(int64_t);
DEFINE_FP16_KERNEL_LAUNCHER(float);
DEFINE_FP16_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_FP16_KERNEL_LAUNCHER
} // namespace kernel
......
......@@ -7,51 +7,51 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _NLLLoss(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
loss[i] = mask[i] = LogitType(0);
loss[i] = mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -logit[k], mask[i] = LogitType(1);
loss[i] = -logit[k], mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _NLLLossGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
mask[i] = LogitType(0);
mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dlogit[k] = LogitType(-1), mask[i] = LogitType(1);
dlogit[k] = LogitT(-1), mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -61,27 +61,27 @@ void _NLLLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
template <> \
void name<LogitType, TargetType, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
ignore_index, \
logit, \
target, \
loss, \
mask); \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitT, TargetT, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
ignore_index, \
logit, \
target, \
loss, \
mask); \
}
DEFINE_KERNEL_LAUNCHER(NLLLoss, float, float);
......
......@@ -9,48 +9,48 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _NLLLoss(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
loss[yi] = mask[yi] = LogitType(0);
loss[yi] = mask[yi] = LogitT(0);
} else {
loss[yi] = -logit[(i * axis_dim + label) * inner_dim + j];
mask[yi] = LogitType(1);
mask[yi] = LogitT(1);
}
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _NLLLossGrad(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
mask[yi] = LogitType(0);
mask[yi] = LogitT(0);
} else {
dlogit[(i * axis_dim + label) * inner_dim + j] = LogitType(-1);
mask[yi] = LogitType(1);
dlogit[(i * axis_dim + label) * inner_dim + j] = LogitT(-1);
mask[yi] = LogitT(1);
}
}
}
......@@ -59,17 +59,17 @@ __global__ void _NLLLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CUDAContext>( \
void name<LogitT, TargetT, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
......@@ -7,19 +7,19 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SigmoidFocalLoss(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
std::array<int, 3> idx = {0, 0, 0};
std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
const int count = dims[0] * dims[1] * dims[2];
......@@ -27,23 +27,21 @@ void _SigmoidFocalLoss(
for (int i = 0; i < count; ++i) {
const int t = (int)target[idx[0] * inner_dim + idx[2]];
// "0" is reserved for target if negative index is zero
LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
// (1 - p)^{gamma} * log(p)
LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
std::log(std::max(p, (LogitType)FLT_MIN));
LogitT pos_term =
std::pow(LogitT(1) - p, gamma) * std::log(std::max(p, (LogitT)FLT_MIN));
// p^{gamma} * log(1 - p)
LogitType neg_term = std::pow(p, gamma) *
LogitT neg_term = std::pow(p, gamma) *
(-logit[i] * (logit[i] >= 0) -
std::log(
LogitType(1) +
std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
LogitT(1) + std::exp(logit[i] - 2 * logit[i] * (logit[i] >= 0))));
loss[i] = LogitType(0);
loss[i] = LogitT(0);
loss[i] += -c1 * pos_term * pos_alpha;
loss[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1;
......@@ -52,19 +50,19 @@ void _SigmoidFocalLoss(
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SigmoidFocalLossGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
std::array<int, 3> idx = {0, 0, 0};
std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
const int count = dims[0] * dims[1] * dims[2];
......@@ -72,26 +70,24 @@ void _SigmoidFocalLossGrad(
for (int i = 0; i < count; ++i) {
const int t = (int)target[idx[0] * inner_dim + idx[2]];
// "0" is reserved for target if negative index is zero
LogitType c1 = (LogitType)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + std::exp(-logit[i]));
LogitT c1 = (LogitT)(t == (idx[1] + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (idx[1] + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + std::exp(-logit[i]));
// (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
LogitType pos_term = std::pow(LogitType(1) - p, gamma) *
(LogitType(1) - p -
p * gamma * std::log(std::max(p, (LogitType)FLT_MIN)));
LogitT pos_term = std::pow(LogitT(1) - p, gamma) *
(LogitT(1) - p - p * gamma * std::log(std::max(p, (LogitT)FLT_MIN)));
// p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
LogitType neg_term = std::pow(p, gamma) *
LogitT neg_term = std::pow(p, gamma) *
((-logit[i] * (logit[i] >= 0) -
std::log(
LogitType(1) +
std::exp(logit[i] - LogitType(2) * logit[i] * (logit[i] >= 0)))) *
LogitT(1) +
std::exp(logit[i] - LogitT(2) * logit[i] * (logit[i] >= 0)))) *
(1 - p) * gamma -
p);
dx[i] = LogitType(0);
dx[i] = LogitT(0);
dx[i] += -c1 * pos_term * pos_alpha;
dx[i] += -c2 * neg_term * neg_alpha;
mask[i] = c1;
......@@ -104,33 +100,33 @@ void _SigmoidFocalLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
template <> \
void name<LogitType, TargetType, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const float pos_alpha, \
const float neg_alpha, \
const float gamma, \
const int negative_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
(LogitType)pos_alpha, \
(LogitType)neg_alpha, \
(LogitType)gamma, \
negative_index, \
logit, \
target, \
loss, \
mask); \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitT, TargetT, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const float pos_alpha, \
const float neg_alpha, \
const float gamma, \
const int negative_index, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
(LogitT)pos_alpha, \
(LogitT)neg_alpha, \
(LogitT)gamma, \
negative_index, \
logit, \
target, \
loss, \
mask); \
}
DEFINE_KERNEL_LAUNCHER(SigmoidFocalLoss, float, float);
......
......@@ -9,19 +9,19 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SigmoidFocalLoss(
const int nthreads,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int j = yi % inner_dim;
const int k = (yi / inner_dim) % axis_dim;
......@@ -29,40 +29,39 @@ __global__ void _SigmoidFocalLoss(
const int t = target[i * inner_dim + j];
// "0" is reserved for target if negative index is zero
LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[yi]));
LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[yi]));
// (1 - p)^{gamma} * log(p)
LogitType pos_term = pow(LogitType(1) - p, gamma) * log(max(p, FLT_MIN));
LogitT pos_term = pow(LogitT(1) - p, gamma) * log(max(p, FLT_MIN));
// p^{gamma} * log(1 - p)
LogitType neg_term = pow(p, gamma) *
LogitT neg_term = pow(p, gamma) *
(-logit[yi] * (logit[yi] >= 0) -
log(LogitType(1) +
exp(logit[yi] - LogitType(2) * logit[yi] * (logit[yi] >= 0))));
log(LogitT(1) +
exp(logit[yi] - LogitT(2) * logit[yi] * (logit[yi] >= 0))));
loss[yi] = LogitType(0);
loss[yi] = LogitT(0);
loss[yi] += -c1 * pos_term * pos_alpha;
loss[yi] += -c2 * neg_term * neg_alpha;
mask[yi] = c1;
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SigmoidFocalLossGrad(
const int nthreads,
const int inner_dim,
const int axis_dim,
const LogitType pos_alpha,
const LogitType neg_alpha,
const LogitType gamma,
const LogitT pos_alpha,
const LogitT neg_alpha,
const LogitT gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* logit,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(xi, nthreads) {
const int j = xi % inner_dim;
const int k = (xi / inner_dim) % axis_dim;
......@@ -70,24 +69,23 @@ __global__ void _SigmoidFocalLossGrad(
const int t = target[i * inner_dim + j];
// "0" is reserved for target if neg index is zero
LogitType c1 = (LogitType)(t == (k + (negative_index ? 0 : 1)));
LogitType c2 =
(LogitType)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitType p = LogitType(1) / (LogitType(1) + exp(-logit[xi]));
LogitT c1 = (LogitT)(t == (k + (negative_index ? 0 : 1)));
LogitT c2 = (LogitT)((t >= 0) & (t != (k + (negative_index ? 0 : 1))));
LogitT p = LogitT(1) / (LogitT(1) + exp(-logit[xi]));
// (1 - p)^{gamma} * (1 - p - gamma * p * log(p))
LogitType pos_term = pow(LogitType(1) - p, gamma) *
(LogitType(1) - p - p * gamma * log(max(p, FLT_MIN)));
LogitT pos_term = pow(LogitT(1) - p, gamma) *
(LogitT(1) - p - p * gamma * log(max(p, FLT_MIN)));
// p^{gamma} * (gamma * (1 - p) * log(1-p) - p)
LogitType neg_term = pow(p, gamma) *
LogitT neg_term = pow(p, gamma) *
((-logit[xi] * (logit[xi] >= 0) -
log(LogitType(1) +
exp(logit[xi] - LogitType(2) * logit[xi] * (logit[xi] >= 0)))) *
(LogitType(1) - p) * gamma -
log(LogitT(1) +
exp(logit[xi] - LogitT(2) * logit[xi] * (logit[xi] >= 0)))) *
(LogitT(1) - p) * gamma -
p);
dx[xi] = LogitType(0);
dx[xi] = LogitT(0);
dx[xi] += -c1 * pos_term * pos_alpha;
dx[xi] += -c2 * neg_term * neg_alpha;
mask[xi] = c1;
......@@ -98,9 +96,9 @@ __global__ void _SigmoidFocalLossGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CUDAContext>( \
void name<LogitT, TargetT, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
......@@ -108,19 +106,19 @@ __global__ void _SigmoidFocalLossGrad(
const float neg_alpha, \
const float gamma, \
const int negative_index, \
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* logit, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * axis_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, \
inner_dim, \
axis_dim, \
(LogitType)pos_alpha, \
(LogitType)neg_alpha, \
(LogitType)gamma, \
(LogitT)pos_alpha, \
(LogitT)neg_alpha, \
(LogitT)gamma, \
negative_index, \
logit, \
target, \
......
......@@ -7,58 +7,58 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SparseSoftmaxCrossEntropy(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
loss[i] = mask[i] = LogitType(0);
loss[i] = mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
mask[i] = LogitType(1);
loss[i] = -std::log(std::max(prob[k], LogitT(FLT_MIN)));
mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void _SparseSoftmaxCrossEntropyGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
LogitType* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
LogitT* offset_dx = dx + idx[0] * axis_dim * inner_dim + idx[1];
for (int j = 0; j < axis_dim; ++j) {
(*offset_dx) = LogitType(0);
(*offset_dx) = LogitT(0);
offset_dx += inner_dim;
}
mask[i] = LogitType(0);
mask[i] = LogitT(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dx[k] -= LogitType(1);
mask[i] = LogitType(1);
dx[k] -= LogitT(1);
mask[i] = LogitT(1);
}
math::utils::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -68,27 +68,27 @@ void _SparseSoftmaxCrossEntropyGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
template <> \
void name<LogitType, TargetType, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* prob, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
ignore_index, \
prob, \
target, \
loss, \
mask); \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitT, TargetT, CPUContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitT* prob, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
inner_dim, \
axis_dim, \
ignore_index, \
prob, \
target, \
loss, \
mask); \
}
DEFINE_KERNEL_LAUNCHER(SparseSoftmaxCrossEntropy, float, float);
......
......@@ -9,54 +9,54 @@ namespace kernel {
namespace {
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SparseSoftmaxCrossEntropy(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* loss,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* loss,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
loss[yi] = mask[yi] = LogitType(0);
loss[yi] = mask[yi] = LogitT(0);
} else {
loss[yi] = -log(max(
prob[(i * axis_dim + label) * inner_dim + j], LogitType(FLT_MIN)));
mask[yi] = LogitType(1);
loss[yi] = -log(
max(prob[(i * axis_dim + label) * inner_dim + j], LogitT(FLT_MIN)));
mask[yi] = LogitT(1);
}
}
}
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
__global__ void _SparseSoftmaxCrossEntropyGrad(
const int nthreads,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* dx,
LogitType* mask) {
const LogitT* prob,
const TargetT* target,
LogitT* dx,
LogitT* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
LogitType* offset_dx = dx + i * axis_dim * inner_dim + j;
LogitT* offset_dx = dx + i * axis_dim * inner_dim + j;
for (int k = 0; k < axis_dim; ++k) {
(*offset_dx) = LogitType(0);
(*offset_dx) = LogitT(0);
offset_dx += inner_dim;
}
mask[yi] = LogitType(0);
mask[yi] = LogitT(0);
} else {
dx[(i * axis_dim + label) * inner_dim + j] -= LogitType(1);
mask[yi] = LogitType(1);
dx[(i * axis_dim + label) * inner_dim + j] -= LogitT(1);
mask[yi] = LogitT(1);
}
}
}
......@@ -65,17 +65,17 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, LogitType, TargetType) \
#define DEFINE_KERNEL_LAUNCHER(name, LogitT, TargetT) \
template <> \
void name<LogitType, TargetType, CUDAContext>( \
void name<LogitT, TargetT, CUDAContext>( \
const int outer_dim, \
const int inner_dim, \
const int axis_dim, \
const int ignore_index, \
const LogitType* prob, \
const TargetType* target, \
LogitType* loss, \
LogitType* mask, \
const LogitT* prob, \
const TargetT* target, \
LogitT* loss, \
LogitT* mask, \
CUDAContext* ctx) { \
const auto nthreads = outer_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,159 +10,72 @@ namespace kernel {
namespace {
template <typename T>
template <typename T, typename AccT>
__global__ void
_Clip(const int nthreads, const T low, const T high, const T* x, T* y) {
_Clip(const int nthreads, const AccT low, const AccT high, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = max(low, min(x[i], high));
y[i] = convert::To<T>(max(low, min(convert::To<AccT>(x[i]), high)));
}
}
template <>
__global__ void _Clip<half>(
const int nthreads,
const half low,
const half high,
const half* x,
half* y) {
#if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hlt(__ldg(x + i), high)
? (__hgt(__ldg(x + i), low) ? __ldg(x + i) : low)
: high;
}
#else
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __float2half(max(kLow, min(__half2float(x[i]), kHigh)));
}
#endif
}
template <typename T>
template <typename T, typename AccT>
__global__ void _ClipGrad(
const int nthreads,
const T low,
const T high,
const AccT low,
const AccT high,
const T* dy,
const T* x,
T* dx) {
const T kZero = convert::To<T>(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350
dx[i] = __ldg(x + i) < low || __ldg(x + i) > high ? T(0) : dy[i];
#else
dx[i] = x[i] < low || x[i] > high ? T(0) : dy[i];
#endif
const AccT val = convert::To<AccT>(x[i]);
dx[i] = val < low || val > high ? kZero : dy[i];
}
}
template <>
__global__ void _ClipGrad<half>(
const int nthreads,
const half low,
const half high,
const half* dy,
const half* x,
half* dx) {
const half kZero = __float2half(0.f);
#if __CUDA_ARCH__ >= 530
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] =
(__hlt(__ldg(x + i), low) || __hgt(__ldg(x + i), high)) ? kZero : dy[i];
}
#elif __CUDA_ARCH__ >= 350
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__half2float(__ldg(x + i)) < kLow ||
__half2float(__ldg(x + i)) > kHigh)
? kZero
: dy[i];
}
#else
const float kLow = __half2float(low);
const float kHigh = __half2float(high);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] = (__half2float(x[i]) < kLow || __half2float(x[i]) > kHigh) ? kZero
: dy[i];
}
#endif
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void Clip<float16, CUDAContext>(
const int count,
const float low,
const float high,
const float16* x,
float16* y,
CUDAContext* ctx) {
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
convert::To<half>(low),
convert::To<half>(high),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
}
template <>
void ClipGrad<float16, CUDAContext>(
const int count,
const float low,
const float high,
const float16* dy,
const float16* x,
float16* dx,
CUDAContext* ctx) {
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count,
convert::To<half>(low),
convert::To<half>(high),
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(dx));
} // ClipGrad
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Clip<T, CUDAContext>( \
const int count, \
const float low, \
const float high, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
_Clip<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, convert::To<T>(low), convert::To<T>(high), x, y); \
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void Clip<T, CUDAContext>( \
const int count, \
const float low, \
const float high, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
_Clip<T, AccT> \
<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, low, high, x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \
template <> \
void ClipGrad<T, CUDAContext>( \
const int count, \
const float low, \
const float high, \
const T* dy, \
const T* x, \
T* dx, \
CUDAContext* ctx) { \
_ClipGrad<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, convert::To<T>(low), convert::To<T>(high), dy, x, dx); \
#define DEFINE_GRAD_KERNEL_LAUNCHER(T, AccT) \
template <> \
void ClipGrad<T, CUDAContext>( \
const int count, \
const float low, \
const float high, \
const T* dy, \
const T* x, \
T* dx, \
CUDAContext* ctx) { \
_ClipGrad<T, AccT> \
<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, low, high, dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t);
DEFINE_KERNEL_LAUNCHER(int, int);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -20,15 +20,15 @@ void _RowwiseMoments(
#pragma omp parallel for num_threads(OMP_THREADS(cols))
#endif
for (int i = 0; i < cols; ++i) {
T x_val;
AccT m_val = AccT(0), v_val = AccT(0), mu;
AccT x_val, m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < rows; ++j) {
x_val = x[j * cols + i];
x_val = convert::To<AccT>(x[j * cols + i]);
m_val += x_val;
v_val += x_val * x_val;
}
mean[i] = mu = m_val * scale;
var[i] = v_val * scale - mu * mu;
m_val *= scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val;
}
}
......@@ -44,15 +44,15 @@ void _ColwiseMoments(
#pragma omp parallel for num_threads(OMP_THREADS(rows))
#endif
for (int i = 0; i < rows; ++i) {
T x_val;
AccT m_val = AccT(0), v_val = AccT(0), mu;
AccT x_val, m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < cols; ++j) {
x_val = x[i * cols + j];
x_val = convert::To<AccT>(x[i * cols + j]);
m_val += x_val;
v_val += x_val * x_val;
}
mean[i] = mu = m_val * scale;
var[i] = v_val * scale - mu * mu;
m_val *= scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val;
}
}
......@@ -71,8 +71,7 @@ void _GenericMoments(
#pragma omp parallel for num_threads(OMP_THREADS(rows))
#endif
for (int i = 0; i < rows; ++i) {
T x_val;
AccT m_val = AccT(0), v_val = AccT(0), mu;
AccT x_val, m_val = AccT(0), v_val = AccT(0);
int xi, c, r;
for (int j = 0; j < cols; ++j) {
xi = 0;
......@@ -81,12 +80,13 @@ void _GenericMoments(
FIXED_DIVISOR_DIV_MOD(x_dims[d], c, &c, &r);
xi += r * x_strides[d];
}
x_val = x[xi];
x_val = convert::To<AccT>(x[xi]);
m_val += x_val;
v_val += x_val * x_val;
}
mean[i] = mu = m_val * scale;
var[i] = v_val * scale - mu * mu;
m_val *= scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val;
}
}
......@@ -148,19 +148,6 @@ void _Moments(
/* ------------------- Launcher Separator ------------------- */
template <>
void Moments<float16, float, CPUContext>(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float16* x,
float* mean,
float* var,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void Moments<T, AccT, CPUContext>( \
......@@ -178,7 +165,8 @@ void Moments<float16, float, CPUContext>(
DEFINE_KERNEL_LAUNCHER(int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE__KERNEL_LAUNCHER
......
......@@ -201,7 +201,7 @@ void _Moments(
DEFINE_KERNEL_LAUNCHER(int8_t, int8_t, float);
DEFINE_KERNEL_LAUNCHER(uint8_t, uint8_t, float);
DEFINE_KERNEL_LAUNCHER(int, int, float);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, float);
DEFINE_KERNEL_LAUNCHER(int64_t, int64_t, double);
DEFINE_KERNEL_LAUNCHER(float16, half, float);
DEFINE_KERNEL_LAUNCHER(float, float, float);
DEFINE_KERNEL_LAUNCHER(double, double, double);
......
......@@ -70,7 +70,7 @@ void _L1NormalizeGrad(
auto X = ConstEigenStridedVectorMap<T>(
x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
auto norm = std::max(X.template lpNorm<1>() / normalizer, epsilon);
auto norm2 = std::pow(norm, 2);
auto norm2 = std::pow(norm, T(2));
EigenStridedVectorMap<T>(
dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
(dY / norm) -
......@@ -98,7 +98,7 @@ void _L2NormalizeGrad(
auto X = ConstEigenStridedVectorMap<T>(
x + offset, 1, reduce_dim, EigenInnerStride(inner_dim));
auto norm = std::max(std::sqrt(X.squaredNorm() / normalizer), epsilon);
auto norm3 = std::pow(norm, 3);
auto norm3 = std::pow(norm, T(3));
EigenStridedVectorMap<T>(
dx + offset, 1, reduce_dim, EigenInnerStride(inner_dim)) =
(dY / norm) - ((X / norm3) * dY.dot(X) / normalizer);
......
......@@ -93,7 +93,7 @@ __global__ void _L1NormalizeGrad(
val2 = BlockReduce<AccT>(storage).Sum(val2);
if (threadIdx.x == 0) {
norm = max(val1 / normalizer, epsilon);
norm2 = pow(norm, 2);
norm2 = pow(norm, AccT(2));
sum = val2 / normalizer;
}
__syncthreads();
......@@ -130,7 +130,7 @@ __global__ void _L2NormalizeGrad(
val2 = BlockReduce<AccT>(storage).Sum(val2);
if (threadIdx.x == 0) {
norm = max(sqrt(val1 / normalizer), epsilon);
norm3 = pow(norm, 3);
norm3 = pow(norm, AccT(3));
sum = val2 / normalizer;
}
__syncthreads();
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......
......@@ -5,7 +5,7 @@
namespace dragon {
template <class Context>
template <typename Tx, typename Ty>
template <typename InputT, typename OutputT>
void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -35,10 +35,10 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
num_dims,
X_strides.data(),
Y_dims.data(),
X.template data<Tx, Context>(),
X.template data<InputT, Context>(),
X_mean_.template data<float, Context>(),
X_std_.template data<float, Context>(),
Y->Reshape(Y_dims)->template mutable_data<Ty, Context>(),
Y->Reshape(Y_dims)->template mutable_data<OutputT, Context>(),
ctx());
}
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void NLLLossOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -19,19 +19,19 @@ void NLLLossOp<Context>::DoRunWithType() {
<< "\nNumber of preds must match the number of targets.";
auto scratches = ctx()->workspace()->template data<Context>({
(size_t)num_preds * sizeof(LogitType), // loss
(size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
(size_t)num_preds * sizeof(LogitT), // loss
(size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<LogitType*>(scratches[1]);
auto* loss = static_cast<LogitT*>(scratches[0]);
auto* mask = static_cast<LogitT*>(scratches[1]);
kernel::NLLLoss(
outer_dim,
inner_dim,
X.dim(axis),
ignore_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
loss,
mask,
ctx());
......@@ -42,7 +42,7 @@ void NLLLossOp<Context>::DoRunWithType() {
math::Copy(
num_preds,
loss,
Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
ctx());
} else {
int64_t normalizer = 1;
......@@ -59,7 +59,7 @@ void NLLLossOp<Context>::DoRunWithType() {
normalizer,
loss,
mask,
Y->Reshape({})->template mutable_data<LogitType, Context>(),
Y->Reshape({})->template mutable_data<LogitT, Context>(),
ctx());
}
}
......@@ -91,7 +91,7 @@ void NLLLossOp<Context>::RunOnDevice() {
}
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void NLLLossGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -101,19 +101,19 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
auto inner_dim = dX->count(axis + 1);
auto num_preds = outer_dim * inner_dim;
auto* dy = dY.template data<LogitType, Context>();
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* dy = dY.template data<LogitT, Context>();
auto* dx = dX->template mutable_data<LogitT, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
math::Set(dX->count(), convert::To<LogitType>(0.f), dx, ctx());
ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
math::Set(dX->count(), convert::To<LogitT>(0.f), dx, ctx());
kernel::NLLLossGrad(
outer_dim,
inner_dim,
dX->dim(axis),
ignore_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
dx,
mask,
ctx());
......
......@@ -28,7 +28,7 @@ class NLLLossOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......@@ -47,7 +47,7 @@ class NLLLossGradientOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SigmoidFocalLossOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -18,11 +18,11 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
<< "\nNumber of preds must match the number of targets.";
auto scratches = ctx()->workspace()->template data<Context>({
X.size() * sizeof(LogitType), // loss
X.size() * sizeof(LogitType) + sizeof(LogitType), // mask
X.size() * sizeof(LogitT), // loss
X.size() * sizeof(LogitT) + sizeof(LogitT), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<LogitType*>(scratches[1]);
auto* loss = static_cast<LogitT*>(scratches[0]);
auto* mask = static_cast<LogitT*>(scratches[1]);
kernel::SigmoidFocalLoss(
outer_dim,
......@@ -32,8 +32,8 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
neg_alpha_,
gamma_,
negative_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
loss,
mask,
ctx());
......@@ -42,7 +42,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
math::Copy(
X.count(),
loss,
Y->ReshapeLike(X)->template mutable_data<LogitType, Context>(),
Y->ReshapeLike(X)->template mutable_data<LogitT, Context>(),
ctx());
} else {
int64_t normalizer = 1;
......@@ -59,7 +59,7 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
normalizer,
loss,
mask,
Y->Reshape({})->template mutable_data<LogitType, Context>(),
Y->Reshape({})->template mutable_data<LogitT, Context>(),
ctx());
}
}
......@@ -91,7 +91,7 @@ void SigmoidFocalLossOp<Context>::RunOnDevice() {
}
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -100,10 +100,10 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
auto outer_dim = dX->count(0, axis);
auto inner_dim = dX->count(axis + 1);
auto* dy = dY.template data<LogitType, Context>();
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* mask = ctx()->workspace()->template data<LogitType, Context>(
{dX->count() + 1})[0];
auto* dy = dY.template data<LogitT, Context>();
auto* dx = dX->template mutable_data<LogitT, Context>();
auto* mask =
ctx()->workspace()->template data<LogitT, Context>({dX->count() + 1})[0];
kernel::SigmoidFocalLossGrad(
outer_dim,
......@@ -113,8 +113,8 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
neg_alpha_,
gamma_,
negative_index_,
X.template data<LogitType, Context>(),
Input(1).template data<TargetType, Context>(),
X.template data<LogitT, Context>(),
Input(1).template data<TargetT, Context>(),
dx,
mask,
ctx());
......
......@@ -48,7 +48,7 @@ class SigmoidFocalLossOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......@@ -88,7 +88,7 @@ class SigmoidFocalLossGradientOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -45,7 +45,7 @@ class SparseSoftmaxCrossEntropyOp : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......@@ -81,7 +81,7 @@ class SparseSoftmaxCrossEntropyGradientOp : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -6,7 +6,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -18,20 +18,20 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
CHECK_EQ(num_preds, Input(1).count())
<< "\nNumber of preds must match the number of targets.";
auto* X_prob = Buffer("prob")->ReshapeLike(X);
auto* prob = X_prob->template mutable_data<LogitType, Context>();
auto* prob = X_prob->template mutable_data<LogitT, Context>();
auto scratches = ctx()->workspace()->template data<Context>({
(size_t)num_preds * sizeof(LogitType), // loss
(size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
(size_t)num_preds * sizeof(LogitT), // loss
(size_t)num_preds * sizeof(LogitT) + sizeof(LogitT), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<LogitType*>(scratches[1]);
auto* loss = static_cast<LogitT*>(scratches[0]);
auto* mask = static_cast<LogitT*>(scratches[1]);
kernel::Softmax(
outer_dim,
inner_dim,
X.dim(axis),
X.template data<LogitType, Context>(),
X.template data<LogitT, Context>(),
prob,
ctx());
......@@ -41,7 +41,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
X.dim(axis),
ignore_index_,
prob,
Input(1).template data<TargetType, Context>(),
Input(1).template data<TargetT, Context>(),
loss,
mask,
ctx());
......@@ -52,7 +52,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
math::Copy(
num_preds,
loss,
Y->Reshape(out_shape)->template mutable_data<LogitType, Context>(),
Y->Reshape(out_shape)->template mutable_data<LogitT, Context>(),
ctx());
} else {
int64_t normalizer = 1;
......@@ -69,7 +69,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
normalizer,
loss,
mask,
Y->Reshape({})->template mutable_data<LogitType, Context>(),
Y->Reshape({})->template mutable_data<LogitT, Context>(),
ctx());
}
}
......@@ -101,7 +101,7 @@ void SparseSoftmaxCrossEntropyOp<Context>::RunOnDevice() {
}
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
auto &dY = Input(-1), *dX = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(Input(0));
......@@ -110,11 +110,11 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
auto inner_dim = dX->count(axis + 1);
auto num_preds = outer_dim * inner_dim;
auto* prob = Buffer("prob")->template data<LogitType, Context>();
auto* dy = Input(-1).template data<LogitType, Context>();
auto* dx = Output(0)->template mutable_data<LogitType, Context>();
auto* prob = Buffer("prob")->template data<LogitT, Context>();
auto* dy = Input(-1).template data<LogitT, Context>();
auto* dx = Output(0)->template mutable_data<LogitT, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
ctx()->workspace()->template data<LogitT, Context>({num_preds + 1})[0];
math::Copy(dX->count(), prob, dx, ctx());
......@@ -124,7 +124,7 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
dX->dim(axis),
ignore_index_,
prob,
Input(1).template data<TargetType, Context>(),
Input(1).template data<TargetT, Context>(),
dx,
mask,
ctx());
......
......@@ -5,8 +5,9 @@
namespace dragon {
template <class Context>
template <typename Tx, typename Ty>
template <typename T>
void MomentsOp<Context>::DoRunWithType() {
using OutputT = typename math::utils::AccmulatorType<T>::type;
auto &X = Input(0), *Y1 = Output(0), *Y2 = Output(1);
// Determine the reduce axes
......@@ -35,13 +36,13 @@ void MomentsOp<Context>::DoRunWithType() {
if (X.count() == 1) {
math::Cast(
1,
X.template data<Tx, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
X.template data<T, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
ctx());
math::Set(
1,
convert::To<Ty>(0.f),
Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
convert::To<OutputT>(0.f),
Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
ctx());
} else {
kernel::Moments(
......@@ -49,35 +50,16 @@ void MomentsOp<Context>::DoRunWithType() {
X_dims.data(),
reduce_axes.size(),
reduce_axes.data(),
X.template data<Tx, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
Y2->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
X.template data<T, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
Y2->Reshape(Y_shape)->template mutable_data<OutputT, Context>(),
ctx());
}
}
template <class Context>
void MomentsOp<Context>::RunOnDevice() {
auto& X = Input(0);
if (X.template IsType<int8_t>()) {
DoRunWithType<int8_t, float>();
} else if (X.template IsType<uint8_t>()) {
DoRunWithType<uint8_t, float>();
} else if (X.template IsType<int>()) {
DoRunWithType<int, float>();
} else if (X.template IsType<int64_t>()) {
DoRunWithType<int64_t, float>();
} else if (X.template IsType<float16>()) {
DoRunWithType<float16, float>();
} else if (X.template IsType<float>()) {
DoRunWithType<float, float>();
} else if (X.template IsType<double>()) {
DoRunWithType<double, double>();
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(X.meta()),
{"int8", "uint8", "int32", "int64", "float16", "float32", "float64"});
}
DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Moments);
......
......@@ -28,7 +28,7 @@ class MomentsOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename Tx, typename Ty>
template <typename T>
void DoRunWithType();
protected:
......
......@@ -3,7 +3,7 @@
namespace dragon {
template <class Context>
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void AccuracyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
CANONICALIZE_AXIS_WITH_TENSOR(X);
......@@ -18,21 +18,21 @@ void AccuracyOp<Context>::DoRunWithType() {
int64_t acc = 0, count = 0;
int64_t cols = X.count() / outer_dim;
auto* logit = X.template data<LogitType, CPUContext>();
auto* target = Input(1).template data<TargetType, CPUContext>();
auto* logit = X.template data<LogitT, CPUContext>();
auto* target = Input(1).template data<TargetT, CPUContext>();
for (int i = 0; i < outer_dim; ++i) {
for (int j = 0; j < inner_dim; ++j) {
const int label = target[i * inner_dim + j];
if (label == ignore_index_) continue;
vector<pair<LogitType, int>> vec;
vector<pair<LogitT, int>> vec;
for (int k = 0; k < axis_dim; k++)
vec.push_back(std::make_pair(logit[i * cols + k * inner_dim + j], k));
std::partial_sort(
vec.begin(),
vec.begin() + top_k_,
vec.end(),
std::greater<pair<LogitType, int>>());
std::greater<pair<LogitT, int>>());
for (int k = 0; k < top_k_; k++) {
if (vec[k].second == label) {
acc++;
......
......@@ -28,7 +28,7 @@ class AccuracyOp final : public Operator<Context> {
void RunOnDevice() override;
template <typename LogitType, typename TargetType>
template <typename LogitT, typename TargetT>
void DoRunWithType();
protected:
......
......@@ -8,11 +8,11 @@ namespace dragon {
template <class Context>
template <typename T>
void BatchNormOp<Context>::TrainingImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
using ParamT = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
auto* X_mu = Buffer("X_mu")->Reshape({C_});
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
......@@ -20,11 +20,11 @@ void BatchNormOp<Context>::TrainingImpl() {
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* x = Input(0).template data<T, Context>();
auto* rm = Input(3).template mutable_data<ParamType, Context>();
auto* rv = Input(4).template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* scale = X_scale->template mutable_data<ParamType, Context>();
auto* rm = Input(3).template mutable_data<ParamT, Context>();
auto* rv = Input(4).template mutable_data<ParamT, Context>();
auto* mu = X_mu->template mutable_data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
auto* scale = X_scale->template mutable_data<ParamT, Context>();
// Compute moments
if (sync_stats_ > 0) {
......@@ -45,7 +45,7 @@ void BatchNormOp<Context>::TrainingImpl() {
if (enable_nccl_) {
#ifdef USE_NCCL
auto coll_comm = this->nccl_comm();
auto coll_dtype = this->template nccl_dtype<ParamType>();
auto coll_dtype = this->template nccl_dtype<ParamT>();
NCCL_CHECK(ncclAllReduce(
(void*)mu,
(void*)mu,
......@@ -84,8 +84,9 @@ void BatchNormOp<Context>::TrainingImpl() {
// Compute running statistics
if (is_recomputing_ == 0) {
math::Axpby(C_, 1.f - momentum_, mu, momentum_, rm, ctx());
math::Axpby(C_, 1.f - momentum_, rsig, momentum_, rv, ctx());
auto decay_factor = momentum();
math::Axpby(C_, 1.f - decay_factor, mu, decay_factor, rm, ctx());
math::Axpby(C_, 1.f - decay_factor, rsig, decay_factor, rv, ctx());
}
// Inverse stddev from variance
......@@ -100,10 +101,10 @@ void BatchNormOp<Context>::TrainingImpl() {
x,
mu,
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
scale,
X_bias->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
Output(0)->template mutable_data<T, Context>(),
ctx());
}
......@@ -111,17 +112,17 @@ void BatchNormOp<Context>::TrainingImpl() {
template <class Context>
template <typename T>
void BatchNormOp<Context>::InferenceImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
using ParamT = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
auto* X_rsig = Buffer("X_rsig")->Reshape({C_});
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* X_bias = Buffer("X_bias")->Reshape({C_});
auto* rv = Input(4).template data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* rv = Input(4).template data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
// Inverse stddev from variance
math::InvStd(C_, epsilon_, rv, rsig, ctx());
......@@ -133,12 +134,12 @@ void BatchNormOp<Context>::InferenceImpl() {
S_,
data_format(),
Input(0).template data<T, Context>(),
Input(3).template data<ParamType, Context>(),
Input(3).template data<ParamT, Context>(),
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
X_scale->template mutable_data<ParamT, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
Output(0)->template mutable_data<T, Context>(),
ctx());
}
......@@ -159,17 +160,17 @@ void BatchNormOp<Context>::RunOnDevice() {
template <class Context>
template <typename T>
void BatchNormGradientOp<Context>::TrainingImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
using ParamT = typename math::utils::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto* x = Input(0).template data<T, Context>();
auto* gamma = Input(1).template data<ParamType, Context>();
auto* gamma = Input(1).template data<ParamT, Context>();
auto* dy = Input(4).template data<T, Context>();
auto* mu = X_mu->template data<ParamType, Context>();
auto* rsig = X_rsig->template data<ParamType, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
auto* mu = X_mu->template data<ParamT, Context>();
auto* rsig = X_rsig->template data<ParamT, Context>();
auto* dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
auto* dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
// Gradient w.r.t. gamma and beta
kernel::BatchNormWGrad(
......@@ -181,7 +182,7 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
if (enable_nccl_) {
#ifdef USE_NCCL
auto coll_comm = this->nccl_comm();
auto coll_dtype = this->template nccl_dtype<ParamType>();
auto coll_dtype = this->template nccl_dtype<ParamT>();
NCCL_CHECK(ncclAllReduce(
(void*)dgamma,
(void*)dgamma,
......@@ -231,18 +232,18 @@ void BatchNormGradientOp<Context>::TrainingImpl() {
template <class Context>
template <typename T>
void BatchNormGradientOp<Context>::InferenceImpl() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
using ParamT = typename math::utils::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto* X_scale = Buffer("X_scale")->Reshape({C_});
auto* rv = Input(3).template data<ParamType, Context>();
auto* rsig = X_scale->template mutable_data<ParamType, Context>();
auto* rv = Input(3).template data<ParamT, Context>();
auto* rsig = X_scale->template mutable_data<ParamT, Context>();
// Gradient w.r.t. gamma or beta if necessary
ParamType *dgamma = nullptr, *dbeta = nullptr;
ParamT *dgamma = nullptr, *dbeta = nullptr;
if (dW->has_name() || dB->has_name()) {
dgamma = dW->Reshape({C_})->template mutable_data<ParamType, Context>();
dbeta = dB->Reshape({C_})->template mutable_data<ParamType, Context>();
dgamma = dW->Reshape({C_})->template mutable_data<ParamT, Context>();
dbeta = dB->Reshape({C_})->template mutable_data<ParamT, Context>();
}
// Inverse stddev from variance
......@@ -255,9 +256,9 @@ void BatchNormGradientOp<Context>::InferenceImpl() {
S_,
data_format(),
Input(0).template data<T, Context>(), // x
Input(2).template data<ParamType, Context>(), // rm
Input(2).template data<ParamT, Context>(), // rm
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(1).template data<ParamT, Context>(), // gamma
Input(4).template data<T, Context>(), // dy
dgamma,
dbeta,
......
......@@ -33,7 +33,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
public:
BatchNormOpBase(const OperatorDef& def, Workspace* ws)
: GenericOpBase<Context>(def, ws),
momentum_(OP_SINGLE_ARG(float, "momentum", 0.9f)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)),
use_stats_(OP_SINGLE_ARG(int64_t, "use_stats", -1)),
sync_stats_(OP_SINGLE_ARG(int64_t, "comm", 0) > 0 ? 1 : 0) {}
......@@ -57,7 +56,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
}
protected:
float momentum_;
double epsilon_;
int64_t N_, C_, S_;
int64_t use_stats_, sync_stats_;
......@@ -68,7 +66,6 @@ class BatchNormOpBase : public GenericOpBase<Context> {
#define USE_BATCHNORM_FUNCTIONS \
using BatchNormOpBase<Context>::DetermineBaseArguments; \
using BatchNormOpBase<Context>::momentum_; \
using BatchNormOpBase<Context>::epsilon_; \
using BatchNormOpBase<Context>::use_stats_; \
using BatchNormOpBase<Context>::sync_stats_; \
......@@ -82,7 +79,9 @@ template <class Context>
class BatchNormOp : public BatchNormOpBase<Context> {
public:
BatchNormOp(const OperatorDef& def, Workspace* ws)
: BatchNormOpBase<Context>(def, ws) {}
: BatchNormOpBase<Context>(def, ws) {
INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
#ifdef USE_MPI
......@@ -105,6 +104,8 @@ class BatchNormOp : public BatchNormOpBase<Context> {
InferenceImpl<T>();
}
};
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
};
template <class Context>
......@@ -146,11 +147,9 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
CuDNNCreateTensorDesc(&bn_desc_);
CuDNNCreateTensorDesc(&input_desc_);
if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. \nSet it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
epsilon_ = CUDNN_BN_MIN_EPSILON;
}
INIT_OP_SINGLE_ARG_WITH_DESC(float, momentum, 0.9f);
}
USE_OPERATOR_FUNCTIONS;
USE_BATCHNORM_FUNCTIONS;
......@@ -168,6 +167,7 @@ class CuDNNBatchNormOp final : public BatchNormOpBase<Context> {
protected:
cudnnTensorDescriptor_t input_desc_, bn_desc_;
cudnnBatchNormMode_t bn_mode_;
DECLARE_OP_SINGLE_ARG_WITH_DESC(float, momentum);
};
template <class Context>
......@@ -178,9 +178,6 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
CuDNNCreateTensorDesc(&bn_desc_);
CuDNNCreateTensorDesc(&input_desc_);
if (epsilon_ <= CUDNN_BN_MIN_EPSILON) {
LOG(ERROR) << "Provided epsilon is smaller than "
<< "CUDNN_BN_MIN_EPSILON. \nSet it to "
<< "CUDNN_BN_MIN_EPSILON instead.";
epsilon_ = CUDNN_BN_MIN_EPSILON;
}
}
......@@ -211,8 +208,12 @@ class CuDNNBatchNormGradientOp final : public BatchNormGradientOp<Context> {
cudnnBatchNormMode_t bn_mode_;
};
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, CuDNNBatchNormOp, momentum);
#endif // USE_CUDNN
DEFINE_OP_SINGLE_ARG_WITH_DESC(float, BatchNormOp, momentum);
} // namespace dragon
#endif // DRAGON_OPERATORS_NORMALIZATION_BATCH_NORM_OP_H_
......@@ -9,11 +9,11 @@ namespace dragon {
template <class Context>
template <typename T>
void CuDNNBatchNormOp<Context>::DoRunWithType() {
using ParamType = typename CuDNNType<T>::BNParamType;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamType);
using ParamT = typename CuDNNType<T>::BNParamType;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(3), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(4), vec64_t({C_}), ParamT);
// Determine the descriptors
if (Input(0).ndim() == 2) {
......@@ -39,14 +39,14 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
input_desc_,
Output(0)->template mutable_data<T, Context>(), // y
bn_desc_,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
is_recomputing_ > 0 ? 0.f : 1.f - this->momentum_,
Input(3).template mutable_data<ParamType, Context>(), // rm
Input(4).template mutable_data<ParamType, Context>(), // rv
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
is_recomputing_ == 0 ? 1.f - momentum() : 0.f,
Input(3).template mutable_data<ParamT, Context>(), // rm
Input(4).template mutable_data<ParamT, Context>(), // rv
epsilon_,
X_mu->template mutable_data<ParamType, Context>(), // sm
X_rsig->template mutable_data<ParamType, Context>())); // sv
X_mu->template mutable_data<ParamT, Context>(), // sm
X_rsig->template mutable_data<ParamT, Context>())); // sv
} else {
CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
ctx()->cudnn_handle(),
......@@ -58,10 +58,10 @@ void CuDNNBatchNormOp<Context>::DoRunWithType() {
input_desc_,
Output(0)->template mutable_data<T, Context>(), // y
bn_desc_,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
Input(3).template data<ParamType, Context>(), // rm
Input(4).template data<ParamType, Context>(), // rv
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
Input(3).template data<ParamT, Context>(), // rm
Input(4).template data<ParamT, Context>(), // rv
epsilon_));
}
}
......@@ -82,7 +82,7 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
template <class Context>
template <typename T>
void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
using ParamType = typename CuDNNType<T>::BNParamType;
using ParamT = typename CuDNNType<T>::BNParamType;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
......@@ -111,12 +111,12 @@ void CuDNNBatchNormGradientOp<Context>::TrainingImpl() {
input_desc_,
Output(0)->template mutable_data<T, Context>(), // dx
bn_desc_,
Input(1).template data<ParamType, Context>(), // gamma
dW->Reshape({C_})->template mutable_data<ParamType, Context>(), // dw
dB->Reshape({C_})->template mutable_data<ParamType, Context>(), // db
Input(1).template data<ParamT, Context>(), // gamma
dW->Reshape({C_})->template mutable_data<ParamT, Context>(), // dw
dB->Reshape({C_})->template mutable_data<ParamT, Context>(), // db
epsilon_,
X_mu->template data<ParamType, Context>(), // mu
X_rsig->template data<ParamType, Context>())); // rsig
X_mu->template data<ParamT, Context>(), // mu
X_rsig->template data<ParamT, Context>())); // rsig
}
template <class Context>
......
......@@ -8,9 +8,9 @@ namespace dragon {
template <class Context>
template <typename T>
void GroupNormOp<Context>::DoRunWithType() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamType);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamType);
using ParamT = typename math::utils::AccmulatorType<T>::type;
TENSOR_FILL_WITH_TYPE(Input(1), vec64_t({C_}), ParamT);
TENSOR_FILL_WITH_TYPE(Input(2), vec64_t({C_}), ParamT);
auto* X_mu = Buffer("X_mu")->Reshape({N_, G_});
auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_});
......@@ -18,8 +18,8 @@ void GroupNormOp<Context>::DoRunWithType() {
auto* X_bias = Buffer("X_bias")->Reshape({N_, C_});
auto* x = Input(0).template data<T, Context>();
auto* mu = X_mu->template mutable_data<ParamType, Context>();
auto* rsig = X_rsig->template mutable_data<ParamType, Context>();
auto* mu = X_mu->template mutable_data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
// Compute the moments
if (data_format() == "NCHW") {
......@@ -45,10 +45,10 @@ void GroupNormOp<Context>::DoRunWithType() {
x,
mu,
rsig,
Input(1).template data<ParamType, Context>(), // gamma
Input(2).template data<ParamType, Context>(), // beta
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<ParamT, Context>(), // beta
X_scale->template mutable_data<ParamT, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
Output(0)->template mutable_data<T, Context>(),
ctx());
}
......@@ -63,7 +63,7 @@ void GroupNormOp<Context>::RunOnDevice() {
template <class Context>
template <typename T>
void GroupNormGradientOp<Context>::DoRunWithType() {
using ParamType = typename math::utils::AccmulatorType<T>::type;
using ParamT = typename math::utils::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
......@@ -78,14 +78,14 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
S_,
data_format(),
Input(0).template data<T, Context>(), // x
X_mu->template data<ParamType, Context>(),
X_rsig->template data<ParamType, Context>(),
Input(1).template data<ParamType, Context>(), // gamma
X_mu->template data<ParamT, Context>(),
X_rsig->template data<ParamT, Context>(),
Input(1).template data<ParamT, Context>(), // gamma
Input(2).template data<T, Context>(), // dy
X_scale->template mutable_data<ParamType, Context>(),
X_bias->template mutable_data<ParamType, Context>(),
dW->Reshape({C_})->template mutable_data<ParamType, Context>(),
dB->Reshape({C_})->template mutable_data<ParamType, Context>(),
X_scale->template mutable_data<ParamT, Context>(),
X_bias->template mutable_data<ParamT, Context>(),
dW->Reshape({C_})->template mutable_data<ParamT, Context>(),
dB->Reshape({C_})->template mutable_data<ParamT, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
......
......@@ -58,7 +58,7 @@ def dropout(inputs, ratio=0.5, **kwargs):
if context.executing_eagerly():
return op_lib \
.instantiate() \
.apply([inputs], ratio, inplace=inplace)
.apply([inputs], args['ratio'], inplace=inplace)
else:
return op_lib.blend(**args)
......@@ -103,7 +103,7 @@ def drop_block2d(inputs, ratio=0.1, block_size=7, data_format='NCHW', **kwargs):
.instantiate(
block_size=block_size,
data_format=data_format,
).apply([inputs], ratio, inplace=inplace)
).apply([inputs], args['ratio'], inplace=inplace)
else:
return op_lib.blend(**args)
......@@ -137,7 +137,7 @@ def drop_path(inputs, ratio=0.2, **kwargs):
if context.executing_eagerly():
return op_lib \
.instantiate() \
.apply([inputs], ratio, inplace=inplace)
.apply([inputs], args['ratio'], inplace=inplace)
else:
return op_lib.blend(**args)
......
......@@ -205,9 +205,8 @@ def broadcast_to(inputs, shape, **kwargs):
op_lib = array_ops_lib.Expand
if context.executing_eagerly():
return op_lib \
.instantiate(
ndim=len(args['dims']),
).apply([inputs], args['dims'])
.instantiate(ndim=len(args['dims'])) \
.apply([inputs], args['dims'])
else:
return op_lib.blend(**args)
......@@ -1163,6 +1162,7 @@ def pad(inputs, pads, mode='constant', value=0, **kwargs):
return op_lib.blend(**args)
@ArgHelper.desc('limit', as_target=True)
def permutation(limit, dtype='int64', **kwargs):
r"""Return a tensor with value in the permuted range.
......@@ -1174,7 +1174,7 @@ def permutation(limit, dtype='int64', **kwargs):
Parameters
----------
limit: number
limit: Union[number, dragon.Tensor]
The end of interval.
dtype : str, optional, default='int64'
The optional data type.
......@@ -1192,7 +1192,7 @@ def permutation(limit, dtype='int64', **kwargs):
if context.executing_eagerly():
return op_lib \
.instantiate(dtype=dtype) \
.apply(limit, trainable=trainable)
.apply(args['limit'], trainable=trainable)
else:
return op_lib.blend(**args)
......
......@@ -49,10 +49,11 @@ def assign(inputs, starts=None, sizes=None, **kwargs):
inputs[1] = ops.scalar_to_tensor(inputs[1], inputs[0].dtype)
op_lib = control_flow_ops_lib.Assign
if context.executing_eagerly():
starts = args['starts'] if starts is not None else [0]
sizes = args['sizes'] if sizes is not None else [-1]
return op_lib \
.instantiate(
ndim=len(starts) if starts is not None else 0,
).apply(inputs, starts, sizes, inplace=inplace)
.instantiate(ndim=len(starts)) \
.apply(inputs, starts, sizes, inplace=inplace)
else:
return op_lib.blend(**args)
......
......@@ -23,6 +23,7 @@ from dragon.core.util import nest
@OpSchema.num_inputs(5)
@ArgHelper.desc('momentum', as_target=False)
def batch_norm(
inputs,
axis=-1,
......@@ -40,7 +41,8 @@ def batch_norm(
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
(1 - \text{momentum}) * x_{\text{batch}}
Parameters
----------
......@@ -48,8 +50,8 @@ def batch_norm(
The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.9
The momentum for running average.
momentum : Union[float, dragon.Tensor], optional
The value to :math:`\text{momentum}`.
epsilon : float, optional, default=1e-5
The value to :math:`\epsilon`.
use_stats : int, optional, default=-1
......@@ -62,16 +64,15 @@ def batch_norm(
"""
args = ArgHelper.parse(locals())
args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
args['epsilon'] = float(epsilon)
op_lib = normalization_ops_lib.BatchNorm
if context.executing_eagerly():
return op_lib \
.instantiate(
axis=axis,
momentum=args['momentum'],
epsilon=args['epsilon'],
use_stats=use_stats,
).apply(inputs)
).apply(inputs, args['momentum'])
else:
return op_lib.blend(**args)
......@@ -304,6 +305,7 @@ def local_response_norm(
@OpSchema.num_inputs(5)
@ArgHelper.desc('momentum', as_target=False)
def sync_batch_norm(
inputs,
axis=-1,
......@@ -322,7 +324,8 @@ def sync_batch_norm(
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} + (1 - \text{momentum}) * x_{\text{stat}}
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
(1 - \text{momentum}) * x_{\text{batch}}
Parameters
----------
......@@ -330,8 +333,8 @@ def sync_batch_norm(
The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.9
The momentum for average.
momentum : Union[float, dragon.Tensor], optional
The value to :math:`\text{momentum}`.
epsilon : float, optional, default=1e-5
The value to :math:`\epsilon`.
use_stats : int, optional, default=-1
......@@ -346,7 +349,7 @@ def sync_batch_norm(
"""
args = ArgHelper.parse(locals())
args['momentum'], args['epsilon'] = float(momentum), float(epsilon)
args['epsilon'] = float(epsilon)
if process_group is None:
process_group = distributed.get_group()
if process_group is None:
......@@ -356,11 +359,10 @@ def sync_batch_norm(
return op_lib \
.instantiate(
axis=axis,
momentum=args['momentum'],
epsilon=args['epsilon'],
use_stats=use_stats,
process_group=process_group,
).apply(inputs)
).apply(inputs, args['momentum'])
else:
args.update(process_group.arguments)
return op_lib.blend(**args)
......@@ -23,7 +23,6 @@ class BatchNorm(Operator):
def __init__(self, key, dev, **kwargs):
super(BatchNorm, self).__init__(key, dev, **kwargs)
self.axis = kwargs.get('axis', -1)
self.momentum = kwargs.get('momentum', 0.9)
self.epsilon = kwargs.get('epsilon', 1e-5)
self.use_stats = kwargs.get('use_stats', 0)
if self.use_stats not in (0, 1):
......@@ -34,14 +33,21 @@ class BatchNorm(Operator):
'op_type': 'BatchNorm',
'arguments': {
'axis': self.axis,
'momentum': self.momentum,
'epsilon': self.epsilon,
'use_stats': self.use_stats,
'momentum_desc': '${HANDLE}/momentum',
},
}
def forward(self, inputs):
return self.dispatch(inputs, [self.alloc()])
def setup(self, ws, handle, momentum):
self.feed_arg(ws, '%s/momentum' % handle, momentum, 'float32')
def forward(self, inputs, momentum):
return self.dispatch(
inputs, [self.alloc()],
callback=lambda ws, handle:
self.setup(ws, handle, momentum),
)
class GroupNorm(Operator):
......
......@@ -118,7 +118,8 @@ class ArgHelper(object):
if 'extra_inputs' not in arguments:
arguments['extra_inputs'] = []
arguments['extra_inputs'] += [arg]
arguments.pop(name)
if name in arguments:
arguments.pop(name)
arguments[name + '_desc'] = arg.id
return arguments
......@@ -141,5 +142,6 @@ class ArgHelper(object):
descs.append(ele.id)
else:
descs.append(Tensor.from_value(ele, dtype, 'DescConst').id)
arguments.pop(name)
if name in arguments:
arguments.pop(name)
arguments[name + '_descs'] = descs
......@@ -176,13 +176,16 @@ def conv2d_transpose(
raise ValueError('Unsupported padding algorithm: %s' % padding)
if data_format not in ('NCHW', 'NHWC'):
raise ValueError('Unsupported data format: %s' % data_format)
if 'SAME' in padding and output_shape is None:
raise ValueError('Excepted <output_shape> for same padding.')
if output_shape is not None and 'SAME' not in padding:
args['padding'] = 'SAME'
for key in ('kernel_shape', 'strides', 'pads', 'dilations'):
if key == 'pads':
args[key] = _normalize_pads(args[key], 2)
else:
args[key] = _normalize_tuple(args[key], 2)
if key in args and args[key] is not None:
if key == 'pads':
args[key] = _normalize_pads(args[key], 2)
else:
args[key] = _normalize_tuple(args[key], 2)
op_lib = vision_ops_lib.ConvTranspose2d
if context.executing_eagerly():
weight_shape = inputs[1].shape
......
......@@ -26,7 +26,7 @@ def dropout_exporter(op_def, context):
drop_ratio = arg.f
elif arg.name == 'prob_desc':
drop_ratio = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'ratio', drop_ratio)
helper.add_attribute(node, 'ratio', float(drop_ratio))
return node, const_tensors
......
......@@ -26,6 +26,9 @@ def batch_norm_exporter(op_def, context):
helper.add_attribute(node, 'epsilon', arg.f)
elif arg.name == 'momentum':
helper.add_attribute(node, 'momentum', arg.f)
elif arg.name == 'momentum_desc':
momentum = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'momentum', float(momentum))
# Weight, bias, running mean and running variance
const_tensors = [helper.from_tensor(e, context.ws) for e in op_def.input[1:]]
return node, const_tensors
......
......@@ -123,23 +123,51 @@ CONVERSIONS_DECL float16 To<float16, half>(half val) {
}
template <>
CONVERSIONS_DECL half To<half, float>(float val) {
return __float2half(val);
CONVERSIONS_DECL half To<half, float16>(float16 val) {
return __half_raw{val.x};
}
template <>
CONVERSIONS_DECL half To<half, float16>(float16 val) {
return __half_raw{val.x};
CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
return half2(__half2_raw{val.x, val.x});
}
template <>
CONVERSIONS_DECL half2 To<half2, float>(float val) {
return __float2half2_rn(val);
CONVERSIONS_DECL half To<half, float>(float val) {
#if CUDA_VERSION_MIN(9, 2, 0)
return __float2half(val);
#else
#if defined(__CUDA_ARCH__)
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
__half ret;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(ret)) : "f"(val));
return ret;
#undef __HALF_TO_US
#else
return To<half>(To<float16>(val));
#endif
#endif
}
template <>
CONVERSIONS_DECL half2 To<half2, float16>(float16 val) {
return half2(__half2_raw{val.x, val.x});
CONVERSIONS_DECL half2 To<half2, float>(float val) {
#if CUDA_VERSION_MIN(9, 2, 0)
return __float2half2_rn(val);
#else
#if defined(__CUDA_ARCH__)
#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
__half2 ret;
asm("{.reg .f16 low;\n"
" cvt.rn.f16.f32 low, %1;\n"
" mov.b32 %0, {low,low};}\n"
: "=r"(__HALF2_TO_UI(ret))
: "f"(val));
return ret;
#undef __HALF2_TO_UI
#else
return To<half2>(To<float16>(val));
#endif
#endif
}
#endif // USE_CUDA
......
......@@ -162,23 +162,17 @@ __global__ void _InvStd(const int n, const T eps, const T* x, T* y) {
}
}
template <>
__global__ void
_InvStd<half>(const int n, const half eps, const half* x, half* y) {
__global__ void _InvStd(const int n, const float eps, const half* x, half* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
y[i] = hrsqrt(__hadd(x[i], eps));
#endif
y[i] = __float2half(rsqrt(__half2float(x[i]) + eps));
}
}
template <>
__global__ void
_InvStd<half2>(const int n, const half2 eps, const half2* x, half2* y) {
_InvStd(const int n, const float eps, const half2* x, half2* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
y[i] = h2rsqrt(__hadd2(x[i], eps));
#endif
const float2 val = __half22float2(x[i]);
y[i] = __floats2half2_rn(rsqrt(val.x + eps), rsqrt(val.y + eps));
}
}
......@@ -206,19 +200,15 @@ __global__ void _Powx(const int n, const T exponent, const T* x, T* y) {
__global__ void
_Powx(const int n, const float exponent, const half* x, half* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
y[i] = __float2half(pow(__half2float(x[i]), exponent));
#endif
}
}
__global__ void
_Powx(const int n, const float exponent, const half2* x, half2* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
#if __CUDA_ARCH__ >= 530
const float2 val = __half22float2(x[i]);
y[i] = __floats2half2_rn(pow(val.x, exponent), pow(val.y, exponent));
#endif
}
}
......@@ -269,20 +259,16 @@ __global__ void _Square(const int n, const T* x, T* y) {
template <typename T>
__global__ void _NotZero(const int nthreads, const T* x, bool* y) {
const T kZero = T(0);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = x[i] != kZero ? true : false;
y[i] = x[i] != T(0) ? true : false;
}
}
template <>
__global__ void _NotZero<half>(const int nthreads, const half* x, bool* y) {
#if __CUDA_ARCH__ >= 530
const half kZero = __float2half(0.f);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
y[i] = __hne(x[i], kZero) ? true : false;
y[i] = __half2float(x[i]) != 0.f ? true : false;
}
#endif
}
template <typename T>
......@@ -560,15 +546,12 @@ DRAGON_API void InvStd<float16, CUDAContext>(
if ((n & 1) == 0) {
_InvStd<<<CUDA_BLOCKS(n >> 1), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
n >> 1,
convert::To<half2>(eps),
eps,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
} else {
_InvStd<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
n,
convert::To<half>(eps),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
n, eps, reinterpret_cast<const half*>(x), reinterpret_cast<half*>(y));
}
}
......
......@@ -26,7 +26,7 @@ namespace math {
template <typename T>
struct MaxFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? rhs : lhs;
}
......@@ -39,7 +39,7 @@ struct MaxFunctor {
template <>
struct MaxFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -62,7 +62,7 @@ struct MaxFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MaxFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -87,7 +87,7 @@ struct MaxFunctor<half2> {
template <typename T>
struct MinFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? lhs : rhs;
}
......@@ -100,7 +100,7 @@ struct MinFunctor {
template <>
struct MinFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -123,7 +123,7 @@ struct MinFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MinFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -148,7 +148,7 @@ struct MinFunctor<half2> {
template <typename T>
struct PlusFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs + rhs;
}
......@@ -161,7 +161,7 @@ struct PlusFunctor {
template <>
struct PlusFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -183,7 +183,7 @@ struct PlusFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct PlusFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -211,7 +211,7 @@ struct PlusFunctor<half2> {
template <typename T>
struct MinusFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs - rhs;
}
......@@ -224,7 +224,7 @@ struct MinusFunctor {
template <>
struct MinusFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -246,7 +246,7 @@ struct MinusFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MinusFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -274,7 +274,7 @@ struct MinusFunctor<half2> {
template <typename T>
struct MultipliesFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs * rhs;
}
......@@ -287,7 +287,7 @@ struct MultipliesFunctor {
template <>
struct MultipliesFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -309,7 +309,7 @@ struct MultipliesFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct MultipliesFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -337,7 +337,7 @@ struct MultipliesFunctor<half2> {
template <typename T>
struct DividesFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs / rhs;
}
......@@ -350,7 +350,7 @@ struct DividesFunctor {
template <>
struct DividesFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
......@@ -372,7 +372,7 @@ struct DividesFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct DividesFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -396,7 +396,7 @@ struct DividesFunctor<half2> {
template <typename T>
struct PowFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return pow(lhs, rhs);
}
......@@ -409,7 +409,7 @@ struct PowFunctor {
template <>
struct PowFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
half ret = __float2half(
......@@ -425,7 +425,7 @@ struct PowFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct PowFunctor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
......@@ -449,7 +449,7 @@ struct PowFunctor<half2> {
template <typename T>
struct EqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs == rhs;
}
......@@ -462,7 +462,7 @@ struct EqualFunctor {
template <>
struct EqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -481,7 +481,7 @@ struct EqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct EqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -496,7 +496,7 @@ struct EqualFunctor<half> {
template <typename T>
struct NotEqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs != rhs;
}
......@@ -509,7 +509,7 @@ struct NotEqualFunctor {
template <>
struct NotEqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -528,7 +528,7 @@ struct NotEqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct NotEqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -543,7 +543,7 @@ struct NotEqualFunctor<half> {
template <typename T>
struct GreaterFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs > rhs;
}
......@@ -556,7 +556,7 @@ struct GreaterFunctor {
template <>
struct GreaterFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -575,7 +575,7 @@ struct GreaterFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct GreaterFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -590,7 +590,7 @@ struct GreaterFunctor<half> {
template <typename T>
struct LessFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs < rhs;
}
......@@ -603,7 +603,7 @@ struct LessFunctor {
template <>
struct LessFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -622,7 +622,7 @@ struct LessFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct LessFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -637,7 +637,7 @@ struct LessFunctor<half> {
template <typename T>
struct GreaterEqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs >= rhs;
}
......@@ -650,7 +650,7 @@ struct GreaterEqualFunctor {
template <>
struct GreaterEqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -669,7 +669,7 @@ struct GreaterEqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct GreaterEqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......@@ -684,7 +684,7 @@ struct GreaterEqualFunctor<half> {
template <typename T>
struct LessEqualFunctor {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs <= rhs;
}
......@@ -697,7 +697,7 @@ struct LessEqualFunctor {
template <>
struct LessEqualFunctor<float16> {
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
......@@ -716,7 +716,7 @@ struct LessEqualFunctor<float16> {
#endif
};
#if defined(__CUDACC__)
#if defined(__CUDA_ARCH__)
template <>
struct LessEqualFunctor<half> {
inline __device__ bool operator()(const half& lhs, const half& rhs) const {
......
......@@ -239,8 +239,8 @@ void ReduceSum<float16, CUDAContext>(
num_axes, \
axes, \
Reducer<AccT>(), \
AccT(kInit), \
AccT(scale), \
convert::To<AccT>(kInit), \
convert::To<AccT>(scale), \
x, \
y, \
ctx); \
......
......@@ -301,16 +301,16 @@ void ChannelAffine(
/* array.channel_normalize */
template <typename Tx, typename Ty, class Context>
template <typename InputT, typename OutputT, class Context>
void ChannelNormalize(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const Tx* x,
const InputT* x,
const float* mean,
const float* std,
Ty* y,
OutputT* y,
Context* ctx);
/* array.channel_shuffle */
......@@ -648,28 +648,28 @@ void BroadcastLossGrad(
/* loss.nll_loss */
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void NLLLoss(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask,
Context* ctx);
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void NLLLossGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask,
Context* ctx);
/* loss.sigmoid_ce_loss */
......@@ -694,7 +694,7 @@ void SigmoidCrossEntropyGrad(
/* loss.sigmoid_focal_loss */
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SigmoidFocalLoss(
const int outer_dim,
const int inner_dim,
......@@ -703,13 +703,13 @@ void SigmoidFocalLoss(
const float neg_alpha,
const float gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* loss,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* loss,
LogitT* mask,
Context* ctx);
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SigmoidFocalLossGrad(
const int outer_dim,
const int inner_dim,
......@@ -718,10 +718,10 @@ void SigmoidFocalLossGrad(
const float neg_alpha,
const float gamma,
const int negative_index,
const LogitType* logit,
const TargetType* target,
LogitType* dlogit,
LogitType* mask,
const LogitT* logit,
const TargetT* target,
LogitT* dlogit,
LogitT* mask,
Context* ctx);
/* loss.smooth_l1_loss */
......@@ -754,28 +754,28 @@ void SoftmaxCrossEntropy(
/* loss.sparse_softmax_cross_entropy */
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SparseSoftmaxCrossEntropy(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* loss,
LogitType* mask,
const LogitT* prob,
const TargetT* target,
LogitT* loss,
LogitT* mask,
Context* ctx);
template <typename LogitType, typename TargetType, class Context>
template <typename LogitT, typename TargetT, class Context>
void SparseSoftmaxCrossEntropyGrad(
const int outer_dim,
const int inner_dim,
const int axis_dim,
const int ignore_index,
const LogitType* prob,
const TargetType* target,
LogitType* dx,
LogitType* mask,
const LogitT* prob,
const TargetT* target,
LogitT* dx,
LogitT* mask,
Context* ctx);
/* math.abs */
......
......@@ -55,7 +55,7 @@ class BatchNormalization(Layer):
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.99
The momentum of moving average.
The decay factor of running average.
epsilon : float, optional, default=1e-3
The epsilon value.
center : bool, optional, default=True
......
......@@ -41,8 +41,8 @@ def batch_normalization(
The moving average of stats are calculated as:
.. math::
x_{moving} \leftarrow momentum * x_{moving} + (1 - momentum) * x_{stat}
.. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}} +
(1 - \text{momentum}) * x_{\text{batch}}
Parameters
----------
......@@ -58,10 +58,10 @@ def batch_normalization(
The :math:`\gamma` tensor.
axis : int, optional, default=-1
The channel axis.
momentum : float, optional, default=0.9
The momentum of moving average.
momentum : Union[float, dragon.Tensor], optional
The value to :math:`\text{momentum}`.
variance_epsilon : float, optional, default=1e-5
The value of epsilon.
The value to :math:`\epsilon`.
trainable : bool, optional, default=False
The optional training flag.
name : str, optional
......
......@@ -50,7 +50,7 @@ class BatchNorm(layer.Layer):
Parameters
----------
decay : float, optional, default=0.9
The decay factor for moving average.
The decay factor of running average.
epsilon : float, optional, default=1e-5
The epsilon.
act : callable, optional
......
......@@ -8,7 +8,6 @@
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Code generator for Runtime API."""
from __future__ import absolute_import
......
......@@ -89,7 +89,8 @@ def batch_norm(
The moving average of stats are calculated as:
.. math:: x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
Parameters
----------
......@@ -124,9 +125,9 @@ def batch_norm(
.instantiate(
input.device,
training=training,
momentum=momentum,
epsilon=eps,
).apply(input, running_mean, running_var, weight, bias)
).apply(input, running_mean, running_var,
weight, bias, momentum)
def binary_cross_entropy_with_logits(
......@@ -1598,7 +1599,7 @@ def sync_batch_norm(
The moving average of stats are calculated as:
.. math::
x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{stat}
x_{moving} \leftarrow (1 - momentum) * x_{moving} + momentum * x_{\text{batch}}
Additionally, you can specify ``process_group`` to perform synchronization.
......
......@@ -111,24 +111,31 @@ class BatchNorm(function.Function):
def __init__(self, key, dev, **kwargs):
super(BatchNorm, self).__init__(key, dev, **kwargs)
self.momentum = kwargs.get('momentum', 0.1)
self.epsilon = kwargs.get('epsilon', 1e-5)
self.training = kwargs.get('training', False)
self.track_stats = kwargs.get('track_stats', True)
def setup(self, ws, handle, momentum):
self.feed_arg(ws, '{}/momentum'.format(handle), 1.0 - momentum, 'float32')
def attributes(self):
return {
'op_type': 'BatchNorm',
'arguments': {
'axis': 1,
'momentum': 1. - self.momentum,
'epsilon': self.epsilon,
'use_stats': int(not self.training),
'momentum_desc': '${HANDLE}/momentum',
}
}
def forward(self, input, running_mean, running_var, weight, bias):
def forward(self, input, running_mean, running_var, weight, bias, momentum):
inputs = [input, weight, bias, running_mean, running_var]
return self.dispatch(inputs, [self.alloc()])
return self.dispatch(
inputs, [self.alloc()],
callback=lambda ws, handle:
self.setup(ws, handle, momentum),
)
class Conv2d(_ConvNd):
......
......@@ -25,6 +25,8 @@ from dragon.vm.torch.core.tensor import Tensor
class _BatchNorm(Module):
"""BatchNorm base module."""
def __init__(
self,
num_features,
......@@ -45,20 +47,26 @@ class _BatchNorm(Module):
else:
self.register_buffer('weight', init_funcs.ones(num_features))
self.register_buffer('bias', init_funcs.zeros(num_features))
if self.track_running_stats:
self.num_batches_tracked = 0
else:
self.num_batches_tracked = None
self.register_buffer('running_mean', init_funcs.zeros(num_features))
self.register_buffer('running_var', init_funcs.ones(num_features))
self.inputs = [self.running_mean, self.running_var, self.weight, self.bias]
self.reset_parameters()
def reset_parameters(self):
if self.affine:
self.weight.data.one_()
self.bias.data.zero_()
def reset_running_stats(self):
if self.track_running_stats:
self.running_mean.zero_()
self.running_var.fill_(1)
self.num_batches_tracked = 0
def reset_parameters(self):
self.reset_running_stats()
if self.affine:
self.weight.data.one_()
self.bias.data.zero_()
def extra_repr(self):
return '{num_features}, ' \
......@@ -72,7 +80,7 @@ class _BatchNorm(Module):
return F.batch_norm(
input, *self.inputs,
training=self.training,
momentum=self.momentum,
momentum=self._get_momentum(),
eps=self.eps
)
......@@ -82,6 +90,19 @@ class _BatchNorm(Module):
return self # Float32 parameters are required.
return super(_BatchNorm, self)._apply(fn)
def _get_momentum(self):
"""Return the current momentum value."""
momentum = 0.0 if self.momentum is None else self.momentum
if self.track_running_stats:
if self.training:
if self.num_batches_tracked is not None:
self.num_batches_tracked += 1
if self.momentum is None:
momentum = 1.0 / float(self.num_batches_tracked)
else:
momentum = 0.0
return momentum
class BatchNorm1d(_BatchNorm):
r"""Apply the batch normalization over 2d input.
......@@ -93,7 +114,8 @@ class BatchNorm1d(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
See Also
--------
......@@ -109,16 +131,16 @@ class BatchNorm1d(_BatchNorm):
affine=True,
track_running_stats=True,
):
"""Create a ``BatchNorm1d`` module.
r"""Create a ``BatchNorm1d`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -142,7 +164,8 @@ class BatchNorm2d(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
See Also
--------
......@@ -158,16 +181,16 @@ class BatchNorm2d(_BatchNorm):
affine=True,
track_running_stats=True,
):
"""Create a ``BatchNorm2d`` module.
r"""Create a ``BatchNorm2d`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -191,7 +214,8 @@ class BatchNorm3d(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
See Also
--------
......@@ -207,16 +231,16 @@ class BatchNorm3d(_BatchNorm):
affine=True,
track_running_stats=True,
):
"""Create a ``BatchNorm3d`` module.
r"""Create a ``BatchNorm3d`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -240,7 +264,8 @@ class SyncBatchNorm(_BatchNorm):
The running average of statistics are calculated as:
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} + \text{momentum} * x_{\text{stat}}
.. math:: x_{\text{running}} = (1 - \text{momentum}) * x_{\text{running}} +
\text{momentum} * x_{\text{batch}}
Additionally, specify ``process_group`` to perform synchronization.
......@@ -261,16 +286,16 @@ class SyncBatchNorm(_BatchNorm):
track_running_stats=True,
process_group=None,
):
"""Create a ``SyncBatchNorm`` module.
r"""Create a ``SyncBatchNorm`` module.
Parameters
----------
num_features : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
momentum : float, optional, default=0.1
The momentum of moving average.
The value to :math:`\text{momentum}`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
track_running_stats : bool, optional, default=True
......@@ -292,7 +317,7 @@ class SyncBatchNorm(_BatchNorm):
return F.sync_batch_norm(
input, *self.inputs,
training=self.training,
momentum=self.momentum,
momentum=self._get_momentum(),
eps=self.eps,
process_group=self.process_group
)
......@@ -300,6 +325,6 @@ class SyncBatchNorm(_BatchNorm):
return F.batch_norm(
input, *self.inputs,
training=self.training,
momentum=self.momentum,
momentum=self._get_momentum(),
eps=self.eps
)
......@@ -61,7 +61,7 @@ class AffineChannel(Module):
fix_bias=False,
inplace=False,
):
"""Create an ``Affine`` module.
"""Create an ``AffineChannel`` module.
Parameters
----------
......@@ -141,7 +141,7 @@ class GroupNorm(Module):
eps=1e-5,
affine=True,
):
"""Create a ``GroupNorm`` module.
r"""Create a ``GroupNorm`` module.
Parameters
----------
......@@ -150,7 +150,7 @@ class GroupNorm(Module):
num_channels : int
The number of channels.
eps : float, optional, default=1e-5
The epsilon value.
The value to :math:`\epsilon`.
affine : bool, optional, default=True
**True** to apply a affine transformation.
......@@ -228,11 +228,11 @@ class LocalResponseNorm(Module):
size : int, required
The number of neighbouring channels to sum over.
alpha : float, optional, default=0.0001
The scale value :math:`\alpha`.
The value to :math:`\alpha`.
beta : float, optional, default=0.75
The exponent value :math:`\beta`.
The value to :math:`\beta`.
k : float, optional, default=1.
The bias constant :math:`k`.
The value to :math:`k`.
"""
super(LocalResponseNorm, self).__init__()
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!