Commit 494774d3 by Ting PAN

Optimize training update operators

Summary:
This commit fuses the weight decay and mixed precision conversion
into update kernels to get lower training latency.
1 parent fb47d86f
Showing with 2315 additions and 2003 deletions
......@@ -418,9 +418,9 @@ class Normalize(Layer):
def __call__(self, bottom):
if len(self.blobs) == 0:
self.build(bottom)
outputs = [normalization_ops.lp_normalize(bottom, **self.norm_args)]
outputs = [normalization_ops.lp_norm(bottom, **self.norm_args)]
outputs += [blob['data'] for blob in self.blobs]
return array_ops.channel_affine(outputs, **self.scale_args)
return math_ops.affine(outputs, **self.scale_args)
class Permute(Layer):
......@@ -591,8 +591,7 @@ class Scale(Layer):
param = layer_param.scale_param
self.axis = param.axis
self.num_axes = param.num_axes
end_axis = -1 if self.num_axes < 1 else self.axis + self.num_axes - 1
self.call_args = {'axis': self.axis, 'end_axis': end_axis}
self.call_args = {'axis': list(range(self.axis, self.axis + self.num_axes))}
self.filler = caffe_pb2.FillerParameter(type='constant', value=1)
self.filler = param.filler if param.HasField('filler') else self.filler
self.bias_filler = param.bias_filler
......@@ -609,7 +608,7 @@ class Scale(Layer):
if len(self.blobs) == 0:
self.build(bottom)
inputs = [bottom] + [blob['data'] for blob in self.blobs]
return array_ops.channel_affine(inputs, **self.call_args)
return math_ops.affine(inputs, **self.call_args)
class Slice(Layer):
......
......@@ -16,8 +16,8 @@ from __future__ import print_function
from dragon.core.framework import workspace
from dragon.core.io.kpl_record import KPLRecordDataset
from dragon.core.ops import array_ops
from dragon.core.ops import framework_ops
from dragon.core.ops import normalization_ops
from dragon.utils import vision
from dragon.vm.caffe.core.layer import Layer
......@@ -121,5 +121,5 @@ class Data(Layer):
data._shape = (self.data_args['batch_size'],
None, None, len(self.norm_args['mean']))
label._shape = (self.data_args['batch_size'], None)
data = array_ops.channel_normalize(data, **self.norm_args)
data = normalization_ops.channel_norm(data, **self.norm_args)
return data, label
......@@ -9,6 +9,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
# ---[ Compiler flags
if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_ENABLE_EXTENDED_ALIGNED_STORAGE")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}
/wd4003 /wd4114
......
......@@ -36,16 +36,6 @@ dragon
`cast(...) <dragon/cast.html>`_
: Cast the data type of input.
`channel_affine(...) <dragon/channel_affine.html>`_
: Apply affine transformation to each channel of input.
`channel_normalize(...) <dragon/channel_normalize.html>`_
: Apply normalization to each channel of input.
`channel_shuffle(...) <dragon/channel_shuffle.html>`_
: Apply group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
`concat(...) <dragon/concat.html>`_
: Concatenate the inputs along the given axis.
......@@ -211,9 +201,6 @@ dragon
dragon/boolean_mask
dragon/broadcast_to
dragon/cast
dragon/channel_affine
dragon/channel_normalize
dragon/channel_shuffle
dragon/concat
dragon/constant
dragon/device
......
......@@ -24,6 +24,9 @@ dragon.cuda
`memory_allocated(...) <cuda/memory_allocated.html>`_
: Return the size of memory used by tensors in current workspace.
`set_cublas_flags(...) <cuda/set_cublas_flags.html>`_
: Set the flags of cuBLAS library.
`set_cudnn_flags(...) <cuda/set_cudnn_flags.html>`_
: Set the flags of cuDNN library.
......@@ -44,6 +47,7 @@ dragon.cuda
cuda/get_device_capability
cuda/is_available
cuda/memory_allocated
cuda/set_cublas_flags
cuda/set_cudnn_flags
cuda/set_default_device
cuda/set_device
......
channel_affine
==============
set_cublas_flags
================
.. autofunction:: dragon.channel_affine
.. autofunction:: dragon.cuda.set_cublas_flags
.. raw:: html
<style>
h1:before {
content: "dragon.";
content: "dragon.cuda.";
color: #103d3e;
}
</style>
......@@ -12,12 +12,18 @@ dragon.math
`add(...) <math/add.html>`_
: Compute the element-wise addition.
`affine(...) <math/affine.html>`_
: Apply the affine transformation to input.
`argmax(...) <math/argmax.html>`_
: Compute the index of maximum elements along the given axis.
`argmin(...) <math/argmin.html>`_
: Compute the index of minimum elements along the given axis.
`atan2(...) <math/atan2.html>`_
: Compute the element-wise arc-tangent of two arguments.
`ceil(...) <math/ceil.html>`_
: Compute the smallest integer not less than input.
......@@ -81,9 +87,6 @@ dragon.math
`logical_xor(...) <math/logical_xor.html>`_
: Compute the element-wise XOR logical operation.
`lp_normalize(...) <math/lp_normalize.html>`_
: Apply the lp normalization.
`matmul(...) <math/matmul.html>`_
: Compute the matrix multiplication.
......@@ -158,8 +161,10 @@ dragon.math
math/abs
math/add
math/affine
math/argmax
math/argmin
math/atan2
math/ceil
math/clip
math/cos
......@@ -181,7 +186,6 @@ dragon.math
math/logical_not
math/logical_or
math/logical_xor
math/lp_normalize
math/matmul
math/max
math/maximum
......
lp_normalize
============
affine
======
.. autofunction:: dragon.math.lp_normalize
.. autofunction:: dragon.math.affine
.. raw:: html
......
channel_normalize
=================
atan2
=====
.. autofunction:: dragon.channel_normalize
.. autofunction:: dragon.math.atan2
.. raw:: html
<style>
h1:before {
content: "dragon.";
content: "dragon.math.";
color: #103d3e;
}
</style>
......@@ -28,6 +28,13 @@ dragon.nn
`bias_add(...) <nn/bias_add.html>`_
: Add the bias across channels to input.
`channel_norm(...) <nn/channel_norm.html>`_
: Apply the normalization to each channel of input.
`channel_shuffle(...) <nn/channel_shuffle.html>`_
: Apply the group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
`conv(...) <nn/conv.html>`_
: Apply the n-dimension convolution.
......@@ -107,6 +114,9 @@ dragon.nn
`log_softmax(...) <nn/log_softmax.html>`_
: Compute the composite of logarithm and softmax.
`lp_norm(...) <nn/lp_norm.html>`_
: Apply the lp normalization.
`moments(...) <nn/moments.html>`_
: Compute the mean and variance of input along the given axis.
......@@ -157,6 +167,8 @@ dragon.nn
nn/RNN
nn/batch_norm
nn/bias_add
nn/channel_norm
nn/channel_shuffle
nn/conv
nn/conv_transpose
nn/conv1d
......@@ -180,6 +192,7 @@ dragon.nn
nn/leaky_relu
nn/local_response_norm
nn/log_softmax
nn/lp_norm
nn/moments
nn/pool
nn/pool1d
......
channel_norm
============
.. autofunction:: dragon.nn.channel_norm
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
channel_shuffle
===============
.. autofunction:: dragon.channel_shuffle
.. autofunction:: dragon.nn.channel_shuffle
.. raw:: html
<style>
h1:before {
content: "dragon.";
content: "dragon.nn.";
color: #103d3e;
}
</style>
lp_norm
=======
.. autofunction:: dragon.nn.lp_norm
.. raw:: html
<style>
h1:before {
content: "dragon.nn.";
color: #103d3e;
}
</style>
......@@ -21,6 +21,9 @@ vm.tensorflow.math
`argmin(...) <math/argmin.html>`_
: Compute the index of minimum elements along the given axis.
`atan2(...) <math/atan2.html>`_
: Compute the element-wise arc-tangent of two arguments.
`ceil(...) <math/ceil.html>`_
: Compute the smallest integer not less than input.
......@@ -134,6 +137,7 @@ vm.tensorflow.math
math/add_n
math/argmax
math/argmin
math/atan2
math/ceil
math/cos
math/cumsum
......
channel_normalize
=================
atan2
=====
.. autofunction:: dragon.vm.torch.channel_normalize
.. autofunction:: dragon.vm.tensorflow.math.atan2
.. raw:: html
<style>
h1:before {
content: "torch.";
content: "tf.math.";
color: #103d3e;
}
</style>
......@@ -51,6 +51,9 @@ vm.torch
`argsort(...) <torch/argsort.html>`_
: Return the index of sorted elements along the given dimension.
`atan2(...) <torch/atan2.html>`_
: Compute the element-wise arc-tangent of two arguments.
`baddbmm(...) <torch/baddbmm.html>`_
: Add input to the result of batched matrix-matrix multiplication.
......@@ -75,12 +78,6 @@ vm.torch
`ceil(...) <torch/ceil.html>`_
: Compute the smallest integer not less than input.
`channel_affine(...) <torch/channel_affine.html>`_
: Apply affine transformation to each channel of input.
`channel_normalize(...) <torch/channel_normalize.html>`_
: Apply normalization to each channel of input.
`chunk(...) <torch/chunk.html>`_
: Split input into a specific number of chunks.
......@@ -345,6 +342,7 @@ vm.torch
torch/argmax
torch/argmin
torch/argsort
torch/atan2
torch/baddbmm
torch/bitwise_and
torch/bitwise_not
......@@ -353,8 +351,6 @@ vm.torch
torch/bmm
torch/cat
torch/ceil
torch/channel_affine
torch/channel_normalize
torch/chunk
torch/clamp
torch/cos
......
......@@ -73,6 +73,10 @@ argsort
#######
.. automethod:: dragon.vm.torch.Tensor.argsort
atan2
#####
.. automethod:: dragon.vm.torch.Tensor.atan2
backward
########
.. automethod:: dragon.vm.torch.Tensor.backward
......@@ -699,6 +703,7 @@ zero\_
.. _torch.argmax(...): argmax.html
.. _torch.argmin(...): argmin.html
.. _torch.argsort(...): argsort.html
.. _torch.atan2(...): atan2.html
.. _torch.baddbmm(...): baddbmm.html
.. _torch.bitwise_and(...): bitwise_and.html
.. _torch.bitwise_not(...): bitwise_not.html
......
channel_affine
==============
atan2
=====
.. autofunction:: dragon.vm.torch.channel_affine
.. autofunction:: dragon.vm.torch.atan2
.. raw:: html
......
......@@ -6,12 +6,16 @@ vm.torch.backends
Modules
-------
`Module cuda <backends/cuda.html>`_
: The CUDA backend module.
`Module cudnn <backends/cudnn.html>`_
: The cuDNN backend module.
.. toctree::
:hidden:
backends/cuda
backends/cudnn
.. raw:: html
......
cuda
====
Properties
----------
matmul.allow_tf32
#################
.. data:: dragon.vm.torch.backends.cuda.matmul.allow_tf32
:annotation: = False
The flag that allows TF32 math type for matmul or not.
Functions
---------
is_built
########
.. automethod:: dragon.vm.torch.backends.cuda.is_built
.. raw:: html
<style>
h1:before {
content: "torch.backends.";
color: #103d3e;
}
</style>
......@@ -24,8 +24,8 @@ vm.torch.nn
`class AdaptiveMaxPool3d <nn/AdaptiveMaxPool3d.html>`_
: Apply the 3d adaptive max pooling.
`class AffineChannel <nn/AffineChannel.html>`_
: Apply affine transformation along the channels.
`class Affine <nn/Affine.html>`_
: Apply the affine transformation.
`class AvgPool1d <nn/AvgPool1d.html>`_
: Apply the 1d average pooling.
......@@ -312,7 +312,7 @@ vm.torch.nn
nn/AdaptiveMaxPool1d
nn/AdaptiveMaxPool2d
nn/AdaptiveMaxPool3d
nn/AffineChannel
nn/Affine
nn/AvgPool1d
nn/AvgPool2d
nn/AvgPool3d
......
AffineChannel
=============
Affine
======
.. autoclass:: dragon.vm.torch.nn.AffineChannel
.. autoclass:: dragon.vm.torch.nn.Affine
__init__
--------
.. automethod:: dragon.vm.torch.nn.AffineChannel.__init__
.. automethod:: dragon.vm.torch.nn.Affine.__init__
.. _torch.channel_affine(...): ../channel_affine.html
.. _torch.nn.functional.affine(...): functional/affine.html
.. raw:: html
......
......@@ -24,6 +24,9 @@ vm.torch.nn.functional
`adaptive_max_pool3d(...) <functional/adaptive_max_pool3d.html>`_
: Apply the 3d adaptive max pooling to input.
`affine(...) <functional/affine.html>`_
: Apply the affine transformation to input.
`avg_pool1d(...) <functional/avg_pool1d.html>`_
: Apply the 1d average pooling to input.
......@@ -40,8 +43,11 @@ vm.torch.nn.functional
`binary_cross_entropy_with_logits(...) <functional/binary_cross_entropy_with_logits.html>`_
: Compute the sigmoid cross entropy with contiguous target.
`channel_norm(...) <nn/channel_norm.html>`_
: Apply the normalization to each channel of input.
`channel_shuffle(...) <functional/channel_shuffle.html>`_
: Apply group shuffle to each channel of input.
: Apply the group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
`conv1d(...) <functional/conv1d.html>`_
......@@ -229,11 +235,13 @@ vm.torch.nn.functional
functional/adaptive_max_pool1d
functional/adaptive_max_pool2d
functional/adaptive_max_pool3d
functional/affine
functional/avg_pool1d
functional/avg_pool2d
functional/avg_pool3d
functional/batch_norm
functional/binary_cross_entropy_with_logits
functional/channel_norm
functional/channel_shuffle
functional/conv1d
functional/conv2d
......
affine
======
.. autofunction:: dragon.vm.torch.nn.functional.affine
.. _torch.nn.affine(...): ../Affine.html
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
channel_norm
============
.. autofunction:: dragon.vm.torch.nn.functional.channel_norm
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
color: #103d3e;
}
</style>
......@@ -56,15 +56,16 @@ class CUDAObjects {
auto& handle = handles[stream_id];
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id)));
}
auto& handle = handles[stream_id];
#if CUDA_VERSION >= 11000
if (cudnn_allow_tf32_) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
} else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}
#endif
if (cublas_allow_tf32_) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
} else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}
return handles[stream_id];
#endif
return handle;
}
/*! \brief Return the specified cudnn handle */
......@@ -150,6 +151,9 @@ class CUDAObjects {
Map<string, ncclComm_t> nccl_comms_[CUDA_MAX_DEVICES];
#endif
/*! \brief The flag that allows cuBLAS TF32 math type or not */
bool cublas_allow_tf32_ = false;
/*! \brief The flag that uses cuDNN or not */
bool cudnn_enabled_ = true;
......
......@@ -20,32 +20,32 @@ namespace dragon {
/*!
* \brief Registry to create class instances.
*/
template <class KeyType, class ObjectType, class... Args>
template <class KeyT, class ClassT, class... Args>
class Registry {
public:
typedef std::function<ObjectType*(Args...)> Creator;
typedef std::function<ClassT*(Args...)> Creator;
/*! \brief Create an instance of specified class */
ObjectType* Create(const KeyType& key, Args... args) {
ClassT* Create(const KeyT& key, Args... args) {
CHECK(registry_.count(key)) << "\nKey(" << key << ") has not registered.";
return registry_[key](args...);
}
/*! \brief Return whether the specified class is registered */
bool Has(const KeyType& key) {
bool Has(const KeyT& key) {
return (registry_.count(key)) != 0;
}
/*! \brief Register a class with the creator */
void Register(const KeyType& key, Creator creator) {
void Register(const KeyT& key, Creator creator) {
CHECK(!registry_.count(key))
<< "\nKey(" << key << ") has already registered.";
registry_[key] = creator;
}
/*! \brief Return the key of registered classes */
vector<KeyType> keys() {
vector<KeyType> ret;
vector<KeyT> keys() {
vector<KeyT> ret;
for (const auto& it : registry_) {
ret.push_back(it.first);
}
......@@ -54,50 +54,49 @@ class Registry {
private:
/*! \brief The registry map */
Map<KeyType, Creator> registry_;
Map<KeyT, Creator> registry_;
};
/*!
* \brief Register creator into the registry.
*/
template <class KeyType, class ObjectType, class... Args>
template <class KeyT, class ClassT, class... Args>
class Registerer {
public:
/*! \brief Constructor with key and creator */
Registerer(
const KeyType& key,
Registry<KeyType, ObjectType, Args...>* registry,
typename Registry<KeyType, ObjectType, Args...>::Creator creator,
const KeyT& key,
Registry<KeyT, ClassT, Args...>* registry,
typename Registry<KeyT, ClassT, Args...>::Creator creator,
const string& help_msg = "") {
registry->Register(key, creator);
}
/*! \brief Return the default creator */
template <class DerivedType>
static ObjectType* DefaultCreator(Args... args) {
return new DerivedType(args...);
template <class DerivedT>
static ClassT* DefaultCreator(Args... args) {
return new DerivedT(args...);
}
};
// Used in *.h files
#define DECLARE_TYPED_REGISTRY(RegistryName, KeyType, ObjectType, ...) \
DRAGON_API Registry<KeyType, ObjectType, ##__VA_ARGS__>* RegistryName(); \
typedef Registerer<KeyType, ObjectType, ##__VA_ARGS__> \
Registerer##RegistryName;
// Used in *.cc files
#define DEFINE_TYPED_REGISTRY(RegistryName, KeyType, ObjectType, ...) \
Registry<KeyType, ObjectType, ##__VA_ARGS__>* RegistryName() { \
static Registry<KeyType, ObjectType, ##__VA_ARGS__>* registry = \
new Registry<KeyType, ObjectType, ##__VA_ARGS__>(); \
return registry; \
// Used in *.h files.
#define DECLARE_TYPED_REGISTRY(RegistryName, KeyT, ClassT, ...) \
DRAGON_API Registry<KeyT, ClassT, ##__VA_ARGS__>* RegistryName(); \
typedef Registerer<KeyT, ClassT, ##__VA_ARGS__> Registerer##RegistryName;
// Used in *.cc files.
#define DEFINE_TYPED_REGISTRY(RegistryName, KeyT, ClassT, ...) \
Registry<KeyT, ClassT, ##__VA_ARGS__>* RegistryName() { \
static Registry<KeyT, ClassT, ##__VA_ARGS__>* registry = \
new Registry<KeyT, ClassT, ##__VA_ARGS__>(); \
return registry; \
}
#define DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
DECLARE_TYPED_REGISTRY(RegistryName, string, ObjectType, ##__VA_ARGS__)
#define DECLARE_REGISTRY(RegistryName, ClassT, ...) \
DECLARE_TYPED_REGISTRY(RegistryName, string, ClassT, ##__VA_ARGS__)
#define DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
DEFINE_TYPED_REGISTRY(RegistryName, string, ObjectType, ##__VA_ARGS__)
#define DEFINE_REGISTRY(RegistryName, ClassT, ...) \
DEFINE_TYPED_REGISTRY(RegistryName, string, ClassT, ##__VA_ARGS__)
#define REGISTER_TYPED_CLASS(RegistryName, key, ...) \
static Registerer##RegistryName ANONYMOUS_VARIABLE(g_##RegistryName)( \
......
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
void _ChannelAffine(
const int N,
const int S,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
if (S == 1) {
if (bias != nullptr) {
EigenArrayMap<T>(y, C, N) = (ConstEigenArrayMap<T>(x, C, N).colwise() *
ConstEigenVectorArrayMap<T>(scale, C))
.colwise() +
ConstEigenVectorArrayMap<T>(bias, C);
} else {
EigenArrayMap<T>(y, C, N) = ConstEigenArrayMap<T>(x, C, N).colwise() *
ConstEigenVectorArrayMap<T>(scale, C);
}
return;
}
for (int i = 0; i < N; ++i) {
for (int j = 0; j < C; ++j) {
if (bias != nullptr) {
EigenVectorArrayMap<T>(y, S) =
ConstEigenVectorArrayMap<T>(x, S) * scale[j] + bias[j];
} else {
EigenVectorArrayMap<T>(y, S) =
ConstEigenVectorArrayMap<T>(x, S) * scale[j];
}
x += S;
y += S;
}
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void ChannelAffine<float16, CPUContext>(
const int N,
const int S,
const int C,
const float16* x,
const float16* w,
const float16* b,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelAffine<T, CPUContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CPUContext* ctx) { \
_ChannelAffine(N, S, C, x, scale, bias, y); \
}
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T, typename AccT>
__global__ void _ChannelAffine(
const int NxCxS,
const int S,
const int C,
const T* x,
const T* scale,
T* y) {
CUDA_1D_KERNEL_LOOP(i, NxCxS) {
y[i] = convert::To<T>(
convert::To<AccT>(x[i]) *
convert::To<AccT>(__ldg(scale + (i / S) % C)));
}
}
template <typename T, typename AccT>
__global__ void _ChannelAffine(
const int NxCxS,
const int S,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
CUDA_1D_KERNEL_LOOP(i, NxCxS) {
const int j = (i / S) % C;
y[i] = convert::To<T>(
fma(convert::To<AccT>(x[i]),
convert::To<AccT>(__ldg(scale + j)),
convert::To<AccT>(__ldg(bias + j))));
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelAffine<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
if (bias != nullptr) { \
_ChannelAffine<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} else { \
_ChannelAffine<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_BLOCKS(NxCxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxCxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} \
}
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#endif // USE_CUDA
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
void _ChannelShuffle(
const int N,
const int S,
const int G,
const int K,
const T* x,
T* y) {
for (int i = 0; i < N; ++i) {
for (int gi = 0; gi < G; ++gi) {
for (int ki = 0; ki < K; ++ki) {
std::memcpy(
y + ((i * K + ki) * G + gi) * S,
x + ((i * G + gi) * K + ki) * S,
S * sizeof(T));
}
}
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelShuffle<T, CPUContext>( \
const int N, \
const int S, \
const int C, \
const int G, \
const T* x, \
T* y, \
CPUContext* ctx) { \
_ChannelShuffle(N, S, G, C / G, x, y); \
}
DEFINE_KERNEL_LAUNCHER(bool);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T>
__global__ void _ChannelShuffle(
const int NxCxS,
const int S,
const int G,
const int K,
const T* x,
T* y) {
CUDA_1D_KERNEL_LOOP(index, NxCxS) {
const int j = index % S;
const int gi = index / S % G;
const int ki = index / S / G % K;
const int i = index / S / G / K;
y[index] = x[((i * G + gi) * K + ki) * S + j];
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelShuffle<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const int G, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
const auto NxCxS = N * C * S; \
_ChannelShuffle<<< \
CUDA_BLOCKS(NxCxS), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>(NxCxS, S, G, C / G, x, y); \
}
DEFINE_KERNEL_LAUNCHER(bool);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#endif // USE_CUDA
......@@ -52,14 +52,23 @@ __global__ void _ComputeCounts(
CUDAContext* ctx) { \
math::Copy(dim, x, y, ctx); \
auto policy = thrust::cuda::par.on(ctx->cuda_stream()); \
auto* data = reinterpret_cast<math::ScalarType<T>::type*>(y); \
thrust::device_vector<int> order1(dim), order2(dim); \
thrust::sequence(policy, order1.begin(), order1.end()); \
thrust::sequence(policy, order2.begin(), order2.end()); \
thrust::sort_by_key( \
policy, y, y + dim, order1.begin(), math::LessFunctor<T>()); \
policy, \
data, \
data + dim, \
order1.begin(), \
math::LessFunctor<math::ScalarType<T>::type>()); \
auto last = thrust::unique_by_key( \
policy, y, y + dim, order2.begin(), math::EqualFunctor<T>()); \
int n = num[0] = last.first - y; \
policy, \
data, \
data + dim, \
order2.begin(), \
math::EqualFunctor<math::ScalarType<T>::type>()); \
int n = num[0] = last.first - data; \
if (inverse_index) { \
_RemapInverse<<<CUDA_BLOCKS(n), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
dim, n, order1.data(), order2.data(), inverse_index); \
......
......@@ -8,7 +8,7 @@ namespace kernels {
namespace {
template <typename InputT, typename OutputT>
void _ChannelNormalize(
void _ChannelNorm(
const int axis,
const int num_dims,
const int64_t* x_strides,
......@@ -19,15 +19,14 @@ void _ChannelNormalize(
OutputT* y) {
const auto N = math::utils::Prod(num_dims, y_dims);
vec64_t idx(num_dims, 0);
int64_t xi, wi;
for (int yi = 0; yi < N; ++yi) {
xi = 0;
int64_t xi = 0, wi;
for (int d = num_dims - 1; d >= 0; --d) {
xi += idx[d] * x_strides[d];
if (d == axis) wi = idx[d];
}
y[yi] =
convert::To<OutputT>((convert::To<float>(x[xi]) - mean[wi]) / std[wi]);
const float val = convert::To<float>(x[xi]);
y[yi] = convert::To<OutputT>((val - mean[wi]) / std[wi]);
math::utils::IncreaseIndexInDims(num_dims, y_dims, idx.data());
}
}
......@@ -36,19 +35,19 @@ void _ChannelNormalize(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNormalize<InputT, OutputT, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const InputT* x, \
const float* mean, \
const float* std, \
OutputT* y, \
CPUContext* ctx) { \
_ChannelNormalize(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNorm<InputT, OutputT, CPUContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const InputT* x, \
const float* mean, \
const float* std, \
OutputT* y, \
CPUContext* ctx) { \
_ChannelNorm(axis, num_dims, x_strides, y_dims, x, mean, std, y); \
}
DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
......
......@@ -11,7 +11,7 @@ namespace kernels {
namespace {
template <typename InputT, typename OutputT, int D>
__global__ void _ChannelNormalize(
__global__ void _ChannelNorm(
const int N,
const int axis,
const int num_dims,
......@@ -38,31 +38,27 @@ __global__ void _ChannelNormalize(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNormalize<InputT, OutputT, CUDAContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const InputT* x, \
const float* mean, \
const float* std, \
OutputT* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \
const auto N = math::utils::Prod(num_dims, y_dims); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_ChannelNormalize<<< \
CUDA_BLOCKS(N), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
N, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
#define DEFINE_KERNEL_LAUNCHER(InputT, OutputT) \
template <> \
void ChannelNorm<InputT, OutputT, CUDAContext>( \
const int axis, \
const int num_dims, \
const int64_t* x_strides, \
const int64_t* y_dims, \
const InputT* x, \
const float* mean, \
const float* std, \
OutputT* y, \
CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \
const auto N = math::utils::Prod(num_dims, y_dims); \
for (int i = 0; i < num_dims; ++i) { \
X_strides.data[i] = x_strides[i]; \
Y_dims.data[i] = y_dims[i]; \
} \
_ChannelNorm<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, axis, num_dims, X_strides, Y_dims, x, mean, std, y); \
}
DEFINE_KERNEL_LAUNCHER(uint8_t, float16);
......
......@@ -8,7 +8,7 @@ namespace kernels {
namespace {
template <typename T>
void _L1Normalize(
void _L1Norm(
const int N,
const int S,
const int C,
......@@ -28,7 +28,7 @@ void _L1Normalize(
}
template <typename T>
void _L2Normalize(
void _L2Norm(
const int N,
const int S,
const int C,
......@@ -48,7 +48,7 @@ void _L2Normalize(
}
template <typename T>
void _L1NormalizeGrad(
void _L1NormGrad(
const int N,
const int S,
const int C,
......@@ -73,7 +73,7 @@ void _L1NormalizeGrad(
}
template <typename T>
void _L2NormalizeGrad(
void _L2NormGrad(
const int N,
const int S,
const int C,
......@@ -101,7 +101,7 @@ void _L2NormalizeGrad(
/* ------------------- Launcher Separator ------------------- */
template <>
void L1Normalize<float16, CPUContext>(
void L1Norm<float16, CPUContext>(
const int N,
const int S,
const int C,
......@@ -114,7 +114,7 @@ void L1Normalize<float16, CPUContext>(
}
template <>
void L2Normalize<float16, CPUContext>(
void L2Norm<float16, CPUContext>(
const int N,
const int S,
const int C,
......@@ -127,7 +127,7 @@ void L2Normalize<float16, CPUContext>(
}
template <>
void L1NormalizeGrad<float16, CPUContext>(
void L1NormGrad<float16, CPUContext>(
const int N,
const int S,
const int C,
......@@ -138,10 +138,10 @@ void L1NormalizeGrad<float16, CPUContext>(
float16* dx,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
} // L1NormalizeGrad
} // L1NormGrad
template <>
void L2NormalizeGrad<float16, CPUContext>(
void L2NormGrad<float16, CPUContext>(
const int N,
const int S,
const int C,
......@@ -152,7 +152,7 @@ void L2NormalizeGrad<float16, CPUContext>(
float16* dx,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
} // L2NormalizeGrad
} // L2NormGrad
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
......@@ -183,14 +183,14 @@ void L2NormalizeGrad<float16, CPUContext>(
_##name<T>(N, S, C, normalizer, eps, dy, x, dx); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double);
DEFINE_KERNEL_LAUNCHER(L1Norm, float);
DEFINE_KERNEL_LAUNCHER(L1Norm, double);
DEFINE_KERNEL_LAUNCHER(L2Norm, float);
DEFINE_KERNEL_LAUNCHER(L2Norm, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
......@@ -12,7 +12,7 @@ namespace kernels {
namespace {
template <typename T, typename AccT>
__global__ void _L1Normalize(
__global__ void _L1Norm(
const int NxS,
const int S,
const int C,
......@@ -41,7 +41,7 @@ __global__ void _L1Normalize(
}
template <typename T, typename AccT>
__global__ void _L2Normalize(
__global__ void _L2Norm(
const int NxS,
const int S,
const int C,
......@@ -70,7 +70,7 @@ __global__ void _L2Normalize(
}
template <typename T, typename AccT>
__global__ void _L1NormalizeGrad(
__global__ void _L1NormGrad(
const int NxS,
const int S,
const int C,
......@@ -107,7 +107,7 @@ __global__ void _L1NormalizeGrad(
}
template <typename T, typename AccT>
__global__ void _L2NormalizeGrad(
__global__ void _L2NormGrad(
const int NxS,
const int S,
const int C,
......@@ -195,18 +195,18 @@ __global__ void _L2NormalizeGrad(
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
}
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, float, float);
DEFINE_KERNEL_LAUNCHER(L1Normalize, double, double);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float16, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, float, float);
DEFINE_KERNEL_LAUNCHER(L2Normalize, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormalizeGrad, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormalizeGrad, double, double);
DEFINE_KERNEL_LAUNCHER(L1Norm, float16, float);
DEFINE_KERNEL_LAUNCHER(L1Norm, float, float);
DEFINE_KERNEL_LAUNCHER(L1Norm, double, double);
DEFINE_KERNEL_LAUNCHER(L2Norm, float16, float);
DEFINE_KERNEL_LAUNCHER(L2Norm, float, float);
DEFINE_KERNEL_LAUNCHER(L2Norm, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L1NormGrad, double, double);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float16, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, float, float);
DEFINE_GRAD_KERNEL_LAUNCHER(L2NormGrad, double, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
......
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
template <>
void Adam<float, CPUContext>(
namespace {
template <typename T, typename CopyT>
void _Adam(
const int N,
const float lr,
const float beta1,
const float beta2,
const float eps,
float* g,
float* m,
float* v,
CPUContext* ctx) {
const T lr,
const T beta1,
const T beta2,
const T eps,
const T wd,
const T* x,
const T* g,
T* m,
T* v,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) {
float gi = g[i];
float mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
float vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
g[i] = lr * mi / (std::sqrt(vi) + eps);
const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
const T mi = m[i] = std::fma(beta1, m[i], (T(1) - beta1) * gi);
const T vi = v[i] = std::fma(beta2, v[i], (T(1) - beta2) * gi * gi);
y[i] -= lr * mi / (std::sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
template <>
void AdamW<float, CPUContext>(
template <typename T, typename CopyT>
void _AdamW(
const int N,
const float lr,
const float beta1,
const float beta2,
const float eps,
const float wd,
const float* x,
float* g,
float* m,
float* v,
CPUContext* ctx) {
const T lr,
const T beta1,
const T beta2,
const T eps,
const T wd,
const T* x,
const T* g,
T* m,
T* v,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) {
float gi = g[i];
float mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
float vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
g[i] = lr * mi / (std::sqrt(vi) + eps) + wd * x[i];
const T gi = g[i];
const T mi = m[i] = std::fma(beta1, m[i], (T(1) - beta1) * gi);
const T vi = v[i] = std::fma(beta2, v[i], (T(1) - beta2) * gi * gi);
y[i] -= wd > T(0) ? std::fma(wd, x[i], lr * mi / (std::sqrt(vi) + eps))
: lr * mi / (std::sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
} // namespace
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CPUContext>( \
const int N, \
const float lr, \
const float beta1, \
const float beta2, \
const float eps, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* v, \
T* y, \
CopyT* y_copy, \
CPUContext* ctx) { \
_##name( \
N, \
convert::To<T>(lr), \
convert::To<T>(beta1), \
convert::To<T>(beta2), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
y_copy); \
}
DEFINE_KERNEL_LAUNCHER(Adam, float, float16);
DEFINE_KERNEL_LAUNCHER(Adam, float, float);
DEFINE_KERNEL_LAUNCHER(Adam, double, double);
DEFINE_KERNEL_LAUNCHER(AdamW, float, float16);
DEFINE_KERNEL_LAUNCHER(AdamW, float, float);
DEFINE_KERNEL_LAUNCHER(AdamW, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,25 +10,32 @@ namespace kernels {
namespace {
template <typename T>
template <typename T, typename CopyT>
__global__ void _Adam(
const int N,
const T lr,
const T beta1,
const T beta2,
const T eps,
T* g,
const T wd,
const T* x,
const T* g,
T* m,
T* v) {
T* v,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i];
T mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
T vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
g[i] = lr * mi / (sqrt(vi) + eps);
const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
const T mi = m[i] = fma(beta1, m[i], (T(1) - beta1) * gi);
const T vi = v[i] = fma(beta2, v[i], (T(1) - beta2) * gi * gi);
y[i] -= lr * mi / (sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
template <typename T>
template <typename T, typename CopyT>
__global__ void _AdamW(
const int N,
const T lr,
......@@ -36,14 +44,20 @@ __global__ void _AdamW(
const T eps,
const T wd,
const T* x,
T* g,
const T* g,
T* m,
T* v) {
T* v,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i];
T mi = m[i] = m[i] * beta1 + gi * (1 - beta1);
T vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2);
g[i] = lr * mi / (sqrt(vi) + eps) + wd * x[i];
const T gi = g[i];
const T mi = m[i] = fma(beta1, m[i], (T(1) - beta1) * gi);
const T vi = v[i] = fma(beta2, v[i], (T(1) - beta2) * gi * gi);
y[i] -= wd > T(0) ? fma(wd, x[i], lr * mi / (sqrt(vi) + eps))
: lr * mi / (sqrt(vi) + eps);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
......@@ -51,37 +65,44 @@ __global__ void _AdamW(
/* ------------------- Launcher Separator ------------------- */
template <>
void Adam<float, CUDAContext>(
const int N,
const float lr,
const float beta1,
const float beta2,
const float eps,
float* g,
float* m,
float* v,
CUDAContext* ctx) {
_Adam<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, beta1, beta2, eps, g, m, v);
}
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CUDAContext>( \
const int N, \
const float lr, \
const float beta1, \
const float beta2, \
const float eps, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* v, \
T* y, \
CopyT* y_copy, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<T>(lr), \
convert::To<T>(beta1), \
convert::To<T>(beta2), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy)); \
}
template <>
void AdamW<float, CUDAContext>(
const int N,
const float lr,
const float beta1,
const float beta2,
const float eps,
const float wd,
const float* x,
float* g,
float* m,
float* v,
CUDAContext* ctx) {
_AdamW<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, beta1, beta2, eps, wd, x, g, m, v);
}
DEFINE_KERNEL_LAUNCHER(Adam, float, float16);
DEFINE_KERNEL_LAUNCHER(Adam, float, float);
DEFINE_KERNEL_LAUNCHER(Adam, double, double);
DEFINE_KERNEL_LAUNCHER(AdamW, float, float16);
DEFINE_KERNEL_LAUNCHER(AdamW, float, float);
DEFINE_KERNEL_LAUNCHER(AdamW, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
......
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
template <>
void RMSprop<float, CPUContext>(
namespace {
template <typename T, typename CopyT>
void _RMSprop(
const int N,
const float lr,
const float momentum,
const float decay,
const float eps,
float* g,
float* m,
float* v,
CPUContext* ctx) {
const T lr,
const T momentum,
const T alpha,
const T eps,
const T wd,
const T* x,
const T* g,
T* m,
T* v,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) {
float gi = g[i];
float vi = v[i] = decay * v[i] + (1 - decay) * gi * gi;
float mi = m[i] = std::fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
g[i] = lr * mi;
const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
const T vi = v[i] = std::fma(alpha, v[i], (T(1) - alpha) * gi * gi);
const T mi = m[i] = std::fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
} // namespace
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CPUContext>( \
const int N, \
const float lr, \
const float momentum, \
const float alpha, \
const float eps, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* v, \
T* y, \
CopyT* y_copy, \
CPUContext* ctx) { \
_##name( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(alpha), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
y_copy); \
}
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float16);
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float);
DEFINE_KERNEL_LAUNCHER(RMSprop, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,21 +10,28 @@ namespace kernels {
namespace {
template <typename T>
template <typename T, typename CopyT>
__global__ void _RMSprop(
const int N,
const T lr,
const T momentum,
const T decay,
const T alpha,
const T eps,
T* g,
const T wd,
const T* x,
const T* g,
T* m,
T* v) {
T* v,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i];
T vi = v[i] = decay * v[i] + (1 - decay) * gi * gi;
T mi = m[i] = fma(momentum, m[i], gi / (sqrt(vi) + eps));
g[i] = lr * mi;
const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
const T vi = v[i] = fma(alpha, v[i], (T(1) - alpha) * gi * gi);
const T mi = m[i] = fma(momentum, m[i], gi / (std::sqrt(vi) + eps));
y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
......@@ -31,20 +39,41 @@ __global__ void _RMSprop(
/* ------------------- Launcher Separator ------------------- */
template <>
void RMSprop<float, CUDAContext>(
const int N,
const float lr,
const float momentum,
const float decay,
const float eps,
float* g,
float* m,
float* v,
CUDAContext* ctx) {
_RMSprop<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, momentum, decay, eps, g, m, v);
}
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CUDAContext>( \
const int N, \
const float lr, \
const float momentum, \
const float alpha, \
const float eps, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* v, \
T* y, \
CopyT* y_copy, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(alpha), \
convert::To<T>(eps), \
convert::To<T>(wd), \
x, \
g, \
m, \
v, \
y, \
reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy)); \
}
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float16);
DEFINE_KERNEL_LAUNCHER(RMSprop, float, float);
DEFINE_KERNEL_LAUNCHER(RMSprop, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
......
#include "dragon/utils/conversions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
template <>
void MomentumSGD<float, CPUContext>(
namespace {
template <typename T, typename CopyT>
void _MomentumSGD(
const int N,
const float lr,
const float momentum,
float* g,
float* m,
CPUContext* ctx) {
const T lr,
const T momentum,
const T wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) {
float mi = m[i] = std::fma(momentum, m[i], g[i]);
g[i] = lr * mi;
const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
const T mi = m[i] = std::fma(momentum, m[i], gi);
y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
template <>
void NesterovSGD<float, CPUContext>(
template <typename T, typename CopyT>
void _NesterovSGD(
const int N,
const float lr,
const float momentum,
float* g,
float* m,
CPUContext* ctx) {
const T lr,
const T momentum,
const T wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy) {
for (int i = 0; i < N; ++i) {
float gi = g[i];
float mi = m[i] = std::fma(momentum, m[i], gi);
g[i] = lr * std::fma(momentum, mi, gi);
const T gi = wd > T(0) ? std::fma(wd, x[i], g[i]) : g[i];
const T mi = m[i] = std::fma(momentum, m[i], gi);
y[i] -= lr * std::fma(momentum, mi, gi);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
} // namespace
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CPUContext>( \
const int N, \
const float lr, \
const float momentum, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* y, \
CopyT* y_copy, \
CPUContext* ctx) { \
_##name( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(wd), \
x, \
g, \
m, \
y, \
y_copy); \
}
DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float16);
DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float);
DEFINE_KERNEL_LAUNCHER(MomentumSGD, double, double);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float16);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,22 +10,45 @@ namespace kernels {
namespace {
template <typename T>
__global__ void
_MomentumSGD(const int N, const T lr, const T momentum, T* g, T* m) {
template <typename T, typename CopyT>
__global__ void _MomentumSGD(
const int N,
const T lr,
const T momentum,
const T wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) {
T mi = m[i] = fma(momentum, m[i], g[i]);
g[i] = lr * mi;
const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
const T mi = m[i] = fma(momentum, m[i], gi);
y[i] -= lr * mi;
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
template <typename T>
__global__ void
_NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) {
template <typename T, typename CopyT>
__global__ void _NesterovSGD(
const int N,
const T lr,
const T momentum,
const T wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy) {
CUDA_1D_KERNEL_LOOP(i, N) {
T gi = g[i];
T mi = m[i] = fma(momentum, m[i], gi);
g[i] = lr * fma(momentum, mi, gi);
const T gi = wd > T(0) ? fma(wd, x[i], g[i]) : g[i];
const T mi = m[i] = fma(momentum, m[i], gi);
y[i] -= lr * fma(momentum, mi, gi);
if (y_copy != nullptr) {
y_copy[i] = convert::To<CopyT>(y[i]);
}
}
}
......@@ -32,29 +56,38 @@ _NesterovSGD(const int N, const T lr, const T momentum, T* g, T* m) {
/* ------------------- Launcher Separator ------------------- */
template <>
void MomentumSGD<float, CUDAContext>(
const int N,
const float lr,
const float momentum,
float* g,
float* m,
CUDAContext* ctx) {
_MomentumSGD<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, momentum, g, m);
}
#define DEFINE_KERNEL_LAUNCHER(name, T, CopyT) \
template <> \
void name<T, CopyT, CUDAContext>( \
const int N, \
const float lr, \
const float momentum, \
const float wd, \
const T* x, \
const T* g, \
T* m, \
T* y, \
CopyT* y_copy, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
convert::To<T>(lr), \
convert::To<T>(momentum), \
convert::To<T>(wd), \
x, \
g, \
m, \
y, \
reinterpret_cast<math::ScalarType<CopyT>::type*>(y_copy)); \
}
template <>
void NesterovSGD<float, CUDAContext>(
const int N,
const float lr,
const float momentum,
float* g,
float* m,
CUDAContext* ctx) {
_NesterovSGD<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, lr, momentum, g, m);
}
DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float16);
DEFINE_KERNEL_LAUNCHER(MomentumSGD, float, float);
DEFINE_KERNEL_LAUNCHER(MomentumSGD, double, double);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float16);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, float, float);
DEFINE_KERNEL_LAUNCHER(NesterovSGD, double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
......
......@@ -91,16 +91,24 @@ void RegisterModule_cuda(py::module& m) {
#endif
});
/*! \brief Set the flags of cuBLAS library */
m.def("cublasSetFlags", [](int allow_tf32) {
#ifdef USE_CUDA
auto& ctx = CUDAContext::objects();
if (allow_tf32 >= 0) ctx.cublas_allow_tf32_ = allow_tf32;
#endif
});
/*! \brief Set the flags of cuDNN library */
m.def(
"cudnnSetFlags",
[](bool enabled, bool benchmark, bool deterministic, bool allow_tf32) {
[](int enabled, int benchmark, int deterministic, int allow_tf32) {
#ifdef USE_CUDA
auto& cuda_objects = CUDAContext::objects();
cuda_objects.cudnn_enabled_ = enabled;
cuda_objects.cudnn_deterministic_ = deterministic;
cuda_objects.cudnn_benchmark_ = benchmark;
cuda_objects.cudnn_allow_tf32_ = allow_tf32;
auto& ctx = CUDAContext::objects();
if (enabled >= 0) ctx.cudnn_enabled_ = enabled;
if (benchmark >= 0) ctx.cudnn_benchmark_ = benchmark;
if (deterministic >= 0) ctx.cudnn_deterministic_ = deterministic;
if (allow_tf32 >= 0) ctx.cudnn_allow_tf32_ = allow_tf32;
#endif
});
......
......@@ -132,8 +132,8 @@ PYBIND11_MODULE(libdragon_python, m) {
PRINT(INFO) << GetVerboseDef(def.DebugString(), "graph");
}
}
// Return the graph name may be different from the def
// We will make a unique dummy name on creating the graph
// Return the graph name may be different from the def.
// We will make a unique dummy name on creating the graph.
return graph->name();
})
......@@ -175,8 +175,8 @@ PYBIND11_MODULE(libdragon_python, m) {
GraphDef init_graph, pred_graph;
onnx::ONNXBackend onnx_backend;
onnx_backend.Prepare(model_path, &init_graph, &pred_graph);
// Serializing to Python is intractable
// We should apply the initializer immediately
// Serializing to Python is intractable.
// We should apply the initializer immediately.
self->RunGraph(self->CreateGraph(init_graph)->name());
return py::bytes(pred_graph.SerializeAsString());
});
......
......@@ -24,14 +24,14 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws)
Py_Initialize();
auto* module = PyImport_ImportModule(module_name_.c_str());
CHECK(module) << "\nFailed to import module: " << module;
auto* module_dict = PyModule_GetDict(module);
auto* op_class = PyDict_GetItemString(module_dict, class_name_.c_str());
CHECK(op_class) << "\nFailed to import class: " << class_name_
<< " from module: " << module_name_;
self_ = PyObject_CallObject(op_class, NULL);
// Project inputs and outputs.
// Set inputs and outputs.
inputs_ = PyList_New(InputSize());
outputs_ = PyList_New(OutputSize());
for (int i = 0; i < InputSize(); i++) {
......@@ -41,16 +41,15 @@ PythonPluginOp<Context>::PythonPluginOp(const OperatorDef& def, Workspace* ws)
PyList_SetItem(outputs_, i, PyBytes_FromStdString(Output(i)->name()));
}
// Set: self.kwargs_str
// Attr: "kwargs_str"
PyObject_SetAttr(
self_,
PyBytes_FromRawString("kwargs_str"),
PyBytes_FromStdString(kwargs_str_));
// Method: self.setup(inputs, outputs)
if (PyObject_HasAttr(self_, PyBytes_FromRawString("setup"))) {
CHECK(PyObject_CallMethod(self_, "setup", "OO", inputs_, outputs_))
<< CallMethodHelper("setup");
<< CallMethodHelper("setup"); // Method: setup(inputs, outputs)
}
}
......@@ -67,27 +66,24 @@ string PythonPluginOp<Context>::CallMethodHelper(const string& method_name) {
template <class Context>
void PythonPluginOp<Context>::RunOnDevice() {
// GIL may have been released
// GIL may have been released.
pybind11::gil_scoped_acquire g;
// Atrribute: self.phase
// Attr: phase
PyObject_SetAttr(
self_, PyBytes_FromRawString("phase"), PyBytes_FromStdString(phase()));
// Method: self.reshape(input, outputs)
if (PyObject_HasAttr(self_, PyBytes_FromRawString("reshape"))) {
CHECK(PyObject_CallMethod(self_, "reshape", "OO", inputs_, outputs_))
<< CallMethodHelper("reshape");
<< CallMethodHelper("reshape"); // Method: reshape(input, outputs)
}
// Method: self.run(input, outputs)
// Method: self.forward(input, outputs)
if (PyObject_HasAttr(self_, PyBytes_FromRawString("forward"))) {
CHECK(PyObject_CallMethod(self_, "forward", "OO", inputs_, outputs_))
<< CallMethodHelper("forward");
<< CallMethodHelper("forward"); // Method: run(input, outputs)
} else if (PyObject_HasAttr(self_, PyBytes_FromRawString("run"))) {
CHECK(PyObject_CallMethod(self_, "run", "OO", inputs_, outputs_))
<< CallMethodHelper("run");
<< CallMethodHelper("run"); // Method: forward(input, outputs)
}
}
......
......@@ -13,7 +13,6 @@ void ONNXBackend::Prepare(
ModelProto onnx_model;
CHECK(ReadProtoFromBinaryFile(onnx_model_path.c_str(), &onnx_model))
<< "\nFailed to parse the onnx model.";
int opset_version = -1;
for (const auto& imp : onnx_model.opset_import()) {
if ((!imp.has_domain()) || imp.domain().empty()) {
......@@ -31,7 +30,6 @@ void ONNXBackend::Prepare(
std::cout << "Unrecognized operator set " << opset_version << std::endl;
}
}
if (opset_version < 0) {
if (onnx_model.ir_version() >= 0x00000003) {
LOG(FATAL) << "Model with IR version >= 3 "
......@@ -40,7 +38,6 @@ void ONNXBackend::Prepare(
opset_version = 1;
}
}
ONNXToDragon(onnx_model, opset_version, true, init_graph, pred_graph);
}
......@@ -52,22 +49,23 @@ void ONNXBackend::ONNXToDragon(
GraphDef* pred_graph) {
ModelProto init_model = ModelProto();
ModelProto pred_model = onnx_model;
pred_graph->set_name(onnx_model.graph().name());
init_graph->set_name(onnx_model.graph().name() + "/init");
ValueInfoMap graph_value_infos{};
InitializerMap graph_initializer{};
for (const auto& vi : onnx_model.graph().input())
graph_value_infos[vi.name()].CopyFrom(vi);
for (const auto& vi : onnx_model.graph().output())
graph_value_infos[vi.name()].CopyFrom(vi);
for (const auto& vi : onnx_model.graph().value_info())
graph_value_infos[vi.name()].CopyFrom(vi);
// Collect graph inputs.
for (const auto& v : onnx_model.graph().input()) {
graph_value_infos[v.name()].CopyFrom(v);
}
// Collect graph outputs.
for (const auto& v : onnx_model.graph().output()) {
graph_value_infos[v.name()].CopyFrom(v);
}
// Collect graph values.
for (const auto& v : onnx_model.graph().value_info()) {
graph_value_infos[v.name()].CopyFrom(v);
}
// Collect graph initializers.
for (const auto& tensor : onnx_model.graph().initializer()) {
if (include_initializers) {
auto* op_def = init_graph->add_op();
......@@ -76,16 +74,18 @@ void ONNXBackend::ONNXToDragon(
}
graph_initializer[tensor.name()] = &tensor;
}
// Convert to graph defs.
auto converter = [&](const ModelProto& model, GraphDef* graph) mutable {
for (const auto& node : model.graph().node()) {
ValueInfoMap value_infos{};
InitializerMap initializer{};
for (const auto& name : node.input()) {
if (graph_value_infos.count(name))
if (graph_value_infos.count(name)) {
value_infos[name].CopyFrom(graph_value_infos[name]);
if (graph_initializer.count(name))
}
if (graph_initializer.count(name)) {
initializer[name] = graph_initializer[name];
}
}
auto onnx_node = ONNXNode(node);
auto returns = ONNXNodeToOps(
......@@ -98,23 +98,18 @@ void ONNXBackend::ONNXToDragon(
}
}
};
converter(pred_model, pred_graph);
// Set(Initializer) + Set(Placehoders) = Set(Inputs)
// Add external inputs.
Set<string> initializer;
for (const auto& e : onnx_model.graph().initializer()) {
initializer.insert(e.name());
for (const auto& v : onnx_model.graph().initializer()) {
initializer.insert(v.name());
}
// Add External Inputs
for (const auto& e : onnx_model.graph().input()) {
if (initializer.count(e.name()) == 0) {
pred_graph->add_input(e.name());
}
}
// Add External Outputs
// Add external outputs.
for (const auto& e : onnx_model.graph().output()) {
pred_graph->add_output(e.name());
}
......
#include "dragon/operators/array/channel_shuffle_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename T>
void ChannelShuffleOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
CHECK_EQ(X.dim(axis) % group_, 0)
<< "\nThe " << X.dim(axis) << " channels "
<< "can not be split into " << group_ << " groups.";
kernels::ChannelShuffle(
X.count(0, axis),
X.count(axis + 1),
X.dim(axis),
group_,
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void ChannelShuffleOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void ChannelShuffleGradientOp<Context>::DoRunWithType() {
auto &dY = Input(0), *dX = Output(0);
GET_OP_AXIS_ARG(axis, dY.ndim(), -1);
kernels::ChannelShuffle(
dY.count(0, axis),
dY.count(axis + 1),
dY.dim(axis),
dY.dim(axis) / group_,
dY.template data<T, Context>(),
dX->ReshapeLike(dY)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void ChannelShuffleGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(ChannelShuffle);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelShuffle);
#endif
DEPLOY_CPU_OPERATOR(ChannelShuffleGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelShuffleGradient);
#endif
OPERATOR_SCHEMA(ChannelShuffle)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(ChannelShuffleGradient)
/* dY */
.NumInputs(1)
/* dX */
.NumOutputs(1);
REGISTER_GRADIENT(ChannelShuffle, SimpleGradientMaker);
} // namespace dragon
#include "dragon/operators/array/shuffle_op.h"
#include "dragon/utils/math_functions.h"
namespace dragon {
template <class Context>
template <typename T>
void ChannelShuffleOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
CHECK_EQ(X.dim(axis) % group_, 0)
<< "\nThe " << X.dim(axis) << " channels "
<< "can not be split into " << group_ << " groups.";
auto G = group_, K = X.dim(axis) / group_;
if (def().type() == "ChannelShuffleGradient") std::swap(G, K);
math::Transpose(
4,
vec64_t({X.count(0, axis), G, K, X.count(axis + 1)}).data(),
vec64_t({0, 2, 1, 3}).data(),
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
DEPLOY_CPU_OPERATOR(ChannelShuffle);
REGISTER_CPU_OPERATOR(ChannelShuffleGradient, ChannelShuffleOp<CPUContext>);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelShuffle);
REGISTER_CUDA_OPERATOR(ChannelShuffleGradient, ChannelShuffleOp<CUDAContext>);
#endif
OPERATOR_SCHEMA(ChannelShuffle).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(ChannelShuffleGradient).NumInputs(1).NumOutputs(1);
REGISTER_GRADIENT(ChannelShuffle, SimpleGradientMaker);
} // namespace dragon
......@@ -10,8 +10,8 @@
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_
#ifndef DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
#define DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
#include "dragon/core/operator.h"
......@@ -25,7 +25,9 @@ class ChannelShuffleOp final : public Operator<Context> {
group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -34,22 +36,6 @@ class ChannelShuffleOp final : public Operator<Context> {
int64_t group_;
};
template <class Context>
class ChannelShuffleGradientOp final : public Operator<Context> {
public:
ChannelShuffleGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
group_(OP_SINGLE_ARG(int64_t, "group", 1)) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T>
void DoRunWithType();
protected:
int64_t group_;
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_SHUFFLE_OP_H_
#endif // DRAGON_OPERATORS_ARRAY_SHUFFLE_OP_H_
#include "dragon/operators/array/channel_affine_op.h"
#include "dragon/operators/math/affine_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename T>
void ChannelAffineOp<Context>::DoRunWithType() {
void AffineOp<Context>::DoRunWithType() {
auto &X = Input(0), &W = Input(1), *Y = Output(0, {0});
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
vec64_t affine_dims(
{X.dims().begin() + axis, X.dims().begin() + end_axis + 1});
// Compute affine dimensions.
vec64_t affine_dims;
for (auto axis : axes_) {
axis = axis < 0 ? axis + X.ndim() : axis;
CHECK(axis >= 0 && axis < X.ndim())
<< "\nExcepted the axis in [-" << X.ndim() << ", " << X.ndim()
<< "), got " << axis << ".";
affine_dims.push_back(X.dim(axis));
}
CHECK(W.dims() == affine_dims)
<< "\nExcepted the weight shape is " << Tensor::DimString(affine_dims)
<< ", got " << W.DimString() << ".";
......@@ -23,10 +27,11 @@ void ChannelAffineOp<Context>::DoRunWithType() {
<< ", got " << Input(2).DimString() << ".";
}
kernels::ChannelAffine(
X.count(0, axis),
X.count(end_axis + 1),
X.count(axis, end_axis + 1),
math::Affine(
X.ndim(),
X.dims().data(),
axes_.size(),
axes_.data(),
X.template data<T, Context>(),
W.template data<T, Context>(),
InputSize() <= 2 ? nullptr : Input(2).template data<T, Context>(),
......@@ -35,28 +40,30 @@ void ChannelAffineOp<Context>::DoRunWithType() {
}
template <class Context>
void ChannelAffineOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void ChannelAffineGradientOp<Context>::DoRunWithType() {
void AffineGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &W = Input(1), &dY = Input(2);
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
vec64_t affine_dims = {X.count(0, axis),
X.count(axis, end_axis + 1),
X.count(end_axis + 1)},
affine_axes = {0, 2};
// Compute reduce axes.
vec64_t reduce_axes;
for (int i = 0; i < X.ndim(); ++i) {
bool keep = true;
for (auto axis : axes_) {
axis = axis < 0 ? axis + X.ndim() : axis;
if (axis == i) keep = false;
}
if (keep) reduce_axes.push_back(i);
}
// Scratch to save the intermediates.
T* data = nullptr;
if (dW->has_name() && X.count() != W.count()) {
data = ctx()->workspace()->template data<T, Context>(X.count());
}
// dW = dY * X
if (dW->has_name()) {
Output(1)->ReshapeLike(Input(1));
auto* x = Input(0).template data<T, Context>();
auto* dw = Output(1)->template mutable_data<T, Context>();
if (X.count() == W.count()) {
math::Mul(
X.count(),
......@@ -65,20 +72,19 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
dW->ReshapeLike(W)->template mutable_data<T, Context>(),
ctx());
} else {
T* scratch = ctx()->workspace()->template data<T, Context>(X.count());
math::Mul(
X.count(),
dY.template data<T, Context>(),
X.template data<T, Context>(),
scratch,
data,
ctx());
math::ReduceSum(
3,
affine_dims.data(),
2,
affine_axes.data(),
X.ndim(),
X.dims().data(),
reduce_axes.size(),
reduce_axes.data(),
1.f,
scratch,
data,
dW->ReshapeLike(W)->template mutable_data<T, Context>(),
ctx());
}
......@@ -90,10 +96,10 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
dB->ReshapeLike(W)->CopyFrom(dY, ctx());
} else {
math::ReduceSum(
3,
affine_dims.data(),
2,
affine_axes.data(),
X.ndim(),
X.dims().data(),
reduce_axes.size(),
reduce_axes.data(),
1.f,
dY.template data<T, Context>(),
dB->ReshapeLike(W)->template mutable_data<T, Context>(),
......@@ -103,11 +109,11 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
// dX = dY * W
if (dX->has_name()) {
Output(0)->ReshapeLike(Input(-1));
kernels::ChannelAffine(
X.count(0, axis),
X.count(end_axis + 1),
X.count(axis, end_axis + 1),
math::Affine(
X.ndim(),
X.dims().data(),
axes_.size(),
axes_.data(),
dY.template data<T, Context>(),
W.template data<T, Context>(),
(const T*)nullptr,
......@@ -116,22 +122,17 @@ void ChannelAffineGradientOp<Context>::DoRunWithType() {
}
}
template <class Context>
void ChannelAffineGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(ChannelAffine);
DEPLOY_CPU_OPERATOR(Affine);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelAffine);
DEPLOY_CUDA_OPERATOR(Affine);
#endif
DEPLOY_CPU_OPERATOR(ChannelAffineGradient);
DEPLOY_CPU_OPERATOR(AffineGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelAffineGradient);
DEPLOY_CUDA_OPERATOR(AffineGradient);
#endif
OPERATOR_SCHEMA(ChannelAffine)
OPERATOR_SCHEMA(Affine)
/* X, W, B */
.NumInputs(2, 3)
/* Y */
......@@ -139,7 +140,7 @@ OPERATOR_SCHEMA(ChannelAffine)
/* X => Y */
.AllowInplace({{0, 0}});
OPERATOR_SCHEMA(ChannelAffineGradient)
OPERATOR_SCHEMA(AffineGradient)
/* X, W, dY */
.NumInputs(3)
/* dX, dW, dB */
......@@ -163,6 +164,6 @@ class GradientMaker final : public GradientMakerBase {
} // namespace
REGISTER_GRADIENT(ChannelAffine, GradientMaker);
REGISTER_GRADIENT(Affine, GradientMaker);
} // namespace dragon
......@@ -10,37 +10,49 @@
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
#ifndef DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#define DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#include "dragon/core/operator.h"
namespace dragon {
template <class Context>
class ChannelAffineOp final : public Operator<Context> {
class AffineOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(ChannelAffineOp);
AffineOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
protected:
vec64_t axes_;
};
template <class Context>
class ChannelAffineGradientOp final : public Operator<Context> {
class AffineGradientOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(ChannelAffineGradientOp);
AffineGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws), axes_(OP_REPEATED_ARG(int64_t, "axes")) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
protected:
vec64_t axes_;
};
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
#endif // DRAGON_OPERATORS_MATH_AFFINE_OP_H_
......@@ -26,6 +26,7 @@ DISPATCH_WITH_TENSOR_TYPES(IsInf, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(IsNaN, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(IsFinite, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Pow, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Atan2, dtypes::Floating, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Minimum, dtypes::Numerical, Input(0));
DISPATCH_WITH_TENSOR_TYPES(Maximum, dtypes::Numerical, Input(0));
DISPATCH_WITH_TENSOR_TYPES(BitwiseNot, dtypes::Bitwise, Input(0));
......@@ -120,6 +121,7 @@ DEFINE_INPLACE_UNARY_OP_IMPL(BitwiseNot, T);
}
DEFINE_SIMPLE_BINARY_OP_IMPL(Pow, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(Atan2, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(Minimum, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(Maximum, T);
DEFINE_SIMPLE_BINARY_OP_IMPL(BitwiseAnd, T);
......@@ -152,6 +154,7 @@ DEPLOY_CPU_OPERATOR(IsInf);
DEPLOY_CPU_OPERATOR(IsNaN);
DEPLOY_CPU_OPERATOR(IsFinite);
DEPLOY_CPU_OPERATOR(Pow);
DEPLOY_CPU_OPERATOR(Atan2);
DEPLOY_CPU_OPERATOR(Minimum);
DEPLOY_CPU_OPERATOR(Maximum);
DEPLOY_CPU_OPERATOR(BitwiseNot);
......@@ -186,6 +189,7 @@ DEPLOY_CUDA_OPERATOR(IsInf);
DEPLOY_CUDA_OPERATOR(IsNaN);
DEPLOY_CUDA_OPERATOR(IsFinite);
DEPLOY_CUDA_OPERATOR(Pow);
DEPLOY_CUDA_OPERATOR(Atan2);
DEPLOY_CUDA_OPERATOR(Minimum);
DEPLOY_CUDA_OPERATOR(Maximum);
DEPLOY_CUDA_OPERATOR(BitwiseNot);
......@@ -222,6 +226,7 @@ OPERATOR_SCHEMA(IsNaN).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(IsFinite).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(Not).NumInputs(1).NumOutputs(1);
OPERATOR_SCHEMA(Pow).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(Atan2).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(Minimum).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(Maximum).NumInputs(2).NumOutputs(1);
OPERATOR_SCHEMA(BitwiseAnd)
......@@ -250,6 +255,7 @@ NO_GRADIENT(Round);
NO_GRADIENT(IsInf);
NO_GRADIENT(IsNaN);
NO_GRADIENT(IsFinite);
NO_GRADIENT(Atan2);
NO_GRADIENT(BitwiseNot);
NO_GRADIENT(BitwiseAnd);
NO_GRADIENT(BitwiseOr);
......
......@@ -70,7 +70,7 @@ inline vec32_t CheckOutputAliases(
return available_aliases;
}
// Unary ElementwiseOp
// Unary ElementwiseOp.
DECLARE_ELEMENTWISE_OP(Abs);
DECLARE_ELEMENTWISE_OP(Ceil);
DECLARE_ELEMENTWISE_OP(Cos);
......@@ -101,12 +101,13 @@ DECLARE_ELEMENTWISE_OP(SignGradient);
DECLARE_ELEMENTWISE_OP(SinGradient);
DECLARE_ELEMENTWISE_OP(SqrtGradient);
DECLARE_ELEMENTWISE_OP(SquareGradient);
// Binary ElementwiseOp
// Binary ElementwiseOp.
DECLARE_ELEMENTWISE_OP(Add);
DECLARE_ELEMENTWISE_OP(Sub);
DECLARE_ELEMENTWISE_OP(Mul);
DECLARE_ELEMENTWISE_OP(Div);
DECLARE_ELEMENTWISE_OP(Pow);
DECLARE_ELEMENTWISE_OP(Atan2);
DECLARE_ELEMENTWISE_OP(Minimum);
DECLARE_ELEMENTWISE_OP(Maximum);
DECLARE_ELEMENTWISE_OP(BitwiseAnd);
......@@ -128,7 +129,7 @@ DECLARE_ELEMENTWISE_OP(DivGradient);
DECLARE_ELEMENTWISE_OP(PowGradient);
DECLARE_ELEMENTWISE_OP(MinimumGradient);
DECLARE_ELEMENTWISE_OP(MaximumGradient);
// Trinary ElementwiseOp
// Trinary ElementwiseOp.
DECLARE_ELEMENTWISE_OP(Where);
DECLARE_ELEMENTWISE_OP(WhereGradient);
#undef DECLARE_ELEMENTWISE_OP
......
......@@ -199,11 +199,6 @@ void MatMulOp<Context>::DoRunWithType() {
}
template <class Context>
void MatMulOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void MatMulGradientOp<Context>::DoRunWithType() {
auto &A = Input(0), &B = Input(1), &dY = Input(2);
......@@ -590,11 +585,6 @@ void MatMulGradientOp<Context>::DoRunWithType() {
}
}
template <class Context>
void MatMulGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(MatMul);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(MatMul);
......
......@@ -23,7 +23,9 @@ class MatMulOp final : public Operator<Context> {
SIMPLE_CTOR_DTOR(MatMulOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -35,7 +37,9 @@ class MatMulGradientOp final : public Operator<Context> {
SIMPLE_CTOR_DTOR(MatMulGradientOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......
#include "dragon/operators/array/channel_normalize_op.h"
#include "dragon/operators/normalization/channel_norm_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/op_kernels.h"
......@@ -6,7 +6,7 @@ namespace dragon {
template <class Context>
template <typename InputT, typename OutputT>
void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
void ChannelNormOp<Context>::DoRunWithTypeAndCast() {
auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
......@@ -30,7 +30,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
<< "\nProviding " << X_mean_.count() << " values to normalize Dimension("
<< Y_dims[axis] << ").";
kernels::ChannelNormalize(
kernels::ChannelNorm(
axis,
num_dims,
X_strides.data(),
......@@ -44,7 +44,7 @@ void ChannelNormalizeOp<Context>::DoRunWithTypeAndCast() {
template <class Context>
template <typename T>
void ChannelNormalizeOp<Context>::DoRunWithType() {
void ChannelNormOp<Context>::DoRunWithType() {
if (data_type() == "float16") {
DoRunWithTypeAndCast<T, float16>();
} else if (data_type() == "float32") {
......@@ -58,21 +58,21 @@ void ChannelNormalizeOp<Context>::DoRunWithType() {
}
template <class Context>
void ChannelNormalizeOp<Context>::RunOnDevice() {
void ChannelNormOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Numerical>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(ChannelNormalize);
DEPLOY_CPU_OPERATOR(ChannelNorm);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(ChannelNormalize);
DEPLOY_CUDA_OPERATOR(ChannelNorm);
#endif
OPERATOR_SCHEMA(ChannelNormalize)
OPERATOR_SCHEMA(ChannelNorm)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
NO_GRADIENT(ChannelNormalize);
NO_GRADIENT(ChannelNorm);
} // namespace dragon
......@@ -10,17 +10,17 @@
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_
#ifndef DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
#define DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
#include "dragon/core/operator.h"
namespace dragon {
template <class Context>
class ChannelNormalizeOp final : public Operator<Context> {
class ChannelNormOp final : public Operator<Context> {
public:
ChannelNormalizeOp(const OperatorDef& def, Workspace* ws)
ChannelNormOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {
INITIALIZE_OP_REPEATED_ARG(int64_t, perm);
auto mean = OP_REPEATED_ARG(float, "mean");
......@@ -50,8 +50,8 @@ class ChannelNormalizeOp final : public Operator<Context> {
DECLARE_OP_REPEATED_ARG(int64_t, perm);
};
DEFINE_OP_REPEATED_ARG(int64_t, ChannelNormalizeOp, perm);
DEFINE_OP_REPEATED_ARG(int64_t, ChannelNormOp, perm);
} // namespace dragon
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_NORMALIZE_OP_H_
#endif // DRAGON_OPERATORS_NORMALIZATION_CHANNEL_NORM_OP_H_
#include "dragon/operators/normalization/lp_normalize_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h"
#include "dragon/operators/normalization/lp_norm_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename T>
void LpNormalizeOp<Context>::DoRunWithType() {
void LpNormOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
auto reduce_dim = X.count(axis, end_axis + 1);
// Normalize input with a scaled Lp-norm
if (p_ == 1) {
kernels::L1Normalize(
kernels::L1Norm(
X.count(0, axis),
X.count(end_axis + 1),
reduce_dim,
......@@ -25,7 +22,7 @@ void LpNormalizeOp<Context>::DoRunWithType() {
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
} else if (p_ == 2) {
kernels::L2Normalize(
kernels::L2Norm(
X.count(0, axis),
X.count(end_axis + 1),
reduce_dim,
......@@ -40,20 +37,15 @@ void LpNormalizeOp<Context>::DoRunWithType() {
}
template <class Context>
void LpNormalizeOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void LpNormalizeGradientOp<Context>::DoRunWithType() {
void LpNormGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(1), *dX = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
GET_OP_AXIS_ARG(end_axis, X.ndim(), axis);
auto reduce_dim = X.count(axis, end_axis + 1);
if (p_ == 1) {
kernels::L1NormalizeGrad(
kernels::L1NormGrad(
X.count(0, axis),
X.count(end_axis + 1),
reduce_dim,
......@@ -64,7 +56,7 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() {
dX->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
} else if (p_ == 2) {
kernels::L2NormalizeGrad(
kernels::L2NormGrad(
X.count(0, axis),
X.count(end_axis + 1),
reduce_dim,
......@@ -79,33 +71,28 @@ void LpNormalizeGradientOp<Context>::DoRunWithType() {
}
}
template <class Context>
void LpNormalizeGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(LpNormalize);
DEPLOY_CPU_OPERATOR(LpNorm);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LpNormalize);
DEPLOY_CUDA_OPERATOR(LpNorm);
#endif
DEPLOY_CPU_OPERATOR(LpNormalizeGradient);
DEPLOY_CPU_OPERATOR(LpNormGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LpNormalizeGradient);
DEPLOY_CUDA_OPERATOR(LpNormGradient);
#endif
OPERATOR_SCHEMA(LpNormalize)
OPERATOR_SCHEMA(LpNorm)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(LpNormalizeGradient)
OPERATOR_SCHEMA(LpNormGradient)
/* X, dY */
.NumInputs(2)
/* dX */
.NumOutputs(1);
REGISTER_GRADIENT(LpNormalize, GenericGradientMaker);
REGISTER_GRADIENT(LpNorm, GenericGradientMaker);
} // namespace dragon
......@@ -10,24 +10,26 @@
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_
#define DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_
#ifndef DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
#define DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
#include "dragon/core/operator.h"
namespace dragon {
template <class Context>
class LpNormalizeOp final : public Operator<Context> {
class LpNormOp final : public Operator<Context> {
public:
LpNormalizeOp(const OperatorDef& def, Workspace* ws)
LpNormOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
p_(OP_SINGLE_ARG(int64_t, "p", 2)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -39,16 +41,18 @@ class LpNormalizeOp final : public Operator<Context> {
};
template <class Context>
class LpNormalizeGradientOp final : public Operator<Context> {
class LpNormGradientOp final : public Operator<Context> {
public:
LpNormalizeGradientOp(const OperatorDef& def, Workspace* ws)
LpNormGradientOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
p_(OP_SINGLE_ARG(int64_t, "p", 2)),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-12)),
reduction_(OP_SINGLE_ARG(string, "reduction", "SUM")) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -61,4 +65,4 @@ class LpNormalizeGradientOp final : public Operator<Context> {
} // namespace dragon
#endif // DRAGON_OPERATORS_NORMALIZATION_LP_NORMALIZE_OP_H_
#endif // DRAGON_OPERATORS_NORMALIZATION_LP_NORM_OP_H_
......@@ -5,46 +5,41 @@
namespace dragon {
template <class Context>
void AdamOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
template <typename T, typename CopyT>
void AdamOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::Adam(
dX->count(),
lr_ * correction_,
beta1_,
beta2_,
eps_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
this->weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx());
}
template <class Context>
void AdamWOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
if (lambda_ > 0.f) {
kernels::AdamW(
dX->count(),
lr_ * correction_,
beta1_,
beta2_,
eps_,
this->lr_ * lambda_,
X->template data<float, Context>(),
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
ctx());
} else {
kernels::Adam(
dX->count(),
lr_ * correction_,
beta1_,
beta2_,
eps_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
ctx());
}
template <typename T, typename CopyT>
void AdamWOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::AdamW(
dX->count(),
lr_ * correction_,
beta1_,
beta2_,
eps_,
lr_ * this->weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx());
}
DEPLOY_CPU_OPERATOR(Adam);
......
......@@ -5,16 +5,21 @@
namespace dragon {
template <class Context>
void RMSpropOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
template <typename T, typename CopyT>
void RMSpropOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::RMSprop(
dX->count(),
lr_,
momentum_,
decay_,
alpha_,
eps_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
Slot("v")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
this->weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
GetState("v")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx());
}
......
......@@ -6,33 +6,44 @@
namespace dragon {
template <class Context>
void MomentumSGDOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
template <typename T, typename CopyT>
void MomentumSGDOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::MomentumSGD(
dX->count(),
lr_,
momentum_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
this->weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx());
}
template <class Context>
void NesterovSGDOp<Context>::ComputeUpdate(Tensor* dX, Tensor* /* X */) {
template <typename T, typename CopyT>
void NesterovSGDOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
kernels::NesterovSGD(
dX->count(),
lr_,
momentum_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
this->weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx());
}
template <class Context>
void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
template <typename T, typename CopyT>
void LARSOp<Context>::DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y) {
float trust_ratio = 0.f;
if (trust_coef_ > 0.f) {
auto* x = X->template data<float, Context>();
auto* dx = dX->template mutable_data<float, Context>();
auto* x = X->template data<T, Context>();
auto* dx = dX->template mutable_data<T, Context>();
float x_norm = std::sqrt(math::Dot(X->count(), x, x, ctx()));
float dx_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
if (x_norm > 0.f && dx_norm > 0.f) {
......@@ -43,16 +54,20 @@ void LARSOp<Context>::ComputeUpdate(Tensor* dX, Tensor* X) {
math::Scale(
dX->count(),
trust_ratio,
dX->template data<float, Context>(),
dX->template mutable_data<float, Context>(),
dX->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
kernels::MomentumSGD(
dX->count(),
lr_,
momentum_,
dX->template mutable_data<float, Context>(),
Slot("m")->ReshapeLike(*dX)->template mutable_data<float, Context>(),
this->weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
GetState("m")->ReshapeLike(*dX)->template mutable_data<T, Context>(),
X->template mutable_data<T, Context>(),
Y ? Y->template mutable_data<CopyT, Context>() : (CopyT*)nullptr,
ctx());
}
......
......@@ -37,17 +37,14 @@ class UpdateOpBase : public Operator<Context> {
void RunOnDevice() override;
template <typename T>
void TransformGrad(Tensor* dX, Tensor* X);
void TransformGrad(Tensor* dX);
virtual void ComputeUpdate(Tensor* dX, Tensor* X) = 0;
template <typename T>
void ApplyUpdate(Tensor* dX, Tensor* X);
virtual void ApplyUpdate(Tensor* dX, Tensor* X, Tensor* Y) = 0;
template <typename T>
T GetHyper(const string& key);
Tensor* Slot(const string& key);
Tensor* GetState(const string& key);
protected:
int weight_index_;
......@@ -55,9 +52,26 @@ class UpdateOpBase : public Operator<Context> {
float clip_norm_, clip_value_;
};
#define USE_UPDATE_FUNCTIONS \
using UpdateOpBase<Context>::GetHyper; \
using UpdateOpBase<Context>::Slot
#define USE_UPDATE_FUNCTIONS \
using UpdateOpBase<Context>::GetHyper; \
using UpdateOpBase<Context>::GetState; \
void ApplyUpdate(Tensor* dX, Tensor* X, Tensor* Y) override { \
if (dX->template IsType<float>()) { \
if (Y == nullptr) { \
DoRunWithType<float, float>(dX, X, Y); \
} else if (Y->template IsType<float16>()) { \
DoRunWithType<float, float16>(dX, X, Y); \
} else { \
LOG(FATAL) << MessageForUnsupported( \
dtypes::to_string(Y->meta()), {"float16", "float32"}); \
} \
} else if (dX->template IsType<double>()) { \
DoRunWithType<double, double>(dX, X, Y); \
} else { \
LOG(FATAL) << MessageForUnsupported( \
dtypes::to_string(dX->meta()), {"float32", "float64"}); \
} \
}
template <class Context>
class MomentumSGDOp final : public UpdateOpBase<Context> {
......@@ -73,7 +87,8 @@ class MomentumSGDOp final : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments();
}
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected:
float lr_, momentum_;
......@@ -93,7 +108,8 @@ class NesterovSGDOp final : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments();
}
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected:
float lr_, momentum_;
......@@ -110,15 +126,16 @@ class RMSpropOp final : public UpdateOpBase<Context> {
void GetArguments() override {
lr_ = this->template GetHyper<float>("lr");
momentum_ = this->template GetHyper<float>("momentum");
decay_ = this->template GetHyper<float>("decay");
alpha_ = this->template GetHyper<float>("alpha");
eps_ = this->template GetHyper<float>("eps");
UpdateOpBase<Context>::GetArguments();
}
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected:
float lr_, momentum_, decay_, eps_;
float lr_, momentum_, alpha_, eps_;
};
template <class Context>
......@@ -139,7 +156,8 @@ class AdamOp : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments();
}
void ComputeUpdate(Tensor* dX, Tensor* /* X */) override;
template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected:
int64_t t_;
......@@ -163,16 +181,15 @@ class AdamWOp final : public UpdateOpBase<Context> {
t_++;
correction_ = sqrt(1.f - pow(beta2_, t_)) / (1.f - pow(beta1_, t_));
UpdateOpBase<Context>::GetArguments();
lambda_ = this->weight_decay_;
this->weight_decay_ = 0.f;
}
void ComputeUpdate(Tensor* dX, Tensor* X) override;
template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected:
int64_t t_;
float lr_, beta1_, beta2_;
float eps_, correction_, lambda_;
float eps_, correction_;
};
template <class Context>
......@@ -190,14 +207,13 @@ class LARSOp final : public UpdateOpBase<Context> {
UpdateOpBase<Context>::GetArguments();
}
void ComputeUpdate(Tensor* dX, Tensor* X) override;
template <typename T, typename CopyT>
void DoRunWithType(Tensor* dX, Tensor* X, Tensor* Y);
protected:
float lr_, momentum_, trust_coef_;
};
#undef USE_UPDATE_FUNCTIONS
} // namespace dragon
#endif // DRAGON_OPERATORS_TRAINING_UPDATE_OP_H_
......@@ -13,67 +13,40 @@ T UpdateOpBase<Context>::GetHyper(const string& key) {
}
template <class Context>
Tensor* UpdateOpBase<Context>::Slot(const string& key) {
Tensor* UpdateOpBase<Context>::GetState(const string& key) {
const string& weight_name = Output(weight_index_)->name();
return workspace()->CreateTensor(name() + "/" + weight_name + "/" + key);
}
template <class Context>
template <typename T>
void UpdateOpBase<Context>::TransformGrad(Tensor* dX, Tensor* X) {
// Scale.
void UpdateOpBase<Context>::TransformGrad(Tensor* dX) {
if (grad_scale_ != 1.f) {
auto* dx = dX->template mutable_data<T, Context>();
math::Scale(dX->count(), grad_scale_, dx, dx, ctx());
}
// Clip.
if (clip_norm_ > 0.f) {
auto* dx = dX->template mutable_data<T, Context>();
float grad_norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
if (grad_norm > clip_norm_) {
math::Scale(dX->count(), clip_norm_ / grad_norm, dx, dx, ctx());
float norm = std::sqrt(math::Dot(dX->count(), dx, dx, ctx()));
if (norm > clip_norm_) {
math::Scale(dX->count(), clip_norm_ / norm, dx, dx, ctx());
}
} else if (clip_value_ > 0.f) {
auto* dx = dX->template mutable_data<T, Context>();
kernels::Clip(dX->count(), -clip_value_, clip_value_, dx, dx, ctx());
}
// Penalty.
if (weight_decay_ > 0.f) {
math::Axpy(
X->count(),
weight_decay_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
}
template <class Context>
template <typename T>
void UpdateOpBase<Context>::ApplyUpdate(Tensor* dX, Tensor* X) {
math::Sub(
X->count(),
X->template data<T, Context>(),
dX->template data<T, Context>(),
X->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void UpdateOpBase<Context>::RunOnDevice() {
GetArguments();
for (int i = 0; i < InputSize(); ++i) {
weight_index_ = i;
auto &dX = Input(i), *X = Output(i);
if (dX.count() == 0 || X->count() == 0) return;
for (weight_index_ = 0; weight_index_ < InputSize(); ++weight_index_) {
auto &dX = Input(weight_index_), *X = Output(weight_index_);
if (dX.count() == 0 || X->count() == 0) continue;
CHECK(dX.dims() == X->dims())
<< "\nWeight and grad should have the same dimensions."
<< "\nGot" << X->DimString() << " and " << dX.DimString();
if (dX.template IsType<float>()) {
TransformGrad<float>(&dX, X);
ComputeUpdate(&dX, X);
ApplyUpdate<float>(&dX, X);
} else if (dX.template IsType<float16>()) {
if (dX.template IsType<float16>()) {
auto* X_master = workspace()->CreateTensor(X->name() + "_master");
auto* X_grad = ctx()->workspace()->CreateTensor("BufferShared");
if (X_master->count() != X->count()) {
......@@ -88,17 +61,17 @@ void UpdateOpBase<Context>::RunOnDevice() {
dX.template data<float16, Context>(),
X_grad->ReshapeLike(dX)->template mutable_data<float, Context>(),
ctx());
TransformGrad<float>(X_grad, X_master);
ComputeUpdate(X_grad, X_master);
ApplyUpdate<float>(X_grad, X_master);
math::Cast(
X->count(),
X_master->template data<float, Context>(),
X->template mutable_data<float16, Context>(),
ctx());
TransformGrad<float>(X_grad);
ApplyUpdate(X_grad, X_master, X);
} else if (dX.template IsType<float>()) {
TransformGrad<float>(&dX);
ApplyUpdate(&dX, X, nullptr);
} else if (dX.template IsType<double>()) {
TransformGrad<double>(&dX);
ApplyUpdate(&dX, X, nullptr);
} else {
LOG(FATAL) << MessageForUnsupported(
dtypes::to_string(dX.meta()), {"float16", "float32"});
dtypes::to_string(dX.meta()), {"float16", "float32", "float64"});
}
}
}
......
......@@ -58,9 +58,6 @@ from dragon.core.ops import tensor_ops as _
from dragon.core.ops.array_ops import assign
from dragon.core.ops.array_ops import boolean_mask
from dragon.core.ops.array_ops import broadcast_to
from dragon.core.ops.array_ops import channel_affine
from dragon.core.ops.array_ops import channel_normalize
from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.array_ops import concat
from dragon.core.ops.array_ops import expand_dims
from dragon.core.ops.array_ops import flatten
......
......@@ -21,6 +21,7 @@ from dragon.core.device.cuda import current_device
from dragon.core.device.cuda import get_device_capability
from dragon.core.device.cuda import is_available
from dragon.core.device.cuda import memory_allocated
from dragon.core.device.cuda import set_cublas_flags
from dragon.core.device.cuda import set_cudnn_flags
from dragon.core.device.cuda import set_default_device
from dragon.core.device.cuda import set_device
......
......@@ -17,8 +17,10 @@ from dragon.core.ops.activation_ops import sigmoid
from dragon.core.ops.activation_ops import tanh
from dragon.core.ops.math_ops import abs
from dragon.core.ops.math_ops import add
from dragon.core.ops.math_ops import affine
from dragon.core.ops.math_ops import argmax
from dragon.core.ops.math_ops import argmin
from dragon.core.ops.math_ops import atan2
from dragon.core.ops.math_ops import ceil
from dragon.core.ops.math_ops import clip
from dragon.core.ops.math_ops import cos
......@@ -60,7 +62,6 @@ from dragon.core.ops.math_ops import sqrt
from dragon.core.ops.math_ops import square
from dragon.core.ops.math_ops import sub
from dragon.core.ops.math_ops import sum
from dragon.core.ops.normalization_ops import lp_normalize
from dragon.core.ops.sort_ops import top_k
__all__ = [_s for _s in dir() if not _s.startswith('_')]
......@@ -34,12 +34,15 @@ from dragon.core.ops.activation_ops import relu6
from dragon.core.ops.activation_ops import selu
from dragon.core.ops.activation_ops import silu
from dragon.core.ops.activation_ops import softmax
from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.math_ops import moments
from dragon.core.ops.normalization_ops import batch_norm
from dragon.core.ops.normalization_ops import channel_norm
from dragon.core.ops.normalization_ops import group_norm
from dragon.core.ops.normalization_ops import instance_norm
from dragon.core.ops.normalization_ops import layer_norm
from dragon.core.ops.normalization_ops import local_response_norm
from dragon.core.ops.normalization_ops import lp_norm
from dragon.core.ops.normalization_ops import sync_batch_norm
from dragon.core.ops.vision_ops import bias_add
from dragon.core.ops.vision_ops import conv
......
......@@ -78,16 +78,13 @@ def cast_args(**kwargs):
return {'dtype': kwargs.get('dtype', 'float32')}
@register('ChannelAffine')
def channel_affine_args(**kwargs):
return {
'axis': kwargs.get('axis', -1),
'end_axis': kwargs.get('end_axis', kwargs.get('axis', -1)),
}
@register('Affine')
def affine_args(**kwargs):
return {'axes': kwargs.get('axes', None)}
@register('ChannelNormalize')
def channel_normalize_args(**kwargs):
@register('ChannelNorm')
def channel_norm_args(**kwargs):
return {
'axis': kwargs.get('axis', -1),
'mean': kwargs.get('mean', None),
......@@ -323,8 +320,8 @@ def loss_args(**kwargs):
return {'reduction': kwargs.get('reduction', 'MEAN')}
@register('LpNormalize')
def lp_normalize_args(**kwargs):
@register('LpNorm')
def lp_norm_args(**kwargs):
return {
'p': kwargs.get('p', 2),
'axis': kwargs.get('axis', -1),
......
......@@ -81,6 +81,7 @@ def binary_shape_spec(inputs, outputs):
@register([
'Add',
'Atan2',
'BitwiseAnd',
'BitwiseOr',
'BitwiseXor',
......@@ -403,7 +404,7 @@ def gemm_spec(args, inputs, outputs):
return outputs
@register('ChannelNormalize')
@register('ChannelNorm')
def channel_normalize_spec(args, inputs, outputs):
outputs[0]._dtype = args['dtype']
try:
......
......@@ -62,11 +62,23 @@ def current_device():
return backend.cudaGetDevice()
def set_cublas_flags(allow_tf32=None):
"""Set the flags of cuBLAS library.
Parameters
----------
allow_tf32 : bool, optional, default=False
Allow TF32 tensor core operation or not.
"""
backend.cublasSetFlags(-1 if allow_tf32 is None else allow_tf32)
def set_cudnn_flags(
enabled=True,
benchmark=False,
deterministic=False,
allow_tf32=False,
enabled=None,
benchmark=None,
deterministic=None,
allow_tf32=None,
):
"""Set the flags of cuDNN library.
......@@ -82,7 +94,11 @@ def set_cudnn_flags(
Allow TF32 tensor core operation or not.
"""
backend.cudnnSetFlags(enabled, benchmark, deterministic, allow_tf32)
backend.cudnnSetFlags(
-1 if enabled is None else enabled,
-1 if benchmark is None else benchmark,
-1 if deterministic is None else deterministic,
-1 if allow_tf32 is None else allow_tf32)
def get_device_capability(device_index=None):
......
......@@ -122,107 +122,16 @@ def broadcast_to(inputs, shape, **kwargs):
return OpLib.add('Expand', **args)
@OpSchema.num_inputs(2, 3)
def channel_affine(inputs, axis=-1, end_axis=None, **kwargs):
r"""Apply affine transformation to each channel of input.
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input, weight and optional bias tensor.
axis : int, optional, default=-1
The first channel axis.
end_axis : int, optional
The last channel axis.
Returns
-------
dragon.Tensor
The output tensor.
"""
outputs = kwargs.pop('outputs', [None])
if context.executing_eagerly():
return OpLib.execute(
'ChannelAffine', inputs, outputs=outputs,
axis=axis, end_axis=end_axis)
return OpLib.add('ChannelAffine', inputs,
axis=axis, end_axis=end_axis, **kwargs)
@OpSchema.num_inputs(1)
@OpSchema.convert_arg('perm')
def channel_normalize(
inputs,
mean,
std,
axis=-1,
dtype='float32',
perm=None,
**kwargs
):
"""Apply normalization to each channel of input.
:attr:`axis` can be negative:
```python
m = s = (1., 1., 1.)
x = dragon.constant([1, 2, 3])
print(dragon.channel_normalize(x, m, s, axis=0)) # [0., 1., 2.]
print(dragon.channel_normalize(x, m, s, axis=-1)) # Equivalent
```
If :attr:`perm` provided, :attr:`axis` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = dragon.constant([[1, 2, 3]])
# Provided 3 values to normalize the last axis
# with length 1, only the first value will be taken
print(dragon.channel_normalize(x, m, s, perm=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
inputs : dragon.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
axis : int, optional, default=-1
The channel axis.
dtype : str, optional, default='float32'
The output data type.
perm : Sequence[Union[int, dragon.Tensor]], optional
The output permutation.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = OpSchema.parse_args(locals())
if context.executing_eagerly():
return OpLib.execute(
'ChannelNormalize', inputs,
axis=axis, mean=mean, std=std, dtype=dtype,
ndim=len(args['perm']) if perm is not None else 0,
perm=args['perm'])
return OpLib.add('ChannelNormalize', **args)
@OpSchema.num_inputs(1)
def channel_shuffle(inputs, axis=-1, group=1, **kwargs):
"""Apply group shuffle to each channel of input.
"""Apply the group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
Examples:
```python
x = dragon.constant([1, 2, 3, 4])
print(dragon.channel_shuffle(x, group=2)) # [1, 3, 2, 4]
print(dragon.nn.channel_shuffle(x, group=2)) # [1, 3, 2, 4]
```
Parameters
......
......@@ -82,6 +82,30 @@ def add(inputs, **kwargs):
return OpLib.add('Add', inputs, **kwargs)
@OpSchema.num_inputs(2, 3)
def affine(inputs, axis=-1, **kwargs):
"""Apply affine transformation to input.
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input, scale and bias tensor.
axis : Union[int, Sequence[int]], optional, default=-1
The axis to apply.
Returns
-------
dragon.Tensor
The output tensor.
"""
axes = nest.flatten(axis)
outputs = kwargs.pop('outputs', [None])
if context.executing_eagerly():
return OpLib.execute('Affine', inputs, outputs=outputs, axes=axes)
return OpLib.add('Affine', inputs, axes=axes, **kwargs)
@OpSchema.num_inputs(1)
def argmax(inputs, axis=0, keepdims=False, **kwargs):
"""Compute the index of maximum elements along the given axis.
......@@ -149,6 +173,37 @@ def argmin(inputs, axis=0, keepdims=False, **kwargs):
@OpSchema.num_inputs(2)
def atan2(inputs, **kwargs):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{input1}}{\text{input2}})
Examples:
```python
y = dragon.constant(1)
x = dragon.constant(2)
print(dragon.math.atan2([y, x])) # 0.46364761
```
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input1 and input2 tensor.
Returns
-------
dragon.Tensor
The output tensor.
"""
inputs = constant_ops.remove_scalars(inputs)
if context.executing_eagerly():
return OpLib.execute('Atan2', inputs)
return OpLib.add('Atan2', inputs, **kwargs)
@OpSchema.num_inputs(2)
def bitwise_and(inputs, **kwargs):
r"""Compute the element-wise AND bitwise operation.
......
......@@ -72,6 +72,69 @@ def batch_norm(
return OpLib.add('BatchNorm', **args)
@OpSchema.num_inputs(1)
@OpSchema.convert_arg('perm')
def channel_norm(
inputs,
mean,
std,
axis=-1,
dtype='float32',
perm=None,
**kwargs
):
"""Apply the normalization to each channel of input.
:attr:`axis` can be negative:
```python
m = s = (1., 1., 1.)
x = dragon.constant([1, 2, 3])
print(dragon.nn.channel_norm(x, m, s, axis=0)) # [0., 1., 2.]
print(dragon.nn.channel_norm(x, m, s, axis=-1)) # Equivalent
```
If :attr:`perm` provided, :attr:`axis` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = dragon.constant([[1, 2, 3]])
# Provided 3 values to normalize the last axis
# with length 1, only the first value will be taken
print(dragon.nn.channel_norm(x, m, s, perm=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
inputs : dragon.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
axis : int, optional, default=-1
The channel axis.
dtype : str, optional, default='float32'
The output data type.
perm : Sequence[Union[int, dragon.Tensor]], optional
The output permutation.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = OpSchema.parse_args(locals())
if context.executing_eagerly():
return OpLib.execute(
'ChannelNorm', inputs,
axis=axis, mean=mean, std=std, dtype=dtype,
ndim=len(args['perm']) if perm is not None else 0,
perm=args['perm'])
return OpLib.add('ChannelNorm', **args)
@OpSchema.num_inputs(3)
def group_norm(inputs, axis=-1, group=0, epsilon=1e-5, **kwargs):
r"""Apply the group normalization.
......@@ -180,7 +243,7 @@ def layer_norm(inputs, axis=-1, epsilon=1e-5, **kwargs):
@OpSchema.num_inputs(1)
def lp_normalize(
def lp_norm(
inputs,
axis=-1,
end_axis=None,
......@@ -200,15 +263,15 @@ def lp_normalize(
```python
x = dragon.constant([[1, 2, 3], [4, 5, 6]], 'float32')
# A negative axis is the last-k axis
print(dragon.math.lp_normalize(x, 1))
print(dragon.math.lp_normalize(x, -1)) # Equivalent
print(dragon.nn.lp_norm(x, 1))
print(dragon.nn.lp_norm(x, -1)) # Equivalent
```
More than one axis could be specified to reduce:
```python
# Along the continuous axes: [axis, end_axis]
print(dragon.math.lp_normalize(x, axis=0, end_axis=1))
print(dragon.nn.lp_norm(x, axis=0, end_axis=1))
```
Parameters
......@@ -236,9 +299,9 @@ def lp_normalize(
reduction = reduction.upper()
if context.executing_eagerly():
return OpLib.execute(
'LpNormalize', inputs, p=p, axis=axis, end_axis=end_axis,
'LpNorm', inputs, p=p, axis=axis, end_axis=end_axis,
epsilon=epsilon, reduction=reduction)
return OpLib.add('LpNormalize', inputs, p=p, axis=axis, end_axis=end_axis,
return OpLib.add('LpNorm', inputs, p=p, axis=axis, end_axis=end_axis,
epsilon=epsilon, reduction=reduction, **kwargs)
......
......@@ -24,9 +24,11 @@ class Adam(optimizer.Optimizer):
The **Adam** update is defined as:
.. math::
\text{Adam}(g) = \frac{\text{lr} * m_{t}}{\sqrt{v_{t}} + \epsilon} \\
\text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
{\sqrt{v_{t}} + \epsilon}) \\
\quad \\ \text{where}\quad
\begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
\end{cases}
......@@ -62,12 +64,13 @@ class AdamW(Adam):
The **AdamW** update is defined as:
.. math::
\text{AdamW}(g, p) = \text{lr} * (\frac{m_{t}}{\sqrt{v_{t}} + \epsilon}
+ \lambda p) \\
\text{AdamW}(g, p) = \text{lr} * (\frac{\text{correction} * m_{t}}
{\sqrt{v_{t}} + \epsilon} + \lambda p) \\
\quad \\ \text{where}\quad
\begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} \\
\end{cases}
"""
......
......@@ -27,13 +27,13 @@ class RMSprop(optimizer.Optimizer):
\text{RMSprop}(g) = \text{lr} * m_{t} \\
\quad \\ \text{where} \quad
\begin{cases}
v_{t} = \text{decay} * v_{t-1} + (1 - \text{decay}) * g^{2} \\
v_{t} = \alpha * v_{t-1} + (1 - \alpha) * g^{2} \\
m_{t} = \text{momentum} * m_{t-1} + \frac{g}{\sqrt{v_{t}} + \epsilon}
\end{cases}
"""
def __init__(self, lr=0.01, momentum=0, decay=0.9, eps=1e-8, **kwargs):
def __init__(self, lr=0.01, momentum=0, alpha=0.9, eps=1e-8, **kwargs):
r"""Create a ``RMSProp`` optimizer.
Parameters
......@@ -42,8 +42,8 @@ class RMSprop(optimizer.Optimizer):
The initial value to :math:`\text{lr}`.
momentum : float, optional, default=0
The initial value to :math:`\text{momentum}`.
decay : float, optional, default=0.9
The initial value to :math:`\text{decay}`.
alpha : float, optional, default=0.9
The initial value to :math:`\alpha`.
eps : float, optional, default=1e-8
The initial value to :math:`\epsilon`.
......@@ -51,5 +51,5 @@ class RMSprop(optimizer.Optimizer):
super(RMSprop, self).__init__(**kwargs)
self._set_hyper('lr', lr)
self._set_hyper('momentum', momentum)
self._set_hyper('decay', decay)
self._set_hyper('alpha', alpha)
self._set_hyper('eps', eps)
......@@ -51,41 +51,6 @@ def cast_exporter(op_def, context):
return node, const_tensors
@export_util.register('ChannelAffine')
def channel_affine_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelAffine')
for arg in op_def.arg:
if arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'end_axis':
helper.add_attribute(node, 'end_axis', arg.i)
return node, const_tensors
@export_util.register('ChannelNormalize')
def channel_normalize_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelNormalize')
for arg in op_def.arg:
if arg.name == 'mean':
helper.add_attribute(node, 'mean', arg.floats)
elif arg.name == 'std':
helper.add_attribute(node, 'std', arg.floats)
elif arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'dtype':
helper.add_attribute(node, 'dtype', arg.s)
elif arg.name == 'perm':
helper.add_attribute(node, 'perm', arg.ints)
elif arg.name == 'perm_desc':
values = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'perm', values)
return node, const_tensors
@export_util.register('Concat')
def concat_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
......
......@@ -31,6 +31,17 @@ def add_exporter(op_def, context):
return node, const_tensors
@export_util.register('Affine')
def affine_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'Affine')
for arg in op_def.arg:
if arg.name == 'axes':
helper.add_attribute(node, 'axes', arg.ints)
return node, const_tensors
@export_util.register('Div')
def div_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
......
......@@ -32,6 +32,28 @@ def batch_norm_exporter(op_def, context):
return node, const_tensors
@export_util.register('ChannelNorm')
def channel_norm_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelNorm')
for arg in op_def.arg:
if arg.name == 'mean':
helper.add_attribute(node, 'mean', arg.floats)
elif arg.name == 'std':
helper.add_attribute(node, 'std', arg.floats)
elif arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'dtype':
helper.add_attribute(node, 'dtype', arg.s)
elif arg.name == 'perm':
helper.add_attribute(node, 'perm', arg.ints)
elif arg.name == 'perm_desc':
values = helper.fetch_argument(op_def, arg, context.ws)
helper.add_attribute(node, 'perm', values)
return node, const_tensors
@export_util.register('GroupNorm')
def group_norm_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
......@@ -49,8 +71,8 @@ def group_norm_exporter(op_def, context):
return node, const_tensors
@export_util.register('LpNormalize')
def lp_normalize_exporter(op_def, context):
@export_util.register('LpNorm')
def lp_norm_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals())
node.op_type = 'LpNormalization'
axis, end_axis = None, None
......
......@@ -33,9 +33,6 @@ constexpr int CUDA_WARP_SIZE = 32;
/*! \brief The number of cuda threads in a block */
constexpr int CUDA_THREADS = 256;
/*! \brief The maximum number of blocks to use in a default kernel call */
constexpr int CUDA_MAX_BLOCKS = 4096;
/*! \brief The maximum number of devices in a single machine */
constexpr int CUDA_MAX_DEVICES = 16;
......@@ -82,12 +79,15 @@ constexpr int CUDA_TENSOR_MAX_DIMS = 8;
for (size_t j = threadIdx.x; j < m; j += blockDim.x)
inline int CUDA_BLOCKS(const int N) {
return std::max(
std::min((N + CUDA_THREADS - 1) / CUDA_THREADS, CUDA_MAX_BLOCKS), 1);
}
inline int CUDA_2D_BLOCKS(const int N) {
return std::max(std::min(N, CUDA_MAX_BLOCKS), 1);
int device, sm_count, threads_per_sm;
CUDA_CHECK(cudaGetDevice(&device));
CUDA_CHECK(cudaDeviceGetAttribute(
&sm_count, cudaDevAttrMultiProcessorCount, device));
CUDA_CHECK(cudaDeviceGetAttribute(
&threads_per_sm, cudaDevAttrMaxThreadsPerMultiProcessor, device));
const auto num_blocks = (N + CUDA_THREADS - 1) / CUDA_THREADS;
const auto max_blocks = sm_count * threads_per_sm / CUDA_THREADS * 32;
return std::max(1, std::min(num_blocks, max_blocks));
}
#if CUDA_VERSION_MAX(9, 0)
......
......@@ -84,6 +84,7 @@ DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Sub, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Mul, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Div, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Pow, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Atan2, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Minimum, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Maximum, T);
DECLARE_ROWWISE_COLWISE_BINARY_FUNC(Equal, bool);
......@@ -434,6 +435,7 @@ DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(And, bool, std::logical_and);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Or, bool, std::logical_or);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Xor, bool, math::XorFunctor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Pow, T, math::PowFunctor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Atan2, T, math::Atan2Functor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Minimum, T, math::MinFunctor);
DEFINE_ROWWISE_COLWISE_BIANRY_FUNC(Maximum, T, math::MaxFunctor);
#undef DEFINE_ROWWISE_COLWISE_BIANRY_FUNC
......@@ -469,6 +471,7 @@ DEFINE_BROADCAST_BINARY_FUNC(And, bool, std::logical_and);
DEFINE_BROADCAST_BINARY_FUNC(Or, bool, std::logical_or);
DEFINE_BROADCAST_BINARY_FUNC(Xor, bool, math::XorFunctor);
DEFINE_BROADCAST_BINARY_FUNC(Pow, T, math::PowFunctor);
DEFINE_BROADCAST_BINARY_FUNC(Atan2, T, math::Atan2Functor);
DEFINE_BROADCAST_BINARY_FUNC(Minimum, T, math::MinFunctor);
DEFINE_BROADCAST_BINARY_FUNC(Maximum, T, math::MaxFunctor);
#undef DEFINE_BROADCAST_BINARY_FUNC
......@@ -612,6 +615,9 @@ DEFINE_BINARY_FUNC(Div, float, float);
DEFINE_BINARY_FUNC(Div, double, double);
DEFINE_BINARY_FUNC(Pow, float, float);
DEFINE_BINARY_FUNC(Pow, double, double);
DEFINE_BINARY_FUNC(Atan2, float16, float16);
DEFINE_BINARY_FUNC(Atan2, float, float);
DEFINE_BINARY_FUNC(Atan2, double, double);
DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t);
DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t);
DEFINE_BINARY_FUNC(Minimum, int, int);
......
......@@ -388,6 +388,9 @@ DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor);
DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor);
DEFINE_BINARY_FUNC(Atan2, float16, float16, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, float, float, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, double, double, math::Atan2Functor);
DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor);
......
......@@ -122,6 +122,17 @@ DRAGON_API void Pow(
Context* ctx);
template <typename T, class Context>
DRAGON_API void Atan2(
const int A_ndim,
const int64_t* A_dims,
const int B_ndim,
const int64_t* B_dims,
const T* a,
const T* b,
T* y,
Context* ctx);
template <typename T, class Context>
DRAGON_API void Minimum(
const int A_ndim,
const int64_t* A_dims,
......
......@@ -550,6 +550,9 @@ DEFINE_BINARY_FUNC(Maximum, double, double, max);
_SimpleBinaryFunc(N, Functor<InputT>(), a, b, y); \
}
DEFINE_BINARY_FUNC(Atan2, float16, float16, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, float, float, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, double, double, math::Atan2Functor);
DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, std::bit_and);
DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, std::bit_and);
DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, std::bit_and);
......
......@@ -342,7 +342,10 @@ _Where(const int N, const T* a, const T* b, const bool* c, T* y) {
DRAGON_API void name<InputT, CUDAContext>( \
const int N, const InputT* x, OutputT* y, CUDAContext* ctx) { \
_SimpleUnaryFunc<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, Functor<InputT>(), x, y); \
N, \
Functor<math::ScalarType<InputT>::type>(), \
reinterpret_cast<const math::ScalarType<InputT>::type*>(x), \
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
}
DEFINE_UNARY_FUNC(BitwiseNot, bool, bool, math::BitNotFunctor);
......@@ -706,6 +709,87 @@ DEFINE_APPLY_MASK_FUNC(float, float);
DEFINE_APPLY_MASK_FUNC(double, double);
#undef DEFINE_APPLY_MASK_FUNC
#define DEFINE_BINARY_FUNC(name, T, Functor) \
template <> \
DRAGON_API void name<T, CUDAContext>( \
const int N, const T* a, const T* b, T* y, CUDAContext* ctx) { \
using ScalarT = typename math::ScalarType<T>::type; \
using ScalarT2 = typename math::ScalarType<T>::type2; \
if ((N & 1) == 0 && sizeof(ScalarT) != sizeof(ScalarT2)) { \
_SimpleBinaryFunc<<< \
CUDA_BLOCKS(N >> 1), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
N >> 1, \
Functor<ScalarT2>(), \
reinterpret_cast<const ScalarT2*>(a), \
reinterpret_cast<const ScalarT2*>(b), \
reinterpret_cast<ScalarT2*>(y)); \
} else { \
_SimpleBinaryFunc<<< \
CUDA_BLOCKS(N), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>( \
N, \
Functor<ScalarT>(), \
reinterpret_cast<const ScalarT*>(a), \
reinterpret_cast<const ScalarT*>(b), \
reinterpret_cast<ScalarT*>(y)); \
} \
}
DEFINE_BINARY_FUNC(Add, uint8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int64_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float16, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, double, math::PlusFunctor);
DEFINE_BINARY_FUNC(Sub, uint8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int64_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float16, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, double, math::MinusFunctor);
DEFINE_BINARY_FUNC(Mul, uint8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int64_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float16, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, double, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Div, uint8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int64_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float16, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, double, math::DividesFunctor);
DEFINE_BINARY_FUNC(Pow, float16, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, float, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, double, math::PowFunctor);
DEFINE_BINARY_FUNC(Atan2, float16, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, float, math::Atan2Functor);
DEFINE_BINARY_FUNC(Atan2, double, math::Atan2Functor);
DEFINE_BINARY_FUNC(Minimum, uint8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int64_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float16, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, double, math::MinFunctor);
DEFINE_BINARY_FUNC(Maximum, uint8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int64_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float16, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, double, math::MaxFunctor);
#undef DEFINE_BINARY_FUNC
#define DEFINE_BINARY_FUNC(name, InputT, OutputT, Functor) \
template <> \
DRAGON_API void name<InputT, CUDAContext>( \
......@@ -726,51 +810,6 @@ DEFINE_APPLY_MASK_FUNC(double, double);
reinterpret_cast<math::ScalarType<OutputT>::type*>(y)); \
}
DEFINE_BINARY_FUNC(Add, uint8_t, uint8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int8_t, int8_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int, int, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, int64_t, int64_t, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float16, float16, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, float, float, math::PlusFunctor);
DEFINE_BINARY_FUNC(Add, double, double, math::PlusFunctor);
DEFINE_BINARY_FUNC(Sub, uint8_t, uint8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int8_t, int8_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int, int, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, int64_t, int64_t, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float16, float16, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, float, float, math::MinusFunctor);
DEFINE_BINARY_FUNC(Sub, double, double, math::MinusFunctor);
DEFINE_BINARY_FUNC(Mul, uint8_t, uint8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int8_t, int8_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int, int, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, int64_t, int64_t, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float16, float16, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, float, float, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Mul, double, double, math::MultipliesFunctor);
DEFINE_BINARY_FUNC(Div, uint8_t, uint8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int8_t, int8_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int, int, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, int64_t, int64_t, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float16, float16, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, float, float, math::DividesFunctor);
DEFINE_BINARY_FUNC(Div, double, double, math::DividesFunctor);
DEFINE_BINARY_FUNC(Pow, float16, float16, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, float, float, math::PowFunctor);
DEFINE_BINARY_FUNC(Pow, double, double, math::PowFunctor);
DEFINE_BINARY_FUNC(Minimum, uint8_t, uint8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int8_t, int8_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int, int, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, int64_t, int64_t, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float16, float16, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, float, float, math::MinFunctor);
DEFINE_BINARY_FUNC(Minimum, double, double, math::MinFunctor);
DEFINE_BINARY_FUNC(Maximum, uint8_t, uint8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int8_t, int8_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int, int, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, int64_t, int64_t, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float16, float16, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, float, float, math::MaxFunctor);
DEFINE_BINARY_FUNC(Maximum, double, double, math::MaxFunctor);
DEFINE_BINARY_FUNC(BitwiseAnd, bool, bool, math::BitAndFunctor);
DEFINE_BINARY_FUNC(BitwiseAnd, uint8_t, uint8_t, math::BitAndFunctor);
DEFINE_BINARY_FUNC(BitwiseAnd, int8_t, int8_t, math::BitAndFunctor);
......
......@@ -126,6 +126,9 @@ template <typename T, class Context>
DRAGON_API void Pow(const int N, const T* a, const T* b, T* y, Context* ctx);
template <typename T, class Context>
DRAGON_API void Atan2(const int N, const T* a, const T* b, T* y, Context* ctx);
template <typename T, class Context>
DRAGON_API void
Minimum(const int N, const T* a, const T* b, T* y, Context* ctx);
......
......@@ -16,24 +16,25 @@
#include "dragon/core/types.h"
#include "dragon/utils/conversions.h"
#if defined(__CUDA_ARCH__)
#define HOSTDEVICE_DECL inline __host__ __device__
#else
#define HOSTDEVICE_DECL inline
#endif
namespace dragon {
namespace math {
/*
* Arithmetic Functors */
* Arithmetic Functors
*/
template <typename T>
struct IdentityFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& x) const {
HOSTDEVICE_DECL T operator()(const T& x) const {
return x;
}
#else
inline T operator()(const T& x) const {
return x;
}
#endif
};
template <typename T>
......@@ -76,15 +77,9 @@ struct AbsFunctor<half2> {
template <typename T>
struct SqrFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& x) const {
HOSTDEVICE_DECL T operator()(const T& x) const {
return x * x;
}
#else
inline T operator()(const T& x) const {
return x * x;
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -115,40 +110,16 @@ struct SqrFunctor<half2> {
template <typename T>
struct MaxFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? rhs : lhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? rhs : lhs;
}
#endif
};
template <>
struct MaxFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
return __hlt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs))
? rhs
: lhs;
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs))
? rhs
: lhs;
#endif
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) < convert::To<float>(rhs) ? rhs : lhs;
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -176,40 +147,16 @@ struct MaxFunctor<half2> {
template <typename T>
struct MinFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? lhs : rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs < rhs ? lhs : rhs;
}
#endif
};
template <>
struct MinFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
return __hlt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs))
? lhs
: rhs;
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs))
? lhs
: rhs;
#endif
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) < convert::To<float>(rhs) ? lhs : rhs;
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -237,39 +184,17 @@ struct MinFunctor<half2> {
template <typename T>
struct PlusFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs + rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs + rhs;
}
#endif
};
template <>
struct PlusFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hadd(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) +
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
convert::To<float>(lhs) + convert::To<float>(rhs));
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -300,39 +225,17 @@ struct PlusFunctor<half2> {
template <typename T>
struct MinusFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs - rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs - rhs;
}
#endif
};
template <>
struct MinusFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hsub(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) -
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
convert::To<float>(lhs) - convert::To<float>(rhs));
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -363,39 +266,17 @@ struct MinusFunctor<half2> {
template <typename T>
struct MultipliesFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs * rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs * rhs;
}
#endif
};
template <>
struct MultipliesFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hmul(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) *
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
convert::To<float>(lhs) * convert::To<float>(rhs));
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -426,39 +307,17 @@ struct MultipliesFunctor<half2> {
template <typename T>
struct DividesFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs / rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs / rhs;
}
#endif
};
template <>
struct DividesFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
#if __CUDA_ARCH__ >= 530
half ret = __hdiv(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
half ret = __float2half(
__half2float(*reinterpret_cast<const half*>(&lhs)) /
__half2float(*reinterpret_cast<const half*>(&rhs)));
#endif
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
convert::To<float>(lhs) / convert::To<float>(rhs));
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -498,20 +357,10 @@ struct PowFunctor {
template <>
struct PowFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ float16
operator()(const float16& lhs, const float16& rhs) const {
half ret = __float2half(
pow(__half2float(*reinterpret_cast<const half*>(&lhs)),
__half2float(*reinterpret_cast<const half*>(&rhs))));
return *reinterpret_cast<float16*>(&ret);
}
#else
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
std::pow(convert::To<float>(lhs), convert::To<float>(rhs)));
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -532,34 +381,104 @@ struct PowFunctor<half2> {
};
#endif
/*
* Logical Functors
*/
template <typename T>
struct NotFunctor {
struct Atan2Functor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& x) const {
return !x;
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return atan2(lhs, rhs);
}
#else
inline bool operator()(const T& x) const {
return !x;
inline T operator()(const T& lhs, const T& rhs) const {
return std::atan2(lhs, rhs);
}
#endif
};
template <>
struct NotFunctor<float16> {
struct Atan2Functor<float16> {
inline float16 operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float16>(
std::atan2(convert::To<float>(lhs), convert::To<float>(rhs)));
}
};
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& x) const {
return !__half2float(*reinterpret_cast<const half*>(&x));
template <>
struct Atan2Functor<half> {
inline __device__ half operator()(const half& lhs, const half& rhs) const {
return __float2half(atan2f(__half2float(lhs), __half2float(rhs)));
}
};
template <>
struct Atan2Functor<half2> {
inline __device__ half2 operator()(const half2& lhs, const half2& rhs) const {
const float2 v1 = __half22float2(lhs);
const float2 v2 = __half22float2(rhs);
return __floats2half2_rn(atan2f(v1.x, v2.x), atan2f(v1.y, v2.y));
}
};
#endif
template <typename T>
struct FMAFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& x, const T& y, const T& z) const {
return fma(x, y, z);
}
#else
inline T operator()(const T& x, const T& y, const T& z) const {
return std::fma(x, y, z);
}
#endif
};
#if defined(__CUDA_ARCH__)
template <>
struct FMAFunctor<half> {
inline __device__ half
operator()(const half& x, const half& y, const half& z) const {
#if __CUDA_ARCH__ >= 530
return __hfma(x, y, z);
#else
return __float2half(
fmaf(__half2float(x), __half2float(y), __half2float(z)));
#endif
}
};
template <>
struct FMAFunctor<half2> {
inline __device__ half2
operator()(const half2& x, const half2& y, const half2& z) const {
#if __CUDA_ARCH__ >= 530
return __hfma2(x, y, z);
#else
const float2 v1 = __half22float2(x);
const float2 v2 = __half22float2(y);
const float2 v3 = __half22float2(z);
return __floats2half2_rn(fmaf(v1.x, v2.x, v3.x), fmaf(v1.y, v2.y, v3.y));
#endif
}
};
#endif
/*
* Logical Functors
*/
template <typename T>
struct NotFunctor {
HOSTDEVICE_DECL bool operator()(const T& x) const {
return !x;
}
};
template <>
struct NotFunctor<float16> {
inline bool operator()(const float16& x) const {
return !convert::To<float>(x);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -573,30 +492,16 @@ struct NotFunctor<half> {
template <typename T>
struct AndFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs && rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs && rhs;
}
#endif
};
template <>
struct AndFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
return __half2float(*reinterpret_cast<const half*>(&lhs)) &&
__half2float(*reinterpret_cast<const half*>(&rhs));
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) && convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -610,30 +515,16 @@ struct AndFunctor<half> {
template <typename T>
struct OrFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs || rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs || rhs;
}
#endif
};
template <>
struct OrFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
return __half2float(*reinterpret_cast<const half*>(&lhs)) ||
__half2float(*reinterpret_cast<const half*>(&rhs));
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) || convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -647,32 +538,17 @@ struct OrFunctor<half> {
template <typename T>
struct XorFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return convert::To<bool>(lhs) ^ convert::To<bool>(rhs);
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return convert::To<bool>(lhs) ^ convert::To<bool>(rhs);
}
#endif
};
template <>
struct XorFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
return convert::To<bool>(
__half2float(*reinterpret_cast<const half*>(&lhs))) ^
convert::To<bool>(__half2float(*reinterpret_cast<const half*>(&rhs)));
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<bool>(convert::To<float>(lhs)) ^
convert::To<bool>(convert::To<float>(rhs));
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -691,54 +567,30 @@ struct XorFunctor<half> {
template <typename T>
struct BitNotFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& x) const {
HOSTDEVICE_DECL T operator()(const T& x) const {
return ~x;
}
#else
inline T operator()(const T& x) const {
return ~x;
}
#endif
};
template <typename T>
struct BitAndFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs & rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
return lhs & rhs;
}
#endif
};
template <typename T>
struct BitOrFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs | rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs | rhs;
}
#endif
};
template <typename T>
struct BitXorFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ T operator()(const T& lhs, const T& rhs) const {
return lhs ^ rhs;
}
#else
inline T operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL T operator()(const T& lhs, const T& rhs) const {
return lhs ^ rhs;
}
#endif
};
/*
......@@ -747,36 +599,16 @@ struct BitXorFunctor {
template <typename T>
struct EqualFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs == rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs == rhs;
}
#endif
};
template <>
struct EqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __heq(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) ==
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) == convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -794,36 +626,16 @@ struct EqualFunctor<half> {
template <typename T>
struct NotEqualFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs != rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs != rhs;
}
#endif
};
template <>
struct NotEqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hne(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) !=
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) != convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -841,36 +653,16 @@ struct NotEqualFunctor<half> {
template <typename T>
struct GreaterFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs > rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs > rhs;
}
#endif
};
template <>
struct GreaterFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hgt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) >
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) > convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -888,36 +680,16 @@ struct GreaterFunctor<half> {
template <typename T>
struct LessFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs < rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs < rhs;
}
#endif
};
template <>
struct LessFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hlt(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) < convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -935,36 +707,16 @@ struct LessFunctor<half> {
template <typename T>
struct GreaterEqualFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs >= rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
return lhs >= rhs;
}
#endif
};
template <>
struct GreaterEqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hge(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) >=
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) >= convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -982,36 +734,16 @@ struct GreaterEqualFunctor<half> {
template <typename T>
struct LessEqualFunctor {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const T& lhs, const T& rhs) const {
return lhs <= rhs;
}
#else
inline bool operator()(const T& lhs, const T& rhs) const {
HOSTDEVICE_DECL bool operator()(const T& lhs, const T& rhs) const {
return lhs <= rhs;
}
#endif
};
template <>
struct LessEqualFunctor<float16> {
#if defined(__CUDA_ARCH__)
inline __device__ bool operator()(const float16& lhs, const float16& rhs)
const {
#if __CUDA_ARCH__ >= 530
return __hle(
*reinterpret_cast<const half*>(&lhs),
*reinterpret_cast<const half*>(&rhs));
#else
return __half2float(*reinterpret_cast<const half*>(&lhs)) <
__half2float(*reinterpret_cast<const half*>(&rhs));
#endif
}
#else
inline bool operator()(const float16& lhs, const float16& rhs) const {
return convert::To<float>(lhs) <= convert::To<float>(rhs);
}
#endif
};
#if defined(__CUDA_ARCH__)
......@@ -1031,4 +763,6 @@ struct LessEqualFunctor<half> {
} // namespace dragon
#undef HOSTDEVICE_DECL
#endif // DRAGON_UTILS_MATH_FUNCTIONAL_H_
......@@ -108,13 +108,11 @@ void _GenericReduce(
} \
if (math::utils::IsRowwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseReduce##name(rows, cols, scale, x, y); \
return; \
return _RowwiseReduce##name(rows, cols, scale, x, y); \
} \
if (math::utils::IsColwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \
_ColwiseReduce##name(rows, cols, scale, x, y); \
return; \
return _ColwiseReduce##name(rows, cols, scale, x, y); \
} \
vec64_t transpose_axes(num_dims); \
vec64_t transpose_strides(num_dims); \
......
#include "dragon/utils/math/transform.h"
#include "dragon/utils/device/common_eigen.h"
#include "dragon/utils/math/reduce.h"
namespace dragon {
namespace math {
namespace {
template <typename T>
void _AffineChannel(
const int N,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
EigenArrayMap<T> Y(y, C, N);
ConstEigenArrayMap<T> X(x, C, N);
Y = X.colwise() * ConstEigenVectorArrayMap<T>(scale, C);
if (bias != nullptr) {
Y.colwise() += ConstEigenVectorArrayMap<T>(bias, C);
}
}
template <typename T>
void _AffineChannel(
const int N,
const int C,
const int S,
const T* x,
const T* scale,
const T* bias,
T* y) {
const auto CxS = C * S;
for (int i = 0; i < N; ++i) {
EigenArrayMap<T> Y(y + i * CxS, S, C);
ConstEigenArrayMap<T> X(x + i * CxS, S, C);
Y = X.rowwise() * ConstEigenVectorArrayMap<T>(scale, C).transpose();
if (bias != nullptr) {
Y.rowwise() += ConstEigenVectorArrayMap<T>(bias, C).transpose();
}
}
}
template <typename T>
void _AffineImpl(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const T* x,
const T* scale,
const T* bias,
T* y) {
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], x, scale, bias, y);
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel(dims[0], dims[1], dims[2], x, scale, bias, y);
} else {
LOG(FATAL) << "Unsupported affine dimensions.";
}
}
} // namespace
template <>
void Affine<float16, CPUContext>(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const float16* x,
const float16* scale,
const float16* bias,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_AFFINE_FUNC(T) \
template <> \
void Affine<T, CPUContext>( \
const int num_dims, \
const int64_t* dims, \
const int num_axes, \
const int64_t* axes, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CPUContext* ctx) { \
vec64_t new_dims, new_axes; \
math::utils::CollapseReduceAxes( \
num_dims, dims, num_axes, axes, new_dims, new_axes); \
_AffineImpl( \
new_dims.size(), \
new_dims.data(), \
new_axes.size(), \
new_axes.data(), \
x, \
scale, \
bias, \
y); \
}
DEFINE_AFFINE_FUNC(float);
DEFINE_AFFINE_FUNC(double);
#undef DEFINE_AFFINE_FUNC
} // namespace math
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/math/functional.h"
#include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/transform.h"
#include "dragon/utils/math/types.h"
#include "dragon/utils/math/utils.h"
namespace dragon {
namespace math {
namespace {
template <typename T>
__global__ void _AffineChannel(
const int NxC,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y) {
auto op3 = math::FMAFunctor<T>();
auto op2 = math::MultipliesFunctor<T>();
CUDA_1D_KERNEL_LOOP(i, NxC) {
if (bias != nullptr) {
y[i] = op3(x[i], __ldg(scale + i % C), __ldg(bias + i % C));
} else {
y[i] = op2(x[i], __ldg(scale + i % C));
}
}
}
template <typename T>
__global__ void _AffineChannel(
const int NxCxS,
const int C,
const int S,
const T* x,
const T* scale,
const T* bias,
T* y) {
auto op3 = math::FMAFunctor<T>();
auto op2 = math::MultipliesFunctor<T>();
CUDA_1D_KERNEL_LOOP(i, NxCxS) {
const int j = (i / S) % C;
if (bias != nullptr) {
y[i] = op3(x[i], __ldg(scale + j), __ldg(bias + j));
} else {
y[i] = op2(x[i], __ldg(scale + j));
}
}
}
template <typename T>
void _AffineImpl(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const T* x,
const T* scale,
const T* bias,
T* y,
CUDAContext* ctx) {
const auto N = math::utils::Prod(num_dims, dims);
if (num_dims == 2 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], x, scale, bias, y);
} else if (num_dims == 3 && num_axes == 1 && axes[0] == 1) {
_AffineChannel<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, dims[1], dims[2], x, scale, bias, y);
} else {
LOG(FATAL) << "Unsupported affine dimensions.";
}
}
} // namespace
#define DEFINE_AFFINE_FUNC(T) \
template <> \
void Affine<T, CUDAContext>( \
const int num_dims, \
const int64_t* dims, \
const int num_axes, \
const int64_t* axes, \
const T* x, \
const T* scale, \
const T* bias, \
T* y, \
CUDAContext* ctx) { \
vec64_t new_dims, new_axes; \
math::utils::CollapseReduceAxes( \
num_dims, dims, num_axes, axes, new_dims, new_axes); \
_AffineImpl( \
new_dims.size(), \
new_dims.data(), \
new_axes.size(), \
new_axes.data(), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<const math::ScalarType<T>::type*>(scale), \
reinterpret_cast<const math::ScalarType<T>::type*>(bias), \
reinterpret_cast<math::ScalarType<T>::type*>(y), \
ctx); \
}
DEFINE_AFFINE_FUNC(float);
DEFINE_AFFINE_FUNC(float16);
DEFINE_AFFINE_FUNC(double);
#undef DEFINE_AFFINE_FUNC
} // namespace math
} // namespace dragon
#endif // USE_CUDA
/*!
* Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
*
* Licensed under the BSD 2-Clause License.
* You should have received a copy of the BSD 2-Clause License
* along with the software. If not, See,
*
* <https://opensource.org/licenses/BSD-2-Clause>
*
* ------------------------------------------------------------
*/
#ifndef DRAGON_UTILS_MATH_TRANSFORM_H_
#define DRAGON_UTILS_MATH_TRANSFORM_H_
#include "dragon/core/context.h"
namespace dragon {
namespace math {
template <typename T, class Context>
DRAGON_API void Affine(
const int num_dims,
const int64_t* dims,
const int num_axes,
const int64_t* axes,
const T* x,
const T* scale,
const T* bias,
T* y,
Context* ctx);
} // namespace math
} // namespace dragon
#endif // DRAGON_UTILS_MATH_TRANSFORM_H_
......@@ -141,8 +141,7 @@ void _TransposeImpl(
CUDAContext* ctx) {
auto aligned_size = sizeof(T);
if (axes.back() == D - 1) {
const auto N = math::utils::Prod(D, dims.data());
aligned_size = utils::GetAlignedSize<T, 16>(N, x, y);
aligned_size = utils::GetAlignedSize<T, 16>(dims[D - 1], x, y);
}
SimpleArray<int, D> X_dims, X_strides, Y_dims;
for (int i = 0; i < D; ++i) {
......
......@@ -27,6 +27,7 @@ template <typename T>
class ScalarType {
public:
typedef T type;
typedef T type2;
};
#if defined(__CUDACC__)
......@@ -34,6 +35,7 @@ template <>
class ScalarType<float16> {
public:
typedef half type;
typedef half2 type2;
};
#endif
......
......@@ -16,9 +16,9 @@
#include "dragon/utils/conversions.h"
#if defined(__CUDACC__)
#define MATH_UTILS_DECL inline __host__ __device__
#define HOSTDEVICE_DECL inline __host__ __device__
#else
#define MATH_UTILS_DECL inline
#define HOSTDEVICE_DECL inline
#endif
#define FIXED_DIVISOR_DIV_MOD(d, n, q, r) \
......@@ -41,28 +41,28 @@ namespace utils {
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
MATH_UTILS_DECL T IsInf(const T x) {
HOSTDEVICE_DECL T IsInf(const T x) {
return false;
}
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
MATH_UTILS_DECL T IsNaN(const T x) {
HOSTDEVICE_DECL T IsNaN(const T x) {
return false;
}
template <
typename T,
typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
MATH_UTILS_DECL T IsFinite(const T x) {
HOSTDEVICE_DECL T IsFinite(const T x) {
return true;
}
template <
typename T,
typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
MATH_UTILS_DECL bool IsInf(T x) {
HOSTDEVICE_DECL bool IsInf(T x) {
#if defined(__CUDACC__)
return isinf(x);
#else
......@@ -73,7 +73,7 @@ MATH_UTILS_DECL bool IsInf(T x) {
template <
typename T,
typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
MATH_UTILS_DECL bool IsNaN(T x) {
HOSTDEVICE_DECL bool IsNaN(T x) {
#if defined(__CUDACC__)
return isnan(x);
#else
......@@ -84,7 +84,7 @@ MATH_UTILS_DECL bool IsNaN(T x) {
template <
typename T,
typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
MATH_UTILS_DECL bool IsFinite(T x) {
HOSTDEVICE_DECL bool IsFinite(T x) {
#if defined(__CUDACC__)
return isfinite(x);
#else
......@@ -106,27 +106,27 @@ inline bool IsFinite(float16 x) {
}
template <typename T>
MATH_UTILS_DECL bool IsAGeZeroAndALtB(const T a, const T b) {
HOSTDEVICE_DECL bool IsAGeZeroAndALtB(const T a, const T b) {
return static_cast<unsigned int>(a) < static_cast<unsigned int>(b);
}
template <typename T>
MATH_UTILS_DECL T Sign(const T x) {
HOSTDEVICE_DECL T Sign(const T x) {
return x > T(0) ? T(1) : (x < T(0) ? T(-1) : T(0));
}
template <typename T>
MATH_UTILS_DECL T Identity(const T x) {
HOSTDEVICE_DECL T Identity(const T x) {
return x;
}
template <typename T>
MATH_UTILS_DECL T Square(const T x) {
HOSTDEVICE_DECL T Square(const T x) {
return x * x;
}
template <typename T>
MATH_UTILS_DECL T Cube(const T x) {
HOSTDEVICE_DECL T Cube(const T x) {
return x * x * x;
}
......@@ -247,4 +247,6 @@ void IncreaseIndexInDims(const int num_dims, const DimT* dims, IndexT* index) {
} // namespace dragon
#undef HOSTDEVICE_DECL
#endif // DRAGON_UTILS_MATH_UTILS_H_
......@@ -21,6 +21,7 @@
#include "dragon/utils/math/random.h"
#include "dragon/utils/math/reduce.h"
#include "dragon/utils/math/sort.h"
#include "dragon/utils/math/transform.h"
#include "dragon/utils/math/transpose.h"
#include "dragon/utils/math/types.h"
#include "dragon/utils/math/utils.h"
......
......@@ -284,39 +284,6 @@ void BooleanMaskGrad(
Context* ctx);
template <typename T, class Context>
void ChannelAffine(
const int N,
const int S,
const int C,
const T* x,
const T* scale,
const T* bias,
T* y,
Context* ctx);
template <typename InputT, typename OutputT, class Context>
void ChannelNormalize(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const InputT* x,
const float* mean,
const float* std,
OutputT* y,
Context* ctx);
template <typename T, class Context>
void ChannelShuffle(
const int N,
const int S,
const int C,
const int G,
const T* x,
T* y,
Context* ctx);
template <typename T, class Context>
void ConstPad(
const int num_dims,
const int64_t* x_dims,
......@@ -813,6 +780,18 @@ void TopK(
* NormalizationOp Kernels
*/
template <typename InputT, typename OutputT, class Context>
void ChannelNorm(
const int axis,
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const InputT* x,
const float* mean,
const float* std,
OutputT* y,
Context* ctx);
template <typename T, typename AccT, class Context>
void BatchNormExpectation(
const int N,
......@@ -923,7 +902,7 @@ void GroupNormGrad(
Context* ctx);
template <typename T, class Context>
void L1Normalize(
void L1Norm(
const int N,
const int S,
const int C,
......@@ -934,7 +913,7 @@ void L1Normalize(
Context* ctx);
template <typename T, class Context>
void L1NormalizeGrad(
void L1NormGrad(
const int N,
const int S,
const int C,
......@@ -946,7 +925,7 @@ void L1NormalizeGrad(
Context* ctx);
template <typename T, class Context>
void L2Normalize(
void L2Norm(
const int N,
const int S,
const int C,
......@@ -957,7 +936,7 @@ void L2Normalize(
Context* ctx);
template <typename T, class Context>
void L2NormalizeGrad(
void L2NormGrad(
const int N,
const int S,
const int C,
......@@ -1012,19 +991,23 @@ void LSTMCellGrad(
* TrainingOp Kernels
*/
template <typename T, class Context>
template <typename T, typename CopyT, class Context>
void Adam(
const int N,
const float lr,
const float beta1,
const float beta2,
const float eps,
T* g,
const float wd,
const T* x,
const T* g,
T* m,
T* v,
T* y,
CopyT* y_copy,
Context* ctx);
template <typename T, class Context>
template <typename T, typename CopyT, class Context>
void AdamW(
const int N,
const float lr,
......@@ -1033,39 +1016,53 @@ void AdamW(
const float eps,
const float wd,
const T* x,
T* g,
const T* g,
T* m,
T* v,
T* y,
CopyT* y_copy,
Context* ctx);
template <typename T, class Context>
template <typename T, typename CopyT, class Context>
void MomentumSGD(
const int N,
const float lr,
const float momentum,
T* g,
const float wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy,
Context* ctx);
template <typename T, class Context>
template <typename T, typename CopyT, class Context>
void NesterovSGD(
const int N,
const float lr,
const float momentum,
T* g,
const float wd,
const T* x,
const T* g,
T* m,
T* y,
CopyT* y_copy,
Context* ctx);
template <typename T, class Context>
template <typename T, typename CopyT, class Context>
void RMSprop(
const int N,
const float lr,
const float momentum,
const float decay,
const float alpha,
const float eps,
T* g,
const float wd,
const T* x,
const T* g,
T* m,
T* v,
T* y,
CopyT* y_copy,
Context* ctx);
/*
......
......@@ -18,6 +18,7 @@ from dragon.vm.tensorflow.core.ops.math_ops import add
from dragon.vm.tensorflow.core.ops.math_ops import add_n
from dragon.vm.tensorflow.core.ops.math_ops import argmax
from dragon.vm.tensorflow.core.ops.math_ops import argmin
from dragon.vm.tensorflow.core.ops.math_ops import atan2
from dragon.vm.tensorflow.core.ops.math_ops import cast
from dragon.vm.tensorflow.core.ops.math_ops import ceil
from dragon.vm.tensorflow.core.ops.math_ops import cos
......
......@@ -27,9 +27,11 @@ class Adam(optimizer.Optimizer):
The **Adam** update is defined as:
.. math::
\text{Adam}(g) = \text{lr} * \frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \\
\text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
{\sqrt{v_{t}} + \epsilon}) \\
\quad \\ \text{where}\quad
\begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
\end{cases}
......
......@@ -61,8 +61,8 @@ class RMSprop(optimizer.Optimizer):
super(RMSprop, self).__init__(name, **kwargs)
self._set_hyper('lr', learning_rate)
self._set_hyper('momentum', momentum)
self._set_hyper('decay', rho)
self._set_hyper('alpha', rho)
self._set_hyper('eps', epsilon)
self._hyper_aliases['learning_rate'] = 'lr'
self._hyper_aliases['rho'] = 'decay'
self._hyper_aliases['rho'] = 'alpha'
self._hyper_aliases['eps'] = 'epsilon'
......@@ -184,6 +184,35 @@ def argmin(input, axis=None, name=None):
return math_ops.argmin(input, axis=axis, name=name)
def atan2(y, x, name=None):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{input1}}{\text{input2}})
```python
y = tf.constant(1.)
x = tf.constant(2.)
print(tf.math.atan2(y, x)) # 0.46364761
```
Parameters
----------
y : dragon.Tensor
The input1 tensor.
x : dragon.Tensor
The input2 tensor.
name : str, optional
The operation name.
Returns
-------
dragon.Tensor
The output tensor.
"""
return math_ops.atan2([y, x], name=name)
def cast(x, dtype, name=None):
"""Cast the data type of input.
......
......@@ -129,13 +129,8 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None):
The output tensor.
"""
return normalization_ops.lp_normalize(
x,
p=2,
axis=axis,
epsilon=epsilon,
name=name,
)
return normalization_ops.lp_norm(
x, p=2, axis=axis, epsilon=epsilon, name=name)
def moments(x, axes=None, keepdims=False, name=None):
......
......@@ -501,8 +501,8 @@ class TestOpSpecWithTensorDesc(unittest.TestCase):
self.assertEqual(dragon.broadcast_to(
self.sym2, shape=self.shape1).shape, (None,) * len(self.sym2.shape))
def test_channel_normalize(self):
func = functools.partial(dragon.channel_normalize,
def test_channel_norm(self):
func = functools.partial(dragon.nn.channel_norm,
mean=(1., 1., 1.), std=(1., 1., 1.))
with dragon.graph_mode():
self.assertEqual(func(self.sym1).shape, None)
......
......@@ -31,6 +31,9 @@ class TestCUDA(unittest.TestCase):
stream.synchronize()
dragon.cuda.synchronize()
def test_cublas(self):
dragon.cuda.set_cublas_flags()
def test_cudnn(self):
dragon.cuda.set_cudnn_flags()
......
......@@ -572,51 +572,6 @@ class TestArrayOps(OpTestCase):
with dragon.device('cuda'):
self.test_cast()
def test_channel_affine(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data1 = arange((2, 3, 4, 5))
data2, data3 = arange((3, 4)), arange((3, 4))
data4 = arange(data1.shape)
grad1 = data4 * np.expand_dims(data2, -1)
grad2 = np.sum(data4 * data1, (0, 3))
grad3 = np.sum(data4, (0, 3))
x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
with dragon.GradientTape() as tape:
tape.watch([x, w, b])
y = dragon.channel_affine([x, w, b], axis=1, end_axis=2)
dy = new_tensor(data4)
dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
self.assertEqual(
[y, dx, dw, db],
[data1 * np.expand_dims(data2, -1) +
np.expand_dims(data3, -1),
grad1, grad2, grad3])
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_channel_affine_cuda(self):
with dragon.device('cuda'):
self.test_channel_affine()
def test_channel_normalize(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'perm': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'perm': (0, 2, 1)})]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
for shape, args, kwargs in entries:
perm = kwargs['perm']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = dragon.ones(shape, dtype='uint8')
y = dragon.channel_normalize(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_channel_normalize_cuda(self):
with dragon.device('cuda'):
self.test_channel_normalize()
def test_channel_shuffle(self):
entries = [(0, 2), (1, 4)]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
......@@ -630,7 +585,7 @@ class TestArrayOps(OpTestCase):
x, dy = new_tensor(data), new_tensor(data)
with dragon.GradientTape() as tape:
tape.watch(x)
y = dragon.channel_shuffle(x, axis, group)
y = dragon.nn.channel_shuffle(x, axis, group)
dx = tape.gradient(y, [x], output_gradients=[dy])[0]
self.assertEqual(
[y, dx], [data.reshape(shape1).transpose(perm).reshape(data.shape),
......@@ -1676,6 +1631,32 @@ class TestMathOps(OpTestCase):
with dragon.device('cuda'):
self.test_add()
def test_affine(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
data1 = arange((2, 3, 4, 5))
data2, data3 = arange((3, 4)), arange((3, 4))
data4 = arange(data1.shape)
grad1 = data4 * np.expand_dims(data2, -1)
grad2 = np.sum(data4 * data1, (0, 3))
grad3 = np.sum(data4, (0, 3))
x, w, b = new_tensor(data1), new_tensor(data2), new_tensor(data3)
with dragon.GradientTape() as tape:
tape.watch([x, w, b])
y = dragon.math.affine([x, w, b], axis=(1, 2))
dy = new_tensor(data4)
dx, dw, db = tape.gradient(y, [x, w, b], output_gradients=[dy])
self.assertEqual(
[y, dx, dw, db],
[data1 * np.expand_dims(data2, -1) +
np.expand_dims(data3, -1),
grad1, grad2, grad3])
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_affine_cuda(self):
with dragon.device('cuda'):
self.test_affine()
def test_argmax(self):
entries = [(0, True), (0, False), (1, True), (1, False)]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
......@@ -1712,6 +1693,20 @@ class TestMathOps(OpTestCase):
with dragon.device('cuda'):
self.test_argmin()
def test_atan2(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
for a_shape, b_shape in self.binary_test_shapes:
data1, data2 = arange(a_shape), arange(b_shape, 1)
a, b = new_tensor(data1), new_tensor(data2)
y = dragon.math.atan2([a, b])
self.assertEqual(y, np.arctan2(data1, data2))
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_atan2_cuda(self):
with dragon.device('cuda'):
self.test_atan2()
def test_bitwise_and(self):
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
......@@ -2738,6 +2733,25 @@ class TestNormalizationOps(OpTestCase):
with dragon.device('cuda'), self.cudnn_ws.as_default():
self.test_batch_norm()
def test_channel_norm(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'perm': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'perm': (0, 2, 1)})]
for execution in ('EAGER_MODE', 'GRAPH_MODE'):
with execution_context().mode(execution):
for shape, args, kwargs in entries:
perm = kwargs['perm']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = dragon.ones(shape, dtype='uint8')
y = dragon.nn.channel_norm(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_channel_norm_cuda(self):
with dragon.device('cuda'):
self.test_channel_norm()
def test_group_norm(self):
eps = 1e-5
entries = [((1, 4), (4,), -1, 2, (2,)),
......@@ -2904,7 +2918,7 @@ class TestNormalizationOps(OpTestCase):
with dragon.device('cuda'), self.cudnn_ws.as_default():
self.test_local_response_norm(test_cudnn=True, prec=1e-2)
def test_lp_normalize(self):
def test_lp_norm(self):
entries = [(0, 1, 1e-12, 'sum'),
(0, 1, 1e-12, 'mean'),
(0, 2, 1e-12, 'sum'),
......@@ -2921,7 +2935,7 @@ class TestNormalizationOps(OpTestCase):
x, dy = new_tensor(data1), new_tensor(data2)
with dragon.GradientTape() as tape:
tape.watch(x)
y = dragon.math.lp_normalize(
y = dragon.nn.lp_norm(
x, axis, p=p, epsilon=eps, reduction=reduction)
dx = tape.gradient(y, [x], output_gradients=[dy])[0]
norm = np.abs(data1) if p == 1 else np.square(data1)
......@@ -2930,9 +2944,9 @@ class TestNormalizationOps(OpTestCase):
self.assertEqual([y, dx], [data1 / max(norm, eps), grad])
@unittest.skipIf(not TEST_CUDA, 'CUDA unavailable')
def test_lp_normalize_cuda(self):
def test_lp_norm_cuda(self):
with dragon.device('cuda'):
self.test_lp_normalize()
self.test_lp_norm()
class TestRNNOps(OpTestCase):
......@@ -3028,7 +3042,7 @@ class TestTrainingOps(OpTestCase):
def test_rmsprop_update(self):
with execution_context().mode('EAGER_MODE'):
momentum, lr = self.rmsprop.momentum, self.rmsprop.lr
decay, eps = self.rmsprop.decay, self.rmsprop.eps
alpha, eps = self.rmsprop.alpha, self.rmsprop.eps
data1 = uniform((2, 3))
data2, data3 = np.zeros((2, 3), 'float32'), np.zeros((2, 3), 'float32')
param = new_tensor(data1)
......@@ -3036,7 +3050,7 @@ class TestTrainingOps(OpTestCase):
data4 = uniform((2, 3))
grad = new_tensor(data4)
self.rmsprop.apply_gradients([[grad, param]])
data3 = decay * data3 + (1 - decay) * np.square(data4)
data3 = alpha * data3 + (1 - alpha) * np.square(data4)
data2 = momentum * data2 + (data4 / (np.sqrt(data3) + eps))
data1 -= lr * data2
self.assertEqual(param, data1)
......
......@@ -20,6 +20,17 @@ from dragon.core.testing.unittest.common_utils import run_tests
from dragon.vm import torch
class TestCUDA(unittest.TestCase):
"""Test the CUDA backend."""
def test_library(self):
_ = torch.backends.cuda.is_built()
def test_set_flags(self):
torch.backends.cuda.matmul.allow_tf32 = False
self.assertEqual(torch.backends.cuda.matmul.allow_tf32, False)
class TestCuDNN(unittest.TestCase):
"""Test the CuDNN backend."""
......
......@@ -169,7 +169,7 @@ class TestModule(unittest.TestCase):
class TestModules(OpTestCase):
"""Test the nn module class."""
def test_affine_channel(self):
def test_affine(self):
data1 = arange((2, 3, 4, 5))
data2, data3 = arange((1, 3, 1, 1)), arange((1, 3, 1, 1))
w, b = new_tensor(data2.flatten()), new_tensor(data3.flatten())
......@@ -181,21 +181,19 @@ class TestModules(OpTestCase):
for bias, fix_weight, fix_bias in entries:
x = new_tensor(data1)
try:
m = torch.nn.AffineChannel(
m = torch.nn.Affine(
num_features=3,
bias=bias,
fix_weight=fix_weight,
fix_bias=fix_bias,
inplace=True,
)
inplace=True)
except ValueError:
m = torch.nn.AffineChannel(
m = torch.nn.Affine(
num_features=3,
bias=bias,
fix_weight=fix_weight,
fix_bias=fix_bias,
inplace=False,
)
inplace=False)
m.weight.copy_(w)
result = data1 * data2
if bias:
......@@ -262,6 +260,18 @@ class TestModules(OpTestCase):
y, _ = m(x), repr(m)
self.assertEqual(y, result)
def test_channel_norm(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'dims': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'dims': (0, 2, 1)})]
for shape, args, kwargs in entries:
perm = kwargs['dims']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = torch.ones(shape, dtype='uint8')
y = torch.nn.functional.channel_norm(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
def test_channel_shuffle(self):
entries = [(1, 4)]
for axis, group in entries:
......
......@@ -127,6 +127,12 @@ class TestTensorOps(OpTestCase):
result = np.expand_dims(result, axis)
self.assertEqual(x.argmin(axis, keepdims), result)
def test_atan2(self):
for a_shape, b_shape in self.binary_test_shapes:
data1, data2 = arange(a_shape), arange(b_shape, 1)
a, b = new_tensor(data1, False), new_tensor(data2, False)
self.assertEqual(a.atan2(b), np.arctan2(data1, data2))
def test_baddbmm(self):
entries = [((2, 2, 3), (2, 3, 4), (2, 2, 4))]
for a_shape, b_shape, c_shape in entries:
......@@ -944,18 +950,6 @@ class TestTorchOps(OpTestCase):
y = torch.cat([x, x], dim=axis)
self.assertEqual(y, np.concatenate([data, data], axis=axis))
def test_channel_normalize(self):
entries = [((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 1], {'dims': (0, 1, 2)}),
((2, 3, 4), [(1., 2., 3.), (3., 2., 1.), 2], {'dims': (0, 2, 1)})]
for shape, args, kwargs in entries:
perm = kwargs['dims']
data = np.ones(shape, dtype='uint8').transpose(perm)
mean = np.array(args[0]).reshape((1, 3, 1)).transpose(perm)
std = np.array(args[1]).reshape((1, 3, 1)).transpose(perm)
x = torch.ones(shape, dtype='uint8')
y = torch.channel_normalize(x, *args, **kwargs)
self.assertEqual(y, (data - mean) / std)
def test_linspace(self):
entries = [([[0., 5.], [10., 40.], 5], {'dim': 0, 'dtype': 'float32'}),
([[0., 5.], [10., 40.], 5], {'dim': 1, 'dtype': 'float32'}),
......
......@@ -49,8 +49,6 @@ from dragon.vm.torch.core.tensor import Tensor
from dragon.vm.torch.core.ops import tensor_ops as _
from dragon.vm.torch.core.ops.array_ops import broadcast_to
from dragon.vm.torch.core.ops.array_ops import cat
from dragon.vm.torch.core.ops.array_ops import channel_affine
from dragon.vm.torch.core.ops.array_ops import channel_normalize
from dragon.vm.torch.core.ops.array_ops import chunk
from dragon.vm.torch.core.ops.array_ops import flatten
from dragon.vm.torch.core.ops.array_ops import flip
......@@ -71,7 +69,6 @@ from dragon.vm.torch.core.ops.array_ops import scatter_add
from dragon.vm.torch.core.ops.array_ops import split
from dragon.vm.torch.core.ops.array_ops import squeeze
from dragon.vm.torch.core.ops.array_ops import stack
from dragon.vm.torch.core.ops.math_ops import sum
from dragon.vm.torch.core.ops.array_ops import tile
from dragon.vm.torch.core.ops.array_ops import transpose
from dragon.vm.torch.core.ops.array_ops import tril
......@@ -97,6 +94,7 @@ from dragon.vm.torch.core.ops.math_ops import add
from dragon.vm.torch.core.ops.math_ops import addmm
from dragon.vm.torch.core.ops.math_ops import argmax
from dragon.vm.torch.core.ops.math_ops import argmin
from dragon.vm.torch.core.ops.math_ops import atan2
from dragon.vm.torch.core.ops.math_ops import baddbmm
from dragon.vm.torch.core.ops.math_ops import bitwise_and
from dragon.vm.torch.core.ops.math_ops import bitwise_not
......@@ -144,6 +142,7 @@ from dragon.vm.torch.core.ops.math_ops import sin
from dragon.vm.torch.core.ops.math_ops import sqrt
from dragon.vm.torch.core.ops.math_ops import square
from dragon.vm.torch.core.ops.math_ops import sub
from dragon.vm.torch.core.ops.math_ops import sum
from dragon.vm.torch.core.ops.random_ops import normal
from dragon.vm.torch.core.ops.random_ops import rand
from dragon.vm.torch.core.ops.random_ops import randn
......
......@@ -15,6 +15,7 @@ from __future__ import division as _division
from __future__ import print_function as _print_function
# Modules
from dragon.vm.torch.core.backends import cuda
from dragon.vm.torch.core.backends import cudnn
__all__ = [_s for _s in dir() if not _s.startswith('_')]
......@@ -56,6 +56,7 @@ from dragon.vm.torch.core.nn.modules.dropout import Dropout
from dragon.vm.torch.core.nn.modules.dropout import DropPath
from dragon.vm.torch.core.nn.modules.flatten import Flatten
from dragon.vm.torch.core.nn.modules.fold import Unfold
from dragon.vm.torch.core.nn.modules.linear import Affine
from dragon.vm.torch.core.nn.modules.linear import Identity
from dragon.vm.torch.core.nn.modules.linear import Linear
from dragon.vm.torch.core.nn.modules.loss import CTCLoss
......@@ -68,7 +69,6 @@ from dragon.vm.torch.core.nn.modules.loss import NLLLoss
from dragon.vm.torch.core.nn.modules.loss import SigmoidFocalLoss
from dragon.vm.torch.core.nn.modules.loss import SmoothL1Loss
from dragon.vm.torch.core.nn.modules.module import Module
from dragon.vm.torch.core.nn.modules.normalization import AffineChannel
from dragon.vm.torch.core.nn.modules.normalization import GroupNorm
from dragon.vm.torch.core.nn.modules.normalization import LayerNorm
from dragon.vm.torch.core.nn.modules.normalization import LocalResponseNorm
......
......@@ -20,11 +20,13 @@ from dragon.vm.torch.core.nn.functional import adaptive_avg_pool3d
from dragon.vm.torch.core.nn.functional import adaptive_max_pool1d
from dragon.vm.torch.core.nn.functional import adaptive_max_pool2d
from dragon.vm.torch.core.nn.functional import adaptive_max_pool3d
from dragon.vm.torch.core.nn.functional import affine
from dragon.vm.torch.core.nn.functional import avg_pool1d
from dragon.vm.torch.core.nn.functional import avg_pool2d
from dragon.vm.torch.core.nn.functional import avg_pool3d
from dragon.vm.torch.core.nn.functional import batch_norm
from dragon.vm.torch.core.nn.functional import binary_cross_entropy_with_logits
from dragon.vm.torch.core.nn.functional import channel_norm
from dragon.vm.torch.core.nn.functional import channel_shuffle
from dragon.vm.torch.core.nn.functional import conv1d
from dragon.vm.torch.core.nn.functional import conv2d
......
......@@ -173,6 +173,7 @@ class Function(object):
outputs_id.append(outputs[i].id)
else:
if isinstance(spec, Tensor):
spec._device = device.copy()
outputs.append(spec)
outputs_id.append(spec.id)
else:
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""CUDA backend."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.core.device import cuda
from dragon.core.framework import sysconfig
class CuBLASModule(object):
"""CuBLAS module class."""
def __init__(self):
self._allow_tf32 = False
@property
def allow_tf32(self):
"""The flag that allows cuBLAS TF32 math type or not."""
return self._allow_tf32
@allow_tf32.setter
def allow_tf32(self, value):
self._allow_tf32 = value
cuda.set_cublas_flags(allow_tf32=value)
def is_built():
"""Return a bool reporting if built with CUDA support.
Returns
-------
bool
``True`` if built otherwise ``False``.
"""
version = sysconfig.get_build_info().get('cuda_version', None)
return True if version is not None else False
# Module instances.
matmul = CuBLASModule()
......@@ -37,7 +37,7 @@ class CuDNNModule(object):
@allow_tf32.setter
def allow_tf32(self, value):
self._allow_tf32 = value
self._set_flags()
cuda.set_cudnn_flags(allow_tf32=value)
@property
def benchmark(self):
......@@ -47,7 +47,7 @@ class CuDNNModule(object):
@benchmark.setter
def benchmark(self, value):
self._benchmark = value
self._set_flags()
cuda.set_cudnn_flags(benchmark=value)
@property
def deterministic(self):
......@@ -57,7 +57,7 @@ class CuDNNModule(object):
@deterministic.setter
def deterministic(self, value):
self._deterministic = value
self._set_flags()
cuda.set_cudnn_flags(deterministic=value)
@property
def enabled(self):
......@@ -67,7 +67,7 @@ class CuDNNModule(object):
@enabled.setter
def enabled(self, value):
self._enabled = value
self._set_flags()
cuda.set_cudnn_flags(enabled=value)
@staticmethod
def is_available():
......@@ -97,15 +97,6 @@ class CuDNNModule(object):
version = major * 1000 + minor * 100 + patch
return version
def _set_flags(self):
"""Set all flags with current value."""
cuda.set_cudnn_flags(
enabled=self._enabled,
benchmark=self._benchmark,
deterministic=self._deterministic,
allow_tf32=self._allow_tf32,
)
# Module instances.
sys.modules[__name__] = CuDNNModule()
......
......@@ -170,8 +170,42 @@ def adaptive_max_pool3d(input, output_size):
return _pool('MAX', utils._triple, input, **args)
def affine(input, weight, bias=None, dim=-1, out=None):
r"""Apply affine transformation to input.
.. math:: \text{out} = \text{input} \times \text{weight} + \text{bias}
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
weight : dragon.vm.torch.Tensor
The weight tensor.
bias : dragon.vm.torch.Tensor, optional
The bias tensor.
dim : Union[int, Sequence[int]], optional
The dimension to apply.
out : dragon.vm.torch.Tensor, optional
The output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.nn.Affine(...)`_
"""
return Function.apply(
'Affine', input.device,
[input, weight] + ([bias] if bias else []), outputs=[out],
axes=nest.flatten(dim))
def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 1d average pooling to input.
"""Apply the 1d average pooling to input.
Parameters
----------
......@@ -200,7 +234,7 @@ def avg_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 2d average pooling to input.
"""Apply the 2d average pooling to input.
Parameters
----------
......@@ -229,7 +263,7 @@ def avg_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def avg_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 3d average pooling to input.
"""Apply the 3d average pooling to input.
Parameters
----------
......@@ -267,7 +301,7 @@ def batch_norm(
momentum=0.1,
eps=1e-5,
):
r"""Apply the batch normalization to input.
"""Apply the batch normalization to input.
`[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.
Parameters
......@@ -315,7 +349,7 @@ def binary_cross_entropy_with_logits(
reduction='mean',
pos_weight=None,
):
r"""Compute the sigmoid cross entropy with contiguous target.
"""Compute the sigmoid cross entropy with contiguous target.
Parameters
----------
......@@ -353,6 +387,55 @@ def binary_cross_entropy_with_logits(
[input, target], reduction=reduction.upper())
def channel_norm(input, mean, std, dim=-1, dtype='float32', dims=None):
"""Apply the normalization to each channel of input.
:attr:`dim` can be negative:
```python
m = s = (1., 1., 1.)
x = torch.tensor([1, 2, 3])
print(nn.functional.channel_norm(x, m, s, dim=0)) # [0., 1., 2.]
print(nn.functional.channel_norm(x, m, s, dim=-1)) # Equivalent
```
If :attr:`dims` provided, :attr:`dim` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = torch.tensor([[1, 2, 3]])
# Provided 3 values to normalize the last dimension
# with length 1, only the first value will be taken
print(nn.functional.channel_norm(x, m, s, dims=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
dim : int, optional, default=-1
The channel dimension.
dtype : str, optional, default='float32'
The output data type.
dims : Sequence[int], optional
The order of output dimensions.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return Function.apply(
'ChannelNorm', input.device, [input],
axis=dim, mean=mean, std=std, dtype=dtype,
ndim=len(dims) if dims is not None else 0, perm=dims)
def channel_shuffle(input, groups):
"""Apply group shuffle to each channel of input.
`[Zhang et.al, 2017] <https://arxiv.org/abs/1707.01083>`_.
......@@ -387,7 +470,7 @@ def conv1d(
dilation=1,
groups=1,
):
r"""Apply the 1d convolution to input.
"""Apply the 1d convolution to input.
Parameters
----------
......@@ -428,7 +511,7 @@ def conv2d(
dilation=1,
groups=1,
):
r"""Apply the 2d convolution to input.
"""Apply the 2d convolution to input.
Parameters
----------
......@@ -469,7 +552,7 @@ def conv3d(
dilation=1,
groups=1,
):
r"""Apply the 3d convolution to input.
"""Apply the 3d convolution to input.
Parameters
----------
......@@ -511,7 +594,7 @@ def conv_transpose1d(
groups=1,
dilation=1,
):
r"""Apply the 1d deconvolution to input.
"""Apply the 1d deconvolution to input.
Parameters
----------
......@@ -555,7 +638,7 @@ def conv_transpose2d(
groups=1,
dilation=1,
):
r"""Apply the 2d deconvolution to input.
"""Apply the 2d deconvolution to input.
Parameters
----------
......@@ -599,7 +682,7 @@ def conv_transpose3d(
groups=1,
dilation=1,
):
r"""Apply the 3d deconvolution to input.
"""Apply the 3d deconvolution to input.
Parameters
----------
......@@ -747,7 +830,7 @@ def depthwise_conv2d(
padding=0,
dilation=1,
):
r"""Apply the 2d depthwise convolution to input.
"""Apply the 2d depthwise convolution to input.
Parameters
----------
......@@ -778,7 +861,7 @@ def depthwise_conv2d(
def dropout(input, p=0.5, training=True, inplace=False):
r"""Set the elements of the input to zero randomly.
"""Set the elements of the input to zero randomly.
`[Srivastava et.al, 2014] <http://jmlr.org/papers/v15/srivastava14a.html>`_.
Parameters
......@@ -810,7 +893,7 @@ def dropout(input, p=0.5, training=True, inplace=False):
def drop_block2d(input, p=0.5, block_size=1, training=True, inplace=False):
r"""Set the blocks over input to zero randomly.
"""Set the blocks over input to zero randomly.
Parameters
----------
......@@ -994,6 +1077,15 @@ def group_norm(input, num_groups, weight, bias, eps=1e-5):
def hardsigmoid(input, inplace=False):
r"""Apply the hard sigmoid function to input.
The **HardSigmoid** function is defined as:
.. math::
\text{Hardsigmoid}(x) = \begin{cases}
0 & \text{if~} x \le -3, \\
1 & \text{if~} x \ge +3, \\
x / 6 + 1 / 2 & \text{otherwise}
\end{cases}
Parameters
----------
input : dragon.vm.torch.Tensor
......@@ -1020,6 +1112,15 @@ def hardswish(input):
r"""Apply the hard swish function to input.
`[Howard et.al, 2019] <https://arxiv.org/abs/1905.02244>`_.
The **HardSwish** function is defined as:
.. math::
\text{Hardsigmoid}(x) = \begin{cases}
0 & \text{if~} x \le -3, \\
x & \text{if~} x \ge +3, \\
x \cdot (x + 3) /6 & \text{otherwise}
\end{cases}
Parameters
----------
input : dragon.vm.torch.Tensor
......@@ -1161,7 +1262,7 @@ def kl_div(
def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
r"""Compute the element-wise absolute value difference.
"""Compute the element-wise absolute value difference.
Parameters
----------
......@@ -1196,7 +1297,7 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
def layer_norm(input, normalized_shape, weight, bias, eps=1e-5):
r"""Apply the layer normalization to input.
"""Apply the layer normalization to input.
`[Ba et.al, 2016] <https://arxiv.org/abs/1607.06450>`_
Parameters
......@@ -1387,7 +1488,7 @@ def lstm_cell(input, cx):
def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 1d max pooling to input.
"""Apply the 1d max pooling to input.
Parameters
----------
......@@ -1416,7 +1517,7 @@ def max_pool1d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 2d max pooling to input.
"""Apply the 2d max pooling to input.
Parameters
----------
......@@ -1445,7 +1546,7 @@ def max_pool2d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
r"""Apply the 3d max pooling to input.
"""Apply the 3d max pooling to input.
Parameters
----------
......@@ -1474,11 +1575,7 @@ def max_pool3d(input, kernel_size, stride=1, padding=0, ceil_mode=False):
def mse_loss(input, target, size_average=None, reduce=None, reduction='mean'):
r"""Compute the element-wise squared error.
The ``MSELoss`` function is defined as:
.. math:: \text{MSELoss}(x, y) = (x - y)^{2}
"""Compute the element-wise squared error.
Parameters
----------
......@@ -1726,7 +1823,7 @@ def normalize(input, p=2, dim=1, end_dim=None, eps=1e-12, out=None):
"""
return Function.apply(
'LpNormalize', input.device, [input], outputs=[out],
'LpNorm', input.device, [input], outputs=[out],
p=p, axis=dim, end_axis=end_dim, epsilon=eps, reduction='SUM')
......
......@@ -19,9 +19,97 @@ import math
from dragon.vm.torch.core.nn import functional as F
from dragon.vm.torch.core.nn.modules.module import Module
from dragon.vm.torch.core.nn.parameter import Parameter
from dragon.vm.torch.core.ops import constant_ops
from dragon.vm.torch.core.tensor import Tensor
class Affine(Module):
"""Apply affine transformation.
Affine is often taken as a post-processing of normalization.
Examples:
```python
m = torch.nn.Affine(5)
# Apply a 2d transformation
x2d = torch.ones(3, 5)
y2d = m(x2d)
# Apply a 3d transformation
x3d = torch.ones(3, 5, 4)
y3d = m(x3d)
# Apply a 4d transformation
x4d = torch.ones(3, 5, 2, 2)
y4d = m(x4d)
```
See Also
--------
`torch.nn.functional.affine(...)`_
"""
def __init__(
self,
num_features,
bias=True,
fix_weight=False,
fix_bias=False,
inplace=False,
):
"""Create an ``AffineChannel`` module.
Parameters
----------
num_features : int
The number of channels.
bias : bool, optional, default=True
``True`` to attach a bias.
fix_weight : bool, optional, default=False
``True`` to frozen the ``weight``.
fix_bias : bool, optional, default=False
``True`` to frozen the ``bias``.
inplace : bool, optional, default=False
Whether to do the operation in-place.
"""
super(Affine, self).__init__()
self.num_features = num_features
self.inplace = inplace
if not fix_weight:
self.weight = Parameter(constant_ops.ones(num_features))
if inplace:
raise ValueError('In-place operation requires fixed weight.')
else:
self.register_buffer('weight', constant_ops.ones(num_features))
if bias:
if not fix_bias:
self.bias = Parameter(constant_ops.zeros(num_features))
else:
self.register_buffer('bias', constant_ops.zeros(num_features))
else:
self.bias = None
def extra_repr(self):
s = '{num_features}, ' \
'inplace={inplace}'.format(**self.__dict__)
if self.bias is None:
s += ', bias=False'
return s
def forward(self, input):
return F.affine(
input,
self.weight,
self.bias,
dim=1,
out=input if self.inplace else None,
)
class Identity(Module):
r"""Apply the identity transformation.
......
......@@ -20,98 +20,10 @@ from dragon.core.util import nest
from dragon.vm.torch.core.nn import functional as F
from dragon.vm.torch.core.nn.modules.module import Module
from dragon.vm.torch.core.nn.parameter import Parameter
from dragon.vm.torch.core.ops import array_ops
from dragon.vm.torch.core.ops import constant_ops
from dragon.vm.torch.core.tensor import Tensor
class AffineChannel(Module):
"""Apply affine transformation to channels.
Affine is often taken as a post-processing of normalization.
Examples:
```python
m = torch.nn.AffineChannel(5)
# Apply a 2d transformation
x2d = torch.ones(3, 5)
y2d = m(x2d)
# Apply a 3d transformation
x3d = torch.ones(3, 5, 4)
y3d = m(x3d)
# Apply a 4d transformation
x4d = torch.ones(3, 5, 2, 2)
y4d = m(x4d)
```
See Also
--------
`torch.channel_affine(...)`_
"""
def __init__(
self,
num_features,
bias=True,
fix_weight=False,
fix_bias=False,
inplace=False,
):
"""Create an ``AffineChannel`` module.
Parameters
----------
num_features : int
The number of channels.
bias : bool, optional, default=True
``True`` to attach a bias.
fix_weight : bool, optional, default=False
``True`` to frozen the ``weight``.
fix_bias : bool, optional, default=False
``True`` to frozen the ``bias``.
inplace : bool, optional, default=False
Whether to do the operation in-place.
"""
super(AffineChannel, self).__init__()
self.num_features = num_features
self.inplace = inplace
if not fix_weight:
self.weight = Parameter(constant_ops.ones(num_features))
if inplace:
raise ValueError('In-place operation requires fixed weight.')
else:
self.register_buffer('weight', constant_ops.ones(num_features))
if bias:
if not fix_bias:
self.bias = Parameter(constant_ops.zeros(num_features))
else:
self.register_buffer('bias', constant_ops.zeros(num_features))
else:
self.bias = None
def extra_repr(self):
s = '{num_features}, ' \
'inplace={inplace}'.format(**self.__dict__)
if self.bias is None:
s += ', bias=False'
return s
def forward(self, input):
return array_ops.channel_affine(
input,
self.weight,
self.bias,
dim=1,
out=input if self.inplace else None,
)
class GroupNorm(Module):
r"""Apply the group normalization.
`[Wu & He, 2018] <https://arxiv.org/abs/1803.08494>`_.
......
......@@ -42,85 +42,6 @@ def cat(tensors, dim=0, out=None):
'Concat', tensors[0].device, tensors, outputs=[out], axis=dim)
def channel_affine(input, weight, bias=None, dim=-1, end_dim=None, out=None):
"""Apply affine transformation to each channel of input.
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
weight : dragon.vm.torch.Tensor
The weight tensor.
bias : dragon.vm.torch.Tensor, optional
The bias tensor.
dim : int, optional, default=-1
The first channel dimension.
end_dim : int, optional
The last channel dimension.
out : dragon.vm.torch.Tensor, optional
The output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return Function.apply(
'ChannelAffine', input.device,
[input, weight] + ([bias] if bias else []), outputs=[out],
axis=dim, end_axis=end_dim)
def channel_normalize(input, mean, std, dim=-1, dtype='float32', dims=None):
"""Apply normalization to each channel of input.
:attr:`dim` can be negative:
```python
m = s = (1., 1., 1.)
x = torch.tensor([1, 2, 3])
print(torch.channel_normalize(x, m, s, dim=0)) # [0., 1., 2.]
print(torch.channel_normalize(x, m, s, dim=-1)) # Equivalent
```
If :attr:`dims` provided, :attr:`dim` is selected from the output layout:
```python
m, s = (1., 2., 3.), (1., 1., 1.)
x = torch.tensor([[1, 2, 3]])
# Provided 3 values to normalize the last dimension
# with length 1, only the first value will be taken
print(torch.channel_normalize(x, m, s, dims=(1, 0))) # [[0.], [1.], [2.]]
```
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
mean : Sequence[float], required
The mean to subtract.
std : Sequence[float], required
The standard deviation to divide.
dim : int, optional, default=-1
The channel dimension.
dtype : str, optional, default='float32'
The output data type.
dims : Sequence[int], optional
The order of output dimensions.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return Function.apply(
'ChannelNormalize', input.device, [input],
axis=dim, mean=mean, std=std, dtype=dtype,
ndim=len(dims) if dims is not None else 0, perm=dims)
def chunk(tensor, chunks, dim=0, copy=True):
"""Split input into a specific number of chunks.
......
......@@ -168,6 +168,37 @@ def argmin(input, dim, keepdim=False, out=None):
axis=dim, keepdims=keepdim)
def atan2(input, other, out=None):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{input}}{\text{other}})
Examples:
```python
y = torch.tensor(1.)
x = torch.tensor(2.)
print(torch.atan2(y, x)) # 0.46364761
```
Parameters
----------
input : dragon.vm.torch.Tensor
The input tensor.
other : Union[dragon.vm.torch.Tensor, number]
The tensor to divide.
out : dragon.vm.torch.Tensor, optional
The output tensor.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
"""
return _binary_func(input, other, 'Atan2', out)
def baddbmm(input, batch1, batch2, beta=1, alpha=1, out=None):
r"""Add input to the result of batched matrix-matrix multiplication.
......
......@@ -186,6 +186,29 @@ def argsort(self, dim=-1, descending=False):
return sort_ops.argsort(self, dim, descending)
def atan2(self, other):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{self}}{\text{other}})
Parameters
----------
other : Union[dragon.vm.torch.Tensor, number]
The value to divide.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.atan2(...)`_
"""
return math_ops.atan2(self, other)
def baddbmm(self, batch1, batch2, beta=1, alpha=1):
r"""Add the result of batched matrix-matrix multiplication.
......@@ -3051,6 +3074,7 @@ Tensor.addmm = addmm
Tensor.argmax = argmax
Tensor.argmin = argmin
Tensor.argsort = argsort
Tensor.atan2 = atan2
Tensor.backward = backward
Tensor.baddbmm = baddbmm
Tensor.baddbmm_ = baddbmm_
......
......@@ -28,9 +28,11 @@ class Adam(Optimizer):
The **Adam** update is defined as:
.. math::
\text{Adam}(g) = \text{lr} * \frac{m_{t}}{\sqrt{v_{t}} + \epsilon} \\
\text{Adam}(g) = \text{lr} * (\frac{\text{correction}* m_{t}}
{\sqrt{v_{t}} + \epsilon}) \\
\quad \\ \text{where}\quad
\begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
\end{cases}
......@@ -88,12 +90,13 @@ class AdamW(Adam):
The **AdamW** update is defined as:
.. math::
\text{AdamW}(g, p) = \text{lr} * (\frac{m_{t}}{\sqrt{v_{t}} + \epsilon}
+ \lambda p) \\
\text{AdamW}(g, p) = \text{lr} * (\frac{\text{correction} * m_{t}}
{\sqrt{v_{t}} + \epsilon} + \lambda p) \\
\quad \\ \text{where}\quad
\begin{cases}
\text{correction} = \sqrt{1 - \beta_{2}^{t}} / (1 - \beta_{1}^{t}) \\
m_{t} = \beta_{1} * m_{t-1} + (1 - \beta_{1}) * g \\
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2}
v_{t} = \beta_{2} * v_{t-1} + (1 - \beta_{2}) * g^{2} \\
\end{cases}
"""
......
......@@ -78,5 +78,4 @@ class RMSprop(Optimizer):
defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps,
centered=centered, weight_decay=weight_decay)
super(RMSprop, self).__init__(params, defaults, **kwargs)
self._hyper['alpha'][0] = 'decay'
self._hyper.pop('centered') # Unsupported.
......@@ -372,6 +372,27 @@ class Tensor(object):
"""
def atan2(self, other):
r"""Compute the element-wise arc-tangent of two arguments.
.. math:: \text{out} = \text{arctan}(\frac{\text{self}}{\text{other}})
Parameters
----------
other : Union[dragon.vm.torch.Tensor, number]
The value to divide.
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.atan2(...)`_
"""
def backward(self, gradient=None, retain_graph=False):
"""Compute the derivatives of this tensor w.r.t. graph leaves.
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!