Commit ac051717 by Ting PAN

Fix cuBLAS fp32 downcast issue on ampere devices

Summary:
This commit removes the default cuBLAS tensor core math mode
when CUDA >= 11.0 on ampere devices to avoid the FP32 downcast math.
1 parent b7e2298f
Showing with 1554 additions and 914 deletions
...@@ -19,6 +19,7 @@ from dragon.vm.dali.core.ops.builtin_ops import ExternalSource ...@@ -19,6 +19,7 @@ from dragon.vm.dali.core.ops.builtin_ops import ExternalSource
from dragon.vm.dali.core.ops.decoder_ops import ImageDecoder from dragon.vm.dali.core.ops.decoder_ops import ImageDecoder
from dragon.vm.dali.core.ops.decoder_ops import ImageDecoderRandomCrop from dragon.vm.dali.core.ops.decoder_ops import ImageDecoderRandomCrop
from dragon.vm.dali.core.ops.generic_ops import Cast from dragon.vm.dali.core.ops.generic_ops import Cast
from dragon.vm.dali.core.ops.generic_ops import Erase
from dragon.vm.dali.core.ops.generic_ops import Pad from dragon.vm.dali.core.ops.generic_ops import Pad
from dragon.vm.dali.core.ops.generic_ops import Reshape from dragon.vm.dali.core.ops.generic_ops import Reshape
from dragon.vm.dali.core.ops.generic_ops import Slice from dragon.vm.dali.core.ops.generic_ops import Slice
...@@ -31,6 +32,8 @@ from dragon.vm.dali.core.ops.image_ops import Paste ...@@ -31,6 +32,8 @@ from dragon.vm.dali.core.ops.image_ops import Paste
from dragon.vm.dali.core.ops.image_ops import RandomBBoxCrop from dragon.vm.dali.core.ops.image_ops import RandomBBoxCrop
from dragon.vm.dali.core.ops.image_ops import RandomResizedCrop from dragon.vm.dali.core.ops.image_ops import RandomResizedCrop
from dragon.vm.dali.core.ops.image_ops import Resize from dragon.vm.dali.core.ops.image_ops import Resize
from dragon.vm.dali.core.ops.image_ops import Rotate
from dragon.vm.dali.core.ops.image_ops import WarpAffine
from dragon.vm.dali.core.ops.random_ops import CoinFlip from dragon.vm.dali.core.ops.random_ops import CoinFlip
from dragon.vm.dali.core.ops.random_ops import Uniform from dragon.vm.dali.core.ops.random_ops import Uniform
from dragon.vm.dali.core.ops.reader_ops import KPLRecordReader from dragon.vm.dali.core.ops.reader_ops import KPLRecordReader
......
...@@ -48,7 +48,7 @@ class Iterator(object): ...@@ -48,7 +48,7 @@ class Iterator(object):
with self._api_scope(): with self._api_scope():
self._pipe.build() self._pipe.build()
# Enforce the correct device of current process # Enforce the correct device of current process
# to initialize cuda handles instead of device:0 # to initialize cuda handles instead of device 0.
cuda.set_device(self._pipe.device_id) cuda.set_device(self._pipe.device_id)
self._pipe.schedule_run() self._pipe.schedule_run()
self._copies = None self._copies = None
...@@ -91,12 +91,9 @@ class Iterator(object): ...@@ -91,12 +91,9 @@ class Iterator(object):
shape=tensor.shape(), shape=tensor.shape(),
dtype=str(types.np_dtype(tensor.dtype())), dtype=str(types.np_dtype(tensor.dtype())),
device=self.new_device( device=self.new_device(
device_type='cuda' if isinstance( device_type=('cuda' if isinstance(tensor, TensorGPU)
tensor, TensorGPU) else 'cpu', else 'cpu'),
device_index=self._pipe.device_id, device_index=self._pipe.device_id)))
)
)
)
# Transfer the data: DALI => Storage # Transfer the data: DALI => Storage
for i, tensor in enumerate(tensors): for i, tensor in enumerate(tensors):
self._transfer_tensor(tensor, self._copies[i]) self._transfer_tensor(tensor, self._copies[i])
...@@ -160,7 +157,7 @@ class Iterator(object): ...@@ -160,7 +157,7 @@ class Iterator(object):
def __next__(self): def __next__(self):
"""Return the next batch of data.""" """Return the next batch of data."""
# Return and reset the first batch if necessary # Return and reset the first batch if necessary.
if self._first_batch is not None: if self._first_batch is not None:
outputs = self._first_batch outputs = self._first_batch
self._first_batch = None self._first_batch = None
......
...@@ -34,6 +34,8 @@ try: ...@@ -34,6 +34,8 @@ try:
num_threads=1, num_threads=1,
seed=3, seed=3,
prefetch_queue_depth=2, prefetch_queue_depth=2,
py_num_workers=1,
**kwargs
): ):
"""Create a ``Pipeline``. """Create a ``Pipeline``.
...@@ -47,6 +49,8 @@ try: ...@@ -47,6 +49,8 @@ try:
The seed for random generator. The seed for random generator.
prefetch_queue_depth : int, optional, default=2 prefetch_queue_depth : int, optional, default=2
The number of prefetch queues. The number of prefetch queues.
py_num_workers : int, optional, default=1
The number of workers to process external source.
""" """
device_id = context.get_device()['device_index'] device_id = context.get_device()['device_index']
...@@ -56,6 +60,7 @@ try: ...@@ -56,6 +60,7 @@ try:
device_id=device_id, device_id=device_id,
seed=seed, seed=seed,
prefetch_queue_depth=prefetch_queue_depth, prefetch_queue_depth=prefetch_queue_depth,
**kwargs
) )
@property @property
...@@ -68,7 +73,7 @@ try: ...@@ -68,7 +73,7 @@ try:
The batch size. The batch size.
""" """
return self._batch_size return self._max_batch_size
@property @property
def device_id(self): def device_id(self):
...@@ -83,6 +88,18 @@ try: ...@@ -83,6 +88,18 @@ try:
return self._device_id return self._device_id
@property @property
def max_batch_size(self):
"""Return the maximum batch size of pipeline.
Returns
-------
int
The maximum batch size.
"""
return self._max_batch_size
@property
def num_threads(self): def num_threads(self):
"""Return the number of threads to execute pipeline. """Return the number of threads to execute pipeline.
...@@ -94,26 +111,24 @@ try: ...@@ -94,26 +111,24 @@ try:
""" """
return self._num_threads return self._num_threads
def build(self): def build(self, define_graph=None):
"""Build the pipeline.""" """Build the pipeline.
super(Pipeline, self).build()
def define_graph(self):
"""Define the symbolic operations for pipeline."""
super(Pipeline, self).define_graph()
def feed_input(self, ref, data):
"""Bind an array to the edge reference.
Parameters Parameters
---------- ----------
ref : _EdgeReference define_graph : callable, optional
The reference of a edge. The defined function to use instead.
data : numpy.ndarray
The array data.
""" """
super(Pipeline, self).feed_input(ref, data) super(Pipeline, self).build(define_graph)
def define_graph(self):
"""Define the symbolic operations for pipeline."""
super(Pipeline, self).define_graph()
def feed_input(self, *args, **kwargs):
"""Bind an array to the edge reference."""
super(Pipeline, self).feed_input(*args, **kwargs)
except ImportError: except ImportError:
...@@ -134,6 +149,8 @@ except ImportError: ...@@ -134,6 +149,8 @@ except ImportError:
num_threads=1, num_threads=1,
seed=3, seed=3,
prefetch_queue_depth=2, prefetch_queue_depth=2,
py_num_workers=1,
**kwargs
): ):
"""Create a ``Pipeline`` """Create a ``Pipeline``
...@@ -147,9 +164,11 @@ except ImportError: ...@@ -147,9 +164,11 @@ except ImportError:
The seed for random generator. The seed for random generator.
prefetch_queue_depth : int, optional, default=2 prefetch_queue_depth : int, optional, default=2
The number of prefetch queues. The number of prefetch queues.
py_num_workers : int, optional, default=1
The number of workers to process external source.
""" """
self._batch_size = batch_size self._max_batch_size = batch_size
self._num_threads = num_threads self._num_threads = num_threads
self._seed = seed self._seed = seed
self._prefetch_queue_depth = prefetch_queue_depth self._prefetch_queue_depth = prefetch_queue_depth
...@@ -164,7 +183,7 @@ except ImportError: ...@@ -164,7 +183,7 @@ except ImportError:
The batch size. The batch size.
""" """
return self._batch_size return self._max_batch_size
@property @property
def device_id(self): def device_id(self):
...@@ -179,6 +198,18 @@ except ImportError: ...@@ -179,6 +198,18 @@ except ImportError:
return 0 return 0
@property @property
def max_batch_size(self):
"""Return the maximum batch size of pipeline.
Returns
-------
int
The maximum batch size.
"""
return self._max_batch_size
@property
def num_threads(self): def num_threads(self):
"""Return the number of threads to execute pipeline. """Return the number of threads to execute pipeline.
...@@ -190,23 +221,21 @@ except ImportError: ...@@ -190,23 +221,21 @@ except ImportError:
""" """
return self._num_threads return self._num_threads
def build(self): def build(self, define_graph=None):
"""Build the pipeline.""" """Build the pipeline.
Parameters
----------
define_graph : callable, optional
The defined function to use instead.
"""
pass pass
def define_graph(self): def define_graph(self):
"""Define the symbolic operations for pipeline.""" """Define the symbolic operations for pipeline."""
pass pass
def feed_input(self, ref, data): def feed_input(self, *args, **kwargs):
"""Bind an array to the edge reference. """Bind an array to the edge reference."""
Parameters
----------
ref : _EdgeReference
The reference of a edge.
data : numpy.ndarray
The array data.
"""
pass pass
...@@ -60,6 +60,60 @@ class Cast(object): ...@@ -60,6 +60,60 @@ class Cast(object):
) )
class Erase(object):
"""Erase regions from the input.
Examples:
```python
erase = dali.ops.Erase(
# The axes to erase
axes=[0, 1],
# The value fill
fill_value=0.,
)
y = erase(inputs['x'], anchor=(0, 0), shape=(100, 100))
```
"""
def __new__(
cls,
axes=(0, 1),
fill_value=0,
normalized_anchor=True,
normalized_shape=True,
**kwargs
):
"""Create an ``Erase`` operator.
Parameters
----------
axes : Sequence[int], optional
The padding axes.
fill_value : Union[number, Sequence[float]], optional
The value to fill the erased regions.
normalized_anchor : bool, optional, default=True
Provided anchor is normalized or not.
normalized_shape : bool, optional, default=True
Provided shape is normalized or not.
Returns
-------
nvidia.dali.ops.Erase
The operator.
"""
return ops.Erase(
axes=axes,
fill_value=fill_value,
normalized_anchor=normalized_anchor,
normalized_shape=normalized_shape,
device=context.get_device_type(),
**kwargs
)
class Pad(object): class Pad(object):
"""Pad input to have the same dimensions. """Pad input to have the same dimensions.
...@@ -77,14 +131,14 @@ class Pad(object): ...@@ -77,14 +131,14 @@ class Pad(object):
""" """
def __new__(cls, axes=(0, 1), fill_value=0., align=None, **kwargs): def __new__(cls, axes=(0, 1), fill_value=0, align=None, **kwargs):
"""Create a ``Pad`` operator. """Create a ``Pad`` operator.
Parameters Parameters
---------- ----------
axes : Sequence[int], optional axes : Sequence[int], optional
The padding axes. The padding axes.
fill_value : number, optional, default=0. fill_value : number, optional, default=0
The constant padding value. The constant padding value.
align : Union[int, Sequence[int]], optional align : Union[int, Sequence[int]], optional
The size to align the padding shape. The size to align the padding shape.
......
...@@ -487,3 +487,88 @@ class Resize(object): ...@@ -487,3 +487,88 @@ class Resize(object):
device=context.get_device_type(), device=context.get_device_type(),
**kwargs **kwargs
) )
class Rotate(object):
"""Rotate the image.
Examples:
```python
rotate = dali.ops.Rotate()
y = rotate(inputs['x'], angle=30)
```
"""
def __new__(
cls,
fill_value=0,
interp_type='linear',
keep_size=True,
**kwargs
):
"""Create a ``Rotate`` operator.
Parameters
----------
fill_value : number, optional
The value to fill the empty regions.
interp_type : str, optional, default='linear'
The interpolation method.
keep_size : bool, optional, default=True
Whether to keep the original image size.
Returns
------
nvidia.dali.ops.Rotate
The operator.
"""
if isinstance(interp_type, six.string_types):
interp_type = getattr(types, 'INTERP_' + interp_type.upper())
return ops.Rotate(
fill_value=fill_value,
interp_type=interp_type,
keep_size=keep_size,
device=context.get_device_type(),
**kwargs
)
class WarpAffine(object):
"""Apply an affine transformation to the image.
Examples:
```python
warp_affine = dali.ops.WarpAffine()
y = warp_affine(inputs['x'], matrix=[1, 0, 0, 0, 1, 0])
```
"""
def __new__(cls, fill_value=0, interp_type='linear', **kwargs):
"""Create a ``WarpAffine`` operator.
Parameters
----------
fill_value : number, optional
The value to fill the empty regions.
interp_type : str, optional, default='linear'
The interpolation method.
Returns
------
nvidia.dali.ops.WarpAffine
The operator.
"""
if isinstance(interp_type, six.string_types):
interp_type = getattr(types, 'INTERP_' + interp_type.upper())
return ops.WarpAffine(
fill_value=fill_value,
interp_type=interp_type,
device=context.get_device_type(),
**kwargs
)
...@@ -10,14 +10,14 @@ __init__ ...@@ -10,14 +10,14 @@ __init__
Properties Properties
---------- ----------
batch_size
##########
.. autoattribute:: dragon.vm.dali.Pipeline.batch_size
device_id device_id
######### #########
.. autoattribute:: dragon.vm.dali.Pipeline.device_id .. autoattribute:: dragon.vm.dali.Pipeline.device_id
max_batch_size
##############
.. autoattribute:: dragon.vm.dali.Pipeline.max_batch_size
num_threads num_threads
########### ###########
.. autoattribute:: dragon.vm.dali.Pipeline.num_threads .. autoattribute:: dragon.vm.dali.Pipeline.num_threads
......
...@@ -30,6 +30,9 @@ vm.dali.ops ...@@ -30,6 +30,9 @@ vm.dali.ops
`class CropMirrorNormalize <ops/CropMirrorNormalize.html>`_ `class CropMirrorNormalize <ops/CropMirrorNormalize.html>`_
: Crop and normalize image with the horizontal flip. : Crop and normalize image with the horizontal flip.
`class Erase <ops/Erase.html>`_
: Erase regions from the input.
`class ExternalSource <ops/Cast.html>`_ `class ExternalSource <ops/Cast.html>`_
: Create a placeholder providing data from feeding. : Create a placeholder providing data from feeding.
...@@ -60,6 +63,9 @@ vm.dali.ops ...@@ -60,6 +63,9 @@ vm.dali.ops
`class Resize <ops/Resize.html>`_ `class Resize <ops/Resize.html>`_
: Resize the image. : Resize the image.
`class Rotate <ops/Rotate.html>`_
: Rotate the image.
`class Slice <ops/Slice.html>`_ `class Slice <ops/Slice.html>`_
: Select an interval of elements from input. : Select an interval of elements from input.
...@@ -72,6 +78,9 @@ vm.dali.ops ...@@ -72,6 +78,9 @@ vm.dali.ops
`class Uniform <ops/Uniform.html>`_ `class Uniform <ops/Uniform.html>`_
: Sample values from an uniform distribution. : Sample values from an uniform distribution.
`class WarpAffine <ops/WarpAffine.html>`_
: Apply an affine transformation to the image.
.. toctree:: .. toctree::
:hidden: :hidden:
...@@ -83,6 +92,7 @@ vm.dali.ops ...@@ -83,6 +92,7 @@ vm.dali.ops
ops/CoinFlip ops/CoinFlip
ops/Contrast ops/Contrast
ops/CropMirrorNormalize ops/CropMirrorNormalize
ops/Erase
ops/ExternalSource ops/ExternalSource
ops/Hsv ops/Hsv
ops/ImageDecoder ops/ImageDecoder
...@@ -93,10 +103,12 @@ vm.dali.ops ...@@ -93,10 +103,12 @@ vm.dali.ops
ops/RandomResizedCrop ops/RandomResizedCrop
ops/Reshape ops/Reshape
ops/Resize ops/Resize
ops/Rotate
ops/Slice ops/Slice
ops/KPLRecordReader ops/KPLRecordReader
ops/TFRecordReader ops/TFRecordReader
ops/Uniform ops/Uniform
ops/WarpAffine
.. raw:: html .. raw:: html
......
Erase
=====
.. autoclass:: dragon.vm.dali.ops.Erase
__new__
--------
.. automethod:: dragon.vm.dali.ops.Erase.__new__
.. raw:: html
<style>
h1:before {
content: "dali.ops.";
color: #103d3e;
}
</style>
Rotate
======
.. autoclass:: dragon.vm.dali.ops.Rotate
__new__
--------
.. automethod:: dragon.vm.dali.ops.Rotate.__new__
.. raw:: html
<style>
h1:before {
content: "dali.ops.";
color: #103d3e;
}
</style>
WarpAffine
==========
.. autoclass:: dragon.vm.dali.ops.WarpAffine
__new__
--------
.. automethod:: dragon.vm.dali.ops.WarpAffine.__new__
.. raw:: html
<style>
h1:before {
content: "dali.ops.";
color: #103d3e;
}
</style>
...@@ -61,10 +61,14 @@ glorot_normal ...@@ -61,10 +61,14 @@ glorot_normal
############# #############
.. automethod:: dragon.Tensor.glorot_normal .. automethod:: dragon.Tensor.glorot_normal
glorot_uniform glorot_uniform
############## ##############
.. automethod:: dragon.Tensor.glorot_uniform .. automethod:: dragon.Tensor.glorot_uniform
item
####
.. automethod:: dragon.Tensor.item
normal normal
###### ######
.. automethod:: dragon.Tensor.normal .. automethod:: dragon.Tensor.normal
...@@ -77,6 +81,10 @@ reshape ...@@ -77,6 +81,10 @@ reshape
####### #######
.. automethod:: dragon.Tensor.reshape .. automethod:: dragon.Tensor.reshape
tolist
######
.. automethod:: dragon.Tensor.tolist
truncated_normal truncated_normal
################ ################
.. automethod:: dragon.Tensor.truncated_normal .. automethod:: dragon.Tensor.truncated_normal
......
...@@ -305,8 +305,12 @@ is_floating_point ...@@ -305,8 +305,12 @@ is_floating_point
################# #################
.. automethod:: dragon.vm.torch.Tensor.is_floating_point .. automethod:: dragon.vm.torch.Tensor.is_floating_point
item
####
.. automethod:: dragon.vm.torch.Tensor.item
le le
### ##
.. automethod:: dragon.vm.torch.Tensor.le .. automethod:: dragon.vm.torch.Tensor.le
log log
...@@ -577,6 +581,10 @@ to ...@@ -577,6 +581,10 @@ to
## ##
.. automethod:: dragon.vm.torch.Tensor.to .. automethod:: dragon.vm.torch.Tensor.to
tolist
######
.. automethod:: dragon.vm.torch.Tensor.tolist
topk topk
#### ####
.. automethod:: dragon.vm.torch.Tensor.topk .. automethod:: dragon.vm.torch.Tensor.topk
......
...@@ -56,7 +56,13 @@ class CUDAObjects { ...@@ -56,7 +56,13 @@ class CUDAObjects {
auto& handle = handles[stream_id]; auto& handle = handles[stream_id];
CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); CUBLAS_CHECK(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id))); CUBLAS_CHECK(cublasSetStream(handle, stream(device_id, stream_id)));
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 11000
if (cudnn_allow_tf32_) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
} else {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}
#elif CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) { if (TENSOR_CORE_AVAILABLE()) {
CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); CUBLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
} }
......
...@@ -74,64 +74,56 @@ __global__ void _MaskBlock2dNHWC( ...@@ -74,64 +74,56 @@ __global__ void _MaskBlock2dNHWC(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void DropBlock2d<T, CUDAContext>( \ void DropBlock2d<T, CUDAContext>( \
const int N, \ const int N, \
const int C, \ const int C, \
const int H, \ const int H, \
const int W, \ const int W, \
const int block_size, \ const int block_size, \
const float ratio, \ const float ratio, \
const float scale, \ const float scale, \
const string& data_format, \ const string& data_format, \
const T* x, \ const T* x, \
T* y, \ T* y, \
uint8_t* mask, \ uint8_t* mask, \
uint32_t* r, \ uint32_t* r, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto seed_h = H - block_size + 1; \ const auto seed_h = H - block_size + 1; \
const auto seed_w = W - block_size + 1; \ const auto seed_w = W - block_size + 1; \
const auto num_seeds = N * seed_h * seed_w; \ const auto num_seeds = N * seed_h * seed_w; \
const auto NxCxHxW = N * C * H * W; \ const auto NxCxHxW = N * C * H * W; \
math::Set(NxCxHxW, uint8_t(1), mask, ctx); \ math::Set(NxCxHxW, uint8_t(1), mask, ctx); \
math::Random(num_seeds, r, ctx); \ math::Random(num_seeds, r, ctx); \
if (data_format == "NCHW") { \ if (data_format == "NCHW") { \
_MaskBlock2dNCHW<<< \ _MaskBlock2dNCHW<<<num_seeds, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
CUDA_2D_BLOCKS(num_seeds), \ C, \
CUDA_THREADS, \ H, \
0, \ W, \
ctx->cuda_stream()>>>( \ seed_h, \
C, \ seed_w, \
H, \ num_seeds, \
W, \ block_size, \
seed_h, \ uint32_t(UINT_MAX * ratio), \
seed_w, \ r, \
num_seeds, \ mask); \
block_size, \ } else if (data_format == "NHWC") { \
uint32_t(UINT_MAX * ratio), \ _MaskBlock2dNHWC<<<num_seeds, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
r, \ C, \
mask); \ H, \
} else if (data_format == "NHWC") { \ W, \
_MaskBlock2dNHWC<<< \ seed_h, \
CUDA_2D_BLOCKS(num_seeds), \ seed_w, \
CUDA_THREADS, \ num_seeds, \
0, \ block_size, \
ctx->cuda_stream()>>>( \ uint32_t(UINT_MAX * ratio), \
C, \ r, \
H, \ mask); \
W, \ } else { \
seed_h, \ LOG(FATAL) << "Unknown DataFormat: " << data_format; \
seed_w, \ } \
num_seeds, \ math::ApplyMask(NxCxHxW, scale, mask, x, y, ctx); \
block_size, \
uint32_t(UINT_MAX * ratio), \
r, \
mask); \
} else { \
LOG(FATAL) << "Unknown DataFormat: " << data_format; \
} \
math::ApplyMask(NxCxHxW, scale, mask, x, y, ctx); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(float16);
......
...@@ -211,7 +211,7 @@ __global__ void _PReluWGrad( ...@@ -211,7 +211,7 @@ __global__ void _PReluWGrad(
_PReluWGrad, \ _PReluWGrad, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
math::AccmulatorType<T>::type, \ math::AccmulatorType<T>::type, \
CUDA_2D_BLOCKS(C), \ C, \
CUDA_THREADS, \ CUDA_THREADS, \
NxS, \ NxS, \
S, \ S, \
......
...@@ -30,14 +30,26 @@ void _Softmax(const int N, const int S, const int C, const T* x, T* y) { ...@@ -30,14 +30,26 @@ void _Softmax(const int N, const int S, const int C, const T* x, T* y) {
} }
} }
template <> template <typename T>
void _Softmax<float16>( void _LogSoftmax(const int N, const int S, const int C, const T* x, T* y) {
const int N, if (S == 1) {
const int S, ConstEigenArrayMap<T> X(x, C, N);
const int C, EigenArrayMap<T> Y(y, C, N);
const float16* x, Y = X.rowwise() - X.colwise().maxCoeff();
float16* y) { Y = Y.rowwise() - Y.exp().colwise().sum().log();
CPU_FP16_NOT_SUPPORTED; return;
}
for (int i = 0; i < N; ++i) {
const auto offset = i * C * S;
for (int j = 0; j < S; ++j) {
ConstEigenStridedVectorArrayMap<T> X_vec(
x + offset + j, 1, C, EigenInnerStride(S));
EigenStridedVectorArrayMap<T> Y_vec(
y + offset + j, 1, C, EigenInnerStride(S));
Y_vec = X_vec - X_vec.maxCoeff();
Y_vec -= std::log(Y_vec.exp().sum());
}
}
} }
template <typename T> template <typename T>
...@@ -69,52 +81,104 @@ void _SoftmaxGrad( ...@@ -69,52 +81,104 @@ void _SoftmaxGrad(
} }
} }
template <> template <typename T>
void _SoftmaxGrad<float16>( void _LogSoftmaxGrad(
const int N, const int N,
const int S, const int S,
const int C, const int C,
const float16* dy, const T* dy,
const float16* y, const T* y,
float16* dx) { T* dx) {
CPU_FP16_NOT_SUPPORTED; if (S == 1) {
} // SoftmaxGrad ConstEigenArrayMap<T> dY(dy, C, N);
ConstEigenArrayMap<T> Y(y, C, N);
EigenArrayMap<T> dX(dx, C, N);
dX = dY - Y.exp().rowwise() * dY.colwise().sum();
return;
}
for (int i = 0; i < N; ++i) {
const auto offset = i * C * S;
for (int j = 0; j < S; ++j) {
ConstEigenStridedVectorArrayMap<T> dY_vec(
dy + offset + j, 1, C, EigenInnerStride(S));
ConstEigenStridedVectorArrayMap<T> Y_vec(
y + offset + j, 1, C, EigenInnerStride(S));
EigenStridedVectorArrayMap<T> dX_vec(
dx + offset + j, 1, C, EigenInnerStride(S));
dX_vec = dY_vec - Y_vec.exp() * dY_vec.sum();
}
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void Softmax<T, CPUContext>( \ void name<T, CPUContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_Softmax(N, S, C, x, y); \ _##name(N, S, C, x, y); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CPUContext>( \
const int N, \
const int S, \
const int C, \
const T* dy, \
const T* y, \
T* dx, \
CPUContext* ctx) { \
_##name(N, S, C, dy, y, dx); \
}
DEFINE_KERNEL_LAUNCHER(Softmax, float);
DEFINE_KERNEL_LAUNCHER(Softmax, double);
DEFINE_KERNEL_LAUNCHER(LogSoftmax, float);
DEFINE_KERNEL_LAUNCHER(LogSoftmax, double);
DEFINE_GRAD_KERNEL_LAUNCHER(SoftmaxGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(SoftmaxGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CPUContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
T* y, \
CPUContext* ctx) { \
CPU_FP16_NOT_SUPPORTED; \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void SoftmaxGrad<T, CPUContext>( \ void name<T, CPUContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CPUContext* ctx) { \ CPUContext* ctx) { \
_SoftmaxGrad(N, S, C, dy, y, dx); \ CPU_FP16_NOT_SUPPORTED; \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(Softmax, float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(LogSoftmax, float16);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_GRAD_KERNEL_LAUNCHER(SoftmaxGrad, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -11,12 +11,12 @@ namespace kernels { ...@@ -11,12 +11,12 @@ namespace kernels {
namespace { namespace {
#define LDG2(x, i) convert::To<AccT>(__ldg(x + i)) #define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT> template <typename T, typename AccT>
__global__ void __global__ void
_Softmax(const int NxS, const int S, const int C, const T* x, T* y) { _Softmax(const int NxS, const int S, const int C, const T* x, T* y) {
__shared__ AccT block_val; __shared__ AccT block_max, block_sum;
__shared__ typename BlockReduce<AccT>::TempStorage storage; __shared__ typename BlockReduce<AccT>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, NxS) { CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S); const int offset = (i / S) * C * S + (i % S);
...@@ -25,28 +25,58 @@ _Softmax(const int NxS, const int S, const int C, const T* x, T* y) { ...@@ -25,28 +25,58 @@ _Softmax(const int NxS, const int S, const int C, const T* x, T* y) {
AccT val = convert::To<AccT>(__ldg(offset_x)); AccT val = convert::To<AccT>(__ldg(offset_x));
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
val = max(val, LDG2(offset_x, j * S)); val = max(val, LDG(offset_x, j * S));
} }
val = BlockReduce<AccT>(storage).Reduce(val, cub::Max()); val = BlockReduce<AccT>(storage).Reduce(val, cub::Max());
if (threadIdx.x == 0) block_val = val; if (threadIdx.x == 0) block_max = val;
__syncthreads();
val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) {
val += exp(LDG(offset_x, j * S) - block_max);
}
val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_sum = val;
__syncthreads(); __syncthreads();
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S; const int k = j * S;
offset_y[k] = convert::To<T>(exp(LDG2(offset_x, k) - block_val)); val = exp(LDG(offset_x, k) - block_max);
offset_y[k] = convert::To<T>(val / block_sum);
} }
}
}
template <typename T, typename AccT>
__global__ void
_LogSoftmax(const int NxS, const int S, const int C, const T* x, T* y) {
__shared__ AccT block_max, block_sum;
__shared__ typename BlockReduce<AccT>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_x = x + offset;
auto* offset_y = y + offset;
AccT val = convert::To<AccT>(__ldg(offset_x));
CUDA_2D_KERNEL_LOOP2(j, C) {
val = max(val, LDG(offset_x, j * S));
}
val = BlockReduce<AccT>(storage).Reduce(val, cub::Max());
if (threadIdx.x == 0) block_max = val;
__syncthreads();
val = AccT(0); val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
val += convert::To<AccT>(offset_y[j * S]); val += exp(LDG(offset_x, j * S) - block_max);
} }
val = BlockReduce<AccT>(storage).Sum(val); val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_val = val; if (threadIdx.x == 0) block_sum = val;
__syncthreads(); __syncthreads();
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S; const int k = j * S;
offset_y[k] = convert::To<T>(convert::To<AccT>(offset_y[k]) / block_val); val = LDG(offset_x, k) - block_max;
offset_y[k] = convert::To<T>(val - log(block_sum));
} }
} }
} }
...@@ -70,7 +100,39 @@ __global__ void _SoftmaxGrad( ...@@ -70,7 +100,39 @@ __global__ void _SoftmaxGrad(
AccT val = AccT(0); AccT val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S; const int k = j * S;
val += LDG2(offset_dy, k) * LDG2(offset_y, k); val += LDG(offset_dy, k) * LDG(offset_y, k);
}
val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_val = val;
__syncthreads();
CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S;
val = LDG(offset_dy, k) - block_val;
offset_dx[k] = convert::To<T>(val * LDG(offset_y, k));
}
}
}
template <typename T, typename AccT>
__global__ void _LogSoftmaxGrad(
const int NxS,
const int S,
const int C,
const T* dy,
const T* y,
T* dx) {
__shared__ AccT block_val;
__shared__ typename BlockReduce<AccT>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_dy = dy + offset;
auto* offset_y = y + offset;
auto* offset_dx = dx + offset;
AccT val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) {
val += LDG(offset_dy, j * S);
} }
val = BlockReduce<AccT>(storage).Sum(val); val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_val = val; if (threadIdx.x == 0) block_val = val;
...@@ -78,64 +140,70 @@ __global__ void _SoftmaxGrad( ...@@ -78,64 +140,70 @@ __global__ void _SoftmaxGrad(
CUDA_2D_KERNEL_LOOP2(j, C) { CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S; const int k = j * S;
offset_dx[k] = val = exp(convert::To<AccT>(offset_y[k])) * block_val;
convert::To<T>((LDG2(offset_dy, k) - block_val) * LDG2(offset_y, k)); offset_dx[k] = convert::To<T>(LDG(offset_dy, k) - val);
} }
} }
} }
#undef LDG2 #undef LDG
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T) \ #define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void Softmax<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto NxS = N * S; \ const auto NxS = N * S; \
_Softmax<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_2D_BLOCKS(NxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
C, \ C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(T) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \
template <> \ template <> \
void SoftmaxGrad<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const T* dy, \ const T* dy, \
const T* y, \ const T* y, \
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto NxS = N * S; \ const auto NxS = N * S; \
_SoftmaxGrad<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \ _##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<CUDA_2D_BLOCKS(NxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
C, \ C, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(y), \ reinterpret_cast<const math::ScalarType<T>::type*>(y), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(Softmax, float16);
DEFINE_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(Softmax, float);
DEFINE_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(Softmax, double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16); DEFINE_KERNEL_LAUNCHER(LogSoftmax, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float); DEFINE_KERNEL_LAUNCHER(LogSoftmax, float);
DEFINE_GRAD_KERNEL_LAUNCHER(double); DEFINE_KERNEL_LAUNCHER(LogSoftmax, double);
DEFINE_GRAD_KERNEL_LAUNCHER(SoftmaxGrad, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(SoftmaxGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(SoftmaxGrad, double);
DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, float16);
DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER #undef DEFINE_GRAD_KERNEL_LAUNCHER
......
...@@ -53,25 +53,25 @@ __global__ void _ArgReduce( ...@@ -53,25 +53,25 @@ __global__ void _ArgReduce(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T, CompareFunctor, kInit) \ #define DEFINE_KERNEL_LAUNCHER(name, T, CompareFunctor, kInit) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const T* x, \ const T* x, \
int64_t* y, \ int64_t* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
using ScalarT = math::ScalarType<T>::type; \ using ScalarT = math::ScalarType<T>::type; \
const auto NxS = N * S; \ const auto NxS = N * S; \
_ArgReduce<<<CUDA_2D_BLOCKS(NxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _ArgReduce<<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
C, \ C, \
ArgFunctor<ScalarT, CompareFunctor<ScalarT>>(), \ ArgFunctor<ScalarT, CompareFunctor<ScalarT>>(), \
kInit, \ kInit, \
reinterpret_cast<const ScalarT*>(x), \ reinterpret_cast<const ScalarT*>(x), \
y); \ y); \
} }
DEFINE_KERNEL_LAUNCHER( DEFINE_KERNEL_LAUNCHER(
......
...@@ -134,17 +134,17 @@ __global__ void _GetTopK( ...@@ -134,17 +134,17 @@ __global__ void _GetTopK(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DISPATCH_BLOCKSORT_KERNEL(T, kItemsPerThread) \ #define DISPATCH_BLOCKSORT_KERNEL(T, kItemsPerThread) \
_BlockSort<T, kItemsPerThread> \ _BlockSort<T, kItemsPerThread> \
<<<CUDA_2D_BLOCKS(NxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
C, \ C, \
K, \ K, \
largest > 0, \ largest > 0, \
init, \ init, \
reinterpret_cast<const T*>(x), \ reinterpret_cast<const T*>(x), \
reinterpret_cast<T*>(value), \ reinterpret_cast<T*>(value), \
index) index)
#define DEFINE_KERNEL_LAUNCHER(T, kLowest, kMax) \ #define DEFINE_KERNEL_LAUNCHER(T, kLowest, kMax) \
......
...@@ -17,9 +17,8 @@ void _Transpose( ...@@ -17,9 +17,8 @@ void _Transpose(
const auto N = const auto N =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
vec64_t index(num_dims, 0); vec64_t index(num_dims, 0);
int64_t xi;
for (int yi = 0; yi < N; ++yi) { for (int yi = 0; yi < N; ++yi) {
xi = 0; int64_t xi = 0;
for (int d = num_dims - 1; d >= 0; --d) { for (int d = num_dims - 1; d >= 0; --d) {
xi += index[d] * x_strides[d]; xi += index[d] * x_strides[d];
} }
...@@ -28,27 +27,6 @@ void _Transpose( ...@@ -28,27 +27,6 @@ void _Transpose(
} }
} }
template <typename T>
void _TransposeGrad(
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const T* dy,
T* dx) {
const auto N =
std::accumulate(y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>());
vec64_t index(num_dims, 0);
int64_t xi;
for (int yi = 0; yi < N; ++yi) {
xi = 0;
for (int d = num_dims - 1; d >= 0; --d) {
xi += index[d] * x_strides[d];
}
dx[xi] = dy[yi];
math::utils::IncreaseIndexInDims(num_dims, y_dims, index.data());
}
}
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
...@@ -73,9 +51,6 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t); ...@@ -73,9 +51,6 @@ DEFINE_KERNEL_LAUNCHER(Transpose, int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16); DEFINE_KERNEL_LAUNCHER(Transpose, float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float); DEFINE_KERNEL_LAUNCHER(Transpose, float);
DEFINE_KERNEL_LAUNCHER(Transpose, double); DEFINE_KERNEL_LAUNCHER(Transpose, double);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
......
...@@ -13,14 +13,14 @@ namespace { ...@@ -13,14 +13,14 @@ namespace {
template <typename T, int D> template <typename T, int D>
__global__ void _Transpose( __global__ void _Transpose(
const int N, const int N,
const int num_dims,
const SimpleArray<int, D> X_strides, const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims, const SimpleArray<int, D> Y_dims,
const T* x, const T* x,
T* y) { T* y) {
CUDA_1D_KERNEL_LOOP(yi, N) { CUDA_1D_KERNEL_LOOP(yi, N) {
int xi = 0, tmp = yi; int xi = 0, tmp = yi;
for (int d = num_dims - 1; d >= 0; --d) { #pragma unroll
for (int d = D - 1; d >= 0; --d) {
int r; int r;
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r); FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += r * X_strides.data[d]; xi += r * X_strides.data[d];
...@@ -30,60 +30,76 @@ __global__ void _Transpose( ...@@ -30,60 +30,76 @@ __global__ void _Transpose(
} }
template <typename T, int D> template <typename T, int D>
__global__ void _TransposeGrad( void _TransposeImpl(
const int N, const int N,
const int num_dims, const int64_t* x_strides,
const SimpleArray<int, D> X_strides, const int64_t* y_dims,
const SimpleArray<int, D> Y_dims, const T* x,
const T* dy, T* y,
T* dx) { CUDAContext* ctx) {
CUDA_1D_KERNEL_LOOP(yi, N) { SimpleArray<int, D> X_strides, Y_dims;
int xi = 0, tmp = yi; for (int i = 0; i < D; ++i) {
for (int d = num_dims - 1; d >= 0; --d) { X_strides.data[i] = x_strides[i];
int r; Y_dims.data[i] = y_dims[i];
FIXED_DIVISOR_DIV_MOD(Y_dims.data[d], tmp, &tmp, &r);
xi += r * X_strides.data[d];
}
dx[xi] = dy[yi];
} }
_Transpose<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
N, X_strides, Y_dims, x, y);
} }
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T) \ #define DEFINE_KERNEL_LAUNCHER(T) \
template <> \ template <> \
void name<T, CUDAContext>( \ void Transpose<T, CUDAContext>( \
const int num_dims, \ const int num_dims, \
const int64_t* x_strides, \ const int64_t* x_strides, \
const int64_t* y_dims, \ const int64_t* y_dims, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> X_strides, Y_dims; \ const auto N = std::accumulate( \
const auto N = std::accumulate( \ y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \
y_dims, y_dims + num_dims, 1, std::multiplies<int64_t>()); \ switch (num_dims) { \
for (int i = 0; i < num_dims; ++i) { \ case 1: \
X_strides.data[i] = x_strides[i]; \ _TransposeImpl<T, 1>(N, x_strides, y_dims, x, y, ctx); \
Y_dims.data[i] = y_dims[i]; \ break; \
} \ case 2: \
_##name<<<CUDA_BLOCKS(N), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ _TransposeImpl<T, 2>(N, x_strides, y_dims, x, y, ctx); \
N, num_dims, X_strides, Y_dims, x, y); \ break; \
case 3: \
_TransposeImpl<T, 3>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 4: \
_TransposeImpl<T, 4>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 5: \
_TransposeImpl<T, 5>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 6: \
_TransposeImpl<T, 6>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 7: \
_TransposeImpl<T, 7>(N, x_strides, y_dims, x, y, ctx); \
break; \
case 8: \
_TransposeImpl<T, 8>(N, x_strides, y_dims, x, y, ctx); \
break; \
default: \
break; \
} \
} }
DEFINE_KERNEL_LAUNCHER(Transpose, bool); DEFINE_KERNEL_LAUNCHER(bool);
DEFINE_KERNEL_LAUNCHER(Transpose, uint8_t); DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(Transpose, int8_t); DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(Transpose, int); DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(Transpose, int64_t); DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(Transpose, float16); DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(Transpose, float); DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(Transpose, double); DEFINE_KERNEL_LAUNCHER(double);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float16);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, float);
DEFINE_KERNEL_LAUNCHER(TransposeGrad, double);
#undef DEFINE_KERNEL_LAUNCHER #undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels } // namespace kernels
......
#include "dragon/utils/device/common_openmp.h"
#include "dragon/utils/math_functions.h" #include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
...@@ -16,18 +15,14 @@ void _RowwiseMoments( ...@@ -16,18 +15,14 @@ void _RowwiseMoments(
AccT* mean, AccT* mean,
AccT* var) { AccT* var) {
const AccT scale = AccT(1) / AccT(rows); const AccT scale = AccT(1) / AccT(rows);
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(cols))
#endif
for (int i = 0; i < cols; ++i) { for (int i = 0; i < cols; ++i) {
AccT x_val, m_val = AccT(0), v_val = AccT(0); AccT m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < rows; ++j) { for (int j = 0; j < rows; ++j) {
x_val = convert::To<AccT>(x[j * cols + i]); const AccT val = convert::To<AccT>(x[j * cols + i]);
m_val += x_val; m_val += val;
v_val += x_val * x_val; v_val += val * val;
} }
m_val *= scale; mean[i] = m_val = m_val * scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val; var[i] = v_val * scale - m_val * m_val;
} }
} }
...@@ -40,18 +35,15 @@ void _ColwiseMoments( ...@@ -40,18 +35,15 @@ void _ColwiseMoments(
AccT* mean, AccT* mean,
AccT* var) { AccT* var) {
const AccT scale = AccT(1) / AccT(cols); const AccT scale = AccT(1) / AccT(cols);
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(rows))
#endif
for (int i = 0; i < rows; ++i) { for (int i = 0; i < rows; ++i) {
AccT x_val, m_val = AccT(0), v_val = AccT(0); const int offset = i * cols;
AccT m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < cols; ++j) { for (int j = 0; j < cols; ++j) {
x_val = convert::To<AccT>(x[i * cols + j]); const AccT val = convert::To<AccT>(x[offset + j]);
m_val += x_val; m_val += val;
v_val += x_val * x_val; v_val += val * val;
} }
m_val *= scale; mean[i] = m_val = m_val * scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val; var[i] = v_val * scale - m_val * m_val;
} }
} }
...@@ -67,25 +59,20 @@ void _GenericMoments( ...@@ -67,25 +59,20 @@ void _GenericMoments(
AccT* mean, AccT* mean,
AccT* var) { AccT* var) {
const AccT scale = AccT(1) / AccT(cols); const AccT scale = AccT(1) / AccT(cols);
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(rows))
#endif
for (int i = 0; i < rows; ++i) { for (int i = 0; i < rows; ++i) {
AccT x_val, m_val = AccT(0), v_val = AccT(0); const int offset = i * cols;
int xi, c, r; AccT m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < cols; ++j) { for (int j = 0; j < cols; ++j) {
xi = 0; int xi = 0, c = offset + j, r;
c = i * cols + j;
for (int d = num_dims - 1; d >= 0; --d) { for (int d = num_dims - 1; d >= 0; --d) {
FIXED_DIVISOR_DIV_MOD(x_dims[d], c, &c, &r); FIXED_DIVISOR_DIV_MOD(x_dims[d], c, &c, &r);
xi += r * x_strides[d]; xi += r * x_strides[d];
} }
x_val = convert::To<AccT>(x[xi]); const AccT val = convert::To<AccT>(x[xi]);
m_val += x_val; m_val += val;
v_val += x_val * x_val; v_val += val * val;
} }
m_val *= scale; mean[i] = m_val = m_val * scale;
mean[i] = m_val;
var[i] = v_val * scale - m_val * m_val; var[i] = v_val * scale - m_val * m_val;
} }
} }
......
...@@ -11,9 +11,6 @@ namespace kernels { ...@@ -11,9 +11,6 @@ namespace kernels {
namespace { namespace {
#define LDG(x, i) __ldg(x + i)
#define LDG2(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT> template <typename T, typename AccT>
__global__ void _RowwiseMoments( __global__ void _RowwiseMoments(
const int rows, const int rows,
...@@ -27,16 +24,15 @@ __global__ void _RowwiseMoments( ...@@ -27,16 +24,15 @@ __global__ void _RowwiseMoments(
CUDA_2D_KERNEL_LOOP1(i, cols) { CUDA_2D_KERNEL_LOOP1(i, cols) {
AccT m_val = AccT(0), v_val = AccT(0); AccT m_val = AccT(0), v_val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, rows) { CUDA_2D_KERNEL_LOOP2(j, rows) {
const int xi = j * cols + i; const AccT val = convert::To<AccT>(x[j * cols + i]);
m_val += LDG2(x, xi); m_val += val;
v_val += math::utils::Square(LDG2(x, xi)); v_val += val * val;
} }
m_val = BlockReduce<AccT>(m_storage).Sum(m_val); m_val = BlockReduce<AccT>(m_storage).Sum(m_val);
v_val = BlockReduce<AccT>(v_storage).Sum(v_val); v_val = BlockReduce<AccT>(v_storage).Sum(v_val);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
const AccT mu = m_val * scale; mean[i] = m_val = m_val * scale;
mean[i] = mu; var[i] = v_val * scale - m_val * m_val;
var[i] = v_val * scale - mu * mu;
} }
} }
} }
...@@ -54,16 +50,15 @@ __global__ void _ColwiseMoments( ...@@ -54,16 +50,15 @@ __global__ void _ColwiseMoments(
CUDA_2D_KERNEL_LOOP1(i, rows) { CUDA_2D_KERNEL_LOOP1(i, rows) {
AccT m_val = AccT(0), v_val = AccT(0); AccT m_val = AccT(0), v_val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, cols) { CUDA_2D_KERNEL_LOOP2(j, cols) {
const int xi = i * cols + j; const AccT val = convert::To<AccT>(x[i * cols + j]);
m_val += LDG2(x, xi); m_val += val;
v_val += math::utils::Square(LDG2(x, xi)); v_val += val * val;
} }
m_val = BlockReduce<AccT>(m_storage).Sum(m_val); m_val = BlockReduce<AccT>(m_storage).Sum(m_val);
v_val = BlockReduce<AccT>(v_storage).Sum(v_val); v_val = BlockReduce<AccT>(v_storage).Sum(v_val);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
const AccT mu = m_val * scale; mean[i] = m_val = m_val * scale;
mean[i] = mu; var[i] = v_val * scale - m_val * m_val;
var[i] = v_val * scale - mu * mu;
} }
} }
} }
...@@ -90,15 +85,15 @@ __global__ void _GenericMoments( ...@@ -90,15 +85,15 @@ __global__ void _GenericMoments(
FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r); FIXED_DIVISOR_DIV_MOD(X_dims.data[d], c, &c, &r);
xi += r * X_strides.data[d]; xi += r * X_strides.data[d];
} }
m_val += LDG2(x, xi); const AccT val = convert::To<AccT>(x[xi]);
v_val += math::utils::Square(LDG2(x, xi)); m_val += val;
v_val += val * val;
} }
m_val = BlockReduce<AccT>(m_storage).Sum(m_val); m_val = BlockReduce<AccT>(m_storage).Sum(m_val);
v_val = BlockReduce<AccT>(v_storage).Sum(v_val); v_val = BlockReduce<AccT>(v_storage).Sum(v_val);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
const AccT mu = m_val * scale; mean[i] = m_val = m_val * scale;
mean[i] = mu; var[i] = v_val * scale - m_val * m_val;
var[i] = v_val * scale - mu * mu;
} }
} }
} }
...@@ -120,20 +115,14 @@ void _Moments( ...@@ -120,20 +115,14 @@ void _Moments(
} }
if (math::utils::IsRowwiseReduce( if (math::utils::IsRowwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) { num_dims, dims, out_dims.data(), &rows, &cols)) {
_RowwiseMoments<<< _RowwiseMoments<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
CUDA_2D_BLOCKS(cols), rows, cols, x, mean, var);
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(rows, cols, x, mean, var);
return; return;
} }
if (math::utils::IsColwiseReduce( if (math::utils::IsColwiseReduce(
num_dims, dims, out_dims.data(), &rows, &cols)) { num_dims, dims, out_dims.data(), &rows, &cols)) {
_ColwiseMoments<<< _ColwiseMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
CUDA_2D_BLOCKS(rows), rows, cols, x, mean, var);
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(rows, cols, x, mean, var);
return; return;
} }
CUDA_TENSOR_DIMS_CHECK(num_dims); CUDA_TENSOR_DIMS_CHECK(num_dims);
...@@ -155,17 +144,10 @@ void _Moments( ...@@ -155,17 +144,10 @@ void _Moments(
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
transpose_dims.data[i] = dims[transpose_axes.data[i]]; transpose_dims.data[i] = dims[transpose_axes.data[i]];
} }
_GenericMoments<<< _GenericMoments<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>(
CUDA_2D_BLOCKS(rows),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
rows, cols, num_dims, transpose_dims, transpose_strides, x, mean, var); rows, cols, num_dims, transpose_dims, transpose_strides, x, mean, var);
} }
#undef LDG
#undef LDG2
} // namespace } // namespace
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
......
...@@ -218,7 +218,7 @@ __global__ void _BatchNormInferenceGrad( ...@@ -218,7 +218,7 @@ __global__ void _BatchNormInferenceGrad(
_BatchNormExpectation, \ _BatchNormExpectation, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
AccT, \ AccT, \
CUDA_2D_BLOCKS(C), \ C, \
CUDA_THREADS, \ CUDA_THREADS, \
N, \ N, \
C, \ C, \
...@@ -245,7 +245,7 @@ __global__ void _BatchNormInferenceGrad( ...@@ -245,7 +245,7 @@ __global__ void _BatchNormInferenceGrad(
_BatchNormWGrad, \ _BatchNormWGrad, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
AccT, \ AccT, \
CUDA_2D_BLOCKS(C), \ C, \
CUDA_THREADS, \ CUDA_THREADS, \
N, \ N, \
C, \ C, \
...@@ -314,7 +314,7 @@ __global__ void _BatchNormInferenceGrad( ...@@ -314,7 +314,7 @@ __global__ void _BatchNormInferenceGrad(
_BatchNormWGrad, \ _BatchNormWGrad, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
AccT, \ AccT, \
CUDA_2D_BLOCKS(C), \ C, \
CUDA_THREADS, \ CUDA_THREADS, \
N, \ N, \
C, \ C, \
......
...@@ -230,7 +230,7 @@ __global__ void _GroupNormGrad( ...@@ -230,7 +230,7 @@ __global__ void _GroupNormGrad(
_GroupNormWGrad, \ _GroupNormWGrad, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
AccT, \ AccT, \
CUDA_2D_BLOCKS(G* D), \ G* D, \
CUDA_THREADS, \ CUDA_THREADS, \
N, \ N, \
G, \ G, \
...@@ -246,7 +246,7 @@ __global__ void _GroupNormGrad( ...@@ -246,7 +246,7 @@ __global__ void _GroupNormGrad(
_GroupNormInternalGrad, \ _GroupNormInternalGrad, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
AccT, \ AccT, \
CUDA_2D_BLOCKS(N* G), \ N* G, \
CUDA_THREADS, \ CUDA_THREADS, \
N, \ N, \
G, \ G, \
......
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T, typename AccT>
void _LayerNorm(
const int N,
const int C,
const AccT epsilon,
const T* x,
const AccT* gamma,
const AccT* beta,
AccT* mu,
AccT* rsig,
T* y) {
const AccT scale = AccT(1) / AccT(C);
for (int i = 0; i < N; ++i) {
const int offset = i * C;
AccT m_val = AccT(0), v_val = AccT(0);
for (int j = 0; j < C; ++j) {
const AccT val = convert::To<AccT>(x[offset + j]);
m_val += val;
v_val += val * val;
}
mu[i] = m_val = m_val * scale;
v_val = std::sqrt(v_val * scale - m_val * m_val + epsilon);
rsig[i] = v_val = AccT(1) / v_val;
for (int j = 0; j < C; ++j) {
AccT val = convert::To<AccT>(x[offset + j]);
val = (val - m_val) * v_val;
y[offset + j] = convert::To<T>(val * gamma[j] + beta[j]);
}
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void LayerNorm<T, AccT, CPUContext>( \
const int N, \
const int C, \
const float epsilon, \
const T* x, \
const AccT* gamma, \
const AccT* beta, \
AccT* mu, \
AccT* rsig, \
T* y, \
CPUContext* ctx) { \
_LayerNorm(N, C, AccT(epsilon), x, gamma, beta, mu, rsig, y); \
}
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/device/common_cub.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernels {
namespace {
template <typename T, typename AccT>
__global__ void _LayerNorm(
const int N,
const int C,
const AccT epsilon,
const T* x,
const AccT* gamma,
const AccT* beta,
AccT* mu,
AccT* rsig,
T* y) {
__shared__ AccT block_mu, block_rsig;
__shared__ typename BlockReduce<AccT>::TempStorage m_storage;
__shared__ typename BlockReduce<AccT>::TempStorage v_storage;
const AccT scale = AccT(1) / AccT(C);
CUDA_2D_KERNEL_LOOP1(i, N) {
AccT m_val = AccT(0), v_val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) {
const AccT val = convert::To<AccT>(__ldg(x + i * C + j));
m_val += val;
v_val += val * val;
}
m_val = BlockReduce<AccT>(m_storage).Sum(m_val);
v_val = BlockReduce<AccT>(v_storage).Sum(v_val);
if (threadIdx.x == 0) {
mu[i] = block_mu = m_val = m_val * scale;
rsig[i] = block_rsig = rsqrt(v_val * scale - m_val * m_val + epsilon);
}
__syncthreads();
CUDA_2D_KERNEL_LOOP2(j, C) {
const int index = i * C + j;
m_val = convert::To<AccT>(__ldg(x + index));
m_val = (m_val - block_mu) * block_rsig;
y[index] = convert::To<T>(fma(m_val, __ldg(gamma + j), __ldg(beta + j)));
}
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(T, AccT) \
template <> \
void LayerNorm<T, AccT, CUDAContext>( \
const int N, \
const int C, \
const float epsilon, \
const T* x, \
const AccT* gamma, \
const AccT* beta, \
AccT* mu, \
AccT* rsig, \
T* y, \
CUDAContext* ctx) { \
_LayerNorm<<<N, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
N, \
C, \
AccT(epsilon), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
gamma, \
beta, \
mu, \
rsig, \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
DEFINE_KERNEL_LAUNCHER(float16, float);
DEFINE_KERNEL_LAUNCHER(float, float);
DEFINE_KERNEL_LAUNCHER(double, double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernels
} // namespace dragon
#endif // USE_CUDA
...@@ -147,52 +147,52 @@ __global__ void _L2NormalizeGrad( ...@@ -147,52 +147,52 @@ __global__ void _L2NormalizeGrad(
/* ------------------- Launcher Separator ------------------- */ /* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T, AccT) \ #define DEFINE_KERNEL_LAUNCHER(name, T, AccT) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const float normalizer, \ const float normalizer, \
const float epsilon, \ const float epsilon, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto NxS = N * S; \ const auto NxS = N * S; \
_##name<math::ScalarType<T>::type, AccT> \ _##name<math::ScalarType<T>::type, AccT> \
<<<CUDA_2D_BLOCKS(NxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
C, \ C, \
AccT(normalizer), \ AccT(normalizer), \
AccT(epsilon), \ AccT(epsilon), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \ reinterpret_cast<math::ScalarType<T>::type*>(y)); \
} }
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, AccT) \ #define DEFINE_GRAD_KERNEL_LAUNCHER(name, T, AccT) \
template <> \ template <> \
void name<T, CUDAContext>( \ void name<T, CUDAContext>( \
const int N, \ const int N, \
const int S, \ const int S, \
const int C, \ const int C, \
const float normalizer, \ const float normalizer, \
const float epsilon, \ const float epsilon, \
const T* dy, \ const T* dy, \
const T* x, \ const T* x, \
T* dx, \ T* dx, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
const auto NxS = N * S; \ const auto NxS = N * S; \
_##name<math::ScalarType<T>::type, AccT> \ _##name<math::ScalarType<T>::type, AccT> \
<<<CUDA_2D_BLOCKS(NxS), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \ <<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \ NxS, \
S, \ S, \
C, \ C, \
AccT(normalizer), \ AccT(normalizer), \
AccT(epsilon), \ AccT(epsilon), \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \ reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \ reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \ reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
} }
DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, float); DEFINE_KERNEL_LAUNCHER(L1Normalize, float16, float);
......
...@@ -415,7 +415,7 @@ DEFINE_KERNEL_LAUNCHER(Col2Im2d, true, double); ...@@ -415,7 +415,7 @@ DEFINE_KERNEL_LAUNCHER(Col2Im2d, true, double);
_Im2ColNd, \ _Im2ColNd, \
kTransposed, \ kTransposed, \
math::ScalarType<T>::type, \ math::ScalarType<T>::type, \
CUDA_2D_BLOCKS(outer_dim), \ outer_dim, \
CUDA_THREADS, \ CUDA_THREADS, \
channels, \ channels, \
kernel_dim, \ kernel_dim, \
......
...@@ -56,18 +56,18 @@ void _RoiAlign( ...@@ -56,18 +56,18 @@ void _RoiAlign(
const T* x, const T* x,
const float* rois, const float* rois,
T* y) { T* y) {
auto x_inner_dim = H * W; const auto HxW = H * W;
auto y_inner_dim = out_h * out_w; const auto HoxWo = out_h * out_w;
auto x_cols = C * x_inner_dim; const auto CxHxW = C * HxW;
auto y_cols = C * y_inner_dim; const auto CxHoxWo = C * HoxWo;
for (int n = 0; n < num_rois; ++n) { for (int n = 0; n < num_rois; ++n) {
auto* roi = rois + n * 5; auto* roi = rois + n * 5;
int batch_ind = (int)roi[0]; int batch_ind = (int)roi[0];
auto* offset_y = y + n * y_cols; auto* offset_y = y + n * CxHoxWo;
if (batch_ind < 0) { if (batch_ind < 0) {
memset(offset_y, 0, sizeof(T) * y_cols); memset(offset_y, 0, sizeof(T) * CxHoxWo);
continue; continue;
} }
...@@ -78,19 +78,21 @@ void _RoiAlign( ...@@ -78,19 +78,21 @@ void _RoiAlign(
const float roi_w = std::max(roi_wend - roi_wstart, 1.f); const float roi_w = std::max(roi_wend - roi_wstart, 1.f);
const float roi_h = std::max(roi_hend - roi_hstart, 1.f); const float roi_h = std::max(roi_hend - roi_hstart, 1.f);
const float bin_h = roi_h / (float)out_h; const float bin_h = roi_h / float(out_h);
const float bin_w = roi_w / (float)out_w; const float bin_w = roi_w / float(out_w);
const int grid_h = const int grid_h = sampling_ratio > 0
sampling_ratio > 0 ? sampling_ratio : (int)std::ceil(roi_h / out_h); ? sampling_ratio
const int grid_w = : int(std::ceil(roi_h / float(out_h)));
sampling_ratio > 0 ? sampling_ratio : (int)std::ceil(roi_w / out_w); const int grid_w = sampling_ratio > 0
? sampling_ratio
: int(std::ceil(roi_w / float(out_w)));
const T num_grids = T(grid_h * grid_w); const T num_grids = T(grid_h * grid_w);
int yi; int yi;
T val; T val;
float hstart, wstart, h, w; float hstart, wstart, h, w;
const T* offset_x = x + batch_ind * x_cols; const T* offset_x = x + batch_ind * CxHxW;
for (int c = 0; c < C; ++c) { for (int c = 0; c < C; ++c) {
yi = 0; yi = 0;
...@@ -109,8 +111,8 @@ void _RoiAlign( ...@@ -109,8 +111,8 @@ void _RoiAlign(
offset_y[yi++] = val / num_grids; offset_y[yi++] = val / num_grids;
} }
} // End h_out && w_out } // End h_out && w_out
offset_x += x_inner_dim; offset_x += HxW;
offset_y += y_inner_dim; offset_y += HoxWo;
} // End c } // End c
} // End n } // End n
} }
......
...@@ -123,16 +123,16 @@ __global__ void _RoiAlign( ...@@ -123,16 +123,16 @@ __global__ void _RoiAlign(
const float roi_w = max(roi_wend - roi_wstart, 1.f); const float roi_w = max(roi_wend - roi_wstart, 1.f);
const float roi_h = max(roi_hend - roi_hstart, 1.f); const float roi_h = max(roi_hend - roi_hstart, 1.f);
const float bin_h = roi_h / (float)out_h; const float bin_h = roi_h / float(out_h);
const float bin_w = roi_w / (float)out_w; const float bin_w = roi_w / float(out_w);
const float hstart = roi_hstart + h_out * bin_h; const float hstart = roi_hstart + h_out * bin_h;
const float wstart = roi_wstart + w_out * bin_w; const float wstart = roi_wstart + w_out * bin_w;
const int grid_h = const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h); sampling_ratio > 0 ? sampling_ratio : int(ceil(roi_h / float(out_h)));
const int grid_w = const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w); sampling_ratio > 0 ? sampling_ratio : int(ceil(roi_w / float(out_w)));
const T* offset_x = x + (batch_ind * C + c) * H * W; const T* offset_x = x + (batch_ind * C + c) * H * W;
AccT val = AccT(0); AccT val = AccT(0);
...@@ -178,16 +178,16 @@ __global__ void _RoiAlignGrad( ...@@ -178,16 +178,16 @@ __global__ void _RoiAlignGrad(
const float roi_w = max(roi_wend - roi_wstart, 1.f); const float roi_w = max(roi_wend - roi_wstart, 1.f);
const float roi_h = max(roi_hend - roi_hstart, 1.f); const float roi_h = max(roi_hend - roi_hstart, 1.f);
const float bin_h = roi_h / (float)out_h; const float bin_h = roi_h / float(out_h);
const float bin_w = roi_w / (float)out_w; const float bin_w = roi_w / float(out_w);
const float hstart = roi_hstart + h_out * bin_h; const float hstart = roi_hstart + h_out * bin_h;
const float wstart = roi_wstart + w_out * bin_w; const float wstart = roi_wstart + w_out * bin_w;
const int grid_h = const int grid_h =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_h / out_h); sampling_ratio > 0 ? sampling_ratio : int(ceil(roi_h / float(out_h)));
const int grid_w = const int grid_w =
sampling_ratio > 0 ? sampling_ratio : ceil(roi_w / out_w); sampling_ratio > 0 ? sampling_ratio : int(ceil(roi_w / float(out_w)));
const float dyi = convert::To<float>(dy[yi]) / float(grid_h * grid_w); const float dyi = convert::To<float>(dy[yi]) / float(grid_h * grid_w);
float* offset_dx = dx + (batch_ind * C + c) * H * W; float* offset_dx = dx + (batch_ind * C + c) * H * W;
......
...@@ -19,21 +19,21 @@ void _RoiPool( ...@@ -19,21 +19,21 @@ void _RoiPool(
const float* rois, const float* rois,
int* mask, int* mask,
T* y) { T* y) {
auto x_inner_dim = H * W; const auto HxW = H * W;
auto y_inner_dim = out_h * out_w; const auto HoxWo = out_h * out_w;
auto x_cols = C * x_inner_dim; const auto CxHxW = C * HxW;
auto y_cols = C * y_inner_dim; const auto CxHoxWo = C * HoxWo;
for (int n = 0; n < num_rois; ++n) { for (int n = 0; n < num_rois; ++n) {
auto* roi = rois + n * 5; auto* roi = rois + n * 5;
auto* offset_y = y + n * y_cols; auto* offset_y = y + n * CxHoxWo;
auto* offset_mask = mask + n * y_cols; auto* offset_mask = mask + n * CxHoxWo;
const int batch_ind = (int)roi[0]; const int batch_ind = (int)roi[0];
if (batch_ind < 0) { if (batch_ind < 0) {
memset(offset_y, 0, sizeof(T) * y_cols); memset(offset_y, 0, sizeof(T) * CxHoxWo);
memset(offset_mask, -1, sizeof(int) * y_cols); memset(offset_mask, -1, sizeof(int) * CxHoxWo);
continue; continue;
} }
...@@ -44,14 +44,14 @@ void _RoiPool( ...@@ -44,14 +44,14 @@ void _RoiPool(
const int roi_w = std::max(roi_wend - roi_wstart + 1, 1); const int roi_w = std::max(roi_wend - roi_wstart + 1, 1);
const int roi_h = std::max(roi_hend - roi_hstart + 1, 1); const int roi_h = std::max(roi_hend - roi_hstart + 1, 1);
const float bin_h = (float)roi_h / (float)out_h; const float bin_h = float(roi_h) / float(out_h);
const float bin_w = (float)roi_w / (float)out_w; const float bin_w = float(roi_w) / float(out_w);
T val; T val;
bool empty; bool empty;
int xi, yi, mask_val; int xi, yi, mask_val;
int hstart, wstart, hend, wend; int hstart, wstart, hend, wend;
const T* offset_x = x + batch_ind * x_cols; const T* offset_x = x + batch_ind * CxHxW;
for (int c = 0; c < C; ++c) { for (int c = 0; c < C; ++c) {
yi = 0; yi = 0;
...@@ -82,9 +82,9 @@ void _RoiPool( ...@@ -82,9 +82,9 @@ void _RoiPool(
offset_mask[yi++] = mask_val; offset_mask[yi++] = mask_val;
} }
} // End h_out && w_out } // End h_out && w_out
offset_x += x_inner_dim; offset_x += HxW;
offset_y += y_inner_dim; offset_y += HoxWo;
offset_mask += y_inner_dim; offset_mask += HoxWo;
} // End c } // End c
} // End n } // End n
} }
......
#include "dragon/operators/activation/log_softmax_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename T>
void LogSoftmaxOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0});
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
kernels::LogSoftmax(
X.count(0, axis),
X.count(axis + 1),
X.dim(axis),
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
template <typename T>
void LogSoftmaxGradientOp<Context>::DoRunWithType() {
auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
GET_OP_AXIS_ARG(axis, Y.ndim(), -1);
kernels::LogSoftmaxGrad(
Y.count(0, axis),
Y.count(axis + 1),
Y.dim(axis),
dY.template data<T, Context>(),
Y.template data<T, Context>(),
dX->ReshapeLike(Y)->template mutable_data<T, Context>(),
ctx());
}
DEPLOY_CPU_OPERATOR(LogSoftmax);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LogSoftmax);
#endif
DEPLOY_CPU_OPERATOR(LogSoftmaxGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LogSoftmaxGradient);
#endif
OPERATOR_SCHEMA(LogSoftmax)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1)
/* X => Y */
.AllowInplace({{0, 0}});
OPERATOR_SCHEMA(LogSoftmaxGradient)
/* Y, dY */
.NumInputs(2)
/* dX */
.NumOutputs(1)
/* dY => dX */
.AllowInplace({{1, 0}});
REGISTER_GRADIENT(LogSoftmax, InplaceGradientMaker);
} // namespace dragon
...@@ -10,37 +10,41 @@ ...@@ -10,37 +10,41 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_OPERATORS_VISION_DEPTH_TO_SPACE_OP_H_ #ifndef DRAGON_OPERATORS_ACTIVATION_LOG_SOFTMAX_OP_H_
#define DRAGON_OPERATORS_VISION_DEPTH_TO_SPACE_OP_H_ #define DRAGON_OPERATORS_ACTIVATION_LOG_SOFTMAX_OP_H_
#include "dragon/operators/array/transpose_op.h" #include "dragon/core/operator.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class DepthToSpaceOp final : public Operator<Context> { class LogSoftmaxOp : public Operator<Context> {
public: public:
DepthToSpaceOp(const OperatorDef& def, Workspace* ws) SIMPLE_CTOR_DTOR(LogSoftmaxOp);
: Operator<Context>(def, ws),
block_size_(OP_SINGLE_ARG(int, "block_size", 2)) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
protected:
int64_t block_size_;
}; };
template <class Context> template <class Context>
class DepthToSpaceGradientOp final : public TransposeGradientOp<Context> { class LogSoftmaxGradientOp : public Operator<Context> {
public: public:
DepthToSpaceGradientOp(const OperatorDef& def, Workspace* ws) SIMPLE_CTOR_DTOR(LogSoftmaxGradientOp);
: TransposeGradientOp<Context>(def, ws) {} USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
}; };
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_VISION_DEPTH_TO_SPACE_OP_H_ #endif // DRAGON_OPERATORS_ACTIVATION_LOG_SOFTMAX_OP_H_
...@@ -18,11 +18,6 @@ void SoftmaxOp<Context>::DoRunWithType() { ...@@ -18,11 +18,6 @@ void SoftmaxOp<Context>::DoRunWithType() {
} }
template <class Context> template <class Context>
void SoftmaxOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void SoftmaxGradientOp<Context>::DoRunWithType() { void SoftmaxGradientOp<Context>::DoRunWithType() {
auto &Y = Input(0), &dY = Input(1), *dX = Output(0); auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
...@@ -37,11 +32,6 @@ void SoftmaxGradientOp<Context>::DoRunWithType() { ...@@ -37,11 +32,6 @@ void SoftmaxGradientOp<Context>::DoRunWithType() {
ctx()); ctx());
} }
template <class Context>
void SoftmaxGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Softmax); DEPLOY_CPU_OPERATOR(Softmax);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Softmax); DEPLOY_CUDA_OPERATOR(Softmax);
......
...@@ -23,7 +23,9 @@ class SoftmaxOp : public Operator<Context> { ...@@ -23,7 +23,9 @@ class SoftmaxOp : public Operator<Context> {
SIMPLE_CTOR_DTOR(SoftmaxOp); SIMPLE_CTOR_DTOR(SoftmaxOp);
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -35,10 +37,12 @@ class SoftmaxGradientOp : public Operator<Context> { ...@@ -35,10 +37,12 @@ class SoftmaxGradientOp : public Operator<Context> {
SIMPLE_CTOR_DTOR(SoftmaxGradientOp); SIMPLE_CTOR_DTOR(SoftmaxGradientOp);
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
void RunOnDevice() override;
}; };
#ifdef USE_CUDNN #ifdef USE_CUDNN
......
...@@ -14,69 +14,60 @@ void RepeatOp<Context>::DoRunWithType() { ...@@ -14,69 +14,60 @@ void RepeatOp<Context>::DoRunWithType() {
// Determine the repeat scheme // Determine the repeat scheme
// 1) Repeat to a flatten vector if axis is not specified // 1) Repeat to a flatten vector if axis is not specified
// 2) Repeat along the specified axis // 2) Repeat along the specified axis
int64_t outer_dim, axis_dim, inner_dim; int64_t N, C, S;
int64_t reps = repeats();
if (axis == INT_MAX) { if (axis == INT_MAX) {
outer_dim = inner_dim = 1; N = S = 1;
axis_dim = X.count(); C = X.count();
Y->Reshape({axis_dim * repeats()}); Y->Reshape({C * reps});
} else { } else {
axis_dim = X.dim(axis); C = X.dim(axis);
outer_dim = X.count(0, axis); N = X.count(0, axis);
inner_dim = X.count(axis + 1); S = X.count(axis + 1);
auto Y_dims = X.dims(); auto Y_dims = X.dims();
Y_dims[axis] *= repeats(); Y_dims[axis] *= reps;
Y->Reshape(Y_dims); Y->Reshape(Y_dims);
} }
// Dispatch the repeat kenrel // Dispatch the repeat kenrel
kernels::Repeat( kernels::Repeat(
outer_dim, N,
inner_dim, S,
axis_dim, C,
repeats(), reps,
X.template data<T, Context>(), X.template data<T, Context>(),
Y->template mutable_data<T, Context>(), Y->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context> template <class Context>
void RepeatOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void RepeatGradientOp<Context>::DoRunWithType() { void RepeatGradientOp<Context>::DoRunWithType() {
auto &X = INPUT_SPEC(0), &dY = Input(0), *dX = Output(0); auto &X = INPUT_SPEC(0), &dY = Input(0), *dX = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), INT_MAX); GET_OP_AXIS_ARG(axis, X.ndim(), INT_MAX);
// Determine the repeat scheme // Determine the repeat scheme
int64_t outer_dim, axis_dim, inner_dim; int64_t N, C, S;
if (axis == INT_MAX) { if (axis == INT_MAX) {
outer_dim = inner_dim = 1; N = S = 1;
axis_dim = X.count(); C = X.count();
} else { } else {
outer_dim = X.count(0, axis); N = X.count(0, axis);
axis_dim = X.dim(axis); C = X.dim(axis);
inner_dim = X.count(axis + 1); S = X.count(axis + 1);
} }
// Reduce the gradient along the axis // Reduce the gradient along the axis
kernels::RepeatGrad( kernels::RepeatGrad(
outer_dim, N,
inner_dim, S,
axis_dim, C,
repeats(), repeats(),
dY.template data<T, Context>(), dY.template data<T, Context>(),
dX->ReshapeLike(X)->template mutable_data<T, Context>(), dX->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context>
void RepeatGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Repeat); DEPLOY_CPU_OPERATOR(Repeat);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Repeat); DEPLOY_CUDA_OPERATOR(Repeat);
......
...@@ -20,12 +20,15 @@ namespace dragon { ...@@ -20,12 +20,15 @@ namespace dragon {
template <class Context> template <class Context>
class RepeatOp final : public Operator<Context> { class RepeatOp final : public Operator<Context> {
public: public:
RepeatOp(const OperatorDef& def, Workspace* ws) : Operator<Context>(def, ws) { explicit RepeatOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws) {
INITIALIZE_OP_SINGLE_ARG(int64_t, repeats, 1); INITIALIZE_OP_SINGLE_ARG(int64_t, repeats, 1);
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -43,7 +46,9 @@ class RepeatGradientOp final : public Operator<Context> { ...@@ -43,7 +46,9 @@ class RepeatGradientOp final : public Operator<Context> {
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
......
...@@ -17,16 +17,22 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -17,16 +17,22 @@ void TransposeOp<Context>::DoRunWithType() {
<< "\nProviding " << num_axes << " dimensions to permute, " << "\nProviding " << num_axes << " dimensions to permute, "
<< "while Tensor(" << X.name() << ")'s dims are " << X.DimString(); << "while Tensor(" << X.name() << ")'s dims are " << X.DimString();
vec64_t new_axes(num_dims);
for (int i = 0; i < num_dims; ++i) { for (int i = 0; i < num_dims; ++i) {
auto axis = num_axes > 0 ? perm(i) : num_dims - i - 1; new_axes[i] = num_axes > 0 ? perm(i) : num_dims - i - 1;
X_strides[i] = X.stride(axis);
Y_dims[i] = X.dim(axis);
} }
// Store for the gradient calculation if (def().type() == "TransposeGradient") {
SET_INPUT_SPEC(0); auto old_axes(new_axes);
Buffer("X_strides")->template CopyFrom<int64_t>(X_strides); for (int i = 0; i < num_dims; ++i) {
Buffer("Y_dims")->template CopyFrom<int64_t>(Y_dims); new_axes[old_axes[i]] = i;
}
}
for (int i = 0; i < num_dims; ++i) {
X_strides[i] = X.stride(new_axes[i]);
Y_dims[i] = X.dim(new_axes[i]);
}
kernels::Transpose( kernels::Transpose(
num_dims, num_dims,
...@@ -37,43 +43,11 @@ void TransposeOp<Context>::DoRunWithType() { ...@@ -37,43 +43,11 @@ void TransposeOp<Context>::DoRunWithType() {
ctx()); ctx());
} }
template <class Context>
void TransposeOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void TransposeGradientOp<Context>::DoRunWithType() {
auto &dY = Input(0), *dX = Output(0);
dX->ReshapeLike(INPUT_SPEC(0));
vec64_t X_strides, Y_dims;
Buffer("X_strides")->template CopyTo<int64_t>(X_strides);
Buffer("Y_dims")->template CopyTo<int64_t>(Y_dims);
kernels::TransposeGrad(
X_strides.size(),
X_strides.data(),
Y_dims.data(),
dY.template data<T, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void TransposeGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Transpose); DEPLOY_CPU_OPERATOR(Transpose);
REGISTER_CPU_OPERATOR(TransposeGradient, TransposeOp<CPUContext>);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Transpose); DEPLOY_CUDA_OPERATOR(Transpose);
#endif REGISTER_CUDA_OPERATOR(TransposeGradient, TransposeOp<CUDAContext>);
DEPLOY_CPU_OPERATOR(TransposeGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(TransposeGradient);
#endif #endif
OPERATOR_SCHEMA(Transpose) OPERATOR_SCHEMA(Transpose)
......
...@@ -26,7 +26,9 @@ class TransposeOp final : public Operator<Context> { ...@@ -26,7 +26,9 @@ class TransposeOp final : public Operator<Context> {
} }
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -35,18 +37,6 @@ class TransposeOp final : public Operator<Context> { ...@@ -35,18 +37,6 @@ class TransposeOp final : public Operator<Context> {
DECLARE_OP_REPEATED_ARG(int64_t, perm); DECLARE_OP_REPEATED_ARG(int64_t, perm);
}; };
template <class Context>
class TransposeGradientOp : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(TransposeGradientOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
template <typename T>
void DoRunWithType();
};
DEFINE_OP_REPEATED_ARG(int64_t, TransposeOp, perm); DEFINE_OP_REPEATED_ARG(int64_t, TransposeOp, perm);
} // namespace dragon } // namespace dragon
......
...@@ -55,6 +55,7 @@ DISPATCH_WITH_TENSOR_TYPES(GreaterEqual, dtypes::Generic, Input(0)); ...@@ -55,6 +55,7 @@ DISPATCH_WITH_TENSOR_TYPES(GreaterEqual, dtypes::Generic, Input(0));
ctx()); \ ctx()); \
} }
DEFINE_SIMPLE_UNARY_OP_IMPL(Log, T);
DEFINE_SIMPLE_UNARY_OP_IMPL(Sin, T); DEFINE_SIMPLE_UNARY_OP_IMPL(Sin, T);
DEFINE_SIMPLE_UNARY_OP_IMPL(Cos, T); DEFINE_SIMPLE_UNARY_OP_IMPL(Cos, T);
DEFINE_SIMPLE_UNARY_OP_IMPL(Square, T); DEFINE_SIMPLE_UNARY_OP_IMPL(Square, T);
...@@ -83,7 +84,6 @@ DEFINE_INPLACE_UNARY_OP_IMPL(Sign, T); ...@@ -83,7 +84,6 @@ DEFINE_INPLACE_UNARY_OP_IMPL(Sign, T);
DEFINE_INPLACE_UNARY_OP_IMPL(Sqrt, T); DEFINE_INPLACE_UNARY_OP_IMPL(Sqrt, T);
DEFINE_INPLACE_UNARY_OP_IMPL(Rsqrt, T); DEFINE_INPLACE_UNARY_OP_IMPL(Rsqrt, T);
DEFINE_INPLACE_UNARY_OP_IMPL(Exp, T); DEFINE_INPLACE_UNARY_OP_IMPL(Exp, T);
DEFINE_INPLACE_UNARY_OP_IMPL(Log, T);
DEFINE_INPLACE_UNARY_OP_IMPL(BitwiseNot, T); DEFINE_INPLACE_UNARY_OP_IMPL(BitwiseNot, T);
#undef DEFINE_INPLACE_UNARY_OP_IMPL #undef DEFINE_INPLACE_UNARY_OP_IMPL
......
...@@ -3,36 +3,35 @@ ...@@ -3,36 +3,35 @@
namespace dragon { namespace dragon {
template <class Context> template <class Context>
template <typename LogitT, typename TargetT> template <typename InputT, typename TargetT>
void AccuracyOp<Context>::DoRunWithType() { void AccuracyOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), &Y = Input(1), *R = Output(0);
GET_OP_AXIS_ARG(axis, X.ndim(), -1); GET_OP_AXIS_ARG(axis, X.ndim(), -1);
auto outer_dim = X.count(0, axis); const auto C = X.dim(axis);
auto axis_dim = X.dim(axis); const auto N = X.count(0, axis);
auto inner_dim = X.count(axis + 1); const auto S = X.count(axis + 1);
const auto NxS = N * S;
const auto CxS = C * S;
CHECK_EQ(Y.count(), NxS) << "\nNumel of X and Y must be matched.";
CHECK_EQ(outer_dim * inner_dim, Input(1).count()) auto* input = X.template data<InputT, CPUContext>();
<< "\nNumber of preds must match the number of targets."; auto* target = Y.template data<TargetT, CPUContext>();
int64_t acc = 0, count = 0; int64_t acc = 0, count = 0;
int64_t cols = X.count() / outer_dim; for (int i = 0; i < N; ++i) {
for (int j = 0; j < S; ++j) {
auto* logit = X.template data<LogitT, CPUContext>(); const int label = target[i * S + j];
auto* target = Input(1).template data<TargetT, CPUContext>();
for (int i = 0; i < outer_dim; ++i) {
for (int j = 0; j < inner_dim; ++j) {
const int label = target[i * inner_dim + j];
if (label == ignore_index_) continue; if (label == ignore_index_) continue;
vector<pair<LogitT, int>> vec; vector<pair<InputT, int>> vec;
for (int k = 0; k < axis_dim; k++) for (int k = 0; k < C; ++k) {
vec.push_back(std::make_pair(logit[i * cols + k * inner_dim + j], k)); vec.push_back(std::make_pair(input[i * CxS + k * S + j], k));
}
std::partial_sort( std::partial_sort(
vec.begin(), vec.begin(),
vec.begin() + top_k_, vec.begin() + top_k_,
vec.end(), vec.end(),
std::greater<pair<LogitT, int>>()); std::greater<pair<InputT, int>>());
for (int k = 0; k < top_k_; k++) { for (int k = 0; k < top_k_; k++) {
if (vec[k].second == label) { if (vec[k].second == label) {
acc++; acc++;
...@@ -40,10 +39,10 @@ void AccuracyOp<Context>::DoRunWithType() { ...@@ -40,10 +39,10 @@ void AccuracyOp<Context>::DoRunWithType() {
} }
} }
count++; count++;
} // End inner_dim }
} // End outer_dim }
Y->Reshape({})->template mutable_data<float, CPUContext>()[0] = R->Reshape({})->template mutable_data<float, CPUContext>()[0] =
(float)acc / (float)count; (float)acc / (float)count;
} }
...@@ -79,9 +78,9 @@ DEPLOY_CUDA_OPERATOR(Accuracy); ...@@ -79,9 +78,9 @@ DEPLOY_CUDA_OPERATOR(Accuracy);
#endif #endif
OPERATOR_SCHEMA(Accuracy) OPERATOR_SCHEMA(Accuracy)
/* X, T */ /* X, Y */
.NumInputs(2) .NumInputs(2)
/* Y */ /* R */
.NumOutputs(1); .NumOutputs(1);
NO_GRADIENT(Accuracy); NO_GRADIENT(Accuracy);
......
...@@ -284,13 +284,13 @@ DEPLOY_CUDA_OPERATOR(BatchNormGradient); ...@@ -284,13 +284,13 @@ DEPLOY_CUDA_OPERATOR(BatchNormGradient);
#endif #endif
OPERATOR_SCHEMA(BatchNorm) OPERATOR_SCHEMA(BatchNorm)
/* X, W, B, RunningMean, RunningVar */ /* X, W, B, RM, RV */
.NumInputs(5) .NumInputs(5)
/* Y */ /* Y */
.NumOutputs(1); .NumOutputs(1);
OPERATOR_SCHEMA(BatchNormGradient) OPERATOR_SCHEMA(BatchNormGradient)
/* X, W, RunningMean, RunningVar, dY */ /* X, W, RM, RV, dY */
.NumInputs(5) .NumInputs(5)
/* dX, dW, dB */ /* dX, dW, dB */
.NumOutputs(3); .NumOutputs(3);
......
...@@ -9,12 +9,16 @@ template <class Context> ...@@ -9,12 +9,16 @@ template <class Context>
template <typename T> template <typename T>
void GroupNormOp<Context>::DoRunWithType() { void GroupNormOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccmulatorType<T>::type;
INITIALIZE_TENSOR_VIA_SPEC(Input(1), vec64_t({C_}), ParamT); auto &X = Input(0), *Y = Output(0);
INITIALIZE_TENSOR_VIA_SPEC(Input(2), vec64_t({C_}), ParamT); auto &W = Input(1), &B = Input(2);
GetBaseArguments();
INITIALIZE_TENSOR_VIA_SPEC(W, vec64_t({C_}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(B, vec64_t({C_}), ParamT);
auto* X_mu = Buffer("X_mu")->Reshape({N_, G_}); auto* X_mu = Buffer("X_mu")->Reshape({N_, G_});
auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_}); auto* X_rsig = Buffer("X_rsig")->Reshape({N_, G_});
auto* x = Input(0).template data<T, Context>(); auto* x = X.template data<T, Context>();
auto* mu = X_mu->template mutable_data<ParamT, Context>(); auto* mu = X_mu->template mutable_data<ParamT, Context>();
auto* rsig = X_rsig->template mutable_data<ParamT, Context>(); auto* rsig = X_rsig->template mutable_data<ParamT, Context>();
...@@ -29,10 +33,10 @@ void GroupNormOp<Context>::DoRunWithType() { ...@@ -29,10 +33,10 @@ void GroupNormOp<Context>::DoRunWithType() {
kernels::Moments(4, dims.data(), 2, axes.data(), x, mu, rsig, ctx()); kernels::Moments(4, dims.data(), 2, axes.data(), x, mu, rsig, ctx());
} }
// Inverse stddev from variance // Inverse stddev from variance.
math::InvStd(N_ * G_, epsilon_, rsig, rsig, ctx()); math::InvStd(N_ * G_, epsilon_, rsig, rsig, ctx());
// Fuse parameters to compute affine transformation // Fuse parameters to compute affine transformation.
auto* scratch = auto* scratch =
ctx()->workspace()->template data<ParamT, Context>({2 * N_ * C_})[0]; ctx()->workspace()->template data<ParamT, Context>({2 * N_ * C_})[0];
kernels::GroupNorm( kernels::GroupNorm(
...@@ -44,29 +48,24 @@ void GroupNormOp<Context>::DoRunWithType() { ...@@ -44,29 +48,24 @@ void GroupNormOp<Context>::DoRunWithType() {
x, x,
mu, mu,
rsig, rsig,
Input(1).template data<ParamT, Context>(), // gamma W.template data<ParamT, Context>(),
Input(2).template data<ParamT, Context>(), // beta B.template data<ParamT, Context>(),
scratch, scratch,
scratch + N_ * C_, scratch + N_ * C_,
Output(0)->template mutable_data<T, Context>(), Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context> template <class Context>
void GroupNormOp<Context>::RunOnDevice() {
GetBaseArguments();
Output(0)->ReshapeLike(Input(0));
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T> template <typename T>
void GroupNormGradientOp<Context>::DoRunWithType() { void GroupNormGradientOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type; using ParamT = typename math::AccmulatorType<T>::type;
auto *dX = Output(0), *dW = Output(1), *dB = Output(2); auto &X = Input(0), &W = Input(1), &dY = Input(2);
auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig"); auto *X_mu = Buffer("X_mu"), *X_rsig = Buffer("X_rsig");
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
GetBaseArguments();
// Gradient w.r.t. gamma, beta and input // Gradient w.r.t. gamma, beta and input.
auto* scratch = auto* scratch =
ctx()->workspace()->template data<ParamT, Context>({2 * N_ * G_})[0]; ctx()->workspace()->template data<ParamT, Context>({2 * N_ * G_})[0];
kernels::GroupNormGrad( kernels::GroupNormGrad(
...@@ -75,26 +74,19 @@ void GroupNormGradientOp<Context>::DoRunWithType() { ...@@ -75,26 +74,19 @@ void GroupNormGradientOp<Context>::DoRunWithType() {
D_, D_,
S_, S_,
data_format(), data_format(),
Input(0).template data<T, Context>(), // x X.template data<T, Context>(),
X_mu->template data<ParamT, Context>(), X_mu->template data<ParamT, Context>(),
X_rsig->template data<ParamT, Context>(), X_rsig->template data<ParamT, Context>(),
Input(1).template data<ParamT, Context>(), // gamma W.template data<ParamT, Context>(),
Input(2).template data<T, Context>(), // dy dY.template data<T, Context>(),
scratch, scratch,
scratch + N_ * G_, scratch + N_ * G_,
dW->Reshape({C_})->template mutable_data<ParamT, Context>(), dW->Reshape({C_})->template mutable_data<ParamT, Context>(),
dB->Reshape({C_})->template mutable_data<ParamT, Context>(), dB->Reshape({C_})->template mutable_data<ParamT, Context>(),
dX->template mutable_data<T, Context>(), dX->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context>
void GroupNormGradientOp<Context>::RunOnDevice() {
GetBaseArguments();
Output(0)->ReshapeLike(Input(0));
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(GroupNorm); DEPLOY_CPU_OPERATOR(GroupNorm);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(GroupNorm); DEPLOY_CUDA_OPERATOR(GroupNorm);
......
...@@ -64,7 +64,9 @@ class GroupNormOp : public GroupNormOpBase<Context> { ...@@ -64,7 +64,9 @@ class GroupNormOp : public GroupNormOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_GROUPNORM_FUNCTIONS; USE_GROUPNORM_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
...@@ -78,7 +80,9 @@ class GroupNormGradientOp : public GroupNormOpBase<Context> { ...@@ -78,7 +80,9 @@ class GroupNormGradientOp : public GroupNormOpBase<Context> {
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
USE_GROUPNORM_FUNCTIONS; USE_GROUPNORM_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
......
#include "dragon/operators/normalization/layer_norm_op.h" #include "dragon/operators/normalization/layer_norm_op.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
template <class Context>
template <typename T>
void LayerNormOp<Context>::DoRunWithType() {
using ParamT = typename math::AccmulatorType<T>::type;
auto &X = Input(0), *Y = Output(0);
auto &W = Input(1), &B = Input(2);
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
const auto N = X.count(0, axis);
const auto C = X.count(axis);
INITIALIZE_TENSOR_VIA_SPEC(W, vec64_t({C}), ParamT);
INITIALIZE_TENSOR_VIA_SPEC(B, vec64_t({C}), ParamT);
auto* X_mu = Buffer("X_mu")->Reshape({N});
auto* X_rsig = Buffer("X_rsig")->Reshape({N});
kernels::LayerNorm(
N,
C,
epsilon_,
X.template data<T, Context>(),
W.template data<ParamT, Context>(),
B.template data<ParamT, Context>(),
X_mu->template mutable_data<ParamT, Context>(),
X_rsig->template mutable_data<ParamT, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
}
DEPLOY_CPU_OPERATOR(LayerNorm); DEPLOY_CPU_OPERATOR(LayerNorm);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(LayerNorm); DEPLOY_CUDA_OPERATOR(LayerNorm);
......
...@@ -18,22 +18,22 @@ ...@@ -18,22 +18,22 @@
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class LayerNormOp final : public GroupNormOp<Context> { class LayerNormOp final : public Operator<Context> {
public: public:
LayerNormOp(const OperatorDef& def, Workspace* ws) LayerNormOp(const OperatorDef& def, Workspace* ws)
: GroupNormOp<Context>(def, ws) {} : Operator<Context>(def, ws),
epsilon_(OP_SINGLE_ARG(double, "epsilon", 1e-5)) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void GetBaseArguments() override { void RunOnDevice() override {
auto& X = Input(0); DispatchHelper<dtypes::Floating>::Call(this, Input(0));
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
// Set dimensions
this->N_ = X.count(0, axis);
this->C_ = this->D_ = X.count(axis);
this->G_ = this->S_ = 1;
// Set data format
this->data_format_ = "NHWC";
} }
template <typename T>
void DoRunWithType();
protected:
double epsilon_;
}; };
template <class Context> template <class Context>
......
...@@ -62,9 +62,6 @@ template <class Context> ...@@ -62,9 +62,6 @@ template <class Context>
template <typename T> template <typename T>
void ConvOpBase<Context>::Col2Im(const T* col, T* im) { void ConvOpBase<Context>::Col2Im(const T* col, T* im) {
if (num_axes_ == 1 || num_axes_ == 2) { if (num_axes_ == 1 || num_axes_ == 2) {
// std::cout << conv_in_channels_ << std::endl;
// std::cout << in_shape_[0] << " " << in_shape_[1] << std::endl;
// std::cout << out_shape_[0] << " " << out_shape_[1] << std::endl;
kernels::Col2Im2d( kernels::Col2Im2d(
conv_in_channels_, conv_in_channels_,
in_shape_[0], in_shape_[0],
......
#include "dragon/operators/vision/depth_to_space_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
template <class Context>
template <typename T>
void DepthToSpaceOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
SET_INPUT_SPEC(0);
int start_axis, end_axis;
int num_dims = X.ndim(), num_axes = X.ndim() - 2;
CHECK_GT(num_dims, 2) << "\nExcepted the spatial input"
<< " with number of dimensions >= 3.";
// Compute the reshape and transpose arguments
vec64_t perm(size_t(num_axes * 2 + 2), 0);
vec64_t in_dims, out_shape = X.dims();
if (data_format() == "NCHW") {
start_axis = 2, end_axis = num_dims;
out_shape[1] /= std::pow(block_size_, num_axes);
in_dims = out_shape;
perm[1] = num_axes + 1;
for (int i = 0; i < num_axes; i++) {
perm[i * 2 + 2] = num_axes + i + 2;
perm[i * 2 + 3] = i + 1;
in_dims.insert(in_dims.begin() + 1, block_size_);
out_shape[start_axis + i] *= block_size_;
}
} else if (data_format() == "NHWC") {
start_axis = 1, end_axis = num_dims - 1;
out_shape[end_axis] /= std::pow(block_size_, num_axes);
in_dims = out_shape;
for (int i = 0; i < num_axes; i++) {
perm[i * 2 + 1] = i + 1;
perm[i * 2 + 2] = num_axes + i + 1;
in_dims.insert(in_dims.begin() + num_axes + 1, block_size_);
out_shape[start_axis + i] *= block_size_;
}
perm.back() = perm.size() - 1;
} else {
LOG(FATAL) << "Unknown DataFormat: " << data_format();
}
// Now, handle it as the generic transpose operation
Tensor X_reshape(in_dims);
vec64_t x_strides(in_dims.size()), y_dims(in_dims.size());
CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << ".";
for (int i = 0; i < in_dims.size(); i++) {
x_strides[i] = X_reshape.stride(perm[i]);
y_dims[i] = X_reshape.dim(perm[i]);
}
// Store for the gradient calculation
Buffer("X_strides")->template CopyFrom<int64_t>(x_strides);
Buffer("Y_dims")->template CopyFrom<int64_t>(y_dims);
kernels::Transpose(
x_strides.size(),
x_strides.data(),
y_dims.data(),
X.template data<T, Context>(),
Y->Reshape(out_shape)->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
void DepthToSpaceOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(DepthToSpace);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(DepthToSpace);
#endif
DEPLOY_CPU_OPERATOR(DepthToSpaceGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(DepthToSpaceGradient);
#endif
OPERATOR_SCHEMA(DepthToSpace)
/* X */
.NumInputs(1)
/* Y */
.NumOutputs(1);
OPERATOR_SCHEMA(DepthToSpaceGradient)
/* dY */
.NumInputs(1)
/* dX */
.NumOutputs(1);
REGISTER_GRADIENT(DepthToSpace, SimpleGradientMaker);
} // namespace dragon
#include "dragon/operators/vision/space_to_depth_op.h" #include "dragon/operators/vision/space_to_depth_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/op_kernels.h" #include "dragon/utils/op_kernels.h"
namespace dragon { namespace dragon {
...@@ -8,7 +7,6 @@ template <class Context> ...@@ -8,7 +7,6 @@ template <class Context>
template <typename T> template <typename T>
void SpaceToDepthOp<Context>::DoRunWithType() { void SpaceToDepthOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0); auto &X = Input(0), *Y = Output(0);
SET_INPUT_SPEC(0);
int start_axis, end_axis, perm_count = 0; int start_axis, end_axis, perm_count = 0;
int num_dims = X.ndim(), num_axes = X.ndim() - 2; int num_dims = X.ndim(), num_axes = X.ndim() - 2;
...@@ -16,9 +14,9 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -16,9 +14,9 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
CHECK_GT(num_dims, 2) << "\nExcepted the spatial input" CHECK_GT(num_dims, 2) << "\nExcepted the spatial input"
<< " with number of dimensions >= 3."; << " with number of dimensions >= 3.";
// Compute the reshape and transpose arguments // Compute the reshape and transpose arguments.
vec64_t perm(size_t(num_axes * 2 + 2)); vec64_t perm(size_t(num_axes * 2 + 2));
vec64_t in_dims, in_shape = Input(0).dims(); vec64_t in_dims, in_shape = X.dims();
vec64_t out_shape = in_shape; vec64_t out_shape = in_shape;
if (data_format() == "NCHW") { if (data_format() == "NCHW") {
...@@ -55,59 +53,111 @@ void SpaceToDepthOp<Context>::DoRunWithType() { ...@@ -55,59 +53,111 @@ void SpaceToDepthOp<Context>::DoRunWithType() {
} }
} }
// Now, handle it as the generic transpose operation // Now, handle it as the generic transpose operation.
Tensor X_reshape(in_dims); Tensor X_reshape(in_dims);
vec64_t x_strides(in_dims.size()), y_dims(in_dims.size());
CHECK_EQ(X_reshape.count(), X.count()) CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to " << "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << "."; << X_reshape.DimString() << " with block size " << block_size_ << ".";
for (int i = 0; i < in_dims.size(); i++) { vec64_t X_strides(in_dims.size());
x_strides[i] = X_reshape.stride(perm[i]); vec64_t Y_dims(in_dims.size());
y_dims[i] = X_reshape.dim(perm[i]); for (int i = 0; i < X_reshape.ndim(); i++) {
X_strides[i] = X_reshape.stride(perm[i]);
Y_dims[i] = X_reshape.dim(perm[i]);
} }
// Store for the gradient calculation
Buffer("X_strides")->template CopyFrom<int64_t>(x_strides);
Buffer("Y_dims")->template CopyFrom<int64_t>(y_dims);
kernels::Transpose( kernels::Transpose(
x_strides.size(), X_strides.size(),
x_strides.data(), X_strides.data(),
y_dims.data(), Y_dims.data(),
X.template data<T, Context>(), X.template data<T, Context>(),
Y->Reshape(out_shape)->template mutable_data<T, Context>(), Y->Reshape(out_shape)->template mutable_data<T, Context>(),
ctx()); ctx());
} }
template <class Context> template <class Context>
void SpaceToDepthOp<Context>::RunOnDevice() { template <typename T>
DispatchHelper<dtypes::Generic>::Call(this, Input(0)); void DepthToSpaceOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0);
int start_axis, end_axis;
int num_dims = X.ndim(), num_axes = X.ndim() - 2;
CHECK_GT(num_dims, 2) << "\nExcepted the spatial input"
<< " with number of dimensions >= 3.";
// Compute the reshape and transpose arguments.
vec64_t perm(size_t(num_axes * 2 + 2), 0);
vec64_t in_dims, out_shape = X.dims();
if (data_format() == "NCHW") {
start_axis = 2, end_axis = num_dims;
out_shape[1] /= std::pow(block_size_, num_axes);
in_dims = out_shape;
perm[1] = num_axes + 1;
for (int i = 0; i < num_axes; i++) {
perm[i * 2 + 2] = num_axes + i + 2;
perm[i * 2 + 3] = i + 1;
in_dims.insert(in_dims.begin() + 1, block_size_);
out_shape[start_axis + i] *= block_size_;
}
} else if (data_format() == "NHWC") {
start_axis = 1, end_axis = num_dims - 1;
out_shape[end_axis] /= std::pow(block_size_, num_axes);
in_dims = out_shape;
for (int i = 0; i < num_axes; i++) {
perm[i * 2 + 1] = i + 1;
perm[i * 2 + 2] = num_axes + i + 1;
in_dims.insert(in_dims.begin() + num_axes + 1, block_size_);
out_shape[start_axis + i] *= block_size_;
}
perm.back() = perm.size() - 1;
} else {
LOG(FATAL) << "Unknown DataFormat: " << data_format();
}
// Now, handle it as the generic transpose operation.
Tensor X_reshape(in_dims);
CHECK_EQ(X_reshape.count(), X.count())
<< "\nCould not rearrange " << X.DimString() << " to "
<< X_reshape.DimString() << " with block size " << block_size_ << ".";
vec64_t X_strides(in_dims.size());
vec64_t Y_dims(in_dims.size());
for (int i = 0; i < in_dims.size(); i++) {
X_strides[i] = X_reshape.stride(perm[i]);
Y_dims[i] = X_reshape.dim(perm[i]);
}
kernels::Transpose(
X_strides.size(),
X_strides.data(),
Y_dims.data(),
X.template data<T, Context>(),
Y->Reshape(out_shape)->template mutable_data<T, Context>(),
ctx());
} }
DEPLOY_CPU_OPERATOR(SpaceToDepth); DEPLOY_CPU_OPERATOR(SpaceToDepth);
REGISTER_CPU_OPERATOR(SpaceToDepthGradient, DepthToSpaceOp<CPUContext>);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(SpaceToDepth); DEPLOY_CUDA_OPERATOR(SpaceToDepth);
REGISTER_CUDA_OPERATOR(SpaceToDepthGradient, DepthToSpaceOp<CUDAContext>);
#endif #endif
DEPLOY_CPU_OPERATOR(SpaceToDepthGradient); DEPLOY_CPU_OPERATOR(DepthToSpace);
REGISTER_CPU_OPERATOR(DepthToSpaceGradient, SpaceToDepthOp<CPUContext>);
#ifdef USE_CUDA #ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(SpaceToDepthGradient); DEPLOY_CUDA_OPERATOR(DepthToSpace);
REGISTER_CUDA_OPERATOR(DepthToSpaceGradient, SpaceToDepthOp<CUDAContext>);
#endif #endif
OPERATOR_SCHEMA(SpaceToDepth) OPERATOR_SCHEMA(SpaceToDepth).NumInputs(1).NumOutputs(1);
/* X */ OPERATOR_SCHEMA(SpaceToDepthGradient).NumInputs(1).NumOutputs(1);
.NumInputs(1) OPERATOR_SCHEMA(DepthToSpace).NumInputs(1).NumOutputs(1);
/* Y */ OPERATOR_SCHEMA(DepthToSpaceGradient).NumInputs(1).NumOutputs(1);
.NumOutputs(1);
OPERATOR_SCHEMA(SpaceToDepthGradient)
/* dY */
.NumInputs(1)
/* dX */
.NumOutputs(1);
REGISTER_GRADIENT(SpaceToDepth, SimpleGradientMaker); REGISTER_GRADIENT(SpaceToDepth, SimpleGradientMaker);
REGISTER_GRADIENT(DepthToSpace, SimpleGradientMaker);
} // namespace dragon } // namespace dragon
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#ifndef DRAGON_OPERATORS_VISION_SPACE_TO_DEPTH_OP_H_ #ifndef DRAGON_OPERATORS_VISION_SPACE_TO_DEPTH_OP_H_
#define DRAGON_OPERATORS_VISION_SPACE_TO_DEPTH_OP_H_ #define DRAGON_OPERATORS_VISION_SPACE_TO_DEPTH_OP_H_
#include "dragon/operators/array/transpose_op.h" #include "dragon/core/operator.h"
namespace dragon { namespace dragon {
...@@ -25,21 +25,34 @@ class SpaceToDepthOp final : public Operator<Context> { ...@@ -25,21 +25,34 @@ class SpaceToDepthOp final : public Operator<Context> {
block_size_(OP_SINGLE_ARG(int, "block_size", 2)) {} block_size_(OP_SINGLE_ARG(int, "block_size", 2)) {}
USE_OPERATOR_FUNCTIONS; USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override; void RunOnDevice() override {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <typename T> template <typename T>
void DoRunWithType(); void DoRunWithType();
protected: protected:
int64_t block_size_; int64_t block_size_;
Tensor X_, *X_strides_, *Y_dims_;
}; };
template <class Context> template <class Context>
class SpaceToDepthGradientOp final : public TransposeGradientOp<Context> { class DepthToSpaceOp final : public Operator<Context> {
public: public:
SpaceToDepthGradientOp(const OperatorDef& def, Workspace* ws) DepthToSpaceOp(const OperatorDef& def, Workspace* ws)
: TransposeGradientOp<Context>(def, ws) {} : Operator<Context>(def, ws),
block_size_(OP_SINGLE_ARG(int, "block_size", 2)) {}
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override {
DispatchHelper<dtypes::Generic>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
protected:
int64_t block_size_;
}; };
} // namespace dragon } // namespace dragon
......
...@@ -547,7 +547,7 @@ def smooth_l1_loss_args(**kwargs): ...@@ -547,7 +547,7 @@ def smooth_l1_loss_args(**kwargs):
} }
@register('Softmax') @register(['Softmax', 'LogSoftmax'])
def softmax_args(**kwargs): def softmax_args(**kwargs):
return {'axis': kwargs.get('axis', -1)} return {'axis': kwargs.get('axis', -1)}
......
...@@ -283,6 +283,17 @@ class Tensor(types.TensorBase): ...@@ -283,6 +283,17 @@ class Tensor(types.TensorBase):
""" """
def item(self):
"""Return the value as a python number.
Returns
-------
number
The value.
"""
return float(self) if 'float' in self.dtype else int(self)
def normal(self, mean=0, std=1): def normal(self, mean=0, std=1):
r"""Fill self from a normal distribution. r"""Fill self from a normal distribution.
...@@ -343,6 +354,17 @@ class Tensor(types.TensorBase): ...@@ -343,6 +354,17 @@ class Tensor(types.TensorBase):
""" """
def tolist(self):
"""Return the value as a python list.
Returns
-------
list
The value.
"""
return self.numpy().tolist()
def truncated_normal(self, mean=0, std=1): def truncated_normal(self, mean=0, std=1):
r"""Fill self from a truncated normal distribution. r"""Fill self from a truncated normal distribution.
...@@ -452,7 +474,7 @@ class Tensor(types.TensorBase): ...@@ -452,7 +474,7 @@ class Tensor(types.TensorBase):
""" """
def __float__(self): def __float__(self):
"""Return a float python scalar. """Return the value as a python number.
Returns Returns
------- -------
...@@ -591,7 +613,7 @@ class Tensor(types.TensorBase): ...@@ -591,7 +613,7 @@ class Tensor(types.TensorBase):
""" """
def __int__(self): def __int__(self):
"""Return an integer python scalar. """Return the value as a python number.
Returns Returns
------- -------
......
...@@ -129,7 +129,7 @@ class DataReader(multiprocessing.Process): ...@@ -129,7 +129,7 @@ class DataReader(multiprocessing.Process):
self._init_dataset() self._init_dataset()
# Persist a loop to read examples. # Persist a loop to read examples.
while True: while True:
# Pop the depleted part if necessary # Pop the depleted part if necessary.
if self._parts[0].start == self._parts[0].end: if self._parts[0].start == self._parts[0].end:
self._parts.pop(0) self._parts.pop(0)
offset = 0 offset = 0
...@@ -145,10 +145,10 @@ class DataReader(multiprocessing.Process): ...@@ -145,10 +145,10 @@ class DataReader(multiprocessing.Process):
# Load and push back a new example into the buffer. # Load and push back a new example into the buffer.
k = self._parts[-1].end % len(self._example_buffer) k = self._parts[-1].end % len(self._example_buffer)
self._example_buffer[k] = self.next_example() self._example_buffer[k] = self.next_example()
# Increase the part boundaries # Increase the part boundaries.
self._parts[-1].end += 1 self._parts[-1].end += 1
self._parts[0].start += 1 self._parts[0].start += 1
# Reset the cursor if necessary # Reset the cursor if necessary.
if self._cursor >= self._last: if self._cursor >= self._last:
self.reset() self.reset()
......
...@@ -17,8 +17,6 @@ from __future__ import print_function ...@@ -17,8 +17,6 @@ from __future__ import print_function
from dragon.core.autograph import context from dragon.core.autograph import context
from dragon.core.autograph.op_impl import OpLib from dragon.core.autograph.op_impl import OpLib
from dragon.core.autograph.op_impl import OpSchema from dragon.core.autograph.op_impl import OpSchema
from dragon.core.ops import math_ops
from dragon.core.ops import array_ops
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
...@@ -353,7 +351,7 @@ def leaky_relu(inputs, alpha=0.2, inplace=False, **kwargs): ...@@ -353,7 +351,7 @@ def leaky_relu(inputs, alpha=0.2, inplace=False, **kwargs):
@OpSchema.num_inputs(1) @OpSchema.num_inputs(1)
def log_softmax(inputs, axis=-1, **kwargs): def log_softmax(inputs, axis=-1, inplace=False, **kwargs):
r"""Compute the composite of logarithm and softmax. r"""Compute the composite of logarithm and softmax.
The **LogSoftmax** function is defined as: The **LogSoftmax** function is defined as:
...@@ -374,6 +372,8 @@ def log_softmax(inputs, axis=-1, **kwargs): ...@@ -374,6 +372,8 @@ def log_softmax(inputs, axis=-1, **kwargs):
The input tensor. The input tensor.
axis : int, optional, default=-1 axis : int, optional, default=-1
The axis to reduce. The axis to reduce.
inplace : bool, optional, default=False
Call in-place or return a new tensor.
Returns Returns
------- -------
...@@ -381,11 +381,11 @@ def log_softmax(inputs, axis=-1, **kwargs): ...@@ -381,11 +381,11 @@ def log_softmax(inputs, axis=-1, **kwargs):
The output tensor. The output tensor.
""" """
return math_ops.sub( if context.executing_eagerly():
[inputs, math_ops.log( return OpLib.execute(
array_ops.sum(math_ops.exp(inputs, **kwargs), 'LogSoftmax', inputs,
axis=[axis], keepdims=True, **kwargs), outputs=inputs if inplace else [None], axis=axis)
**kwargs)], **kwargs) return OpLib.add('LogSoftmax', inputs, axis=axis, **kwargs)
@OpSchema.num_inputs(2) @OpSchema.num_inputs(2)
......
...@@ -69,7 +69,7 @@ def selu_exporter(op_def, context): ...@@ -69,7 +69,7 @@ def selu_exporter(op_def, context):
return node, const_tensors return node, const_tensors
@export_util.register('Softmax') @export_util.register(['Softmax', 'LogSoftmax'])
def softmax_exporter(op_def, context): def softmax_exporter(op_def, context):
node, const_tensors = export_util.translate(**locals()) node, const_tensors = export_util.translate(**locals())
ndim = len(context.blob_shapes[op_def.input[0]]) ndim = len(context.blob_shapes[op_def.input[0]])
...@@ -82,7 +82,7 @@ def softmax_exporter(op_def, context): ...@@ -82,7 +82,7 @@ def softmax_exporter(op_def, context):
return node, const_tensors return node, const_tensors
@export_util.register('Softmax-13') @export_util.register(['Softmax-13', 'LogSoftmax-13'])
def softmax_exporter_v13(op_def, context): def softmax_exporter_v13(op_def, context):
node, const_tensors = export_util.translate(**locals()) node, const_tensors = export_util.translate(**locals())
ndim = len(context.blob_shapes[op_def.input[0]]) ndim = len(context.blob_shapes[op_def.input[0]])
......
...@@ -228,12 +228,11 @@ DEFINE_BROADCAST_1ST_FUNC(Div, double, /); ...@@ -228,12 +228,11 @@ DEFINE_BROADCAST_1ST_FUNC(Div, double, /);
const int rows, const int cols, const T* a, const T* b, T* y) { \ const int rows, const int cols, const T* a, const T* b, T* y) { \
if (a == y) { \ if (a == y) { \
EigenArrayMap<T>(y, cols, rows).rowwise() Expr## = \ EigenArrayMap<T>(y, cols, rows).rowwise() Expr## = \
ConstEigenVectorArrayMap<T>(b, rows).transpose(); \ ConstEigenVectorArrayMap2<T>(b, rows); \
} else { \ } else { \
EigenArrayMap<T>(y, cols, rows) = \ EigenArrayMap<T>(y, cols, rows) = \
ConstEigenArrayMap<T>(a, cols, rows) \ ConstEigenArrayMap<T>(a, cols, rows) \
.rowwise() Expr ConstEigenVectorArrayMap<T>(b, rows) \ .rowwise() Expr ConstEigenVectorArrayMap2<T>(b, rows); \
.transpose(); \
} \ } \
} }
......
...@@ -89,76 +89,66 @@ __global__ void _GenericReduce( ...@@ -89,76 +89,66 @@ __global__ void _GenericReduce(
} }
} }
#define DEFINE_REDUCE_DISPATCHER(name) \ #define DEFINE_REDUCE_DISPATCHER(name) \
template <typename T, typename AccT, typename Reducer> \ template <typename T, typename AccT, typename Reducer> \
void _Reduce##name( \ void _Reduce##name( \
const int num_dims, \ const int num_dims, \
const int* dims, \ const int* dims, \
const int num_axes, \ const int num_axes, \
const int* axes, \ const int* axes, \
const Reducer reducer, \ const Reducer reducer, \
const AccT init, \ const AccT init, \
const AccT scale, \ const AccT scale, \
const T* x, \ const T* x, \
T* y, \ T* y, \
CUDAContext* ctx) { \ CUDAContext* ctx) { \
int rows, cols; \ int rows, cols; \
vec32_t out_dims(dims, dims + num_dims); \ vec32_t out_dims(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \ for (int i = 0; i < num_axes; ++i) { \
out_dims[axes[i]] = 1; \ out_dims[axes[i]] = 1; \
} \ } \
if (math::utils::IsRowwiseReduce( \ if (math::utils::IsRowwiseReduce( \
num_dims, dims, out_dims.data(), &rows, &cols)) { \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
_RowwiseReduce<<< \ _RowwiseReduce<<<cols, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
CUDA_2D_BLOCKS(cols), \ rows, cols, reducer, init, scale, x, y); \
CUDA_THREADS, \ return; \
0, \ } \
ctx->cuda_stream()>>>(rows, cols, reducer, init, scale, x, y); \ if (math::utils::IsColwiseReduce( \
return; \ num_dims, dims, out_dims.data(), &rows, &cols)) { \
} \ _ColwiseReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
if (math::utils::IsColwiseReduce( \ rows, cols, reducer, init, scale, x, y); \
num_dims, dims, out_dims.data(), &rows, &cols)) { \ return; \
_ColwiseReduce<<< \ } \
CUDA_2D_BLOCKS(rows), \ CUDA_TENSOR_DIMS_CHECK(num_dims); \
CUDA_THREADS, \ SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes; \
0, \ SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides; \
ctx->cuda_stream()>>>(rows, cols, reducer, init, scale, x, y); \ SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims; \
return; \ math::utils::TransposeAxesForReduce( \
} \ num_dims, num_axes, axes, transpose_axes.data); \
CUDA_TENSOR_DIMS_CHECK(num_dims); \ math::utils::ComputeTransposeStrides( \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_axes; \ num_dims, dims, transpose_axes.data, transpose_strides.data); \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_strides; \ rows = cols = 1; \
SimpleArray<int, CUDA_TENSOR_MAX_DIMS> transpose_dims; \ const int pivot = num_dims - num_axes; \
math::utils::TransposeAxesForReduce( \ for (int i = 0; i < pivot; ++i) { \
num_dims, num_axes, axes, transpose_axes.data); \ rows *= dims[transpose_axes.data[i]]; \
math::utils::ComputeTransposeStrides( \ } \
num_dims, dims, transpose_axes.data, transpose_strides.data); \ for (int i = pivot; i < num_dims; ++i) { \
rows = cols = 1; \ cols *= dims[transpose_axes.data[i]]; \
const int pivot = num_dims - num_axes; \ } \
for (int i = 0; i < pivot; ++i) { \ for (int i = 0; i < num_dims; ++i) { \
rows *= dims[transpose_axes.data[i]]; \ transpose_dims.data[i] = dims[transpose_axes.data[i]]; \
} \ } \
for (int i = pivot; i < num_dims; ++i) { \ _GenericReduce<<<rows, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
cols *= dims[transpose_axes.data[i]]; \ rows, \
} \ cols, \
for (int i = 0; i < num_dims; ++i) { \ num_dims, \
transpose_dims.data[i] = dims[transpose_axes.data[i]]; \ transpose_dims, \
} \ transpose_strides, \
_GenericReduce<<< \ reducer, \
CUDA_2D_BLOCKS(rows), \ init, \
CUDA_THREADS, \ scale, \
0, \ x, \
ctx->cuda_stream()>>>( \ y); \
rows, \
cols, \
num_dims, \
transpose_dims, \
transpose_strides, \
reducer, \
init, \
scale, \
x, \
y); \
} }
DEFINE_REDUCE_DISPATCHER(Max); DEFINE_REDUCE_DISPATCHER(Max);
......
...@@ -229,6 +229,25 @@ void SoftmaxGrad( ...@@ -229,6 +229,25 @@ void SoftmaxGrad(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void LogSoftmax(
const int N,
const int S,
const int C,
const T* x,
T* y,
Context* ctx);
template <typename T, class Context>
void LogSoftmaxGrad(
const int N,
const int S,
const int C,
const T* dy,
const T* y,
T* dx,
Context* ctx);
template <typename T, class Context>
void Tanh(const int N, const T* x, T* y, Context* ctx); void Tanh(const int N, const T* x, T* y, Context* ctx);
template <typename T, class Context> template <typename T, class Context>
...@@ -586,15 +605,6 @@ void Transpose( ...@@ -586,15 +605,6 @@ void Transpose(
Context* ctx); Context* ctx);
template <typename T, class Context> template <typename T, class Context>
void TransposeGrad(
const int num_dims,
const int64_t* x_strides,
const int64_t* y_dims,
const T* dy,
T* dx,
Context* ctx);
template <typename T, class Context>
void TopK( void TopK(
const int N, const int N,
const int S, const int S,
...@@ -978,6 +988,19 @@ void L2NormalizeGrad( ...@@ -978,6 +988,19 @@ void L2NormalizeGrad(
T* dx, T* dx,
Context* ctx); Context* ctx);
template <typename T, typename AccT, class Context>
void LayerNorm(
const int N,
const int C,
const float epsilon,
const T* x,
const AccT* gamma,
const AccT* beta,
AccT* mu,
AccT* rsig,
T* y,
Context* ctx);
/* /*
* RecurrentOp Kernels * RecurrentOp Kernels
*/ */
......
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
* ------------------------------------------------------------ * ------------------------------------------------------------
*/ */
#ifndef DRAGON_UTILS_STRING_H_ #ifndef DRAGON_UTILS_STRING_UTILS_H_
#define DRAGON_UTILS_STRING_H_ #define DRAGON_UTILS_STRING_UTILS_H_
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
...@@ -100,4 +100,4 @@ inline std::string replace_all( ...@@ -100,4 +100,4 @@ inline std::string replace_all(
} // namespace dragon } // namespace dragon
#endif // DRAGON_UTILS_STRING_H_ #endif // DRAGON_UTILS_STRING_UTILS_H_
...@@ -115,6 +115,8 @@ class TestTensor(unittest.TestCase): ...@@ -115,6 +115,8 @@ class TestTensor(unittest.TestCase):
self.assertEqual(a.__repr__(), b.__repr__()) self.assertEqual(a.__repr__(), b.__repr__())
self.assertNotEqual(a.__repr__(), dragon.Tensor((), symbolic=True).__repr__()) self.assertNotEqual(a.__repr__(), dragon.Tensor((), symbolic=True).__repr__())
self.assertEqual(float(int(a)), float(b)) self.assertEqual(float(int(a)), float(b))
self.assertEqual(dragon.constant([2]).item(), 2)
self.assertEqual(dragon.constant([2, 3]).tolist(), [2, 3])
try: try:
_ = dragon.Tensor(None) _ = dragon.Tensor(None)
except ValueError: except ValueError:
......
...@@ -76,7 +76,7 @@ class OpTestCase(unittest.TestCase): ...@@ -76,7 +76,7 @@ class OpTestCase(unittest.TestCase):
second = inputs[num_first:len(inputs)] if num_second > 1 else inputs[num_first] second = inputs[num_first:len(inputs)] if num_second > 1 else inputs[num_first]
if isinstance(first, np.ndarray) and isinstance(second, np.ndarray): if isinstance(first, np.ndarray) and isinstance(second, np.ndarray):
super(OpTestCase, self).assertEqual(first.shape, second.shape) super(OpTestCase, self).assertEqual(first.shape, second.shape)
if first.dtype == np.bool and second.dtype == np.bool: if first.dtype == bool and second.dtype == bool:
diff = first ^ second diff = first ^ second
num_unique = len(np.unique(diff)) num_unique = len(np.unique(diff))
self.assertLessEqual(num_unique, 1, msg) self.assertLessEqual(num_unique, 1, msg)
......
...@@ -61,7 +61,7 @@ class OpTestCase(unittest.TestCase): ...@@ -61,7 +61,7 @@ class OpTestCase(unittest.TestCase):
second = inputs[num_first:len(inputs)] if num_second > 1 else inputs[num_first] second = inputs[num_first:len(inputs)] if num_second > 1 else inputs[num_first]
if isinstance(first, np.ndarray) and isinstance(second, np.ndarray): if isinstance(first, np.ndarray) and isinstance(second, np.ndarray):
super(OpTestCase, self).assertEqual(first.shape, second.shape) super(OpTestCase, self).assertEqual(first.shape, second.shape)
if first.dtype == np.bool and second.dtype == np.bool: if first.dtype == bool and second.dtype == bool:
diff = first ^ second diff = first ^ second
num_unique = len(np.unique(diff)) num_unique = len(np.unique(diff))
self.assertLessEqual(num_unique, 1, msg) self.assertLessEqual(num_unique, 1, msg)
......
...@@ -59,7 +59,7 @@ class OpTestCase(unittest.TestCase): ...@@ -59,7 +59,7 @@ class OpTestCase(unittest.TestCase):
second = inputs[num_first:len(inputs)] if num_second > 1 else inputs[num_first] second = inputs[num_first:len(inputs)] if num_second > 1 else inputs[num_first]
if isinstance(first, np.ndarray) and isinstance(second, np.ndarray): if isinstance(first, np.ndarray) and isinstance(second, np.ndarray):
super(OpTestCase, self).assertEqual(first.shape, second.shape) super(OpTestCase, self).assertEqual(first.shape, second.shape)
if first.dtype == np.bool and second.dtype == np.bool: if first.dtype == bool and second.dtype == bool:
diff = first ^ second diff = first ^ second
num_unique = len(np.unique(diff)) num_unique = len(np.unique(diff))
self.assertLessEqual(num_unique, 1, msg) self.assertLessEqual(num_unique, 1, msg)
...@@ -243,6 +243,8 @@ class TestTensorOps(OpTestCase): ...@@ -243,6 +243,8 @@ class TestTensorOps(OpTestCase):
data = np.array([0., 1., 2.], 'float32') data = np.array([0., 1., 2.], 'float32')
x = new_tensor(data) x = new_tensor(data)
self.assertEqual(x.exp(), np.exp(data)) self.assertEqual(x.exp(), np.exp(data))
x.exp_()
self.assertEqual(x, np.exp(data))
def test_expand(self): def test_expand(self):
entries = [(2, 2, 3, 1), entries = [(2, 2, 3, 1),
...@@ -403,6 +405,8 @@ class TestTensorOps(OpTestCase): ...@@ -403,6 +405,8 @@ class TestTensorOps(OpTestCase):
data = np.array([1., 2., 3.], 'float32') data = np.array([1., 2., 3.], 'float32')
x = new_tensor(data) x = new_tensor(data)
self.assertEqual(x.log(), np.log(data)) self.assertEqual(x.log(), np.log(data))
x.log_()
self.assertEqual(x, np.log(data))
def test_logical_and(self): def test_logical_and(self):
for a_shape, b_shape in self.binary_test_shapes: for a_shape, b_shape in self.binary_test_shapes:
......
...@@ -52,6 +52,8 @@ class TestTensor(unittest.TestCase): ...@@ -52,6 +52,8 @@ class TestTensor(unittest.TestCase):
self.assertEqual(int(a.detach()), 0) self.assertEqual(int(a.detach()), 0)
self.assertEqual(torch.Tensor([0]).dim(), 1) self.assertEqual(torch.Tensor([0]).dim(), 1)
self.assertEqual(float(torch.Tensor(1).one_()), 1.) self.assertEqual(float(torch.Tensor(1).one_()), 1.)
self.assertEqual(torch.tensor(2.333).item(), 2.333)
self.assertEqual(torch.tensor([2, 3]).tolist(), [2, 3])
self.assertEqual(torch.empty(2, 3).ndimension(), 2) self.assertEqual(torch.empty(2, 3).ndimension(), 2)
self.assertEqual(torch.empty(3).new_empty(2, 3).ndimension(), 2) self.assertEqual(torch.empty(3).new_empty(2, 3).ndimension(), 2)
self.assertEqual(repr(torch.tensor(1)), '1') self.assertEqual(repr(torch.tensor(1)), '1')
......
...@@ -1303,7 +1303,7 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1.): ...@@ -1303,7 +1303,7 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1.):
size=size, alpha=float(alpha), beta=float(beta), bias=float(k)) size=size, alpha=float(alpha), beta=float(beta), bias=float(k))
def log_softmax(input, dim): def log_softmax(input, dim, inplace=False):
r"""Apply the composite of logarithm and softmax to input. r"""Apply the composite of logarithm and softmax to input.
The **LogSoftmax** function is defined as: The **LogSoftmax** function is defined as:
...@@ -1316,6 +1316,8 @@ def log_softmax(input, dim): ...@@ -1316,6 +1316,8 @@ def log_softmax(input, dim):
The input. The input.
dim : int dim : int
The dimension to reduce. The dimension to reduce.
inplace : bool, optional, default=False
Whether to do the operation in-place.
Returns Returns
------- -------
...@@ -1327,7 +1329,9 @@ def log_softmax(input, dim): ...@@ -1327,7 +1329,9 @@ def log_softmax(input, dim):
`torch.nn.LogSoftmax(...)`_ `torch.nn.LogSoftmax(...)`_
""" """
return input - input.logsumexp(dim, keepdim=True) return FunctionLib.apply(
'LogSoftmax', input.device, [input],
outputs=[input if inplace else None], axis=dim)
def lstm_cell(input, cx): def lstm_cell(input, cx):
......
...@@ -142,18 +142,17 @@ class GumbelSoftmax(Module): ...@@ -142,18 +142,17 @@ class GumbelSoftmax(Module):
self.tau = tau self.tau = tau
self.dim = dim self.dim = dim
self.inplace = inplace self.inplace = inplace
if dim is None:
raise ValueError('Excepted a valid dim, got None.')
def forward(self, logits=None, probs=None): def extra_repr(self):
if probs is not None: inplace_str = ', inplace' if self.inplace else ''
input = probs.log() return 'dim={}{}'.format(self.dim, inplace_str)
else:
input = logits - logits.logsumexp(dim=self.dim, keepdim=True) def forward(self, input):
u_dist = init_ops.rand(input.shape, dtype=input.dtype, device=input.device) u_dist = init_ops.rand(input.shape, dtype=input.dtype,
gumbels = -((-(u_dist.log())).log()) device=input.device)
scores = (input + gumbels) / self.tau gumbel = -((-(u_dist.log())).log())
return F.softmax(scores, self.dim, self.inplace) gumbel = (input + gumbel) / self.tau
return F.softmax(gumbel, self.dim, self.inplace)
class Hardsigmoid(Module): class Hardsigmoid(Module):
...@@ -307,23 +306,27 @@ class LogSoftmax(Module): ...@@ -307,23 +306,27 @@ class LogSoftmax(Module):
""" """
def __init__(self, dim): def __init__(self, dim, inplace=False):
"""Create a ``LogSoftmax`` module. """Create a ``LogSoftmax`` module.
Parameters Parameters
---------- ----------
dim : int dim : int
The dimension to reduce. The dimension to reduce.
inplace : bool, optional, default=False
Whether to do the operation in-place.
""" """
super(LogSoftmax, self).__init__() super(LogSoftmax, self).__init__()
self.dim = dim self.dim = dim
self.inplace = inplace
def extra_repr(self): def extra_repr(self):
return 'dim={dim}'.format(dim=self.dim) inplace_str = ', inplace' if self.inplace else ''
return 'dim={}{}'.format(self.dim, inplace_str)
def forward(self, input): def forward(self, input):
return F.log_softmax(input, self.dim) return F.log_softmax(input, self.dim, self.inplace)
class MultiheadAttention(Module): class MultiheadAttention(Module):
......
...@@ -788,6 +788,24 @@ def exp(self): ...@@ -788,6 +788,24 @@ def exp(self):
return math_ops.exp(self) return math_ops.exp(self)
def exp_(self):
r"""Set to the exponential of elements.
.. math:: \text{self} = \exp(\text{self})
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.exp(...)`_
"""
return math_ops.exp(self, self)
def expand(self, *sizes): def expand(self, *sizes):
"""Return a tensor with elements broadcast. """Return a tensor with elements broadcast.
...@@ -1234,6 +1252,24 @@ def log(self): ...@@ -1234,6 +1252,24 @@ def log(self):
return math_ops.log(self) return math_ops.log(self)
def log_(self):
r"""Set to the natural logarithm of elements.
.. math:: \text{self} = \log(\text{self})
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.log(...)`_
"""
return math_ops.log(self, self)
def logical_and(self, other): def logical_and(self, other):
r"""Compute the element-wise AND logical operation. r"""Compute the element-wise AND logical operation.
...@@ -2916,6 +2952,7 @@ Tensor.double = double ...@@ -2916,6 +2952,7 @@ Tensor.double = double
Tensor.double_ = double_ Tensor.double_ = double_
Tensor.eq = eq Tensor.eq = eq
Tensor.exp = exp Tensor.exp = exp
Tensor.exp_ = exp_
Tensor.expand = expand Tensor.expand = expand
Tensor.fill_ = fill_ Tensor.fill_ = fill_
Tensor.flatten = flatten Tensor.flatten = flatten
...@@ -2941,6 +2978,7 @@ Tensor.le = le ...@@ -2941,6 +2978,7 @@ Tensor.le = le
Tensor.long = long Tensor.long = long
Tensor.long_ = long_ Tensor.long_ = long_
Tensor.log = log Tensor.log = log
Tensor.log_ = log_
Tensor.logical_and = logical_and Tensor.logical_and = logical_and
Tensor.logical_not = logical_not Tensor.logical_not = logical_not
Tensor.logical_or = logical_or Tensor.logical_or = logical_or
......
...@@ -972,6 +972,22 @@ class Tensor(object): ...@@ -972,6 +972,22 @@ class Tensor(object):
""" """
def exp_(self):
r"""Set to the exponential of elements.
.. math:: \text{self} = \exp(\text{self})
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.exp(...)`_
"""
def expand(self, *sizes): def expand(self, *sizes):
"""Return a tensor with elements broadcast. """Return a tensor with elements broadcast.
...@@ -1326,6 +1342,17 @@ class Tensor(object): ...@@ -1326,6 +1342,17 @@ class Tensor(object):
""" """
return 'float' in self.dtype return 'float' in self.dtype
def item(self):
"""Return the value as a python number.
Returns
-------
number
The value.
"""
return float(self) if self.is_floating_point() else int(self)
def le(self, other): def le(self, other):
r"""Compute the element-wise less-equal comparison. r"""Compute the element-wise less-equal comparison.
...@@ -1363,6 +1390,22 @@ class Tensor(object): ...@@ -1363,6 +1390,22 @@ class Tensor(object):
""" """
def log_(self):
r"""Set to the natural logarithm of elements.
.. math:: \text{self} = \log(\text{self})
Returns
-------
dragon.vm.torch.Tensor
The output tensor.
See Also
--------
`torch.log(...)`_
"""
def logical_and(self, other): def logical_and(self, other):
r"""Compute the element-wise AND logical operation. r"""Compute the element-wise AND logical operation.
...@@ -2676,6 +2719,17 @@ class Tensor(object): ...@@ -2676,6 +2719,17 @@ class Tensor(object):
return self.type(dtype) return self.type(dtype)
return self return self
def tolist(self):
"""Return the value as a python list.
Returns
-------
list
The value.
"""
return self.numpy().tolist()
def topk(self, k, dim=-1, largest=True, sorted=True): def topk(self, k, dim=-1, largest=True, sorted=True):
"""Return the top-K largest or smallest elements. """Return the top-K largest or smallest elements.
...@@ -3089,7 +3143,7 @@ class Tensor(object): ...@@ -3089,7 +3143,7 @@ class Tensor(object):
return self.eq(other) return self.eq(other)
def __float__(self): def __float__(self):
"""Return a float python scalar. """Return the value as a python number.
Returns Returns
------- -------
...@@ -3194,7 +3248,7 @@ class Tensor(object): ...@@ -3194,7 +3248,7 @@ class Tensor(object):
return self.mul_(other) return self.mul_(other)
def __int__(self): def __int__(self):
"""Return an integer python scalar. """Return the value as a python number.
Returns Returns
------- -------
...@@ -3202,7 +3256,7 @@ class Tensor(object): ...@@ -3202,7 +3256,7 @@ class Tensor(object):
The integer value. The integer value.
""" """
return int(self.__float__()) return int(self.numpy())
def __invert__(self): def __invert__(self):
"""Compute the element-wise NOT bitwise operation. """Compute the element-wise NOT bitwise operation.
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!