Commit 654febe3 by Ting PAN

Implement softmax kernels via warp reduce

Summary:
This commit adds extra CUDA softmax kernels using warp reduce.
Warp reduce leads to better performance when dimension <= 256,
which is preferred for the recent vision transformers.
1 parent 3a97dc22
......@@ -34,6 +34,7 @@ from dragon.vm.dali.core.ops.image_ops import RandomResizedCrop
from dragon.vm.dali.core.ops.image_ops import Resize
from dragon.vm.dali.core.ops.image_ops import Rotate
from dragon.vm.dali.core.ops.image_ops import WarpAffine
from dragon.vm.dali.core.ops.math_ops import Normalize
from dragon.vm.dali.core.ops.random_ops import CoinFlip
from dragon.vm.dali.core.ops.random_ops import Uniform
from dragon.vm.dali.core.ops.reader_ops import KPLRecordReader
......
......@@ -118,13 +118,13 @@ class CropMirrorNormalize(object):
# (H, W) for 2d input
# (D, H, W) for 3d input
crop=(224, 224),
# Historical values to normalize input
mean=(102., 115., 122.),
std=(1., 1., 1.),
# Historical BGR values to normalize input
mean=(103.53, 116.28, 123.675),
std=(57.375, 57.12, 58.395),
# Or ``float16`` for fp16 training
dtype='float32',
# Or ``NHWC``
output_layout='NCHW'
# Or ``HWC``
output_layout='CHW',
)
y = cmn(inputs['x'], mirror=flip_rng())
```
......@@ -138,7 +138,7 @@ class CropMirrorNormalize(object):
mean=0.,
std=1.,
dtype='float32',
output_layout='NCHW',
output_layout='CHW',
**kwargs
):
"""Create a ``CropMirrorNormalize`` operator.
......@@ -153,10 +153,10 @@ class CropMirrorNormalize(object):
The values to subtract.
std : Union[float, Sequence[float]], optional
The values to divide after subtraction.
dtype : {'float16', 'float32'}, optional
The data type of output.
output_layout : {'NCHW', 'NHWC'}, optional
The data format of output.
dtype : str, optional, default='float32'
The output data type.
output_layout : str, optional
The output data layout.
Returns
-------
......@@ -167,7 +167,7 @@ class CropMirrorNormalize(object):
if isinstance(dtype, six.string_types):
dtype = getattr(types, dtype.upper())
if isinstance(output_layout, six.string_types):
output_layout = getattr(types, output_layout.upper())
output_layout = output_layout.upper()
return ops.CropMirrorNormalize(
crop=crop,
mirror=mirror,
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
"""Math ops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
try:
from nvidia.dali import ops
except ImportError:
from dragon.core.util import deprecation
ops = deprecation.not_installed('nvidia.dali')
from dragon.core.util import six
from dragon.vm.dali.core.framework import context
from dragon.vm.dali.core.framework import types
class Normalize(object):
"""Normalize input.
Examples:
```python
norm = dali.ops.Normalize(
# Batch normalization case of HWC layout
axes=(0, 1), batch=True, epsilon=1e-5,
)
y = norm(inputs['x'])
```
"""
def __new__(
cls,
axes=(0, 1),
mean=None,
stddev=None,
scale=1.0,
shift=0.0,
batch=False,
epsilon=0,
dtype='float32',
**kwargs
):
"""Create a ``Normalize`` operator.
Parameters
----------
axes : Sequence[int], optional
The axes to normalize.
mean : float, optional
The value to subtract.
stddev : float, optional
The value to divide after subtraction.
scale : float, optional, default=1.0
The scale factor after normalization.
shift : float, optional, default=0.0
The shift factor after normalization.
batch : bool, optional, default=False
Whether to compute mean and stddev across the batch.
epsilon : float, optional, default=0
The value added to the computed variance.
dtype : str, optional, default='float32'
The output data type.
Returns
-------
nvidia.dali.ops.Normalize
The operator.
"""
if isinstance(dtype, six.string_types):
dtype = getattr(types, dtype.upper())
return ops.Normalize(
axes=axes,
mean=mean,
stddev=stddev,
scale=scale,
shift=shift,
batch=batch,
epsilon=epsilon,
dtype=dtype,
device=context.get_device_type(),
**kwargs
)
......@@ -17,7 +17,7 @@ Public Functions
CopyFrom
########
.. doxygenfunction:: dragon::Tensor::CopyFrom(const Tensor &other, Context *ctx)
.. doxygenfunction:: dragon::Tensor::CopyFrom(Tensor &other, Context *ctx)
CopyFrom
########
......@@ -39,6 +39,10 @@ IsType
######
.. doxygenfunction:: dragon::Tensor::IsType
MapFrom
#######
.. doxygenfunction:: dragon::Tensor::MapFrom
Reset
#####
.. doxygenfunction:: dragon::Tensor::Reset
......@@ -51,10 +55,6 @@ ReshapeLike
###########
.. doxygenfunction:: dragon::Tensor::ReshapeLike
Share
#####
.. doxygenfunction:: dragon::Tensor::Share
axis
####
.. doxygenfunction:: dragon::Tensor::axis
......@@ -135,10 +135,6 @@ raw_mutable_data
################
.. doxygenfunction:: dragon::Tensor::raw_mutable_data()
raw_mutable_data
################
.. doxygenfunction:: dragon::Tensor::raw_mutable_data(const TypeMeta &meta)
size
####
.. doxygenfunction:: dragon::Tensor::size
......
......@@ -45,6 +45,9 @@ vm.dali.ops
`class ImageDecoderRandomCrop <ops/ImageDecoderRandomCrop.html>`_
: Decode image and return a random crop.
`class Normalize <ops/Normalize.html>`_
: Normalize input.
`class Pad <ops/Pad.html>`_
: Pad input to have the same dimensions.
......@@ -97,6 +100,7 @@ vm.dali.ops
ops/Hsv
ops/ImageDecoder
ops/ImageDecoderRandomCrop
ops/Normalize
ops/Pad
ops/Paste
ops/RandomBBoxCrop
......
Normalize
=========
.. autoclass:: dragon.vm.dali.ops.Normalize
__new__
--------
.. automethod:: dragon.vm.dali.ops.Normalize.__new__
.. raw:: html
<style>
h1:before {
content: "dali.ops.";
color: #103d3e;
}
</style>
......@@ -143,6 +143,9 @@ class CUDAObjects {
#ifdef USE_CUDNN
/*! \brief The cached cuDNN handles of each device */
vector<cudnnHandle_t> cudnn_handles_[CUDA_MAX_DEVICES];
/*! \brief The disabled cuDNN operators */
Set<string> cudnn_disabled_ops_;
#endif
#ifdef USE_NCCL
......
......@@ -75,11 +75,6 @@ class DRAGON_API UnifiedMemory {
return size_t(0);
}
/*! \brief Return the number of batch chunks */
size_t num_chunks() const {
return num_chunks_;
}
/*! \brief Return the storage order */
StorageOrder order() const {
return order_;
......@@ -105,11 +100,6 @@ class DRAGON_API UnifiedMemory {
/*! \brief Return the mutable cuda data */
void* mutable_cuda_data(size_t size = 0);
/*! \brief Set the number of data chunks */
void set_num_chunks(size_t num_chunks) {
num_chunks_ = num_chunks;
}
/*! \brief Set the storage order */
void set_order(StorageOrder order) {
order_ = order;
......@@ -125,8 +115,8 @@ class DRAGON_API UnifiedMemory {
/*! \brief The data state */
State state_ = UNINITIALIZED;
/*! \brief The size and number of chunks */
size_t size_ = 0, num_chunks_ = 1;
/*! \brief The data size */
size_t size_ = 0;
/*! \brief The type meta */
TypeMeta meta_;
......@@ -140,12 +130,12 @@ class DRAGON_API UnifiedMemory {
/*! \brief The cpu data pointer */
void* cpu_ptr_ = nullptr;
/*! \brief The ownership of cpu data pointer */
bool own_cpu_ptr_ = true;
/*! \brief The cuda data pointer */
void* cuda_ptr_ = nullptr;
/*! \brief The ownership of cpu data pointer */
bool own_cpu_ptr_ = true;
/*! \brief The ownership of cuda data pointer */
bool own_cuda_ptr_ = true;
......
......@@ -60,15 +60,13 @@ Tensor* OperatorBase::Output(int i, const vec32_t& inputs) {
auto* Y = Output(i);
if (i < output_aliases_.size()) {
for (auto j : inputs) {
const auto& X = Input(j);
auto& X = Input(j);
if (output_aliases_[i].count(X.name())) {
Output(i)->ReshapeLike(X)->Share(X.memory());
return Y;
return Y->ReshapeLike(X)->MapFrom(&X);
}
}
}
Y->Share(nullptr);
return Y;
return Y->MapFrom(nullptr);
}
Tensor* OperatorBase::Buffer(const string& name) {
......@@ -85,7 +83,8 @@ OperatorBase* OperatorBase::New(const OperatorDef& def, Workspace* ws) {
case PROTO_CUDA:
#ifdef USE_CUDNN
if (CUDNNOperatorRegistry()->Has(op_type) &&
CUDAContext::objects().cudnn_enabled_) {
CUDAContext::objects().cudnn_enabled_ &&
!CUDAContext::objects().cudnn_disabled_ops_.count(op_type)) {
return CUDNNOperatorRegistry()->Create(op_type, def, ws);
}
#endif
......
......@@ -79,7 +79,7 @@ class DRAGON_API Tensor {
}
}
/*! \brief Change the tensor dimensions */
/*! \brief Change the dimensions */
Tensor* Reshape(const vec64_t& dims) {
dims_ = dims;
strides_.resize(dims.size());
......@@ -91,31 +91,23 @@ class DRAGON_API Tensor {
CHECK_GE(d, 0);
if (d > 0) new_size *= d;
}
if (capacity_ < new_size * meta_.itemsize()) {
if (own_memory_ptr_) {
memory_.reset();
} else {
mapped_memory_ = nullptr;
own_memory_ptr_ = true;
}
capacity_ = 0;
}
size_ = new_size;
return this;
}
/*! \brief Change the tensor dimensions as the other */
/*! \brief Change the dimensions as the other */
Tensor* ReshapeLike(const Tensor& other) {
return Reshape(other.dims_);
}
/*! \brief Copy memory from a tensor with context */
/*! \brief Copy memory from a tensor */
template <class Context>
Tensor* CopyFrom(const Tensor& other, Context* ctx) {
Tensor* CopyFrom(Tensor& other, Context* ctx) {
if ((void*)&other == (void*)this) return this;
CHECK_EQ(size_, other.size_);
meta_ = other.meta_;
auto* src = other.template raw_data<Context>();
auto* dst = raw_mutable_data<Context>(other.meta_);
auto* dst = raw_mutable_data<Context>();
if (dst == src) return this;
ctx->template MemcpyAsync<Context, Context>(nbytes(), dst, src);
return this;
......@@ -144,20 +136,22 @@ class DRAGON_API Tensor {
}
}
/*! \brief Share an external memory */
void Share(UnifiedMemory* memory) {
if (memory != nullptr) {
CHECK_LE(size_, memory->size())
<< "\nShare an external memory with smaller capacity.";
memory_.reset();
capacity_ = memory->size();
/*! \brief Map memory from a tensor */
Tensor* MapFrom(Tensor* other) {
if (other == nullptr) {
mapped_memory_ = nullptr;
capacity_ = (memory_ != nullptr ? memory_->size() : 0);
} else {
if (memory_) {
capacity_ = memory_->size();
auto* new_memory = other->memory();
if (new_memory != nullptr) {
CHECK_LE(size_, new_memory->size())
<< "\nMap from a memory with smaller capacity.";
memory_.reset();
mapped_memory_ = new_memory;
capacity_ = new_memory->size();
}
}
mapped_memory_ = memory;
own_memory_ptr_ = (memory == nullptr);
return this;
}
/*! \brief Reset tensor to release all resources */
......@@ -167,7 +161,6 @@ class DRAGON_API Tensor {
memory_.reset();
meta_ = TypeMeta();
size_ = capacity_ = 0;
own_memory_ptr_ = true;
mapped_memory_ = nullptr;
if (ExternalDeleter != nullptr) {
ExternalDeleter();
......@@ -216,17 +209,17 @@ class DRAGON_API Tensor {
return version_;
}
/*! \brief Return the total number of elements */
/*! \brief Return the number of elements */
size_t size() const {
return size_;
}
/*! \brief Return the memory capacity */
/*! \brief Return the byte length of memory */
size_t capacity() const {
return capacity_;
}
/*! \brief Return the total number of data bytes */
/*! \brief Return the byte length of all elements */
size_t nbytes() const {
return size_ * meta_.itemsize();
}
......@@ -260,22 +253,22 @@ class DRAGON_API Tensor {
return strides_[axis(i)];
}
/*! \brief Return the tensor dimensions */
/*! \brief Return the dimensions */
const vec64_t& dims() const {
return dims_;
}
/*! \brief Return the tensor strides */
/*! \brief Return the strides */
const vec64_t& strides() const {
return strides_;
}
/*! \brief Return the total number of elements */
/*! \brief Return the number of elements counting along all axes */
int64_t count() const {
return (int64_t)size_;
}
/*! \brief Return the number of elements counting along the given axes */
/*! \brief Return the number of elements counting along given axes */
int64_t count(int64_t start, int64_t end) const {
int64_t num = 1;
for (int64_t i = start; i < end; i++) {
......@@ -284,12 +277,12 @@ class DRAGON_API Tensor {
return num;
}
/*! \brief Return the number of elements counting from the given axis */
/*! \brief Return the number of elements counting from given axis */
int64_t count(int64_t start) const {
return count(start, ndim());
}
/*! \brief Return whether the total number of elements is zero */
/*! \brief Return whether the number of elements is zero */
bool empty() const {
return size_ == 0;
}
......@@ -299,22 +292,30 @@ class DRAGON_API Tensor {
return memory_ != nullptr || mapped_memory_ != nullptr;
}
/*! \brief Return the memory pointer */
UnifiedMemory* memory(bool required = false, bool owned = false) const {
auto* ptr = own_memory_ptr_ || owned ? memory_.get() : mapped_memory_;
/*! \brief Return the memory */
UnifiedMemory* memory(bool required = false, bool owned = false) {
if (capacity_ < size_ * meta_.itemsize()) {
if (mapped_memory_ != nullptr) {
mapped_memory_ = nullptr;
} else {
memory_.reset();
}
capacity_ = 0;
}
auto* ptr = (owned || !mapped_memory_ ? memory_.get() : mapped_memory_);
if (required) CHECK(ptr) << "\nAccess the empty memory.";
return ptr;
}
/*! \brief Return the memory state */
UnifiedMemory::State memory_state() const {
UnifiedMemory::State memory_state() {
return memory(true)->state();
}
/*! \brief Try to return the raw const data pointer */
/*! \brief Return the raw data pointer */
template <class Context>
const void* const_data_ptr() const {
auto context_type = TypeMeta::Id<Context>();
const void* raw_data() {
const auto context_type = TypeMeta::Id<Context>();
if (context_type == TypeMeta::Id<CPUContext>()) {
return memory(true)->cpu_data(nbytes());
} else if (context_type == TypeMeta::Id<CUDAContext>()) {
......@@ -325,14 +326,14 @@ class DRAGON_API Tensor {
}
}
/*! \brief Try to return the raw mutable data pointer */
/*! \brief Get the raw mutable data pointer */
template <class Context>
void mutable_data_ptr(void** data_ptr) {
void raw_mutable_data(void** data_ptr) {
auto* memory_ptr = memory();
if (!memory_ptr) {
*data_ptr = nullptr;
} else {
auto context_type = TypeMeta::Id<Context>();
const auto context_type = TypeMeta::Id<Context>();
if (context_type == TypeMeta::Id<CPUContext>()) {
*data_ptr = memory_ptr->mutable_cpu_data(nbytes());
} else if (context_type == TypeMeta::Id<CUDAContext>()) {
......@@ -343,67 +344,38 @@ class DRAGON_API Tensor {
}
}
/*!
* \brief Return the raw mutable data pointer.
*
* If memory is not set, create to manage it with the given meta.
*/
template <class Context>
void* raw_mutable_data(const TypeMeta& meta) {
void* data_ptr;
mutable_data_ptr<Context>(&data_ptr);
// Return the data pointer directly
if (meta_ == meta && data_ptr) return data_ptr;
// Create a new memory created with size and meta
CHECK_GT(size_, 0) << "\nInvalid tensor size.";
meta_ = meta;
capacity_ = size_ * meta.itemsize();
memory_.reset(new UnifiedMemory(meta_, capacity_));
mutable_data_ptr<Context>(&data_ptr);
if (meta_.ctor()) meta_.ctor()(data_ptr, size_);
return data_ptr;
}
/*! \brief Return the raw mutable data pointer */
template <class Context>
void* raw_mutable_data() {
CHECK_NE(meta_.id(), 0) << "\nTensor(" << name_ << "): unknown type, "
<< "or does not have a type.";
return raw_mutable_data<Context>(meta_);
}
/*! \brief Return the raw const data pointer */
template <class Context>
const void* raw_data() const {
return const_data_ptr<Context>();
}
/*! \brief Return the typed mutable data pointer */
template <typename T, class Context>
T* mutable_data() {
void* data_ptr;
mutable_data_ptr<Context>(&data_ptr);
if (data_ptr) {
auto meta = TypeMeta::Make<T>();
if (meta_ == meta) {
return static_cast<T*>(data_ptr);
} else if (capacity_ >= size_ * meta.itemsize()) {
meta_ = meta;
return static_cast<T*>(data_ptr);
}
}
return static_cast<T*>(raw_mutable_data<Context>(TypeMeta::Make<T>()));
raw_mutable_data<Context>(&data_ptr);
if (data_ptr) return data_ptr;
CHECK_GT(size_, 0) << "\nInvalid tensor size.";
capacity_ = size_ * meta_.itemsize();
memory_.reset(new UnifiedMemory(meta_, capacity_));
raw_mutable_data<Context>(&data_ptr);
if (meta_.ctor()) meta_.ctor()(data_ptr, size_);
return data_ptr;
}
/*! \brief Return the typed const data pointer */
/*! \brief Return the typed data pointer */
template <typename T, class Context>
const T* data() const {
const T* data() {
CHECK(meta_.Match<T>()) << "\nThe type of Tensor(" << name() << ") is "
<< dtypes::to_string(meta_) << ", while requesting "
<< dtypes::to_string(TypeMeta::Make<T>()) << ".";
return static_cast<const T*>(raw_data<Context>());
}
/*! \brief Return the typed mutable data pointer */
template <typename T, class Context>
T* mutable_data() {
meta_ = TypeMeta::Make<T>();
return static_cast<T*>(raw_mutable_data<Context>());
}
/*! \brief Set the tensor version */
void set_version(int version) {
version_ = version;
......@@ -415,12 +387,14 @@ class DRAGON_API Tensor {
return this;
}
/*! \brief Set to manage the memory */
void set_memory(UnifiedMemory* memory_ptr) {
if (memory_ptr != memory_.get()) {
memory_.reset(memory_ptr);
/*! \brief Set the managed memory */
void set_memory(UnifiedMemory* memory) {
if (memory != nullptr) {
if (memory != memory_.get()) {
memory_.reset(memory);
}
capacity_ = memory->size();
}
capacity_ = memory_ptr->size();
}
private:
......@@ -430,14 +404,20 @@ class DRAGON_API Tensor {
/*! \brief The type meta */
TypeMeta meta_;
/*! \brief The size and capacity */
size_t size_ = 0, capacity_ = 0;
/*! \brief The number of elements */
size_t size_ = 0;
/*! \brief The byte length of memory */
size_t capacity_ = 0;
/*! \brief The tensor version */
int version_ = -1;
/*! \brief The dimensions and strides */
vec64_t dims_, strides_;
/*! \brief The dimensions */
vec64_t dims_;
/*! \brief The strides */
vec64_t strides_;
/*! \brief The managed memory */
unique_ptr<UnifiedMemory> memory_;
......@@ -445,9 +425,6 @@ class DRAGON_API Tensor {
/*! \brief The mapped memory */
UnifiedMemory* mapped_memory_ = nullptr;
/*! \brief The ownership of memory pointer */
bool own_memory_ptr_ = true;
DISABLE_COPY_AND_ASSIGN(Tensor);
};
......
......@@ -11,33 +11,101 @@ namespace kernels {
namespace {
#define WARP_ITEMS 16
#define BLOCK_THREADS 1024
#define LDG(x, i) convert::To<AccT>(__ldg(x + i))
template <typename T, typename AccT>
__global__ void
_Softmax(const int NxS, const int S, const int C, const T* x, T* y) {
__shared__ AccT block_max, block_sum;
__shared__ typename BlockReduce<AccT>::TempStorage storage;
__global__ void _SoftmaxViaWarpReduce(
const int NxS,
const int S,
const int C,
const T* x,
T* y) {
const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int num_warps = gridDim.x * blockDim.y;
for (int i = warp_idx; i < NxS; i += num_warps) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_x = x + offset;
auto* offset_y = y + offset;
AccT val = AccT(-FLT_MAX);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
val = max(val, LDG(offset_x, j * S));
}
const AccT warp_max = WarpAllReduce<AccT, cub::Max>(val);
val = AccT(0);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
val += exp(LDG(offset_x, j * S) - warp_max);
}
const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
const int k = j * S;
val = exp(LDG(offset_x, k) - warp_max);
offset_y[k] = convert::To<T>(val / warp_sum);
}
}
}
template <typename T, typename AccT>
__global__ void _LogSoftmaxViaWarpReduce(
const int NxS,
const int S,
const int C,
const T* x,
T* y) {
const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int num_warps = gridDim.x * blockDim.y;
for (int i = warp_idx; i < NxS; i += num_warps) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_x = x + offset;
auto* offset_y = y + offset;
AccT val = AccT(-FLT_MAX);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
val = max(val, LDG(offset_x, j * S));
}
const AccT warp_max = WarpAllReduce<AccT, cub::Max>(val);
val = AccT(0);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
val += exp(LDG(offset_x, j * S) - warp_max);
}
const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
const int k = j * S;
val = LDG(offset_x, k) - warp_max;
offset_y[k] = convert::To<T>(val - log(warp_sum));
}
}
}
template <typename T, typename AccT>
__global__ void _SoftmaxViaBlockReduce(
const int NxS,
const int S,
const int C,
const T* x,
T* y) {
CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_x = x + offset;
auto* offset_y = y + offset;
AccT val = convert::To<AccT>(__ldg(offset_x));
AccT val = AccT(-FLT_MAX);
CUDA_2D_KERNEL_LOOP2(j, C) {
val = max(val, LDG(offset_x, j * S));
}
val = BlockReduce<AccT>(storage).Reduce(val, cub::Max());
if (threadIdx.x == 0) block_max = val;
__syncthreads();
const AccT block_max = BlockAllReduce<AccT, cub::Max, BLOCK_THREADS>(val);
val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) {
val += exp(LDG(offset_x, j * S) - block_max);
}
val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_sum = val;
__syncthreads();
const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);
CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S;
......@@ -48,30 +116,28 @@ _Softmax(const int NxS, const int S, const int C, const T* x, T* y) {
}
template <typename T, typename AccT>
__global__ void
_LogSoftmax(const int NxS, const int S, const int C, const T* x, T* y) {
__shared__ AccT block_max, block_sum;
__shared__ typename BlockReduce<AccT>::TempStorage storage;
__global__ void _LogSoftmaxViaBlockReduce(
const int NxS,
const int S,
const int C,
const T* x,
T* y) {
CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_x = x + offset;
auto* offset_y = y + offset;
AccT val = convert::To<AccT>(__ldg(offset_x));
AccT val = AccT(-FLT_MAX);
CUDA_2D_KERNEL_LOOP2(j, C) {
val = max(val, LDG(offset_x, j * S));
}
val = BlockReduce<AccT>(storage).Reduce(val, cub::Max());
if (threadIdx.x == 0) block_max = val;
__syncthreads();
const AccT block_max = BlockAllReduce<AccT, cub::Max, BLOCK_THREADS>(val);
val = AccT(0);
CUDA_2D_KERNEL_LOOP2(j, C) {
val += exp(LDG(offset_x, j * S) - block_max);
}
val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_sum = val;
__syncthreads();
const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);
CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S;
......@@ -82,15 +148,74 @@ _LogSoftmax(const int NxS, const int S, const int C, const T* x, T* y) {
}
template <typename T, typename AccT>
__global__ void _SoftmaxGrad(
__global__ void _SoftmaxGradViaWarpReduce(
const int NxS,
const int S,
const int C,
const T* dy,
const T* y,
T* dx) {
const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int num_warps = gridDim.x * blockDim.y;
for (int i = warp_idx; i < NxS; i += num_warps) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_dy = dy + offset;
auto* offset_y = y + offset;
auto* offset_dx = dx + offset;
AccT val = AccT(0);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
const int k = j * S;
val += LDG(offset_dy, k) * LDG(offset_y, k);
}
const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
const int k = j * S;
val = LDG(offset_dy, k) - warp_sum;
offset_dx[k] = convert::To<T>(val * LDG(offset_y, k));
}
}
}
template <typename T, typename AccT>
__global__ void _LogSoftmaxGradViaWarpReduce(
const int NxS,
const int S,
const int C,
const T* dy,
const T* y,
T* dx) {
const int warp_idx = blockIdx.x * blockDim.y + threadIdx.y;
const int num_warps = gridDim.x * blockDim.y;
for (int i = warp_idx; i < NxS; i += num_warps) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_dy = dy + offset;
auto* offset_y = y + offset;
auto* offset_dx = dx + offset;
AccT val = AccT(0);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
val += LDG(offset_dy, j * S);
}
const AccT warp_sum = WarpAllReduce<AccT, cub::Sum>(val);
for (int j = threadIdx.x; j < C; j += CUDA_WARP_SIZE) {
const int k = j * S;
val = exp(convert::To<AccT>(offset_y[k])) * warp_sum;
offset_dx[k] = convert::To<T>(LDG(offset_dy, k) - val);
}
}
}
template <typename T, typename AccT>
__global__ void _SoftmaxGradViaBlockReduce(
const int NxS,
const int S,
const int C,
const T* dy,
const T* y,
T* dx) {
__shared__ AccT block_val;
__shared__ typename BlockReduce<AccT>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_dy = dy + offset;
......@@ -102,28 +227,24 @@ __global__ void _SoftmaxGrad(
const int k = j * S;
val += LDG(offset_dy, k) * LDG(offset_y, k);
}
val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_val = val;
__syncthreads();
const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);
CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S;
val = LDG(offset_dy, k) - block_val;
val = LDG(offset_dy, k) - block_sum;
offset_dx[k] = convert::To<T>(val * LDG(offset_y, k));
}
}
}
template <typename T, typename AccT>
__global__ void _LogSoftmaxGrad(
__global__ void _LogSoftmaxGradViaBlockReduce(
const int NxS,
const int S,
const int C,
const T* dy,
const T* y,
T* dx) {
__shared__ AccT block_val;
__shared__ typename BlockReduce<AccT>::TempStorage storage;
CUDA_2D_KERNEL_LOOP1(i, NxS) {
const int offset = (i / S) * C * S + (i % S);
auto* offset_dy = dy + offset;
......@@ -134,13 +255,11 @@ __global__ void _LogSoftmaxGrad(
CUDA_2D_KERNEL_LOOP2(j, C) {
val += LDG(offset_dy, j * S);
}
val = BlockReduce<AccT>(storage).Sum(val);
if (threadIdx.x == 0) block_val = val;
__syncthreads();
const AccT block_sum = BlockAllReduce<AccT, cub::Sum, BLOCK_THREADS>(val);
CUDA_2D_KERNEL_LOOP2(j, C) {
const int k = j * S;
val = exp(convert::To<AccT>(offset_y[k])) * block_val;
val = exp(convert::To<AccT>(offset_y[k])) * block_sum;
offset_dx[k] = convert::To<T>(LDG(offset_dy, k) - val);
}
}
......@@ -152,23 +271,41 @@ __global__ void _LogSoftmaxGrad(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
const auto NxS = N * S; \
_##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
#define DEFINE_KERNEL_LAUNCHER(name, T) \
template <> \
void name<T, CUDAContext>( \
const int N, \
const int S, \
const int C, \
const T* x, \
T* y, \
CUDAContext* ctx) { \
const auto NxS = N * S; \
if (C <= 1024) { \
const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
_##name##ViaWarpReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
<<<nblocks, \
dim3(CUDA_WARP_SIZE, WARP_ITEMS), \
0, \
ctx->cuda_stream()>>>( \
NxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
return; \
} \
_##name##ViaBlockReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
<<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(x), \
reinterpret_cast<math::ScalarType<T>::type*>(y)); \
}
#define DEFINE_GRAD_KERNEL_LAUNCHER(name, T) \
......@@ -182,8 +319,27 @@ __global__ void _LogSoftmaxGrad(
T* dx, \
CUDAContext* ctx) { \
const auto NxS = N * S; \
_##name<math::ScalarType<T>::type, math::AccmulatorType<T>::type> \
<<<NxS, CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
if (C <= 1024) { \
const auto nblocks = math::utils::DivUp<int>(NxS, WARP_ITEMS); \
_##name##ViaWarpReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
<<<nblocks, \
dim3(CUDA_WARP_SIZE, WARP_ITEMS), \
0, \
ctx->cuda_stream()>>>( \
NxS, \
S, \
C, \
reinterpret_cast<const math::ScalarType<T>::type*>(dy), \
reinterpret_cast<const math::ScalarType<T>::type*>(y), \
reinterpret_cast<math::ScalarType<T>::type*>(dx)); \
return; \
} \
_##name##ViaBlockReduce< \
math::ScalarType<T>::type, \
math::AccmulatorType<T>::type> \
<<<NxS, BLOCK_THREADS, 0, ctx->cuda_stream()>>>( \
NxS, \
S, \
C, \
......@@ -206,6 +362,8 @@ DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, float);
DEFINE_GRAD_KERNEL_LAUNCHER(LogSoftmaxGrad, double);
#undef DEFINE_KERNEL_LAUNCHER
#undef DEFINE_GRAD_KERNEL_LAUNCHER
#undef WARP_ITEMS
#undef BLOCK_THREADS
} // namespace kernels
......
......@@ -91,16 +91,24 @@ void RegisterModule_cuda(py::module& m) {
#endif
});
/*! \brief Activate the CuDNN engine */
/*! \brief Enable using the cuDNN library */
m.def(
"cudaEnableDNN",
[](bool enabled, bool deterministic, bool benchmark, bool allow_tf32) {
[](bool enabled,
bool deterministic,
bool benchmark,
bool allow_tf32,
const vector<string>& disabled_ops) {
#ifdef USE_CUDA
auto& cuda_objects = CUDAContext::objects();
cuda_objects.cudnn_enabled_ = enabled;
cuda_objects.cudnn_deterministic_ = deterministic;
cuda_objects.cudnn_benchmark_ = benchmark;
cuda_objects.cudnn_allow_tf32_ = allow_tf32;
cuda_objects.cudnn_disabled_ops_.clear();
for (const auto& op_type : disabled_ops) {
cuda_objects.cudnn_disabled_ops_.insert(op_type);
}
#endif
});
......
......@@ -29,15 +29,22 @@ void RegisterModule_tensor(py::module& m) {
/*! \brief Return the number of dimensions */
.def_property_readonly("ndim", &Tensor::ndim)
/*! \brief Return all the dimensions */
/*! \brief Return the dimensions */
.def_property_readonly("dims", &Tensor::dims)
/*! \brief Return the total number of elements */
/*! \brief Return the number of elements */
.def_property_readonly("size", &Tensor::size)
/*! \brief Return the total number of bytes */
/*! \brief Return the byte length of one element */
.def_property_readonly(
"itemsize", [](Tensor* self) { return self->meta().itemsize(); })
/*! \brief Return the byte length of all elements */
.def_property_readonly("nbytes", &Tensor::nbytes)
/*! \brief Return the byte length of allocated memory */
.def_property_readonly("capacity", &Tensor::capacity)
/*! \brief Return the data type */
.def_property_readonly(
"dtype",
......
......@@ -60,7 +60,9 @@ class CuDNNSoftmaxOp final : public SoftmaxOp<Context> {
CuDNNDestroyTensorDesc(&input_desc_);
}
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......@@ -82,7 +84,9 @@ class CuDNNSoftmaxGradientOp final : public SoftmaxGradientOp<Context> {
CuDNNDestroyTensorDesc(&input_desc_);
}
void RunOnDevice() override;
void RunOnDevice() override {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <typename T>
void DoRunWithType();
......
#ifdef USE_CUDNN
#include "dragon/operators/activation/softmax_op.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -9,8 +10,19 @@ template <typename T>
void CuDNNSoftmaxOp<Context>::DoRunWithType() {
auto &X = Input(0), *Y = Output(0, {0});
GET_OP_AXIS_ARG(axis, X.ndim(), -1);
auto S = X.count(axis + 1);
CuDNNSetTensorDesc<T>(&input_desc_, {X.count(0, axis), X.dim(axis), S, 1});
const auto C = X.dim(axis);
const auto N = X.count(0, axis), S = X.count(axis + 1);
if (C < 384 && S == 1) {
kernels::Softmax(
N,
S,
C,
X.template data<T, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ctx());
return;
}
CuDNNSetTensorDesc<T>(&input_desc_, {N, C, S, 1});
CUDNN_CHECK(cudnnSoftmaxForward(
ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE,
......@@ -24,17 +36,24 @@ void CuDNNSoftmaxOp<Context>::DoRunWithType() {
}
template <class Context>
void CuDNNSoftmaxOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void CuDNNSoftmaxGradientOp<Context>::DoRunWithType() {
auto &Y = Input(0), &dY = Input(1), *dX = Output(0);
GET_OP_AXIS_ARG(axis, Y.ndim(), -1);
auto S = Y.count(axis + 1);
CuDNNSetTensorDesc<T>(&input_desc_, {Y.count(0, axis), Y.dim(axis), S, 1});
const auto C = Y.dim(axis);
const auto N = Y.count(0, axis), S = Y.count(axis + 1);
if (C < 256 && S == 1) {
kernels::SoftmaxGrad(
Y.count(0, axis),
Y.count(axis + 1),
Y.dim(axis),
dY.template data<T, Context>(),
Y.template data<T, Context>(),
dX->ReshapeLike(Y)->template mutable_data<T, Context>(),
ctx());
return;
}
CuDNNSetTensorDesc<T>(&input_desc_, {N, C, S, 1});
CUDNN_CHECK(cudnnSoftmaxBackward(
ctx()->cudnn_handle(),
CUDNN_SOFTMAX_ACCURATE,
......@@ -49,11 +68,6 @@ void CuDNNSoftmaxGradientOp<Context>::DoRunWithType() {
dX->ReshapeLike(Y)->template mutable_data<T, Context>()));
}
template <class Context>
void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
DispatchHelper<dtypes::Floating>::Call(this, Input(0));
}
DEPLOY_CUDNN_OPERATOR(Softmax);
DEPLOY_CUDNN_OPERATOR(SoftmaxGradient);
......
......@@ -29,7 +29,7 @@ void ConcatOp<Context>::DoRunWithType() {
int64_t output_offset = 0;
for (int i = 0; i < InputSize(); ++i) {
const auto& Xi = Input(i);
auto& Xi = Input(i);
math::CopyMatrix(
Xi.count(0, axis),
Xi.count(axis),
......
......@@ -28,7 +28,7 @@ void StackOp<Context>::DoRunWithType() {
int64_t output_offset = 0;
for (int i = 0; i < num_stacks; i++) {
const auto& Xi = Input(i);
auto& Xi = Input(i);
math::CopyMatrix(
Xi.count(0, axis),
Xi.count(axis),
......
......@@ -67,6 +67,7 @@ def enable_cudnn(
deterministic=False,
benchmark=False,
allow_tf32=False,
disabled_ops=None,
):
"""Enable backend to use the cuDNN library.
......@@ -80,9 +81,12 @@ def enable_cudnn(
Select fastest algorithms via benchmark or heuristics.
allow_tf32 : bool, optional, default=False
Allow TF32 tensor core operation or not.
disabled_ops : Sequence[str], optional
The operator types to disable using cuDNN.
"""
return backend.cudaEnableDNN(enabled, deterministic, benchmark, allow_tf32)
return backend.cudaEnableDNN(enabled, deterministic, benchmark,
allow_tf32, disabled_ops or [])
def get_device_capability(device_index=None):
......
......@@ -143,6 +143,11 @@ CONVERSIONS_DECL half To<half, half>(half val) {
}
template <>
CONVERSIONS_DECL float To<float, half>(half val) {
return __half2float(val);
}
template <>
CONVERSIONS_DECL half To<half, float>(float val) {
#if CUDA_VERSION_MIN(9, 2)
return __float2half(val);
......
......@@ -20,12 +20,35 @@
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_select.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#include <cub/warp/warp_reduce.cuh>
namespace dragon {
template <typename T>
using WarpReduce = cub::BlockReduce<T, CUDA_WARP_SIZE>;
template <typename T>
using BlockReduce = cub::BlockReduce<T, CUDA_THREADS>;
template <typename T, typename Reducer>
__inline__ __device__ T WarpAllReduce(T val) {
for (int mask = CUDA_WARP_SIZE / 2; mask > 0; mask /= 2) {
val = Reducer()(val, __shfl_xor_sync(0xffffffff, val, mask));
}
return val;
}
template <typename T, typename Reducer, int kThreadsPerBlock>
__inline__ __device__ T BlockAllReduce(T val) {
typedef cub::BlockReduce<T, kThreadsPerBlock> BlockReduce;
__shared__ T block_val;
__shared__ typename BlockReduce::TempStorage storage;
val = BlockReduce(storage).Reduce(val, Reducer());
if (threadIdx.x == 0) block_val = val;
__syncthreads();
return block_val;
}
} // namespace dragon
#endif // USE_CUDA
......
......@@ -27,10 +27,13 @@ namespace dragon {
#ifdef USE_CUDA
/*! \brief The number of cuda threads to use */
/*! \brief The number of cuda threads in a warp */
constexpr int CUDA_WARP_SIZE = 32;
/*! \brief The number of cuda threads in a block */
constexpr int CUDA_THREADS = 256;
/*! \brief The maximum number of blocks to use in the default kernel call */
/*! \brief The maximum number of blocks to use in a default kernel call */
constexpr int CUDA_MAX_BLOCKS = 4096;
/*! \brief The maximum number of devices in a single machine */
......
......@@ -29,11 +29,9 @@ void _Transpose(
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_TRANSPOSE_FUNC(T) \
template <> \
void Transpose<T, CPUContext>( \
DRAGON_API void Transpose<T, CPUContext>( \
const int num_dims, \
const int64_t* dims, \
const int64_t* axes, \
......
......@@ -82,11 +82,9 @@ void _TransposeImpl(
} // namespace
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_TRANSPOSE_FUNC(T) \
template <> \
void Transpose<T, CUDAContext>( \
DRAGON_API void Transpose<T, CUDAContext>( \
const int num_dims, \
const int64_t* dims, \
const int64_t* axes, \
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!