Commit fdf26ef2 by Ting PAN

Use local workspace for Context

Summary:
This commit uses local(thread or stream) workspace for Context,
which provides a more elegant way to dispatch kernels requiring scratch.
Besides, TF32 math type is provided as a cuDNN option for Ampere device.
1 parent 1dd8aeef
Showing with 1038 additions and 923 deletions
......@@ -9,7 +9,7 @@ dragon/core
`class CPUContext <core/CPUContext.html>`_
: The cpu device context.
`class CUDAContext <core/CPUContext.html>`_
`class CUDAContext <core/CUDAContext.html>`_
: The cuda device context.
`class Graph <core/Graph.html>`_
......
......@@ -69,6 +69,10 @@ stream
######
.. doxygenfunction:: dragon::CPUContext::stream
workspace
#########
.. doxygenfunction:: dragon::CPUContext::workspace
.. raw:: html
<style>
......
......@@ -97,6 +97,14 @@ stream
######
.. doxygenfunction:: dragon::CUDAContext::stream
workspace
#########
.. doxygenfunction:: dragon::CUDAContext::workspace()
workspace
#########
.. doxygenfunction:: dragon::CUDAContext::workspace(int device, int stream)
.. raw:: html
<style>
......
......@@ -43,9 +43,9 @@ phase
#####
.. doxygenfunction:: dragon::Graph::phase
ws
##
.. doxygenfunction:: dragon::Graph::ws
workspace
#########
.. doxygenfunction:: dragon::Graph::workspace
.. raw:: html
......
......@@ -95,9 +95,9 @@ phase
#####
.. doxygenfunction:: dragon::Operator::phase
ws
##
.. doxygenfunction:: dragon::Operator::ws
workspace
#########
.. doxygenfunction:: dragon::Operator::workspace
.. raw:: html
......
......@@ -30,6 +30,9 @@ dragon
`cast(...) <dragon/cast.html>`_
: Cast the data type of input.
`channel_affine(...) <dragon/channel_affine.html>`_
: Apply affine transformation along the channels.
`channel_normalize(...) <dragon/channel_normalize.html>`_
: Normalize channels with mean and standard deviation.
......@@ -171,6 +174,7 @@ dragon
dragon/assign
dragon/broadcast_to
dragon/cast
dragon/channel_affine
dragon/channel_normalize
dragon/channel_shuffle
dragon/concat
......
affine
======
channel_affine
==============
.. autofunction:: dragon.math.affine
.. autofunction:: dragon.channel_affine
.. raw:: html
<style>
h1:before {
content: "dragon.math.";
content: "dragon.";
color: #103d3e;
}
</style>
......@@ -12,9 +12,6 @@ dragon.math
`add(...) <math/add.html>`_
: Compute the element-wise addition.
`affine(...) <math/affine.html>`_
: Compute the affine transformation along the given axes.
`argmax(...) <math/argmax.html>`_
: Compute the index of maximum elements along the given axis.
......@@ -149,7 +146,6 @@ dragon.math
math/abs
math/add
math/affine
math/argmax
math/argmin
math/axpby
......
......@@ -60,6 +60,9 @@ vm.torch
`ceil(...) <torch/ceil.html>`_
: Compute the smallest integer not less than input.
`channel_affine(...) <torch/channel_affine.html>`_
: Apply affine transformation along the channels.
`channel_normalize(...) <torch/channel_normalize.html>`_
: Normalize channels with mean and standard deviation.
......@@ -263,6 +266,7 @@ vm.torch
torch/bitwise_xor
torch/cat
torch/ceil
torch/channel_affine
torch/channel_normalize
torch/channel_shuffle
torch/chunk
......
affine
======
channel_affine
==============
.. autofunction:: dragon.vm.torch.nn.functional.affine
.. _torch.nn.Affine(...): ../Affine.html
.. autofunction:: dragon.vm.torch.channel_affine
.. raw:: html
<style>
h1:before {
content: "torch.nn.functional.";
content: "torch.";
color: #103d3e;
}
</style>
......@@ -6,8 +6,8 @@ vm.torch.nn
Classes
-------
`class Affine <nn/Affine.html>`_
: Apply the affine transformation over input.
`class AffineChannel <nn/AffineChannel.html>`_
: Apply affine transformation along the channels.
`class AvgPool2d <nn/AvgPool2d.html>`_
: Apply the 2d average pooling.
......@@ -197,7 +197,7 @@ vm.torch.nn
.. toctree::
:hidden:
nn/Affine
nn/AffineChannel
nn/AvgPool2d
nn/BatchNorm1d
nn/BatchNorm2d
......
Affine
======
AffineChannel
=============
.. autoclass:: dragon.vm.torch.nn.Affine
.. autoclass:: dragon.vm.torch.nn.AffineChannel
__init__
--------
.. automethod:: dragon.vm.torch.nn.Affine.__init__
.. automethod:: dragon.vm.torch.nn.AffineChannel.__init__
.. _torch.nn.functional.affine(...): functional/affine.html
.. _torch.channel_affine(...): ../channel_affine.html
.. raw:: html
......
......@@ -6,9 +6,6 @@ vm.torch.nn.functional
Functions
---------
`affine(...) <functional/affine.html>`_
: Apply the affine transformation to input.
`avg_pool2d(...) <functional/avg_pool2d.html>`_
: Apply the 2d average pooling to input.
......@@ -132,7 +129,6 @@ vm.torch.nn.functional
.. toctree::
:hidden:
functional/affine
functional/avg_pool2d
functional/batch_norm
functional/binary_cross_entropy_with_logits
......
#include "context_cuda.h"
#include "dragon/core/context_cuda.h"
#include "dragon/core/workspace.h"
namespace dragon {
Workspace* CPUContext::workspace() {
static thread_local Workspace workspace("");
return &workspace;
}
#ifdef USE_CUDA
CUDAObjects::~CUDAObjects() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
#ifdef USE_NCCL
for (auto& comm_iter : nccl_comms_[i]) {
if (comm_iter.second) {
NCCL_CHECK(ncclCommDestroy(comm_iter.second));
}
}
#endif
#ifdef USE_CUDNN
for (auto& handle : cudnn_handles_[i]) {
/*!
* Temporarily disable the handle destroying,
* to avoid the segmentation fault in CUDNN v8.
*
* if (handle) CUDNN_CHECK(cudnnDestroy(handle));
*/
}
#endif
for (auto& handle : cublas_handles_[i]) {
if (handle) CUBLAS_CHECK(cublasDestroy(handle));
}
for (int j = 0; j < cuda_streams_[i].size(); j++) {
auto& stream = cuda_streams_[i][j];
/*!
* Do not check the stream destroying,
* error code 29 (driver shutting down) is inevitable.
*/
if (stream) cudaStreamDestroy(stream);
}
for (auto& workspace : cuda_workspaces_[i]) {
if (workspace) delete workspace;
}
}
}
Workspace* CUDAObjects::workspace(int device_id, int stream_id) {
auto& workspaces = cuda_workspaces_[device_id];
if (workspaces.size() <= (unsigned)stream_id) {
workspaces.resize(stream_id + 1, nullptr);
}
if (!workspaces[stream_id]) {
workspaces[stream_id] = new Workspace("");
}
return workspaces[stream_id];
}
std::mutex& CUDAContext::mutex() {
static std::mutex m;
return m;
......
......@@ -17,6 +17,8 @@
namespace dragon {
class Workspace;
/*!
* \brief The cpu device context.
*/
......@@ -94,6 +96,9 @@ class DRAGON_API CPUContext {
/*! \brief Wait for the dispatched computation to complete */
void FinishDeviceComputation() {}
/*! \brief Return the current workspace */
Workspace* workspace();
/*! \brief Return the device index */
int device() const {
return 0;
......
......@@ -22,12 +22,15 @@ namespace dragon {
#ifdef USE_CUDA
class Workspace;
class CUDAObjects {
public:
/*! \brief Default Constructor */
CUDAObjects() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
cuda_streams_[i] = vector<cudaStream_t>();
cuda_workspaces_[i] = vector<Workspace*>();
cublas_handles_[i] = vector<cublasHandle_t>();
#ifdef USE_CUDNN
cudnn_handles_[i] = vector<cudnnHandle_t>();
......@@ -39,38 +42,7 @@ class CUDAObjects {
}
/*! \brief Destructor */
~CUDAObjects() {
for (int i = 0; i < CUDA_MAX_DEVICES; i++) {
#ifdef USE_NCCL
for (auto& comm_iter : nccl_comms_[i]) {
if (comm_iter.second) {
NCCL_CHECK(ncclCommDestroy(comm_iter.second));
}
}
#endif
#ifdef USE_CUDNN
for (auto& handle : cudnn_handles_[i]) {
/*!
* Temporarily disable the handle destroying,
* to avoid the segmentation fault in CUDNN v8.
*
* if (handle) CUDNN_CHECK(cudnnDestroy(handle));
*/
}
#endif
for (auto& handle : cublas_handles_[i]) {
if (handle) CUBLAS_CHECK(cublasDestroy(handle));
}
for (int j = 0; j < cuda_streams_[i].size(); j++) {
auto& stream = cuda_streams_[i][j];
/*!
* Do not check the stream destroying,
* error code 29 (driver shutting down) is inevitable.
*/
if (stream) cudaStreamDestroy(stream);
}
}
}
~CUDAObjects();
/*! \brief Return the specified cublas handle */
cublasHandle_t cublas_handle(int device_id, int stream_id) {
......@@ -142,8 +114,9 @@ class CUDAObjects {
/*! \brief Return the specified cuda stream */
cudaStream_t stream(int device_id, int stream_id) {
auto& streams = cuda_streams_[device_id];
if (streams.size() <= (unsigned)stream_id)
if (streams.size() <= (unsigned)stream_id) {
streams.resize(stream_id + 1, nullptr);
}
if (!streams[stream_id]) {
CUDADeviceGuard guard(device_id);
unsigned int flags =
......@@ -153,19 +126,37 @@ class CUDAObjects {
return streams[stream_id];
}
/*! \brief Return the workspace for specified cuda stream */
Workspace* workspace(int device_id, int stream_id);
/*! \brief The cached CUDA streams of each device */
vector<cudaStream_t> cuda_streams_[CUDA_MAX_DEVICES];
/*! \brief The cached CUDA workspaces of each device */
vector<Workspace*> cuda_workspaces_[CUDA_MAX_DEVICES];
/*! \brief The cached cuBLAS handles of each device */
vector<cublasHandle_t> cublas_handles_[CUDA_MAX_DEVICES];
#ifdef USE_CUDNN
/*! \brief The cached cuDNN handles of each device */
vector<cudnnHandle_t> cudnn_handles_[CUDA_MAX_DEVICES];
#endif
#ifdef USE_NCCL
/*! \brief The cached NCCL comms of each device */
Map<string, ncclComm_t> nccl_comms_[CUDA_MAX_DEVICES];
#endif
/*! \brief The flag that alllows cuDNN or not */
bool cudnn_enabled_ = true;
/*! \brief The flag that allows cuDNN benchmark or not */
bool cudnn_benchmark_ = false;
/*! \brief The flag thats allow cuDNN TF32 math type or not */
bool cudnn_allow_tf32_ = false;
private:
DISABLE_COPY_AND_ASSIGN(CUDAObjects);
};
......@@ -190,11 +181,19 @@ class DRAGON_API CUDAContext {
CHECK_EQ(option.device_type(), PROTO_CUDA);
}
/*! \brief Allocate a block of memory */
/*! \brief Allocate a block of device memory */
static void* New(size_t size) {
void* data;
cudaMalloc(&data, size);
CHECK(data) << "\nAllocate cuda memory with " << size << " bytes failed.";
CHECK(data) << "\nAllocate device memory with " << size << " bytes failed.";
return data;
}
/*! \brief Allocate a block of host memory */
static void* NewHost(size_t size) {
void* data;
cudaMallocHost(&data, size);
CHECK(data) << "\nAllocate host memory with " << size << " bytes failed.";
return data;
}
......@@ -237,11 +236,16 @@ class DRAGON_API CUDAContext {
CHECK_EQ(err, cudaSuccess) << "\nCUDA Error: " << cudaGetErrorString(err);
}
/*! \brief Deallocate a memory block */
/*! \brief Deallocate a device memory block */
static void Delete(void* ptr) {
cudaFree(ptr);
}
/*! \brief Deallocate a host memory block */
static void DeleteHost(void* ptr) {
cudaFreeHost(ptr);
}
/*! \brief Switch to the device in current thread */
void SwitchToDevice() {
SwitchToDevice(0);
......@@ -265,9 +269,19 @@ class DRAGON_API CUDAContext {
SynchronizeStream(cuda_stream());
}
/*! \brief Return the cuda stream */
/*! \brief Return the current workspace */
Workspace* workspace() {
return objects().workspace(device_id_, stream_id_);
}
/*! \brief Return the specified workspace */
Workspace* workspace(int device, int stream) {
return objects().workspace(device, stream);
}
/*! \brief Return the current cuda stream */
cudaStream_t cuda_stream() {
return cuda_stream(device_id_, stream_id_);
return objects().stream(device_id_, stream_id_);
}
/*! \brief Return the specified cuda stream */
......@@ -359,12 +373,18 @@ class DRAGON_API CUDAContext {
CUDA_NOT_COMPILED;
}
/*! \brief Allocate a block of memory */
/*! \brief Allocate a block of device memory */
static void* New(size_t nbytes) {
CUDA_NOT_COMPILED;
return nullptr;
}
/*! \brief Allocate a block of host memory */
static void* NewHost(size_t nbytes) {
CUDA_NOT_COMPILED;
return nullptr;
}
/*! \brief Set a memory block to the given value */
static void Memset(size_t nbytes, void* ptr, int value = 0) {
CUDA_NOT_COMPILED;
......@@ -387,11 +407,16 @@ class DRAGON_API CUDAContext {
CUDA_NOT_COMPILED;
}
/*! \brief Deallocate a memory block */
/*! \brief Deallocate a device memory block */
static void Delete(void* ptr) {
CUDA_NOT_COMPILED;
}
/*! \brief Deallocate a host memory block */
static void DeleteHost(void* ptr) {
CUDA_NOT_COMPILED;
}
/*! \brief Copy the memory asynchronously */
template <class DestContext, class SrcContext>
void MemcpyAsync(size_t nbytes, void* dest, const void* src) {
......
......@@ -69,7 +69,7 @@ class DRAGON_API GraphBase {
}
/*! \brief Return the parent workspace */
Workspace* ws() const {
Workspace* workspace() const {
return ws_;
}
......
......@@ -147,7 +147,7 @@ class DRAGON_API UnifiedMemory {
/*! \brief Set to use an external block of cpu data */
void set_cpu_data(void* cpu_ptr, size_t size);
/*! \brief Set to use an extenral block of cuda data */
/*! \brief Set to use an external block of cuda data */
void set_cuda_data(void* cuda_ptr, size_t size, int device);
private:
......
......@@ -71,7 +71,7 @@ Tensor* OperatorBase::Output(int i, const vec32_t& inputs) {
}
Tensor* OperatorBase::Buffer(const string& name) {
return ws()->CreateTensor("/share/buffer/" + handle_ + "/" + name);
return workspace()->CreateTensor("/share/buffer/" + handle_ + "/" + name);
}
string OperatorBase::MessageForUnsupported(
......@@ -94,10 +94,10 @@ OperatorBase* OperatorBase::UpdateFrom(const OperatorDef& def) {
inputs_.resize(def.input_size());
outputs_.resize(def.output_size());
for (int i = 0; i < inputs_.size(); i++) {
inputs_[i] = ws()->GetTensor(def.input(i));
inputs_[i] = workspace()->GetTensor(def.input(i));
}
for (int i = 0; i < outputs_.size(); i++) {
outputs_[i] = ws()->CreateTensor(def.output(i));
outputs_[i] = workspace()->CreateTensor(def.output(i));
}
return this;
}
......@@ -113,7 +113,7 @@ void Operator<Context>::Prepare() {
LOG(DEBUG) << "Excepted version of Tensor(" + Input(i).name() + ") "
<< "is " << version << ", got " << Input(i).version()
<< ". Recompute.";
Tensor* flag = ws()->GetTensor("/share/flag/recomputing");
Tensor* flag = workspace()->GetTensor("/share/flag/recomputing");
flag->mutable_data<bool, CPUContext>()[0] = true;
vector<OperatorBase*>& chain = subgraph()[name];
for (auto* op : chain) {
......
......@@ -139,7 +139,7 @@ class DRAGON_API OperatorBase {
}
/*! \brief Return the parent workspace */
Workspace* ws() const {
Workspace* workspace() const {
return ws_;
}
......@@ -219,7 +219,7 @@ class DRAGON_API Operator : public OperatorBase {
ctx()->SwitchToDevice(stream);
SwitchToDevice();
RunOnDevice();
if (do_sync_ || stream > 0) {
if (do_sync_) {
ctx()->FinishDeviceComputation();
}
Release();
......@@ -262,7 +262,7 @@ OperatorBase* NewOperator(const OperatorDef&, Workspace*);
using OperatorBase::data_format; \
using OperatorBase::handle; \
using OperatorBase::def; \
using OperatorBase::ws
using OperatorBase::workspace
#define USE_OPERATOR_FUNCTIONS \
USE_OPERATOR_BASE_FUNCTIONS; \
......@@ -274,7 +274,7 @@ OperatorBase* NewOperator(const OperatorDef&, Workspace*);
->set_meta(Input(i).meta()))
#define RESTORE_INPUT_SPEC(i) \
*(ws()->GetTensor( \
*(workspace()->GetTensor( \
"/share/buffer/" + handle() + "/X_spec:" + std::to_string(i)))
/* Dispatchers */
......@@ -341,7 +341,7 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
if (tensor.count() == 0) { \
auto* filler_info = ws()->GetFillerInfo(tensor.name()); \
auto* filler_info = workspace()->GetFillerInfo(tensor.name()); \
CHECK(filler_info) << "\nTensor(" << tensor.name() << ") is empty.\n" \
<< "May be specify a filler for it?"; \
tensor.Reshape(shape); \
......@@ -362,7 +362,7 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
#define TENSOR_FILL(tensor, shape) \
if (tensor.count() == 0) { \
auto* filler_info = ws()->GetFillerInfo(tensor.name()); \
auto* filler_info = workspace()->GetFillerInfo(tensor.name()); \
CHECK(filler_info) << "\nTensor(" << tensor.name() << ") is empty.\n" \
<< "May be specify a filler for it?"; \
tensor.Reshape(shape); \
......@@ -413,7 +413,7 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
template <class Context> \
type classname<Context>::arg() { \
if (arg##_desc_.empty()) return arg##_; \
auto* arg##_tensor = ws()->GetTensor( \
auto* arg##_tensor = workspace()->GetTensor( \
str::replace_first(arg##_desc_, "${HANDLE}", handle())); \
CHECK_EQ(arg##_tensor->count(), 1) \
<< "\nThe argument <" << #arg << "> should be a scalar."; \
......@@ -423,35 +423,35 @@ DEFINE_TENSOR_TYPES_DISPATCHER(DoRunWithType);
return arg##_tensor->template data<type, CPUContext>()[0]; \
}
#define DEFINE_OP_REPEATED_ARG_WITH_DESC(type, classname, arg) \
template <class Context> \
type classname<Context>::arg(int i, int* num) { \
const type* data; \
string desc; \
if (!arg##_desc_.empty()) { \
desc = arg##_desc_; \
} else if (!arg##_descs_.empty()) { \
desc = arg##_descs_[i]; \
} \
if (!desc.empty()) { \
auto* arg##_tensor = \
ws()->GetTensor(str::replace_first(desc, "${HANDLE}", handle())); \
CHECK(arg##_tensor->template IsType<type>()) \
<< "\nThe type of argument <" << #arg << "> should be " \
<< types::to_string<type>() << "."; \
data = arg##_tensor->template data<type, CPUContext>(); \
if (num != nullptr) { \
*num = arg##_desc_.empty() ? (int)arg##_descs_.size() \
: (int)arg##_tensor->size(); \
} \
} else { \
data = arg##_.data(); \
if (num != nullptr) { \
*num = (int)arg##_.size(); \
} \
} \
if (num != nullptr && (*num) == 0) return type(0); \
return arg##_descs_.empty() ? data[i] : data[0]; \
#define DEFINE_OP_REPEATED_ARG_WITH_DESC(type, classname, arg) \
template <class Context> \
type classname<Context>::arg(int i, int* num) { \
const type* data; \
string desc; \
if (!arg##_desc_.empty()) { \
desc = arg##_desc_; \
} else if (!arg##_descs_.empty()) { \
desc = arg##_descs_[i]; \
} \
if (!desc.empty()) { \
auto* arg##_tensor = workspace()->GetTensor( \
str::replace_first(desc, "${HANDLE}", handle())); \
CHECK(arg##_tensor->template IsType<type>()) \
<< "\nThe type of argument <" << #arg << "> should be " \
<< types::to_string<type>() << "."; \
data = arg##_tensor->template data<type, CPUContext>(); \
if (num != nullptr) { \
*num = arg##_desc_.empty() ? (int)arg##_descs_.size() \
: (int)arg##_tensor->size(); \
} \
} else { \
data = arg##_.data(); \
if (num != nullptr) { \
*num = (int)arg##_.size(); \
} \
} \
if (num != nullptr && (*num) == 0) return type(0); \
return arg##_descs_.empty() ? data[i] : data[0]; \
}
#define CANONICALIZE_AXIS_WITH_TENSOR_AND_OFFSET(tensor, offset) \
......
......@@ -89,9 +89,9 @@ class DRAGON_API Workspace {
template <class Context>
vector<void*> data(const vector<size_t>& segments) {
vector<void*> group(segments.size());
auto total_bytes = std::accumulate(segments.begin(), segments.end(), 0);
group[0] = CreateTensor("/share/data")
->Reshape({(int64_t)total_bytes})
->Reshape({(int64_t)std::accumulate(
segments.begin(), segments.end(), size_t(0))})
->template mutable_data<uint8_t, Context>();
for (int i = 1; i < segments.size(); ++i) {
group[i] = (uint8_t*)group[i - 1] + segments[i - 1];
......
......@@ -8,7 +8,7 @@ namespace kernel {
namespace {
template <typename T>
void _Affine(
void _ChannelAffine(
const int outer_dim,
const int axis_dim,
const T* x,
......@@ -29,7 +29,7 @@ void _Affine(
}
template <typename T>
void _Affine(
void _ChannelAffine(
const int outer_dim,
const int axis_dim,
const int inner_dim,
......@@ -57,7 +57,7 @@ void _Affine(
/* ------------------- Launcher Separator ------------------- */
template <>
void Affine<float16, CPUContext>(
void ChannelAffine<float16, CPUContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
......@@ -69,22 +69,22 @@ void Affine<float16, CPUContext>(
CPU_FP16_NOT_SUPPORTED;
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Affine<T, CPUContext>( \
const int outer_dim, \
const int axis_dim, \
const int inner_dim, \
const T* x, \
const T* w, \
const T* b, \
T* y, \
CPUContext* ctx) { \
if (inner_dim == 1) { \
_Affine(outer_dim, axis_dim, x, w, b, y); \
} else { \
_Affine(outer_dim, axis_dim, inner_dim, x, w, b, y); \
} \
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelAffine<T, CPUContext>( \
const int outer_dim, \
const int axis_dim, \
const int inner_dim, \
const T* x, \
const T* w, \
const T* b, \
T* y, \
CPUContext* ctx) { \
if (inner_dim == 1) { \
_ChannelAffine(outer_dim, axis_dim, x, w, b, y); \
} else { \
_ChannelAffine(outer_dim, axis_dim, inner_dim, x, w, b, y); \
} \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
......@@ -93,7 +93,6 @@ DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernel {
namespace {
template <typename T>
__global__ void _ChannelAffine(
const int nthreads,
const int axis_dim,
const int inner_dim,
const T* x,
const T* w,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350
y[i] = x[i] * __ldg(w + (i / inner_dim) % axis_dim);
#else
y[i] = x[i] * w[(i / inner_dim) % axis_dim];
#endif
}
}
template <>
__global__ void _ChannelAffine<half>(
const int nthreads,
const int axis_dim,
const int inner_dim,
const half* x,
const half* w,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hmul(x[i], __ldg(w + (i / inner_dim) % axis_dim));
#elif __CUDA_ARCH__ >= 350
y[i] = __float2half(
__half2float(x[i]) *
__half2float(__ldg(w + (i / inner_dim) % axis_dim)));
#else
y[i] = __float2half(
__half2float(x[i]) * __half2float(w[(i / inner_dim) % axis_dim]));
#endif
}
}
template <typename T>
__global__ void _ChannelAffine(
const int nthreads,
const int axis_dim,
const int inner_dim,
const T* x,
const T* w,
const T* b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int wi = (i / inner_dim) % axis_dim;
#if __CUDA_ARCH__ >= 350
y[i] = x[i] * __ldg(w + wi) + __ldg(b + wi);
#else
y[i] = x[i] * w[wi] + b[wi];
#endif
}
}
template <>
__global__ void _ChannelAffine<half>(
const int nthreads,
const int axis_dim,
const int inner_dim,
const half* x,
const half* w,
const half* b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int wi = (i / inner_dim) % axis_dim;
#if __CUDA_ARCH__ >= 530
y[i] = __hfma(x[i], __ldg(w + wi), __ldg(b + wi));
#elif __CUDA_ARCH__ >= 350
y[i] = __float2half(fmaf(
__half2float(x[i]),
__half2float(__ldg(w + wi)),
__half2float(__ldg(b + wi))));
#else
y[i] = __float2half(
fmaf(__half2float(x[i]), __half2float(w[wi]), __half2float(b[wi])));
#endif
}
}
template <>
__global__ void _ChannelAffine<float>(
const int nthreads,
const int axis_dim,
const int inner_dim,
const float* x,
const float* w,
const float* b,
float* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int wi = (i / inner_dim) % axis_dim;
#if __CUDA_ARCH__ >= 350
y[i] = fmaf(x[i], __ldg(w + wi), __ldg(b + wi));
#else
y[i] = fmaf(x[i], w[wi], b[wi]);
#endif
}
}
template <>
__global__ void _ChannelAffine<double>(
const int nthreads,
const int axis_dim,
const int inner_dim,
const double* x,
const double* w,
const double* b,
double* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int wi = (i / inner_dim) % axis_dim;
#if __CUDA_ARCH__ >= 350
y[i] = fma(x[i], __ldg(w + wi), __ldg(b + wi));
#else
y[i] = fma(x[i], w[wi], b[wi]);
#endif
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void ChannelAffine<float16, CUDAContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* x,
const float16* w,
const float16* b,
float16* y,
CUDAContext* ctx) {
const int nthreads = outer_dim * axis_dim * inner_dim;
if (b != nullptr) {
_ChannelAffine<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
axis_dim,
inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(w),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
} else {
_ChannelAffine<<<
CUDA_BLOCKS(nthreads),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(
nthreads,
axis_dim,
inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(w),
reinterpret_cast<half*>(y));
}
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void ChannelAffine<T, CUDAContext>( \
const int outer_dim, \
const int axis_dim, \
const int inner_dim, \
const T* x, \
const T* w, \
const T* b, \
T* y, \
CUDAContext* ctx) { \
const int nthreads = outer_dim * axis_dim * inner_dim; \
if (b != nullptr) { \
_ChannelAffine<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>(nthreads, axis_dim, inner_dim, x, w, b, y); \
} else { \
_ChannelAffine<<< \
CUDA_BLOCKS(nthreads), \
CUDA_THREADS, \
0, \
ctx->cuda_stream()>>>(nthreads, axis_dim, inner_dim, x, w, y); \
} \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
} // namespace dragon
#endif // USE_CUDA
......@@ -12,18 +12,12 @@ void _Flagged(
const int count,
const uint8_t* mask,
IndexType* index,
int* num_selected,
void* scratch,
size_t& scratch_size) {
if (scratch_size <= 0) {
scratch_size = size_t(1);
} else {
IndexType* offset_index = index;
for (int i = 0; i < count; ++i) {
if (mask[i]) *(offset_index++) = i;
}
num_selected[0] = std::distance(index, offset_index);
int* num_selected) {
IndexType* offset_index = index;
for (int i = 0; i < count; ++i) {
if (mask[i]) *(offset_index++) = i;
}
num_selected[0] = std::distance(index, offset_index);
}
template <typename IndexType, typename CoordType>
......@@ -45,17 +39,15 @@ void _UnravelIndex(
} // namespace
#define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \
void Flagged<IndexType, CPUContext>( \
const int count, \
const uint8_t* mask, \
IndexType* index, \
int* num_selected, \
void* scratch, \
size_t& scratch_size, \
CPUContext* ctx) { \
_Flagged(count, mask, index, num_selected, scratch, scratch_size); \
#define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \
void Flagged<IndexType, CPUContext>( \
const int count, \
const uint8_t* mask, \
IndexType* index, \
int* num_selected, \
CPUContext* ctx) { \
_Flagged(count, mask, index, num_selected); \
}
DEFINE_KERNEL_LAUNCHER(int);
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/device/common_cub.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -31,48 +32,44 @@ __global__ void _UnravelIndex(
/* ------------------- Launcher Separator ------------------- */
#define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \
void Flagged<IndexType, CUDAContext>( \
const int count, \
const uint8_t* mask, \
IndexType* index, \
int* num_selected, \
void* scratch, \
size_t& scratch_size, \
CUDAContext* ctx) { \
cub::CountingInputIterator<int> itr(0); \
if (scratch_size <= 0) { \
cub::DeviceSelect::Flagged( \
scratch, \
scratch_size, \
itr, \
mask, \
index, \
static_cast<int64_t*>(nullptr), \
count, \
ctx->cuda_stream()); \
} else { \
auto* num_selected_dev = index + count; \
cub::DeviceSelect::Flagged( \
scratch, \
scratch_size, \
itr, \
mask, \
index, \
num_selected_dev, \
count, \
ctx->cuda_stream()); \
IndexType num_selected_host; \
CUDA_CHECK(cudaMemcpyAsync( \
&num_selected_host, \
num_selected_dev, \
sizeof(IndexType), \
cudaMemcpyDefault, \
ctx->cuda_stream())); \
ctx->FinishDeviceComputation(); \
num_selected[0] = num_selected_host; \
} \
#define DEFINE_KERNEL_LAUNCHER(IndexType) \
template <> \
void Flagged<IndexType, CUDAContext>( \
const int count, \
const uint8_t* mask, \
IndexType* index, \
int* num_selected, \
CUDAContext* ctx) { \
IndexType num_selected_host; \
auto* num_selected_dev = index + count; \
size_t ws_nbytes = 0; \
cub::CountingInputIterator<int> itr(0); \
cub::DeviceSelect::Flagged( \
nullptr, \
ws_nbytes, \
itr, \
mask, \
index, \
static_cast<int64_t*>(nullptr), \
count, \
ctx->cuda_stream()); \
cub::DeviceSelect::Flagged( \
ctx->workspace()->template data<CUDAContext>({ws_nbytes})[0], \
ws_nbytes, \
itr, \
mask, \
index, \
num_selected_dev, \
count, \
ctx->cuda_stream()); \
CUDA_CHECK(cudaMemcpyAsync( \
&num_selected_host, \
num_selected_dev, \
sizeof(IndexType), \
cudaMemcpyDefault, \
ctx->cuda_stream())); \
ctx->FinishDeviceComputation(); \
num_selected[0] = num_selected_host; \
}
DEFINE_KERNEL_LAUNCHER(int);
......
......@@ -23,17 +23,42 @@ void _BroadcastLossGrad(
}
}
} // namespace
template <>
void ReduceLoss<float16, CPUContext>(
const int count,
const int num_masks,
const float normalizer,
const float16* x,
const float16* mask,
float16* y,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <>
void ReduceLossGrad<float16, CPUContext>(
const int count,
const int num_masks,
const float normalizer,
const float16* dy,
const float16* mask,
float16* dx,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
}
template <>
void _BroadcastLossGrad<float16>(
void BroadcastLossGrad<float16, CPUContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* dy,
float16* dx) {
float16* dx,
CPUContext* ctx) {
CPU_FP16_NOT_SUPPORTED;
} // BroadcastLossGrad
} // namespace
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
......@@ -42,11 +67,11 @@ void _BroadcastLossGrad<float16>(
const int num_masks, \
const float normalizer, \
const T* x, \
const int* mask, \
const T* mask, \
T* y, \
CPUContext* ctx) { \
float inv_scale = std::max( \
1e-5F, \
1.f, \
num_masks > 0 && normalizer < 0.f \
? (float)math::Sum(num_masks, 1.f, mask, ctx) \
: normalizer); \
......@@ -60,11 +85,11 @@ void _BroadcastLossGrad<float16>(
const int num_masks, \
const float normalizer, \
const T* dy, \
const int* mask, \
const T* mask, \
T* dx, \
CPUContext* ctx) { \
float inv_scale = std::max( \
1e-5F, \
0.5f, \
num_masks > 0 && normalizer < 0.f \
? (float)math::Sum(num_masks, 1.f, mask, ctx) \
: normalizer); \
......@@ -81,11 +106,9 @@ void _BroadcastLossGrad<float16>(
_BroadcastLossGrad(outer_dim, axis_dim, inner_dim, dy, dx); \
}
DEFINE_KERNEL_LAUNCHER(float16);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
DEFINE_GRAD_KERNEL_LAUNCHER(float16);
DEFINE_GRAD_KERNEL_LAUNCHER(float);
DEFINE_GRAD_KERNEL_LAUNCHER(double);
......
......@@ -16,17 +16,17 @@ void _NLLLoss(
const LogitType* log_prob,
const TargetType* target,
LogitType* loss,
int* mask) {
LogitType* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
loss[i] = mask[i] = 0;
loss[i] = mask[i] = LogitType(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -log_prob[k], mask[i] = 1;
loss[i] = -log_prob[k], mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -41,17 +41,17 @@ void _NLLLossGrad(
const LogitType* log_prob,
const TargetType* target,
LogitType* dx,
int* mask) {
LogitType* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
mask[i] = 0;
mask[i] = LogitType(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dx[k] = LogitType(-1), mask[i] = 1;
dx[k] = LogitType(-1), mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -71,7 +71,7 @@ void _NLLLossGrad(
const LogitType* log_prob, \
const TargetType* target, \
LogitType* loss, \
int* mask, \
LogitType* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
......
......@@ -18,16 +18,16 @@ __global__ void _NLLLoss(
const LogitType* log_prob,
const TargetType* target,
LogitType* loss,
int* mask) {
LogitType* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
loss[yi] = mask[yi] = 0;
loss[yi] = mask[yi] = LogitType(0);
} else {
loss[yi] = -log_prob[(i * axis_dim + label) * inner_dim + j];
mask[yi] = 1;
mask[yi] = LogitType(1);
}
}
}
......@@ -41,16 +41,16 @@ __global__ void _NLLLossGrad(
const LogitType* log_prob,
const TargetType* target,
LogitType* dx,
int* mask) {
LogitType* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
mask[yi] = 0;
mask[yi] = LogitType(0);
} else {
dx[(i * axis_dim + label) * inner_dim + j] = LogitType(-1);
mask[yi] = 1;
mask[yi] = LogitType(1);
}
}
}
......@@ -69,7 +69,7 @@ __global__ void _NLLLossGrad(
const LogitType* log_prob, \
const TargetType* target, \
LogitType* loss, \
int* mask, \
LogitType* mask, \
CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
......@@ -13,19 +13,19 @@ void _SigmoidCrossEntropy(
const T* logit,
const T* target,
T* loss,
int* mask) {
T* mask) {
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
if (target[i] < 0) {
loss[i] = mask[i] = 0;
loss[i] = mask[i] = T(0);
} else {
loss[i] =
std::log(
T(1) + std::exp(logit[i] - T(2) * logit[i] * (logit[i] >= 0))) +
logit[i] * ((logit[i] >= 0) - target[i]);
mask[i] = 1;
mask[i] = T(1);
}
}
}
......@@ -36,16 +36,16 @@ void _SigmoidCrossEntropyGrad(
const T* logit,
const T* target,
T* dx,
int* mask) {
T* mask) {
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
if (target[i] < 0) {
dx[i] = mask[i] = 0;
dx[i] = mask[i] = T(0);
} else {
dx[i] = T(1) / (T(1) + std::exp(-logit[i])) - target[i];
mask[i] = 1;
mask[i] = T(1);
}
}
}
......@@ -61,7 +61,7 @@ void _SigmoidCrossEntropyGrad(
const T* logit, \
const T* target, \
T* loss, \
int* mask, \
T* mask, \
CPUContext* ctx) { \
_##name(count, logit, target, loss, mask); \
}
......
......@@ -15,14 +15,14 @@ __global__ void _SigmoidCrossEntropy(
const T* logit,
const T* target,
T* loss,
int* mask) {
T* mask) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
if (target[i] < 0) {
loss[i] = mask[i] = 0;
loss[i] = mask[i] = T(0);
} else {
loss[i] = log(T(1) + exp(logit[i] - T(2) * logit[i] * (logit[i] >= 0))) +
logit[i] * ((logit[i] >= 0) - target[i]);
mask[i] = 1;
mask[i] = T(1);
}
}
}
......@@ -33,13 +33,13 @@ __global__ void _SigmoidCrossEntropyGrad(
const T* logit,
const T* target,
T* dx,
int* mask) {
T* mask) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
if (target[i] < 0) {
dx[i] = mask[i] = 0;
dx[i] = mask[i] = T(0);
} else {
dx[i] = T(1) / (T(1) + exp(-logit[i])) - target[i];
mask[i] = 1;
mask[i] = T(1);
}
}
}
......@@ -55,7 +55,7 @@ __global__ void _SigmoidCrossEntropyGrad(
const T* logit, \
const T* target, \
T* loss, \
int* mask, \
T* mask, \
CUDAContext* ctx) { \
_##name<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
count, logit, target, loss, mask); \
......
......@@ -19,7 +19,7 @@ void _SigmoidFocalLoss(
const LogitType* logit,
const TargetType* target,
LogitType* loss,
int* mask) {
LogitType* mask) {
std::array<int, 3> idx = {0, 0, 0};
std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
const int count = dims[0] * dims[1] * dims[2];
......@@ -64,7 +64,7 @@ void _SigmoidFocalLossGrad(
const LogitType* logit,
const TargetType* target,
LogitType* dx,
int* mask) {
LogitType* mask) {
std::array<int, 3> idx = {0, 0, 0};
std::array<int, 3> dims = {outer_dim, axis_dim, inner_dim};
const int count = dims[0] * dims[1] * dims[2];
......@@ -117,7 +117,7 @@ void _SigmoidFocalLossGrad(
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
int* mask, \
LogitType* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
......
......@@ -21,7 +21,7 @@ __global__ void _SigmoidFocalLoss(
const LogitType* logit,
const TargetType* target,
LogitType* loss,
int* mask) {
LogitType* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int j = yi % inner_dim;
const int k = (yi / inner_dim) % axis_dim;
......@@ -62,7 +62,7 @@ __global__ void _SigmoidFocalLossGrad(
const LogitType* logit,
const TargetType* target,
LogitType* dx,
int* mask) {
LogitType* mask) {
CUDA_1D_KERNEL_LOOP(xi, nthreads) {
const int j = xi % inner_dim;
const int k = (xi / inner_dim) % axis_dim;
......@@ -111,7 +111,7 @@ __global__ void _SigmoidFocalLossGrad(
const LogitType* logit, \
const TargetType* target, \
LogitType* loss, \
int* mask, \
LogitType* mask, \
CUDAContext* ctx) { \
const int nthreads = outer_dim * axis_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
......@@ -16,18 +16,18 @@ void _SparseSoftmaxCrossEntropy(
const LogitType* prob,
const TargetType* target,
LogitType* loss,
int* mask) {
LogitType* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
for (int i = 0; i < count; ++i) {
const int label = (int)target[i];
if (label == ignore_index) {
loss[i] = mask[i] = 0;
loss[i] = mask[i] = LogitType(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
loss[i] = -std::log(std::max(prob[k], LogitType(FLT_MIN)));
mask[i] = 1;
mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -42,7 +42,7 @@ void _SparseSoftmaxCrossEntropyGrad(
const LogitType* prob,
const TargetType* target,
LogitType* dx,
int* mask) {
LogitType* mask) {
std::array<int, 2> idx = {0, 0};
std::array<int, 2> dims = {outer_dim, inner_dim};
int count = dims[0] * dims[1], k;
......@@ -54,11 +54,11 @@ void _SparseSoftmaxCrossEntropyGrad(
(*offset_dx) = LogitType(0);
offset_dx += inner_dim;
}
mask[i] = 0;
mask[i] = LogitType(0);
} else {
k = (idx[0] * axis_dim + label) * inner_dim + idx[1];
dx[k] -= LogitType(1);
mask[i] = 1;
mask[i] = LogitType(1);
}
utils::math::IncreaseIndexInDims(2, dims.data(), idx.data());
}
......@@ -78,7 +78,7 @@ void _SparseSoftmaxCrossEntropyGrad(
const LogitType* prob, \
const TargetType* target, \
LogitType* loss, \
int* mask, \
LogitType* mask, \
CPUContext* ctx) { \
_##name( \
outer_dim, \
......
......@@ -18,17 +18,17 @@ __global__ void _SparseSoftmaxCrossEntropy(
const LogitType* prob,
const TargetType* target,
LogitType* loss,
int* mask) {
LogitType* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
const int label = target[i * inner_dim + j];
if (label == ignore_index) {
loss[yi] = mask[yi] = 0;
loss[yi] = mask[yi] = LogitType(0);
} else {
loss[yi] = -log(max(
prob[(i * axis_dim + label) * inner_dim + j], LogitType(FLT_MIN)));
mask[yi] = 1;
mask[yi] = LogitType(1);
}
}
}
......@@ -42,7 +42,7 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
const LogitType* prob,
const TargetType* target,
LogitType* dx,
int* mask) {
LogitType* mask) {
CUDA_1D_KERNEL_LOOP(yi, nthreads) {
const int i = yi / inner_dim;
const int j = yi % inner_dim;
......@@ -53,10 +53,10 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
(*offset_dx) = LogitType(0);
offset_dx += inner_dim;
}
mask[yi] = 0;
mask[yi] = LogitType(0);
} else {
dx[(i * axis_dim + label) * inner_dim + j] -= LogitType(1);
mask[yi] = 1;
mask[yi] = LogitType(1);
}
}
}
......@@ -75,7 +75,7 @@ __global__ void _SparseSoftmaxCrossEntropyGrad(
const LogitType* prob, \
const TargetType* target, \
LogitType* loss, \
int* mask, \
LogitType* mask, \
CUDAContext* ctx) { \
const int nthreads = outer_dim * inner_dim; \
_##name<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
......
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernel {
namespace {
template <typename T>
__global__ void _Affine(
const int nthreads,
const int axis_dim,
const int inner_dim,
const T* x,
const T* w,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 350
y[i] = __ldg(w + (i / inner_dim) % axis_dim) * x[i];
#else
y[i] = w[(i / inner_dim) % axis_dim] * x[i];
#endif
}
}
template <>
__global__ void _Affine<half>(
const int nthreads,
const int axis_dim,
const int inner_dim,
const half* x,
const half* w,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
y[i] = __hmul(x[i], __ldg(w + (i / inner_dim) % axis_dim));
#endif
}
}
template <typename T>
__global__ void _Affine(
const int nthreads,
const int axis_dim,
const int inner_dim,
const T* x,
const T* w,
const T* b,
T* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int wi = (i / inner_dim) % axis_dim;
#if __CUDA_ARCH__ >= 350
y[i] = __ldg(w + wi) * x[i] + __ldg(b + wi);
#else
y[i] = w[wi] * x[i] + b[wi];
#endif
}
}
template <>
__global__ void _Affine<half>(
const int nthreads,
const int axis_dim,
const int inner_dim,
const half* x,
const half* w,
const half* b,
half* y) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
#if __CUDA_ARCH__ >= 530
const int wi = (i / inner_dim) % axis_dim;
y[i] = __hadd(__hmul(x[i], __ldg(w + wi)), __ldg(b + wi));
#endif
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void Affine<float16, CUDAContext>(
const int outer_dim,
const int axis_dim,
const int inner_dim,
const float16* x,
const float16* w,
const float16* b,
float16* y,
CUDAContext* ctx) {
const int nthreads = outer_dim * axis_dim * inner_dim;
if (b != nullptr) {
_Affine<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
axis_dim,
inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(w),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
} else {
_Affine<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
nthreads,
axis_dim,
inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(w),
reinterpret_cast<half*>(y));
}
}
#define DEFINE_KERNEL_LAUNCHER(T) \
template <> \
void Affine<T, CUDAContext>( \
const int outer_dim, \
const int axis_dim, \
const int inner_dim, \
const T* x, \
const T* w, \
const T* b, \
T* y, \
CUDAContext* ctx) { \
const int nthreads = outer_dim * axis_dim * inner_dim; \
if (b != nullptr) { \
_Affine<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, axis_dim, inner_dim, x, w, b, y); \
} else { \
_Affine<<<CUDA_BLOCKS(nthreads), CUDA_THREADS, 0, ctx->cuda_stream()>>>( \
nthreads, axis_dim, inner_dim, x, w, y); \
} \
}
DEFINE_KERNEL_LAUNCHER(int8_t);
DEFINE_KERNEL_LAUNCHER(uint8_t);
DEFINE_KERNEL_LAUNCHER(int);
DEFINE_KERNEL_LAUNCHER(int64_t);
DEFINE_KERNEL_LAUNCHER(float);
DEFINE_KERNEL_LAUNCHER(double);
#undef DEFINE_KERNEL_LAUNCHER
} // namespace kernel
} // namespace dragon
#endif // USE_CUDA
......@@ -9,7 +9,11 @@ namespace dragon {
namespace kernel {
#if __CUDA_ARCH__ >= 350
#define L(x, i) __ldg(x + i)
#else
#define L(x, i) x[i]
#endif
namespace {
......@@ -30,13 +34,8 @@ __global__ void _BatchNormExpectation(
CUDA_2D_KERNEL_LOOP2(j, outer_dim) {
const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S
: j * C + i;
#if __CUDA_ARCH__ >= 350
ex_val += __ldg(x + xi);
ex2_val += __ldg(x + xi) * __ldg(x + xi);
#else
ex_val += x[xi];
ex2_val += x[xi] * x[xi];
#endif
ex_val += L(x, xi);
ex2_val += utils::math::Square(L(x, xi));
}
ex_val = BlockReduce<Tp>(ex_storage).Reduce(ex_val, cub::Sum());
ex2_val = BlockReduce<Tp>(ex2_storage).Reduce(ex2_val, cub::Sum());
......@@ -67,13 +66,8 @@ __global__ void _BatchNormInternalGrad(
CUDA_2D_KERNEL_LOOP2(j, outer_dim) {
const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S
: j * C + i;
#if __CUDA_ARCH__ >= 350
dg_val += L(dy, xi) * (L(x, xi) - L(mu, i)) * L(rsig, i);
db_val += L(dy, xi);
#else
dg_val += dy[xi] * (x[xi] - mu[i]) * rsig[i];
db_val += dy[xi];
#endif
}
dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
......@@ -101,15 +95,9 @@ __global__ void _BatchNormTrainingGrad(
const Tp denom = Tp(1) / Tp(N * S);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int pi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
#if __CUDA_ARCH__ >= 350
const Tp x_norm = (L(x, i) - L(mu, pi)) * L(rsig, pi);
dx[i] = L(gamma, pi) * L(rsig, pi) *
(L(dy, i) - (x_norm * L(dgamma, pi) + L(dbeta, pi)) * denom);
#else
const Tp x_norm = (x[i] - mu[pi]) * rsig[pi];
dx[i] = gamma[pi] * rsig[pi] *
(dy[i] - (x_norm * dgamma[pi] + dbeta[pi]) * denom);
#endif
(L(dy, i) - fma(x_norm, L(dgamma, pi), L(dbeta, pi)) * denom);
}
}
......@@ -132,13 +120,8 @@ __global__ void _BatchNormWGrad(
CUDA_2D_KERNEL_LOOP2(j, outer_dim) {
const int xi = kOrder == StorageOrder::NCHW ? (j / S * C + i) * S + j % S
: j * C + i;
#if __CUDA_ARCH__ >= 350
dg_val += L(dy, xi) * (L(x, xi) - L(mu, i)) * L(rsig, i);
db_val += L(dy, xi);
#else
dg_val += dy[xi] * (x[xi] - mu[i]) * rsig[i];
db_val += dy[xi];
#endif
}
dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
......@@ -160,11 +143,7 @@ __global__ void _BatchNormInferenceGrad(
Tx* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int pi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
#if __CUDA_ARCH__ >= 350
dx[i] = L(gamma, pi) * L(dy, i) * L(rsig, pi);
#else
dx[i] = gamma[pi] * dy[i] * rsig[pi];
#endif
}
}
......
......@@ -9,8 +9,13 @@ namespace dragon {
namespace kernel {
#if __CUDA_ARCH__ >= 350
#define L(x, i) __ldg(x + i)
#define LF(x, i) __half2float(__ldg(x + i))
#else
#define L(x, i) x[i]
#define LF(x, i) __half2float(x[i])
#endif
namespace {
......@@ -28,25 +33,14 @@ __global__ void _GroupNormFusedParams(
const int outer_dim = N * G;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
const int g = i % G;
#if __CUDA_ARCH__ >= 350
const T mu_val = L(mu, i);
const T rsig_val = L(rsig, i);
#else
const T mu_val = mu[i];
const T rsig_val = rsig[i];
#endif
CUDA_2D_KERNEL_LOOP2(j, D) {
const int wi = i * D + j;
const int gi = g * D + j;
#if __CUDA_ARCH__ >= 350
const T w = L(gamma, gi) * rsig_val;
scale[wi] = w;
bias[wi] = L(beta, gi) - w * mu_val;
#else
const T w = gamma[gi] * rsig_val;
scale[wi] = w;
bias[wi] = beta[gi] - w * mu_val;
#endif
bias[wi] = fma(-w, mu_val, L(beta, gi));
}
}
}
......@@ -62,20 +56,11 @@ __global__ void _GroupNormForwardNCHW(
Tx* y) {
const int outer_dim = N * C;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
#if __CUDA_ARCH__ >= 350
const Tp w = L(scale, i);
const Tp b = L(bias, i);
#else
const Tp w = scale[i];
const Tp b = bias[i];
#endif
CUDA_2D_KERNEL_LOOP2(j, S) {
const int xi = i * S + j;
#if __CUDA_ARCH__ >= 350
y[xi] = L(x, xi) * w + b;
#else
y[xi] = x[xi] * w + b;
#endif
y[xi] = fma(L(x, xi), w, b);
}
}
}
......@@ -89,17 +74,15 @@ __global__ void _GroupNormForwardNCHW<half, float>(
const float* scale,
const float* bias,
half* y) {
#if __CUDA_ARCH__ >= 530
const int outer_dim = N * C;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
const float w = L(scale, i);
const float b = L(bias, i);
CUDA_2D_KERNEL_LOOP2(j, S) {
const int xi = i * S + j;
y[xi] = __float2half(LF(x, xi) * w + b);
y[xi] = __float2half(fmaf(LF(x, xi), w, b));
}
}
#endif
}
template <typename Tx, typename Tp>
......@@ -117,11 +100,7 @@ __global__ void _GroupNormForwardNHWC(
CUDA_2D_KERNEL_LOOP2(j, C) {
const int xi = i * C + j;
const int wi = n * C + j;
#if __CUDA_ARCH__ >= 350
y[xi] = L(x, xi) * L(scale, wi) + L(bias, wi);
#else
y[xi] = x[xi] * scale[wi] + bias[wi];
#endif
y[xi] = fma(L(x, xi), L(scale, wi), L(bias, wi));
}
}
}
......@@ -135,17 +114,15 @@ __global__ void _GroupNormForwardNHWC<half, float>(
const float* scale,
const float* bias,
half* y) {
#if __CUDA_ARCH__ >= 530
const int outer_dim = N * S;
CUDA_2D_KERNEL_LOOP1(i, outer_dim) {
const int n = i / S;
CUDA_2D_KERNEL_LOOP2(j, C) {
const int xi = i * C + j;
const int wi = n * C + j;
y[xi] = __float2half(LF(x, xi) * L(scale, wi) + L(bias, wi));
y[xi] = __float2half(fmaf(LF(x, xi), L(scale, wi), L(bias, wi)));
}
}
#endif
}
template <typename Tx, typename Tp, StorageOrder kOrder>
......@@ -172,13 +149,8 @@ __global__ void _GroupNormWGrad(
? (n * outer_dim + i) * S + j % S
: j * outer_dim + i;
const int mi = n * G + i / D;
#if __CUDA_ARCH__ >= 350
dg_val += L(dy, xi) * (L(x, xi) - L(mu, mi)) * L(rsig, mi);
db_val += L(dy, xi);
#else
dg_val += dy[xi] * (x[xi] - mu[mi]) * rsig[mi];
db_val += dy[xi];
#endif
}
dg_val = BlockReduce<Tp>(dg_storage).Reduce(dg_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
......@@ -201,7 +173,6 @@ __global__ void _GroupNormWGradHalf(
const half* dy,
float* dgamma,
float* dbeta) {
#if __CUDA_ARCH__ >= 530
const int outer_dim = G * D;
const int inner_dim = N * S;
__shared__ typename BlockReduce<float>::TempStorage dg_storage;
......@@ -224,7 +195,6 @@ __global__ void _GroupNormWGradHalf(
dbeta[i] = db_val;
}
}
#endif
}
template <typename Tx, typename Tp, StorageOrder kOrder>
......@@ -249,13 +219,8 @@ __global__ void _GroupNormInternalGrad(
const int xi = kOrder == StorageOrder::NCHW
? i * inner_dim + j
: (i / G * S + j % S) * G * D + gi;
#if __CUDA_ARCH__ >= 350
ds_val += L(gamma, gi) * L(dy, xi) * L(x, xi);
db_val += L(gamma, gi) * L(dy, xi);
#else
ds_val += gamma[gi] * dy[xi] * x[xi];
db_val += gamma[gi] * dy[xi];
#endif
}
ds_val = BlockReduce<Tp>(ds_storage).Reduce(ds_val, cub::Sum());
db_val = BlockReduce<Tp>(db_storage).Reduce(db_val, cub::Sum());
......@@ -277,7 +242,6 @@ __global__ void _GroupNormInternalGradHalf(
const half* dy,
float* ds,
float* db) {
#if __CUDA_ARCH__ >= 530
const int outer_dim = N * G;
const int inner_dim = D * S;
__shared__ typename BlockReduce<float>::TempStorage ds_storage;
......@@ -299,7 +263,6 @@ __global__ void _GroupNormInternalGradHalf(
db[i] = db_val;
}
}
#endif
}
template <typename Tx, typename Tp, StorageOrder kOrder>
......@@ -322,17 +285,10 @@ __global__ void _GroupNormGrad(
const int mi = kOrder == StorageOrder::NCHW ? i / (D * S)
: i / (C * S) * G + (i / D % G);
const int gi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
#if __CUDA_ARCH__ >= 350
const Tp u = (L(db, mi) * L(mu, mi) - L(ds, mi)) * (L(x, i) - L(mu, mi)) *
const Tp u = fma(L(db, mi), L(mu, mi), -L(ds, mi)) * (L(x, i) - L(mu, mi)) *
utils::math::Cube(L(rsig, mi));
const Tp v = L(db, mi) * L(rsig, mi);
dx[i] = L(gamma, gi) * L(dy, i) * L(rsig, mi) + (u - v) * denom;
#else
const Tp u = (db[mi] * mu[mi] - ds[mi]) * (x[i] - mu[mi]) *
utils::math::Cube(rsig[mi]);
const Tp v = db[mi] * rsig[mi];
dx[i] = gamma[gi] * dy[i] * rsig[mi] + (u - v) * denom;
#endif
}
}
......@@ -350,20 +306,18 @@ __global__ void _GroupNormGradHalf(
const float* db,
const half* dy,
half* dx) {
#if __CUDA_ARCH__ >= 530
const int C = G * D;
const float denom = 1.f / float(D * S);
CUDA_1D_KERNEL_LOOP(i, nthreads) {
const int mi = kOrder == StorageOrder::NCHW ? i / (D * S)
: i / (C * S) * G + (i / D % G);
const int gi = kOrder == StorageOrder::NCHW ? (i / S) % C : i % C;
const float u = (L(db, mi) * L(mu, mi) - L(ds, mi)) *
const float u = fmaf(L(db, mi), L(mu, mi), -L(ds, mi)) *
(LF(x, i) - L(mu, mi)) * utils::math::Cube(L(rsig, mi));
const float v = L(db, mi) * L(rsig, mi);
dx[i] =
__float2half(L(gamma, gi) * LF(dy, i) * L(rsig, mi) + (u - v) * denom);
}
#endif
}
} // namespace
......
#include "dragon/utils/cast.h"
#include "dragon/utils/omp_utils.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernel {
template <>
void MixedPrecL2Penalty<float16, CPUContext>(
const int count,
const float alpha,
const float16* x,
float* dx,
CPUContext* ctx) {
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
dx[i] += (cast::to<float>(x[i]) * alpha);
}
}
template <>
void MixedPrecUpdate<float16, CPUContext>(
const int count,
const float* dx,
float16* x,
CPUContext* ctx) {
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) {
x[i] = cast::to<float16>(cast::to<float>(x[i]) - dx[i]);
}
}
} // namespace kernel
} // namespace dragon
#ifdef USE_CUDA
#include "dragon/core/context_cuda.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
namespace kernel {
namespace {
__global__ void _MixedPrecL2Penalty(
const int nthreads,
const float alpha,
const half* x,
float* dx) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
dx[i] += __half2float(x[i]) * alpha;
}
}
__global__ void _MixedPrecUpdate(const int nthreads, const float* dx, half* x) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
x[i] = __float2half(__half2float(x[i]) - dx[i]);
}
}
} // namespace
/* ------------------- Launcher Separator ------------------- */
template <>
void MixedPrecL2Penalty<float16, CUDAContext>(
const int count,
const float alpha,
const float16* x,
float* dx,
CUDAContext* ctx) {
_MixedPrecL2Penalty<<<
CUDA_BLOCKS(count),
CUDA_THREADS,
0,
ctx->cuda_stream()>>>(count, alpha, reinterpret_cast<const half*>(x), dx);
}
template <>
void MixedPrecUpdate<float16, CUDAContext>(
const int count,
const float* dx,
float16* x,
CUDAContext* ctx) {
_MixedPrecUpdate<<<CUDA_BLOCKS(count), CUDA_THREADS, 0, ctx->cuda_stream()>>>(
count, dx, reinterpret_cast<half*>(x));
}
} // namespace kernel
} // namespace dragon
#endif // USE_CUDA
......@@ -11,11 +11,19 @@ namespace {
template <typename T>
__global__ void
_NesterovUpdate(const int nthreads, const T lr, const T momentum, T* g, T* m) {
_NesterovUpdate(const int nthreads, const T lr, const T momentum, T* g, T* m);
template <>
__global__ void _NesterovUpdate<float>(
const int nthreads,
const float lr,
const float momentum,
float* g,
float* m) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
T mi = m[i];
T mi_new = m[i] = momentum * mi + lr * g[i];
g[i] = (1 + momentum) * mi_new - momentum * mi;
float mi = m[i];
float mi_new = m[i] = momentum * mi + lr * g[i];
g[i] = fmaf(momentum, mi_new - mi, mi_new);
}
}
......
......@@ -94,11 +94,12 @@ void RegisterModule(py::module& m) {
});
/*! \brief Activate the CuDNN engine */
m.def("cudaEnableDNN", [](bool enabled, bool benchmark) {
m.def("cudaEnableDNN", [](bool enabled, bool benchmark, bool allow_tf32) {
#ifdef USE_CUDA
auto& cuda_objects = CUDAContext::objects();
cuda_objects.cudnn_enabled_ = enabled;
cuda_objects.cudnn_benchmark_ = benchmark;
cuda_objects.cudnn_allow_tf32_ = allow_tf32;
#endif
});
......
......@@ -40,7 +40,7 @@ void DropBlock2dOp<Context>::DoRunWithType() {
auto* scale = Buffer("scale")
->Reshape({})
->template mutable_data<float, CPUContext>();
auto scratches = ws()->template data<Context>({
auto scratches = ctx()->workspace()->template data<Context>({
X.dim(0) * seed_h * seed_w * sizeof(uint32_t), // seed points
X.count() * sizeof(int), // int32 mask for seed growing
});
......@@ -61,7 +61,7 @@ void DropBlock2dOp<Context>::DoRunWithType() {
(int*)scratches[1],
ctx());
// Convert to uint8 mask
kernel::Cast(X.count(), (int*)scratches[1], mask, ctx());
math::Cast(X.count(), (int*)scratches[1], mask, ctx());
// Count the number of zeros to compute scale factor
float normalizer = math::Sum(X.count(), 1.f, (int*)scratches[1], ctx());
scale[0] = (float)X.count() / std::max(normalizer, 1.f);
......
......@@ -20,7 +20,7 @@ void DropoutOp<Context>::DoRunWithType() {
X.template data<T, Context>(),
Buffer("mask")->template mutable_data<uint8_t, Context>(),
Y->ReshapeLike(X)->template mutable_data<T, Context>(),
ws()->template data<uint32_t, Context>({X.count()})[0],
ctx()->workspace()->template data<uint32_t, Context>({X.count()})[0],
ctx());
} else {
LOG(FATAL) << "Unknown Phase: " << phase();
......
......@@ -22,7 +22,7 @@ void CuDNNDropoutOp<Context>::DoRunWithType() {
CUDNN_CHECK(
cudnnDropoutGetStatesSize(ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
auto* X_states = ws()->CreateTensor(
auto* X_states = workspace()->CreateTensor(
"/share/cudnn/dropout:" + str::to(rng_seed_) + "/states");
if (X_states->count() > 0) {
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
......@@ -80,7 +80,7 @@ void CuDNNDropoutGradientOp<Context>::DoRunWithType() {
CUDNN_CHECK(
cudnnDropoutGetStatesSize(ctx()->cudnn_handle(), &states_size));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
auto* X_states = ws()->CreateTensor(
auto* X_states = workspace()->CreateTensor(
"/share/cudnn/dropout:" + str::to(rng_seed_) + "/states");
if (X_states->count() > 0) {
CUDNN_CHECK(cudnnRestoreDropoutDescriptor(
......
#include "dragon/operators/array/cast_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
#define ELIGIBLE_TENSOR_TYPES \
{ "bool", "int8", "uint8", "int32", "int64", "float16", "float32", "float64" }
#define DISPATCH_TYPE_TO(InputType, OutputType) \
if (dtype() == types::to_string<OutputType>()) { \
if (InputSize() != 0) { \
Output(0)->ReshapeLike(Input(0)); \
auto* x = Input(0).template data<InputType, Context>(); \
auto* y = Output(0)->template mutable_data<OutputType, Context>(); \
kernel::Cast(Input(0).count(), x, y, ctx()); \
} else { \
auto n = Output(0)->count(); \
auto* x = Output(0)->template data<InputType, Context>(); \
auto* scratch = ws()->template data<OutputType, Context>({n})[0]; \
kernel::Cast(n, x, scratch, ctx()); \
ctx()->FinishDeviceComputation(); \
auto* y = Output(0)->template mutable_data<OutputType, Context>(); \
math::Copy(n, scratch, y, ctx()); \
} \
return; \
#define DISPATCH_TYPE_TO(InputType, OutputType) \
if (dtype() == types::to_string<OutputType>()) { \
if (InputSize() != 0) { \
Output(0)->ReshapeLike(Input(0)); \
auto* x = Input(0).template data<InputType, Context>(); \
auto* y = Output(0)->template mutable_data<OutputType, Context>(); \
math::Cast(Input(0).count(), x, y, ctx()); \
} else { \
auto n = Output(0)->count(); \
auto* x = Output(0)->template data<InputType, Context>(); \
auto* scratch = \
ctx()->workspace()->template data<OutputType, Context>({n})[0]; \
math::Cast(n, x, scratch, ctx()); \
ctx()->FinishDeviceComputation(); \
auto* y = Output(0)->template mutable_data<OutputType, Context>(); \
math::Copy(n, scratch, y, ctx()); \
} \
return; \
}
#define DISPATCH_TYPE_TO_ALL(InputType) \
......
#include "dragon/operators/math/affine_op.h"
#include "dragon/operators/array/channel_affine_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
......@@ -19,7 +19,7 @@ namespace dragon {
template <class Context>
template <typename T>
void AffineOp<Context>::DoRunWithType() {
void ChannelAffineOp<Context>::DoRunWithType() {
auto &X = Input(0), &W = Input(1), *Y = Output(0, {0});
CANONICALIZE_AXES_WITH_TENSOR(X);
......@@ -37,7 +37,7 @@ void AffineOp<Context>::DoRunWithType() {
<< ", got " << Input(2).DimString() << ".";
}
kernel::Affine(
kernel::ChannelAffine(
X.count(0, axis),
X.count(axis, axis + num_axes),
X.count(axis + num_axes),
......@@ -49,21 +49,22 @@ void AffineOp<Context>::DoRunWithType() {
}
template <class Context>
void AffineOp<Context>::RunOnDevice() {
void ChannelAffineOp<Context>::RunOnDevice() {
DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
}
template <class Context>
template <typename T>
void AffineGradientOp<Context>::DoRunWithType() {
void ChannelAffineGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &W = Input(1), &dY = Input(2);
auto *dX = Output(0), *dW = Output(1), *dB = Output(2);
CANONICALIZE_AXES_WITH_TENSOR(X);
// Reduce parameters for weight and bias
vec32_t dims = {(int)X.count(0, axis),
(int)X.count(axis, axis + num_axes),
(int)X.count(axis + num_axes)};
vec32_t dims = {
(int)X.count(0, axis),
(int)X.count(axis, axis + num_axes),
(int)X.count(axis + num_axes)};
vec32_t axes = {0, 2};
// dW = dY * X
......@@ -79,7 +80,8 @@ void AffineGradientOp<Context>::DoRunWithType() {
dW->ReshapeLike(W)->template mutable_data<T, Context>(),
ctx());
} else {
T* scratch = ws()->template data<T, Context>({X.count()})[0];
T* scratch =
ctx()->workspace()->template data<T, Context>({X.count()})[0];
math::Mul(
X.count(),
dY.template data<T, Context>(),
......@@ -118,7 +120,7 @@ void AffineGradientOp<Context>::DoRunWithType() {
// dX = dY * W
if (dX->has_name()) {
Output(0)->ReshapeLike(Input(-1));
kernel::Affine(
kernel::ChannelAffine(
X.count(0, axis),
X.count(axis, axis + num_axes),
X.count(axis + num_axes),
......@@ -131,21 +133,21 @@ void AffineGradientOp<Context>::DoRunWithType() {
}
template <class Context>
void AffineGradientOp<Context>::RunOnDevice() {
void ChannelAffineGradientOp<Context>::RunOnDevice() {
DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
}
DEPLOY_CPU_OPERATOR(Affine);
DEPLOY_CPU_OPERATOR(ChannelAffine);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(Affine);
DEPLOY_CUDA_OPERATOR(ChannelAffine);
#endif
DEPLOY_CPU_OPERATOR(AffineGradient);
DEPLOY_CPU_OPERATOR(ChannelAffineGradient);
#ifdef USE_CUDA
DEPLOY_CUDA_OPERATOR(AffineGradient);
DEPLOY_CUDA_OPERATOR(ChannelAffineGradient);
#endif
OPERATOR_SCHEMA(Affine)
OPERATOR_SCHEMA(ChannelAffine)
/* X, W, B */
.NumInputs(2, 3)
/* Y */
......@@ -153,7 +155,7 @@ OPERATOR_SCHEMA(Affine)
/* X => Y */
.AllowInplace({{0, 0}});
OPERATOR_SCHEMA(AffineGradient)
OPERATOR_SCHEMA(ChannelAffineGradient)
/* X, W, dY */
.NumInputs(3)
/* dX, dW, dB */
......@@ -177,7 +179,7 @@ class GradientMaker final : public GradientMakerBase {
} // namespace
REGISTER_GRADIENT(Affine, GradientMaker);
REGISTER_GRADIENT(ChannelAffine, GradientMaker);
#undef CANONICALIZE_AXES_WITH_TENSOR
......
......@@ -10,17 +10,17 @@
* ------------------------------------------------------------
*/
#ifndef DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#define DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#ifndef DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
#define DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
#include "dragon/core/operator.h"
namespace dragon {
template <class Context>
class AffineOp final : public Operator<Context> {
class ChannelAffineOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(AffineOp);
SIMPLE_CTOR_DTOR(ChannelAffineOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
......@@ -30,9 +30,9 @@ class AffineOp final : public Operator<Context> {
};
template <class Context>
class AffineGradientOp final : public Operator<Context> {
class ChannelAffineGradientOp final : public Operator<Context> {
public:
SIMPLE_CTOR_DTOR(AffineGradientOp);
SIMPLE_CTOR_DTOR(ChannelAffineGradientOp);
USE_OPERATOR_FUNCTIONS;
void RunOnDevice() override;
......@@ -43,4 +43,4 @@ class AffineGradientOp final : public Operator<Context> {
} // namespace dragon
#endif // DRAGON_OPERATORS_MATH_AFFINE_OP_H_
#endif // DRAGON_OPERATORS_ARRAY_CHANNEL_AFFINE_OP_H_
......@@ -18,17 +18,6 @@ void MaskedSelectOp<Context>::DoRunWithType() {
STORE_INPUT_SPEC(0);
auto* X_index = Buffer("X_index")->Reshape({X.count() + 1});
// Determine the scratch requirement
size_t scratch_size = 0;
kernel::Flagged(
X.count(),
(const uint8_t*)X_mask.template raw_data<Context>(),
X_index->template mutable_data<int, Context>(),
nullptr,
nullptr,
scratch_size,
ctx());
// Select the index of values matching the criteria
// The first ``num_selected`` indices are valid
int num_selected;
......@@ -37,8 +26,6 @@ void MaskedSelectOp<Context>::DoRunWithType() {
(const uint8_t*)X_mask.template raw_data<Context>(),
X_index->template mutable_data<int, Context>(),
&num_selected,
ws()->template data<Context>({scratch_size})[0],
scratch_size,
ctx());
// Select the values according to the flat indices
......
......@@ -19,17 +19,6 @@ void NonZeroOp<Context>::DoRunWithType() {
(bool*)X_mask->template mutable_data<uint8_t, Context>(),
ctx());
// Determine the scratch requirement
size_t scratch_size = 0;
kernel::Flagged(
X.count(),
X_mask->template mutable_data<uint8_t, Context>(),
X_index->template mutable_data<int, Context>(),
nullptr,
nullptr,
scratch_size,
ctx());
// Select the index of values matching the criteria
// The first ``num_selected`` indices are valid
int num_selected;
......@@ -38,8 +27,6 @@ void NonZeroOp<Context>::DoRunWithType() {
X_mask->template mutable_data<uint8_t, Context>(),
X_index->template mutable_data<int, Context>(),
&num_selected,
ws()->template data<Context>({scratch_size})[0],
scratch_size,
ctx());
// Convert the flat indices into n-dimension coordinates
......
......@@ -11,7 +11,7 @@ void PermutationOp<Context>::DoRunWithType() {
kernel::Permutation(
Y->count(),
Y->template mutable_data<T, Context>(),
ws()->template data<uint32_t, Context>({Y->count()})[0],
ctx()->workspace()->template data<uint32_t, Context>({Y->count()})[0],
ctx());
}
......
......@@ -39,6 +39,7 @@ void ReduceMaxOp<Context>::DoRunWithType() {
X_dims.data(),
reduce_axes.size(),
reduce_axes.data(),
1.f,
X.template data<T, Context>(),
Y->Reshape(Y_shape)->template mutable_data<T, Context>(),
ctx());
......
......@@ -55,7 +55,7 @@ void ReduceMeanOp<Context>::DoRunWithType() {
template <class Context>
void ReduceMeanOp<Context>::RunOnDevice() {
STORE_INPUT_SPEC(0);
DispatchHelper<NumericalTensorTypes>::Call(this, Input(0));
DispatchHelper<FloatingTensorTypes>::Call(this, Input(0));
}
template <class Context>
......
......@@ -39,6 +39,7 @@ void ReduceMinOp<Context>::DoRunWithType() {
X_dims.data(),
reduce_axes.size(),
reduce_axes.data(),
1.f,
X.template data<T, Context>(),
Y->Reshape(Y_shape)->template mutable_data<T, Context>(),
ctx());
......
......@@ -42,12 +42,12 @@ void TileGradientOp<Context>::DoRunWithType() {
const T* dy;
T* dx;
if (src_ == &nav_) {
dy = ws()->template data<T, Context>({src_->count()})[0];
dy = ctx()->workspace()->template data<T, Context>({src_->count()})[0];
} else {
dy = src_->template data<T, Context>();
}
if (dest_ == &nav_) {
dx = ws()->template data<T, Context>({dest_->count()})[0];
dx = ctx()->workspace()->template data<T, Context>({dest_->count()})[0];
} else {
dx = dest_->template mutable_data<T, Context>();
}
......
......@@ -66,7 +66,7 @@ void WhereGradientOp<Context>::DoRunWithType() {
}
if (scratch_size > 0) {
scratch = ws()->template data<T, Context>({scratch_size})[0];
scratch = ctx()->workspace()->template data<T, Context>({scratch_size})[0];
zeros = scratch + (scratch_size - 1);
math::Set(1, cast::to<T>(0.f), zeros, ctx());
}
......
......@@ -49,8 +49,8 @@ void AssignOp<Context>::DoRunWithType() {
<< Tensor::DimString(X_dims);
utils::math::ComputeBinaryBroadcastDims(X.dims(), X_dims, dims1, dims2);
if (dims1 != dims2) {
auto* scratch =
ws()->template data<T, Context>({X_broadcast.count()})[0];
auto* scratch = ctx()->workspace()->template data<T, Context>(
{X_broadcast.count()})[0];
math::Set(
X.ndim(),
X.dims().data(),
......
......@@ -27,7 +27,7 @@ void CollectiveOp<Context>::AllReduceMPI() {
auto from = (comm_rank_ - 1 + comm_size_) % comm_size_;
auto* data = src_tensor_->template mutable_data<T, Context>();
auto* scratch = ws()->template data<T, Context>({sizes[0]})[0];
auto* scratch = ctx()->workspace()->template data<T, Context>({sizes[0]})[0];
// Scatter-Reduce
MPI_Request recv_req;
......@@ -129,25 +129,10 @@ void CollectiveOp<Context>::RunOnDevice() {
// Otherwise, data corruption will happen through GPUDirect(UVA)
// during executing collectives asynchronously.
ctx()->FinishDeviceComputation();
#ifdef USE_NCCL
#if NCCL_VERSION_MIN(2, 2, 0)
if (enable_nccl_ && InputSize() <= 2048) {
this->nccl_comm(); // Ensure the comm created
NCCL_CHECK(ncclGroupStart());
}
#endif
#endif
for (int i = 0; i < InputSize(); i++) {
src_tensor_ = &Input(i);
DispatchHelper<NumericalTensorTypes>::Call(this, *src_tensor_);
}
#ifdef USE_NCCL
#if NCCL_VERSION_MIN(2, 2, 0)
if (enable_nccl_ && InputSize() <= 2048) {
NCCL_CHECK(ncclGroupEnd());
}
#endif
#endif
src_tensor_ = nullptr;
for (int i = 0; i < InputSize(); i++) {
dest_tensor_ = &Input(i);
......
......@@ -52,7 +52,8 @@ void CuDNNCTCLossOp<Context>::DoRunWithType() {
ctc_desc_,
&workspace_size_));
auto* scratch = (uint8_t*)ws()->template data<Context>({workspace_size_})[0];
auto* scratch = (uint8_t*)ctx()->workspace()->template data<Context>(
{workspace_size_})[0];
auto* g = Buffer("grad")
->ReshapeLike(Input(0))
......
......@@ -18,7 +18,7 @@ void L1LossOp<Context>::DoRunWithType() {
}
// Allocate a temporal error buffer
auto* x_error = ws()->template data<T, Context>({X.count()})[0];
auto* x_error = ctx()->workspace()->template data<T, Context>({X.count()})[0];
// Compute the error of inputs
if (InputSize() > 1) {
......@@ -55,7 +55,7 @@ void L1LossOp<Context>::DoRunWithType() {
0,
normalizer,
x_error,
nullptr,
(T*)nullptr,
Y->Reshape({})->template mutable_data<T, Context>(),
ctx());
}
......@@ -99,7 +99,8 @@ void L1LossGradientOp<Context>::DoRunWithType() {
} else if (reduction_ == "MEAN") {
normalizer *= dX->count();
}
kernel::ReduceLossGrad(dX->count(), 0, normalizer, dy, nullptr, dx, ctx());
kernel::ReduceLossGrad(
dX->count(), 0, normalizer, dy, (T*)nullptr, dx, ctx());
}
// Gradient w.r.t. the second input
......
......@@ -18,7 +18,7 @@ void L2LossOp<Context>::DoRunWithType() {
}
// Allocate a temporal error buffer
auto* x_error = ws()->template data<T, Context>({X.count()})[0];
auto* x_error = ctx()->workspace()->template data<T, Context>({X.count()})[0];
// Compute the error of inputs
if (InputSize() > 1) {
......@@ -55,7 +55,7 @@ void L2LossOp<Context>::DoRunWithType() {
0,
normalizer,
x_error,
nullptr,
(T*)nullptr,
Y->Reshape({})->template mutable_data<T, Context>(),
ctx());
}
......@@ -98,7 +98,7 @@ void L2LossGradientOp<Context>::DoRunWithType() {
normalizer *= dX->count();
}
kernel::ReduceLossGrad(
dX->count(), 0, float(normalizer) * 0.5f, dy, nullptr, dx, ctx());
dX->count(), 0, float(normalizer) * 0.5f, dy, (T*)nullptr, dx, ctx());
}
// Gradient w.r.t. the second input
......
......@@ -18,12 +18,12 @@ void NLLLossOp<Context>::DoRunWithType() {
CHECK_EQ(num_preds, Input(1).count())
<< "\nNumber of preds must match the number of targets.";
auto scratches = ws()->template data<Context>({
num_preds * sizeof(LogitType), // loss
num_preds * sizeof(int), // mask
auto scratches = ctx()->workspace()->template data<Context>({
(size_t)num_preds * sizeof(LogitType), // loss
(size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<int*>(scratches[1]);
auto* mask = static_cast<LogitType*>(scratches[1]);
kernel::NLLLoss(
outer_dim,
......@@ -101,9 +101,10 @@ void NLLLossGradientOp<Context>::DoRunWithType() {
auto inner_dim = dX->count(axis + 1);
auto num_preds = outer_dim * inner_dim;
auto* mask = ws()->template data<int, Context>({num_preds})[0];
auto* dy = dY.template data<LogitType, Context>();
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
math::Set(dX->count(), cast::to<LogitType>(0.f), dx, ctx());
kernel::NLLLossGrad(
......
......@@ -13,12 +13,12 @@ void SigmoidCrossEntropyOp<Context>::DoRunWithType() {
CHECK_EQ(X.count(), Input(1).count())
<< "\nNumber of preds must match the number of targets.";
auto scratches = ws()->template data<Context>({
X.count() * sizeof(T), // loss
X.count() * sizeof(int), // mask
auto scratches = ctx()->workspace()->template data<Context>({
X.size() * sizeof(T), // loss
X.size() * sizeof(T) + sizeof(T), // mask
});
auto* loss = static_cast<T*>(scratches[0]);
auto* mask = static_cast<int*>(scratches[1]);
auto* mask = static_cast<T*>(scratches[1]);
kernel::SigmoidCrossEntropy(
X.count(),
......@@ -64,9 +64,10 @@ template <typename T>
void SigmoidCrossEntropyGradientOp<Context>::DoRunWithType() {
auto &X = Input(0), &dY = Input(-1), *dX = Output(0);
auto* mask = ws()->template data<int, Context>({dX->count()})[0];
auto* dy = dY.template data<T, Context>();
auto* dx = dX->template mutable_data<T, Context>();
auto* mask =
ctx()->workspace()->template data<T, Context>({dX->count() + 1})[0];
kernel::SigmoidCrossEntropyGrad(
dX->count(),
......
......@@ -17,12 +17,12 @@ void SigmoidFocalLossOp<Context>::DoRunWithType() {
CHECK_EQ(outer_dim * inner_dim, Input(1).count())
<< "\nNumber of preds must match the number of targets.";
auto scratches = ws()->template data<Context>({
X.count() * sizeof(LogitType), // loss
X.count() * sizeof(int), // mask
auto scratches = ctx()->workspace()->template data<Context>({
X.size() * sizeof(LogitType), // loss
X.size() * sizeof(LogitType) + sizeof(LogitType), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<int*>(scratches[1]);
auto* mask = static_cast<LogitType*>(scratches[1]);
kernel::SigmoidFocalLoss(
outer_dim,
......@@ -100,9 +100,10 @@ void SigmoidFocalLossGradientOp<Context>::DoRunWithType() {
auto outer_dim = dX->count(0, axis);
auto inner_dim = dX->count(axis + 1);
auto* mask = ws()->template data<int, Context>({dX->count()})[0];
auto* dy = dY.template data<LogitType, Context>();
auto* dx = dX->template mutable_data<LogitType, Context>();
auto* mask = ctx()->workspace()->template data<LogitType, Context>(
{dX->count() + 1})[0];
kernel::SigmoidFocalLossGrad(
outer_dim,
......
......@@ -18,7 +18,7 @@ void SmoothL1LossOp<Context>::DoRunWithType() {
}
// Allocate a temporal error buffer
auto* x_error = ws()->template data<T, Context>({X.count()})[0];
auto* x_error = ctx()->workspace()->template data<T, Context>({X.count()})[0];
// Compute the error of inputs
if (InputSize() > 1) {
......@@ -55,7 +55,7 @@ void SmoothL1LossOp<Context>::DoRunWithType() {
0,
normalizer,
x_error,
nullptr,
(T*)nullptr,
Y->Reshape({})->template mutable_data<T, Context>(),
ctx());
}
......@@ -99,7 +99,8 @@ void SmoothL1LossGradientOp<Context>::DoRunWithType() {
} else if (reduction_ == "MEAN") {
normalizer *= dX->count();
}
kernel::ReduceLossGrad(dX->count(), 0, normalizer, dy, nullptr, dx, ctx());
kernel::ReduceLossGrad(
dX->count(), 0, normalizer, dy, (T*)nullptr, dx, ctx());
}
// Gradient w.r.t. the second input
......
......@@ -19,7 +19,7 @@ void SoftmaxCrossEntropyOp<Context>::DoRunWithType() {
<< "\nNumber of preds must match the number of targets.";
Buffer("prob")->ReshapeLike(X);
auto* loss = ws()->template data<T, Context>({X.count()})[0];
auto* loss = ctx()->workspace()->template data<T, Context>({X.count()})[0];
auto* prob = Buffer("prob")->template mutable_data<T, Context>();
kernel::Softmax(
......@@ -59,7 +59,7 @@ void SoftmaxCrossEntropyOp<Context>::DoRunWithType() {
0,
normalizer,
loss,
nullptr,
(T*)nullptr,
Y->Reshape({})->template mutable_data<T, Context>(),
ctx());
}
......@@ -98,7 +98,8 @@ void SoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
} else if (reduction_ == "MEAN") {
normalizer = num_preds;
}
kernel::ReduceLossGrad(dX->count(), 0, normalizer, dy, nullptr, dx, ctx());
kernel::ReduceLossGrad(
dX->count(), 0, normalizer, dy, (T*)nullptr, dx, ctx());
}
}
......
......@@ -20,12 +20,12 @@ void SparseSoftmaxCrossEntropyOp<Context>::DoRunWithType() {
auto* X_prob = Buffer("prob")->ReshapeLike(X);
auto* prob = X_prob->template mutable_data<LogitType, Context>();
auto scratches = ws()->template data<Context>({
num_preds * sizeof(LogitType), // loss
num_preds * sizeof(int), // mask
auto scratches = ctx()->workspace()->template data<Context>({
(size_t)num_preds * sizeof(LogitType), // loss
(size_t)num_preds * sizeof(LogitType) + sizeof(LogitType), // mask
});
auto* loss = static_cast<LogitType*>(scratches[0]);
auto* mask = static_cast<int*>(scratches[1]);
auto* mask = static_cast<LogitType*>(scratches[1]);
kernel::Softmax(
outer_dim,
......@@ -111,9 +111,10 @@ void SparseSoftmaxCrossEntropyGradientOp<Context>::DoRunWithType() {
auto num_preds = outer_dim * inner_dim;
auto* prob = Buffer("prob")->template data<LogitType, Context>();
auto* mask = ws()->template data<int, Context>({num_preds})[0];
auto* dy = Input(-1).template data<LogitType, Context>();
auto* dx = Output(0)->template mutable_data<LogitType, Context>();
auto* mask =
ctx()->workspace()->template data<LogitType, Context>({num_preds + 1})[0];
math::Copy(dX->count(), prob, dx, ctx());
......
......@@ -83,7 +83,7 @@ void DivGradientOp<Context>::DoRunWithType() {
ctx());
}
} else {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch = ctx()->workspace()->template data<T, Context>({dY.count()})[0];
if (B_broadcast_axes.empty()) {
math::Div(
B_ref.count(),
......@@ -136,7 +136,8 @@ void DivGradientOp<Context>::DoRunWithType() {
}
} else {
if (scratch == nullptr) {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch =
ctx()->workspace()->template data<T, Context>({dY.count()})[0];
}
if (A_broadcast_axes.empty()) {
math::Mul(
......
......@@ -21,7 +21,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
T* scratch = nullptr;
if (dA->has_name()) {
auto scratches = ws()->template data<Context>(
auto scratches = ctx()->workspace()->template data<Context>(
{dY.size() * sizeof(T), dY.size() * sizeof(bool)});
mask = (bool*)scratches[1], scratch = (T*)scratches[0];
if (A_broadcast_axes.empty()) {
......@@ -43,7 +43,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
mask,
ctx());
}
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(),
dY.template data<T, Context>(),
......@@ -60,7 +60,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
mask,
ctx());
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(), dY.template data<T, Context>(), scratch, scratch, ctx());
math::ReduceSum(
......@@ -77,7 +77,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
if (dB->has_name()) {
if (mask == nullptr) {
auto scratches = ws()->template data<Context>(
auto scratches = ctx()->workspace()->template data<Context>(
{dY.size() * sizeof(T), dY.size() * sizeof(bool)});
mask = (bool*)scratches[1], scratch = (T*)scratches[0];
}
......@@ -100,7 +100,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
mask,
ctx());
}
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(),
dY.template data<T, Context>(),
......@@ -117,7 +117,7 @@ void MaximumGradientOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
mask,
ctx());
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(), dY.template data<T, Context>(), scratch, scratch, ctx());
math::ReduceSum(
......
......@@ -21,7 +21,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
T* scratch = nullptr;
if (dA->has_name()) {
auto scratches = ws()->template data<Context>(
auto scratches = ctx()->workspace()->template data<Context>(
{dY.size() * sizeof(T), dY.size() * sizeof(bool)});
mask = (bool*)scratches[1], scratch = (T*)scratches[0];
if (A_broadcast_axes.empty()) {
......@@ -43,7 +43,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
mask,
ctx());
}
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(),
dY.template data<T, Context>(),
......@@ -60,7 +60,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
mask,
ctx());
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(), dY.template data<T, Context>(), scratch, scratch, ctx());
math::ReduceSum(
......@@ -77,7 +77,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
if (dB->has_name()) {
if (mask == nullptr) {
auto scratches = ws()->template data<Context>(
auto scratches = ctx()->workspace()->template data<Context>(
{dY.size() * sizeof(T), dY.size() * sizeof(bool)});
mask = (bool*)scratches[1], scratch = (T*)scratches[0];
}
......@@ -100,7 +100,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
mask,
ctx());
}
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(),
dY.template data<T, Context>(),
......@@ -117,7 +117,7 @@ void MinimumGradientOp<Context>::DoRunWithType() {
B.template data<T, Context>(),
mask,
ctx());
kernel::Cast(dY.count(), mask, scratch, ctx());
math::Cast(dY.count(), mask, scratch, ctx());
math::Mul(
dY.count(), dY.template data<T, Context>(), scratch, scratch, ctx());
math::ReduceSum(
......
......@@ -33,7 +33,7 @@ void MomentsOp<Context>::DoRunWithType() {
}
if (X.count() == 1) {
kernel::Cast(
math::Cast(
1,
X.template data<Tx, Context>(),
Y1->Reshape(Y_shape)->template mutable_data<Ty, Context>(),
......
......@@ -83,7 +83,7 @@ void MulGradientOp<Context>::DoRunWithType() {
ctx());
}
} else {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch = ctx()->workspace()->template data<T, Context>({dY.count()})[0];
if (B_broadcast_axes.empty()) {
math::Mul(
B_ref.count(),
......@@ -136,7 +136,8 @@ void MulGradientOp<Context>::DoRunWithType() {
}
} else {
if (scratch == nullptr) {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch =
ctx()->workspace()->template data<T, Context>({dY.count()})[0];
}
if (A_broadcast_axes.empty()) {
math::Mul(
......
......@@ -33,7 +33,8 @@ void PowGradientOp<Context>::DoRunWithType() {
dB->template mutable_data<T, Context>(),
ctx());
} else {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch =
ctx()->workspace()->template data<T, Context>({dY.count()})[0];
math::Log(A.count(), A.template data<T, Context>(), scratch, ctx());
math::Mul(
A.ndim(),
......@@ -53,13 +54,14 @@ void PowGradientOp<Context>::DoRunWithType() {
ctx());
} else {
if (A_broadcast_axes.empty()) {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch =
ctx()->workspace()->template data<T, Context>({dY.count()})[0];
math::Log(A.count(), A.template data<T, Context>(), scratch, ctx());
math::Mul(
Y.count(), scratch, Y.template data<T, Context>(), scratch, ctx());
} else {
auto scratches =
ws()->template data<T, Context>({dY.count(), A.count()});
auto scratches = ctx()->workspace()->template data<T, Context>(
{dY.count(), A.count()});
scratch = scratches[0];
math::Log(
A.count(), A.template data<T, Context>(), scratches[1], ctx());
......@@ -127,7 +129,8 @@ void PowGradientOp<Context>::DoRunWithType() {
ctx());
} else {
if (scratch == nullptr) {
scratch = ws()->template data<T, Context>({dY.count()})[0];
scratch =
ctx()->workspace()->template data<T, Context>({dY.count()})[0];
}
math::Div(
Y.ndim(),
......
......@@ -56,9 +56,9 @@ void BatchNormOp<Context>::TrainingImpl() {
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::Affine(N_, C_, S_, x, scale, bias, y, ctx());
kernel::ChannelAffine(N_, C_, S_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::Affine(N_ * S_, C_, 1, x, scale, bias, y, ctx());
kernel::ChannelAffine(N_ * S_, C_, 1, x, scale, bias, y, ctx());
}
}
......@@ -91,9 +91,9 @@ void BatchNormOp<Context>::InferenceImpl() {
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::Affine(N_, C_, S_, x, scale, bias, y, ctx());
kernel::ChannelAffine(N_, C_, S_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::Affine(N_ * S_, C_, 1, x, scale, bias, y, ctx());
kernel::ChannelAffine(N_ * S_, C_, 1, x, scale, bias, y, ctx());
}
}
......@@ -102,7 +102,7 @@ void BatchNormOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Get the recomputing flag
auto* flag = ws()->GetTensor("/share/flag/recomputing");
auto* flag = workspace()->GetTensor("/share/flag/recomputing");
is_recomputing_ = flag->template data<bool, CPUContext>()[0] ? 1 : 0;
// Dispatch the training or inference impl
......
......@@ -73,7 +73,7 @@ void CuDNNBatchNormOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Get the recomputing flag
auto* flag = ws()->GetTensor("/share/flag/recomputing");
auto* flag = workspace()->GetTensor("/share/flag/recomputing");
is_recomputing_ = flag->template data<bool, CPUContext>()[0] ? 1 : 0;
// Dispatch the training or inference impl
......
......@@ -88,9 +88,9 @@ void SyncBatchNormOp<Context>::TrainingImpl() {
// Compute affine transformation
if (data_format() == "NCHW") {
kernel::Affine(N_, C_, S_, x, scale, bias, y, ctx());
kernel::ChannelAffine(N_, C_, S_, x, scale, bias, y, ctx());
} else if (data_format() == "NHWC") {
kernel::Affine(N_ * S_, C_, 1, x, scale, bias, y, ctx());
kernel::ChannelAffine(N_ * S_, C_, 1, x, scale, bias, y, ctx());
}
}
......@@ -99,7 +99,7 @@ void SyncBatchNormOp<Context>::RunOnDevice() {
DetermineBaseArguments();
// Get the recomputing flag
auto* flag = ws()->GetTensor("/share/flag/recomputing");
auto* flag = workspace()->GetTensor("/share/flag/recomputing");
is_recomputing_ = flag->template data<bool, CPUContext>()[0] ? 1 : 0;
// Dispatch the training or inference impl
......
......@@ -11,6 +11,7 @@ template <typename T>
void CuDNNRecurrentOpBase<Context>::ResetDesc() {
input_dims_ = Input(0).dims();
seq_length_ = Input(0).dim(0);
auto input_type = TypeMeta::Id<T>();
auto batch_size = Input(0).dim(1);
auto x_dim = Input(0).dim(2);
auto ndirections = bidirectional_ ? 2 : 1;
......@@ -24,7 +25,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
CUDNN_CHECK(
cudnnDropoutGetStatesSize(ctx()->cudnn_handle(), &states_size_));
std::lock_guard<std::mutex> lk(CUDAContext::mutex());
auto* states_tensor = ws()->CreateTensor(
auto* states_tensor = workspace()->CreateTensor(
"/share/cudnn/dropout:" + str::to(rng_seed_) + "/states");
if (states_tensor->count() > 0) {
auto* states = states_tensor->template mutable_data<uint8_t, Context>();
......@@ -53,6 +54,13 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
}
// Setup RNN
if (input_type == TypeMeta::Id<float16>()) {
compute_type_ = CUDNN_DATA_FLOAT;
} else if (input_type == TypeMeta::Id<float>()) {
compute_type_ = CUDNN_DATA_FLOAT;
} else if (input_type == TypeMeta::Id<double>()) {
compute_type_ = CUDNN_DATA_DOUBLE;
}
#if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetRNNDescriptor_v6(
ctx()->cudnn_handle(),
......@@ -64,7 +72,7 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
rnn_direction_,
rnn_mode_,
CUDNN_RNN_ALGO_STANDARD,
CuDNNType<T>::type));
compute_type_));
#else
CUDNN_CHECK(cudnnSetRNNDescriptor(
rnn_desc_,
......@@ -74,7 +82,25 @@ void CuDNNRecurrentOpBase<Context>::ResetDesc() {
rnn_input_mode_,
rnn_direction_,
rnn_mode_,
CuDNNType<T>::type));
compute_type_));
#endif
// Setup TensorCore
#if CUDNN_VERSION_MIN(7, 0, 0)
if (enable_tensor_core_ > 0) {
cudnnMathType_t math_type;
if (input_type == TypeMeta::Id<float16>()) {
math_type = CUDNN_TENSOR_OP_MATH;
} else {
math_type = CUDNN_DEFAULT_MATH;
#if CUDNN_VERSION_MIN(8, 0, 0)
if (!CUDAContext::objects().cudnn_allow_tf32_) {
math_type = CUDNN_FMA_MATH;
}
#endif
}
CUDNN_CHECK(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
}
#endif
// Setup X and Y
......@@ -151,7 +177,8 @@ void CuDNNRecurrentOp<Context>::DoRunWithType() {
return Output(i)->template mutable_data<T, Context>();
};
auto* scratch = ws()->template data<Context>({workspace_size_})[0];
auto* scratch =
ctx()->workspace()->template data<Context>({workspace_size_})[0];
if (phase() == "TRAIN") {
CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
......@@ -235,7 +262,8 @@ void CuDNNRecurrentGradientOp<Context>::DoRunWithType() {
return Output(i)->template mutable_data<T, Context>();
};
auto* scratch = ws()->template data<Context>({workspace_size_})[0];
auto* scratch =
ctx()->workspace()->template data<Context>({workspace_size_})[0];
// Check the ReserveSpace
CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
......
......@@ -57,7 +57,8 @@ class CuDNNRecurrentOpBase : public Operator<Context> {
hidden_size_(OP_SINGLE_ARG(int64_t, "hidden_size", 0)),
bidirectional_(OP_SINGLE_ARG(int64_t, "bidirectional", 0)),
dropout_ratio_(OP_SINGLE_ARG(float, "dropout_ratio", 1.f)),
rng_seed_(def.device_option().random_seed()) {
rng_seed_(def.device_option().random_seed()),
enable_tensor_core_(TENSOR_CORE_AVAILABLE() ? 1 : 0) {
// Determine the rnn direction
rnn_direction_ =
bidirectional_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
......@@ -111,11 +112,13 @@ class CuDNNRecurrentOpBase : public Operator<Context> {
public:
float dropout_ratio_;
unsigned long long rng_seed_;
int64_t enable_tensor_core_;
int64_t bidirectional_, states_initialized_;
int64_t seq_length_, hidden_size_, num_layers_;
vec64_t input_dims_, output_dims_, hidden_dims_;
size_t workspace_size_, reserve_size_, states_size_;
cudnnDataType_t compute_type_;
cudnnRNNMode_t rnn_mode_;
cudnnRNNDescriptor_t rnn_desc_;
cudnnDirectionMode_t rnn_direction_;
......
......@@ -12,8 +12,9 @@ Tensor* UpdateOpBase<Context>::Slot(const string& name) {
template <class Context>
float UpdateOpBase<Context>::Parameter(const string& name) const {
auto* P = ws()->GetTensor("/share/hyper/" + handle() + "/" + name);
return P->template mutable_data<float, CPUContext>()[0];
return workspace()
->GetTensor("/share/hyper/" + handle() + "/" + name)
->template mutable_data<float, CPUContext>()[0];
}
template <class Context>
......@@ -36,42 +37,25 @@ void UpdateOpBase<Context>::AdjustGradient(Tensor* dX, Tensor* X) {
}
// Penalty
auto weight_decay = Parameter("weight_decay");
if (weight_decay > 0.f) {
if (X->template IsType<float16>()) {
kernel::MixedPrecL2Penalty(
X->count(),
weight_decay * decay_mult_,
X->template data<float16, Context>(),
dX->template mutable_data<float, Context>(),
ctx());
} else {
math::Axpy(
X->count(),
weight_decay * decay_mult_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
if (weight_decay > 0.f && decay_mult_ > 0.f) {
math::Axpy(
X->count(),
weight_decay * decay_mult_,
X->template data<T, Context>(),
dX->template mutable_data<T, Context>(),
ctx());
}
}
template <class Context>
template <typename T>
void UpdateOpBase<Context>::ApplyUpdate(Tensor* dX, Tensor* X) {
if (X->template IsType<float16>()) {
kernel::MixedPrecUpdate(
X->count(),
dX->template data<float, Context>(),
X->template mutable_data<float16, Context>(),
ctx());
} else {
math::Sub(
X->count(),
X->template data<T, Context>(),
dX->template data<T, Context>(),
X->template mutable_data<T, Context>(),
ctx());
}
math::Sub(
X->count(),
X->template data<T, Context>(),
dX->template data<T, Context>(),
X->template mutable_data<T, Context>(),
ctx());
}
template <class Context>
......@@ -90,15 +74,28 @@ void UpdateOpBase<Context>::RunOnDevice() {
ComputeUpdate(&dX);
ApplyUpdate<float>(&dX, X);
} else if (dX.template IsType<float16>()) {
auto* dX_cast = ws()->CreateTensor(dX.name() + "[float32]");
kernel::Cast(
auto* X_master = workspace()->CreateTensor(X->name() + "[float32]");
auto* dX_copy = ctx()->workspace()->CreateTensor("/share/data");
if (X_master->count() != X->count()) {
math::Cast(
X->count(),
X->template data<float16, Context>(),
X_master->ReshapeLike(*X)->template mutable_data<float, Context>(),
ctx());
}
math::Cast(
dX.count(),
dX.template data<float16, Context>(),
dX_cast->ReshapeLike(dX)->template mutable_data<float, Context>(),
dX_copy->ReshapeLike(dX)->template mutable_data<float, Context>(),
ctx());
AdjustGradient<float>(dX_copy, X_master);
ComputeUpdate(dX_copy);
ApplyUpdate<float>(dX_copy, X_master);
math::Cast(
X->count(),
X_master->template data<float, Context>(),
X->template mutable_data<float16, Context>(),
ctx());
AdjustGradient<float>(dX_cast, X);
ComputeUpdate(dX_cast);
ApplyUpdate<float>(dX_cast, X);
} else {
LOG(FATAL) << MessageForUnsupported(
types::to_string(dX.meta()), {"float16", "float32"});
......
......@@ -41,8 +41,19 @@ void CuDNNConv2dOp<Context>::SetConvDesc() {
#endif
#if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
if (enable_tensor_core_) {
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
if (enable_tensor_core_ > 0) {
cudnnMathType_t math_type;
if (input_type == TypeMeta::Id<float16>()) {
math_type = CUDNN_TENSOR_OP_MATH;
} else {
math_type = CUDNN_DEFAULT_MATH;
#if CUDNN_VERSION_MIN(8, 0, 0)
if (!CUDAContext::objects().cudnn_allow_tf32_) {
math_type = CUDNN_FMA_MATH;
}
#endif
}
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, math_type));
}
#endif
}
......@@ -148,8 +159,8 @@ void CuDNNConv2dOp<Context>::DoRunWithType() {
// Find the appropriate algorithm if necessary
if (exhaustive_search_) {
scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
scratch = ctx()->workspace()->template data<Context>(
{CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
auto algo = algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_FWD_ALGOS;
......@@ -188,7 +199,7 @@ void CuDNNConv2dOp<Context>::DoRunWithType() {
// Alloc the memory for workspace data
if (cudnn_ws_nbytes_ > 0) {
scratch = ws()->template data<Context>({cudnn_ws_nbytes_})[0];
scratch = ctx()->workspace()->template data<Context>({cudnn_ws_nbytes_})[0];
}
for (int g = 0; g < cudnn_group_; g++) {
......@@ -279,8 +290,19 @@ void CuDNNConv2dGradientOp<Context>::SetConvDesc() {
#endif
#if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
if (enable_tensor_core_) {
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
if (enable_tensor_core_ > 0) {
cudnnMathType_t math_type;
if (input_type == TypeMeta::Id<float16>()) {
math_type = CUDNN_TENSOR_OP_MATH;
} else {
math_type = CUDNN_DEFAULT_MATH;
#if CUDNN_VERSION_MIN(8, 0, 0)
if (!CUDAContext::objects().cudnn_allow_tf32_) {
math_type = CUDNN_FMA_MATH;
}
#endif
}
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, math_type));
}
#endif
}
......@@ -418,8 +440,8 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
// Find the appropriate algorithm if necessary
if (dW->has_name() && exhaustive_search_filter_) {
scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
scratch = ctx()->workspace()->template data<Context>(
{CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
x = X.template data<T, Context>();
dw = dW->template mutable_data<T, Context>();
auto algo =
......@@ -448,8 +470,8 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
}
if (dX->has_name() && exhaustive_search_data_) {
scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
scratch = ctx()->workspace()->template data<Context>(
{CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
w = W.template data<T, Context>();
dx = dX->template mutable_data<T, Context>();
auto algo = data_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
......@@ -500,7 +522,7 @@ void CuDNNConv2dGradientOp<Context>::DoRunWithType() {
// Alloc the memory for workspace data
if (cudnn_ws_nbytes_ > 0) {
scratch = ws()->template data<Context>({cudnn_ws_nbytes_})[0];
scratch = ctx()->workspace()->template data<Context>({cudnn_ws_nbytes_})[0];
}
if (Output(2)->has_name()) {
......
......@@ -41,8 +41,19 @@ void CuDNNConvTranspose2dOp<Context>::SetConvDesc() {
#endif
#if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
if (enable_tensor_core_) {
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
if (enable_tensor_core_ > 0) {
cudnnMathType_t math_type;
if (input_type == TypeMeta::Id<float16>()) {
math_type = CUDNN_TENSOR_OP_MATH;
} else {
math_type = CUDNN_DEFAULT_MATH;
#if CUDNN_VERSION_MIN(8, 0, 0)
if (!CUDAContext::objects().cudnn_allow_tf32_) {
math_type = CUDNN_FMA_MATH;
}
#endif
}
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, math_type));
}
#endif
}
......@@ -146,8 +157,8 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() {
// Find the appropriate algorithm if necessary
if (exhaustive_search_) {
scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
scratch = ctx()->workspace()->template data<Context>(
{CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
auto algo = algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
int num_valid_algos;
constexpr int num_algos = CUDNN_CONV_NUM_BWD_DATA_ALGOS;
......@@ -186,7 +197,7 @@ void CuDNNConvTranspose2dOp<Context>::DoRunWithType() {
// Alloc the memory for workspace data
if (cudnn_ws_nbytes_ > 0) {
scratch = ws()->template data<Context>({cudnn_ws_nbytes_})[0];
scratch = ctx()->workspace()->template data<Context>({cudnn_ws_nbytes_})[0];
}
for (int g = 0; g < cudnn_group_; g++) {
......@@ -277,8 +288,19 @@ void CuDNNConvTranspose2dGradientOp<Context>::SetConvDesc() {
#endif
#if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
if (enable_tensor_core_) {
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
if (enable_tensor_core_ > 0) {
cudnnMathType_t math_type;
if (input_type == TypeMeta::Id<float16>()) {
math_type = CUDNN_TENSOR_OP_MATH;
} else {
math_type = CUDNN_DEFAULT_MATH;
#if CUDNN_VERSION_MIN(8, 0, 0)
if (!CUDAContext::objects().cudnn_allow_tf32_) {
math_type = CUDNN_FMA_MATH;
}
#endif
}
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc_, math_type));
}
#endif
}
......@@ -413,8 +435,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
// Find the appropriate algorithm if necessary
if (dW->has_name() && exhaustive_search_filter_) {
scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
scratch = ctx()->workspace()->template data<Context>(
{CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
x = X.template data<T, Context>();
dw = dW->template mutable_data<T, Context>();
auto algo =
......@@ -443,8 +465,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
}
if (dX->has_name() && exhaustive_search_data_) {
scratch =
ws()->template data<Context>({CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
scratch = ctx()->workspace()->template data<Context>(
{CUDNN_CONV_WORKSPACE_LIMIT_BYTES})[0];
w = W.template data<T, Context>();
dx = dX->template mutable_data<T, Context>();
auto algo = data_algo_cache_.get(X.dims(), W.dims(), compute_type_, [&]() {
......@@ -495,7 +517,7 @@ void CuDNNConvTranspose2dGradientOp<Context>::DoRunWithType() {
// Alloc the memory for workspace data
if (cudnn_ws_nbytes_ > 0) {
scratch = ws()->template data<Context>({cudnn_ws_nbytes_})[0];
scratch = ctx()->workspace()->template data<Context>({cudnn_ws_nbytes_})[0];
}
if (Output(2)->has_name()) {
......
......@@ -79,10 +79,11 @@ class CuDNNConv2dOp final : public Conv2dOp<Context> {
CuDNNCreateTensorDesc(&output2b_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
if (data_format() == "NCHW")
if (data_format() == "NCHW") {
format_ = CUDNN_TENSOR_NCHW;
else if (data_format() == "NHWC")
} else if (data_format() == "NHWC") {
format_ = CUDNN_TENSOR_NHWC;
}
}
USE_OPERATOR_FUNCTIONS;
USE_CONVOLUTION_FUNCTIONS;
......@@ -140,10 +141,11 @@ class CuDNNConv2dGradientOp final : public Conv2dGradientOp<Context> {
CuDNNCreateTensorDesc(&input2b_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
if (data_format() == "NCHW")
if (data_format() == "NCHW") {
format_ = CUDNN_TENSOR_NCHW;
else if (data_format() == "NHWC")
} else if (data_format() == "NHWC") {
format_ = CUDNN_TENSOR_NHWC;
}
}
USE_OPERATOR_FUNCTIONS;
USE_CONVOLUTION_FUNCTIONS;
......
......@@ -77,7 +77,8 @@ template <typename T>
void ConvOpBase<Context>::Wx(const T* x, const T* w, T* y, bool skip) {
auto* col = x;
if (!is_1x1_) {
auto* scratch = ws()->template data<T, Context>({col_dim_})[0];
auto* scratch =
ctx()->workspace()->template data<T, Context>({col_dim_})[0];
if (!skip) Im2Col(x, scratch);
col = scratch;
}
......@@ -127,7 +128,9 @@ void ConvOpBase<Context>::Pb(const T* bias, T* y) {
template <class Context>
template <typename T>
void ConvOpBase<Context>::Dx(const T* dy, const T* w, T* dx) {
auto* col = is_1x1_ ? dx : ws()->template data<T, Context>({col_dim_})[0];
auto* col = is_1x1_
? dx
: ctx()->workspace()->template data<T, Context>({col_dim_})[0];
for (int g = 0; g < group_; g++) {
if (data_format() == "NCHW") {
math::Gemm(
......@@ -165,7 +168,8 @@ template <typename T>
void ConvOpBase<Context>::Dw(const T* dy, const T* x, T* dw, bool accum) {
auto* col = x;
if (!is_1x1_) {
auto* scratch = ws()->template data<T, Context>({col_dim_})[0];
auto* scratch =
ctx()->workspace()->template data<T, Context>({col_dim_})[0];
Im2Col(x, scratch);
col = scratch;
}
......
......@@ -142,10 +142,11 @@ class CuDNNConvTranspose2dGradientOp final
CuDNNCreateTensorDesc(&input2b_desc_);
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
if (data_format() == "NCHW")
if (data_format() == "NCHW") {
format_ = CUDNN_TENSOR_NCHW;
else if (data_format() == "NHWC")
} else if (data_format() == "NHWC") {
format_ = CUDNN_TENSOR_NHWC;
}
}
USE_OPERATOR_FUNCTIONS;
USE_CONVOLUTION_FUNCTIONS;
......
#include "dragon/operators/vision/resize_op.h"
#include "dragon/core/workspace.h"
#include "dragon/utils/math_functions.h"
#include "dragon/utils/op_kernels.h"
namespace dragon {
......@@ -175,7 +176,8 @@ template <typename T>
void ResizeGradientOp<Context>::DoRunWithTypeAndCast() {
auto* dy = Input(0).template data<T, Context>();
auto* dx = Output(0)->template mutable_data<T, Context>();
auto* scratch = ws()->template data<float, Context>({Output(0)->count()})[0];
auto* scratch = ctx()->workspace()->template data<float, Context>(
{Output(0)->count()})[0];
if (mode_ == "NEAREST") {
NearestImpl(dy, scratch);
} else if (mode_ == "LINEAR") {
......@@ -183,7 +185,7 @@ void ResizeGradientOp<Context>::DoRunWithTypeAndCast() {
} else {
LOG(FATAL) << "Unknown interpolation mode: " << mode_;
}
kernel::Cast(Output(0)->count(), scratch, dx, ctx());
math::Cast(Output(0)->count(), scratch, dx, ctx());
}
template <class Context>
......
......@@ -67,7 +67,8 @@ void RoiAlignGradientOp<Context>::DoRunWithTypeAndCast() {
auto &RoI = Input(0), &dY = Input(1);
auto* dX = Output(0)->ReshapeLike(RESTORE_INPUT_SPEC(0));
auto* scratch = ws()->template data<float, Context>({dX->count()})[0];
auto* scratch =
ctx()->workspace()->template data<float, Context>({dX->count()})[0];
math::Set(dX->count(), 0.f, scratch, ctx());
kernel::RoiAlignGrad(
dX->dim(1),
......@@ -82,7 +83,7 @@ void RoiAlignGradientOp<Context>::DoRunWithTypeAndCast() {
RoI.template data<float, Context>(),
scratch,
ctx());
kernel::Cast(
math::Cast(
dX->count(), scratch, dX->template mutable_data<T, Context>(), ctx());
}
......
......@@ -68,7 +68,8 @@ void RoiPoolGradientOp<Context>::DoRunWithTypeAndCast() {
auto &RoI = Input(0), &dY = Input(1);
auto* dX = Output(0)->ReshapeLike(RESTORE_INPUT_SPEC(0));
auto* scratch = ws()->template data<float, Context>({dX->count()})[0];
auto* scratch =
ctx()->workspace()->template data<float, Context>({dX->count()})[0];
math::Set(dX->count(), 0.f, scratch, ctx());
kernel::RoiPoolGrad(
......@@ -85,7 +86,7 @@ void RoiPoolGradientOp<Context>::DoRunWithTypeAndCast() {
scratch,
ctx());
kernel::Cast(
math::Cast(
dX->count(), scratch, dX->template mutable_data<T, Context>(), ctx());
}
......
......@@ -56,6 +56,7 @@ from dragon.core.ops import tensorbind_eager as _
from dragon.core.ops import tensorbind_symbol as _
from dragon.core.ops.array_ops import broadcast_to
from dragon.core.ops.array_ops import cast
from dragon.core.ops.array_ops import channel_affine
from dragon.core.ops.array_ops import channel_normalize
from dragon.core.ops.array_ops import channel_shuffle
from dragon.core.ops.array_ops import concat
......
......@@ -26,7 +26,6 @@ from dragon.core.ops.array_ops import sum
from dragon.core.ops.array_ops import top_k
from dragon.core.ops.math_ops import abs
from dragon.core.ops.math_ops import add
from dragon.core.ops.math_ops import affine
from dragon.core.ops.math_ops import axpby
from dragon.core.ops.math_ops import ceil
from dragon.core.ops.math_ops import clip
......
......@@ -62,7 +62,7 @@ def current_device():
return backend.cudaGetDevice()
def enable_cudnn(enabled=True, benchmark=False):
def enable_cudnn(enabled=True, benchmark=False, allow_tf32=False):
"""Enable backend to use the cuDNN library.
Parameters
......@@ -71,9 +71,11 @@ def enable_cudnn(enabled=True, benchmark=False):
Use cuDNN library or not.
benchmark : bool, optional, default=False
Select algorithms according to the benchmark or not.
allow_tf32 : bool, optional, default=False
Allow TF32 Tensor core operation or not.
"""
return backend.cudaEnableDNN(enabled, benchmark)
return backend.cudaEnableDNN(enabled, benchmark, allow_tf32)
def get_device_capability(device_index=None):
......
......@@ -14,6 +14,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
from dragon import backend as _b
from dragon.core.util import nest
from dragon.core.util import six
......@@ -278,8 +280,10 @@ def _maybe_initialize():
class _MPIContext(object):
"""Context to finalize mpi under destruction."""
def __del__(self):
_b.mpiFinalize()
def __init__(self):
# Register a callback to finalize MPI
# on program exit.
atexit.register(lambda: _b.mpiFinalize())
_GLOBAL_MPI_CONTEXT = None
......
......@@ -204,6 +204,46 @@ def cast(inputs, dtype, **kwargs):
return op_lib.blend(**args)
@OpSchema.num_inputs(2, 3)
def channel_affine(inputs, axis=1, num_axes=1, **kwargs):
r"""Apply affine transformation along the channels.
.. math:: \text{out} = \text{weight} * \text{input} + \text{bias}
The range of channels to transform is given by:
.. math:: [\text{axis}, \text{axis} + \text{num\_axes})
Set ``axis`` to specific the start axis.
Set ``num_axes`` to -1 will transform all remained axes.
Parameters
----------
inputs : Sequence[dragon.Tensor]
The input, weight and optional bias tensor.
axis : int, optional, default=1
The start axis, can be negative.
num_axes : int, optional, default=1
The number of axes to transform.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = parse_args(locals())
inplace = args.pop('inplace') if 'inplace' in args else False
op_lib = array_ops_lib.ChannelAffine
if context.executing_eagerly():
return op_lib \
.instantiate(axis=axis, num_axes=num_axes) \
.apply(inputs, inplace=inplace)
else:
return op_lib.blend(**args)
@OpSchema.num_inputs(1)
@ArgHelper.repeated_desc('perm')
def channel_normalize(
......
......@@ -57,6 +57,26 @@ class Cast(Operator):
return self.dispatch(inputs, [self.alloc()])
class ChannelAffine(Operator):
def __init__(self, key, dev, **kwargs):
super(ChannelAffine, self).__init__(key, dev, **kwargs)
self.axis = kwargs.get('axis', 1)
self.num_axes = kwargs.get('num_axes', 1)
def attributes(self):
return {
'op_type': 'ChannelAffine',
'arguments': {
'axis': self.axis,
'num_axes': self.num_axes,
}
}
def forward(self, inputs, inplace=False):
outputs = [self.alloc(inputs[0]) if inplace else self.alloc()]
return self.dispatch(inputs, outputs)
class ChannelNormalize(Operator):
def __init__(self, key, dev, **kwargs):
super(ChannelNormalize, self).__init__(key, dev, **kwargs)
......
......@@ -88,45 +88,6 @@ def add(inputs, **kwargs):
return op_lib.blend('Add', **args)
@OpSchema.num_inputs(2, 3)
def affine(inputs, axis=1, num_axes=1, **kwargs):
r"""Compute the affine transformation along the given axes.
.. math:: y = Wx + b
The range of axes is defined as:
.. math:: [\text{Axis}, \text{Axis} + \text{NumAxes})
Set ``axis`` to specific the start axis.
Set ``num_axes`` to -1 will scale all remained axes.
Parameters
----------
inputs : Sequence[dragon.Tensor]
The tensor **x**, **W** and **b**.
axis : int, optional, default=1
The start axis, can be negative.
num_axes : int, optional, default=1
The number of axes to compute.
Returns
-------
dragon.Tensor
The output tensor.
"""
args = parse_args(locals())
op_lib = math_ops_lib.Affine
if context.executing_eagerly():
return op_lib \
.instantiate(axis=axis, num_axes=num_axes) \
.apply(inputs)
else:
return op_lib.blend(**args)
@OpSchema.num_inputs(1)
def axpby(inputs, outputs=None, alpha=1., beta=1., **kwargs):
r"""Compute the element-wise addition from input to output.
......
......@@ -17,25 +17,6 @@ from __future__ import print_function
from dragon.core.framework.ops import Operator
class Affine(Operator):
def __init__(self, key, dev, **kwargs):
super(Affine, self).__init__(key, dev, **kwargs)
self.axis = kwargs.get('axis', 1)
self.num_axes = kwargs.get('num_axes', 1)
def attributes(self):
return {
'op_type': 'Affine',
'arguments': {
'axis': self.axis,
'num_axes': self.num_axes,
}
}
def forward(self, inputs):
return self.dispatch(inputs, [self.alloc()])
class Axpby(Operator):
def __init__(self, key, dev, **kwargs):
super(Axpby, self).__init__(key, dev, **kwargs)
......
......@@ -51,6 +51,21 @@ def cast_exporter(op_def, shape_dict, ws):
return node, const_tensors
@exporter.register('ChannelAffine')
def channel_affine_exporter(op_def, shape_dict, ws):
node, const_tensors = exporter.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'ChannelAffine')
for arg in op_def.arg:
if arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'num_axes':
helper.add_attribute(node, 'num_axes', arg.i)
# Weights and biases
const_tensors = [helper.from_tensor(e, ws) for e in op_def.input[1:]]
return node, const_tensors
@exporter.register('ChannelNormalize')
def channel_normalize_exporter(op_def, shape_dict, ws):
node, const_tensors = exporter.translate(**locals())
......
......@@ -31,21 +31,6 @@ def add_exporter(op_def, shape_dict, ws):
return node, const_tensors
@exporter.register('Affine')
def affine_exporter(op_def, shape_dict, ws):
node, const_tensors = exporter.translate(**locals())
node.op_type = 'ATen' # Currently not supported in ai.onnx
helper.add_attribute(node, 'op_type', 'Affine')
for arg in op_def.arg:
if arg.name == 'axis':
helper.add_attribute(node, 'axis', arg.i)
elif arg.name == 'num_axes':
helper.add_attribute(node, 'num_axes', arg.i)
# Weights and biases
const_tensors = [helper.from_tensor(e, ws) for e in op_def.input[1:]]
return node, const_tensors
@exporter.register('Div')
def div_exporter(op_def, shape_dict, ws):
node, const_tensors = exporter.translate(**locals())
......
......@@ -4,11 +4,46 @@
#ifdef USE_CUDA
#include <cub/block/block_reduce.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_select.cuh>
#include <cub/iterator/counting_input_iterator.cuh>
#include "dragon/utils/device/common_cuda.h"
namespace cub {
struct SumHalf {
inline __device__ half operator()(const half& a, const half& b) const {
#if __CUDA_ARCH__ >= 530
return __hadd(a, b);
#else
return __float2half(__half2float(a) + __half2float(b));
#endif
}
};
struct MinHalf {
inline __device__ half operator()(const half& a, const half& b) const {
#if __CUDA_ARCH__ >= 530
return __hlt(a, b) ? a : b;
#else
return __half2float(a) < __half2float(b) ? a : b;
#endif
}
};
struct MaxHalf {
inline __device__ half operator()(const half& a, const half& b) const {
#if __CUDA_ARCH__ >= 530
return __hgt(a, b) ? a : b;
#else
return __half2float(a) > __half2float(b) ? a : b;
#endif
}
};
} // namespace cub
namespace dragon {
template <typename T>
......
......@@ -6,6 +6,7 @@
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!