Commit 79a52211 by Ting PAN

Dismantle FP16 & TensorCore Support

1 parent bab9931e
Showing with 1886 additions and 1292 deletions
...@@ -133,6 +133,9 @@ class CUDAContext { ...@@ -133,6 +133,9 @@ class CUDAContext {
} else { } else {
DeviceGuard gurad(gpu_id_); DeviceGuard gurad(gpu_id_);
CUBLAS_CHECK(cublasCreate_v2(&handle)); CUBLAS_CHECK(cublasCreate_v2(&handle));
#if CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
#endif
return handle; return handle;
} }
} }
......
...@@ -9,17 +9,17 @@ ...@@ -9,17 +9,17 @@
// //
// ------------------------------------------------------------ // ------------------------------------------------------------
#ifndef DRAGON_OPERATORS_ARITHMETIC_SCALE_OP_H_ #ifndef DRAGON_OPERATORS_ARITHMETIC_AFFINE_OP_H_
#define DRAGON_OPERATORS_ARITHMETIC_SCALE_OP_H_ #define DRAGON_OPERATORS_ARITHMETIC_AFFINE_OP_H_
#include "core/operator.h" #include "core/operator.h"
namespace dragon { namespace dragon {
template <class Context> template <class Context>
class ScaleOp : public Operator<Context> { class AffineOp : public Operator<Context> {
public: public:
ScaleOp(const OperatorDef& op_def, Workspace* ws) AffineOp(const OperatorDef& op_def, Workspace* ws)
: Operator<Context>(op_def, ws), : Operator<Context>(op_def, ws),
axis(OperatorBase::GetSingleArg<int>("axis", 1)), axis(OperatorBase::GetSingleArg<int>("axis", 1)),
num_axes(OperatorBase::GetSingleArg<int>("num_axes", 1)) {} num_axes(OperatorBase::GetSingleArg<int>("num_axes", 1)) {}
...@@ -30,14 +30,14 @@ class ScaleOp : public Operator<Context> { ...@@ -30,14 +30,14 @@ class ScaleOp : public Operator<Context> {
protected: protected:
TIndex axis, start_axis, num_axes; TIndex axis, start_axis, num_axes;
TIndex inner_dim; TIndex outer_dim, scale_dim, inner_dim;
Tensor* bias_multiplier; Tensor* bias_multiplier;
}; };
template <class Context> template <class Context>
class ScaleGradientOp final : public Operator<Context> { class AffineGradientOp final : public Operator<Context> {
public: public:
ScaleGradientOp(const OperatorDef& op_def, Workspace* ws) AffineGradientOp(const OperatorDef& op_def, Workspace* ws)
: Operator<Context>(op_def, ws), : Operator<Context>(op_def, ws),
axis(OperatorBase::GetSingleArg<int>("axis", 1)), axis(OperatorBase::GetSingleArg<int>("axis", 1)),
num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)) {} num_axes(OperatorBase::GetSingleArg<int>("num_axes", -1)) {}
...@@ -57,4 +57,4 @@ class ScaleGradientOp final : public Operator<Context> { ...@@ -57,4 +57,4 @@ class ScaleGradientOp final : public Operator<Context> {
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_ARITHMETIC_SCALE_OP_H_ #endif // DRAGON_OPERATORS_ARITHMETIC_AFFINE_OP_H_
\ No newline at end of file \ No newline at end of file
...@@ -33,7 +33,7 @@ class AccuracyOp final: public Operator<Context> { ...@@ -33,7 +33,7 @@ class AccuracyOp final: public Operator<Context> {
USE_OPERATOR_FUNCTIONS(Context); USE_OPERATOR_FUNCTIONS(Context);
void RunOnDevice() override; void RunOnDevice() override;
template <typename T> void RunWithType(); template <typename Tx, typename Ty> void RunWithType();
protected: protected:
TIndex top_k, axis, outer_dim, inner_dim, num_classes; TIndex top_k, axis, outer_dim, inner_dim, num_classes;
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
// //
// ------------------------------------------------------------- // -------------------------------------------------------------
#ifndef DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_ #ifndef DRAGON_OPERATORS_MISC_GRADIENT_OP_H_
#define DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_ #define DRAGON_OPERATORS_MISC_GRADIENT_OP_H_
#include "core/operator.h" #include "core/operator.h"
...@@ -62,4 +62,4 @@ class StopGradientOp final : public Operator<Context> { ...@@ -62,4 +62,4 @@ class StopGradientOp final : public Operator<Context> {
} // namespace dragon } // namespace dragon
#endif // DRAGON_OPERATORS_MISC_GRADIENT_GENERATE_OP_H_ #endif // DRAGON_OPERATORS_MISC_GRADIENT_OP_H_
\ No newline at end of file \ No newline at end of file
...@@ -56,11 +56,13 @@ template <class Context> ...@@ -56,11 +56,13 @@ template <class Context>
class CuDNNConv2dOp : public Conv2dOp<Context> { class CuDNNConv2dOp : public Conv2dOp<Context> {
public: public:
CuDNNConv2dOp(const OperatorDef& def, Workspace* ws) CuDNNConv2dOp(const OperatorDef& def, Workspace* ws)
: Conv2dOp<Context>(def, ws) { : Conv2dOp<Context>(def, ws), enable_tensor_core(true) {
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
cudnn_group = 1; cudnn_group = 1;
enable_tensor_core &= TENSOR_CORE_AVAILABLE();
#else #else
cudnn_group = this->group; cudnn_group = this->group;
enable_tensor_core = false;
#endif #endif
handle = new cudnnHandle_t[cudnn_group]; handle = new cudnnHandle_t[cudnn_group];
stream = new cudaStream_t[cudnn_group]; stream = new cudaStream_t[cudnn_group];
...@@ -109,17 +111,20 @@ class CuDNNConv2dOp : public Conv2dOp<Context> { ...@@ -109,17 +111,20 @@ class CuDNNConv2dOp : public Conv2dOp<Context> {
size_t workspace_fwd_data_size; size_t workspace_fwd_data_size;
TIndex bias_offset, cudnn_group; TIndex bias_offset, cudnn_group;
vector<TIndex> input_dims; vector<TIndex> input_dims;
bool enable_tensor_core;
}; };
template <class Context> template <class Context>
class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> { class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> {
public: public:
CuDNNConv2dGradientOp(const OperatorDef& def, Workspace* ws) CuDNNConv2dGradientOp(const OperatorDef& def, Workspace* ws)
: Conv2dGradientOp<Context>(def, ws) { : Conv2dGradientOp<Context>(def, ws), enable_tensor_core(true) {
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
cudnn_group = 1; cudnn_group = 1;
enable_tensor_core &= TENSOR_CORE_AVAILABLE();
#else #else
cudnn_group = this->group; cudnn_group = this->group;
enable_tensor_core = false;
#endif #endif
handle = new cudnnHandle_t[cudnn_group * 3]; handle = new cudnnHandle_t[cudnn_group * 3];
stream = new cudaStream_t[cudnn_group * 3]; stream = new cudaStream_t[cudnn_group * 3];
...@@ -168,6 +173,7 @@ class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> { ...@@ -168,6 +173,7 @@ class CuDNNConv2dGradientOp : public Conv2dGradientOp<Context> {
size_t workspace_bwd_filter_size, workspace_bwd_data_size; size_t workspace_bwd_filter_size, workspace_bwd_data_size;
TIndex bias_offset, cudnn_group; TIndex bias_offset, cudnn_group;
vector<TIndex> input_dims; vector<TIndex> input_dims;
bool enable_tensor_core;
}; };
#endif // WITH_CUDNN #endif // WITH_CUDNN
......
...@@ -60,11 +60,13 @@ template <class Context> ...@@ -60,11 +60,13 @@ template <class Context>
class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> { class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> {
public: public:
CuDNNConv2dTransposeOp(const OperatorDef& def, Workspace* ws) CuDNNConv2dTransposeOp(const OperatorDef& def, Workspace* ws)
: Conv2dTransposeOp<Context>(def, ws) { : Conv2dTransposeOp<Context>(def, ws), enable_tensor_core(true) {
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
cudnn_group = 1; cudnn_group = 1;
enable_tensor_core &= TENSOR_CORE_AVAILABLE();
#else #else
cudnn_group = this->group; cudnn_group = this->group;
enable_tensor_core = false;
#endif #endif
handle = new cudnnHandle_t[cudnn_group]; handle = new cudnnHandle_t[cudnn_group];
stream = new cudaStream_t[cudnn_group]; stream = new cudaStream_t[cudnn_group];
...@@ -112,17 +114,20 @@ class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> { ...@@ -112,17 +114,20 @@ class CuDNNConv2dTransposeOp : public Conv2dTransposeOp<Context> {
size_t workspace_fwd_data_size; size_t workspace_fwd_data_size;
TIndex bias_offset, cudnn_group; TIndex bias_offset, cudnn_group;
vector<TIndex> input_dims; vector<TIndex> input_dims;
bool enable_tensor_core;
}; };
template <class Context> template <class Context>
class CuDNNConv2dTransposeGradientOp : public Conv2dTransposeGradientOp<Context> { class CuDNNConv2dTransposeGradientOp : public Conv2dTransposeGradientOp<Context> {
public: public:
CuDNNConv2dTransposeGradientOp(const OperatorDef& def, Workspace* ws) CuDNNConv2dTransposeGradientOp(const OperatorDef& def, Workspace* ws)
: Conv2dTransposeGradientOp<Context>(def, ws) { : Conv2dTransposeGradientOp<Context>(def, ws), enable_tensor_core(true) {
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
cudnn_group = 1; cudnn_group = 1;
enable_tensor_core &= TENSOR_CORE_AVAILABLE();
#else #else
cudnn_group = this->group; cudnn_group = this->group;
enable_tensor_core = false;
#endif #endif
handle = new cudnnHandle_t[cudnn_group * 3]; handle = new cudnnHandle_t[cudnn_group * 3];
stream = new cudaStream_t[cudnn_group * 3]; stream = new cudaStream_t[cudnn_group * 3];
...@@ -171,6 +176,7 @@ public: ...@@ -171,6 +176,7 @@ public:
size_t workspace_bwd_filter_size, workspace_bwd_data_size; size_t workspace_bwd_filter_size, workspace_bwd_data_size;
TIndex bias_offset, cudnn_group; TIndex bias_offset, cudnn_group;
vector<TIndex> input_dims; vector<TIndex> input_dims;
bool enable_tensor_core;
}; };
#endif // WITH_CUDNN #endif // WITH_CUDNN
......
...@@ -26,9 +26,9 @@ template<> inline int dragon_cast<int, float>(float val) { ...@@ -26,9 +26,9 @@ template<> inline int dragon_cast<int, float>(float val) {
return static_cast<int>(val); return static_cast<int>(val);
} }
template<> inline float dragon_cast<float, float>(float val) { template<> inline float dragon_cast<float, float>(float val) { return val; }
return val;
} template<> inline float16 dragon_cast<float16, float16>(float16 val) { return val; }
template<> inline float16 dragon_cast<float16, float>(float val) { template<> inline float16 dragon_cast<float16, float>(float val) {
float16 ret; float16 ret;
......
...@@ -77,7 +77,7 @@ inline int GET_BLOCKS(const int N) { ...@@ -77,7 +77,7 @@ inline int GET_BLOCKS(const int N) {
#define __hdiv hdiv #define __hdiv hdiv
#endif #endif
inline int NUM_DEVICES() { inline int CUDA_NUM_DEVICES() {
static int count = -1; static int count = -1;
if (count < 0) { if (count < 0) {
auto err = cudaGetDeviceCount(&count); auto err = cudaGetDeviceCount(&count);
...@@ -86,21 +86,47 @@ inline int NUM_DEVICES() { ...@@ -86,21 +86,47 @@ inline int NUM_DEVICES() {
return count; return count;
} }
inline int CURRENT_DEVICE() { inline int CUDA_CURRENT_DEVICE() {
int gpu_id; int gpu_id;
cudaGetDevice(&gpu_id); cudaGetDevice(&gpu_id);
return gpu_id; return gpu_id;
} }
inline int POINTER_DEVICE(const void* ptr) { inline int CUDA_POINTER_DEVICE(const void* ptr) {
cudaPointerAttributes attr; cudaPointerAttributes attr;
CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
return attr.device; return attr.device;
} }
struct CUDADeviceProps {
CUDADeviceProps() : props(CUDA_NUM_DEVICES()) {
for (int i = 0; i < CUDA_NUM_DEVICES(); ++i)
CUDA_CHECK(cudaGetDeviceProperties(&props[i], i));
}
vector<cudaDeviceProp> props;
};
inline const cudaDeviceProp& GetDeviceProperty(const int device_id) {
static CUDADeviceProps props;
CHECK_LT(device_id, (int)props.props.size())
<< "Invalid device id: " << device_id
<< "\nDetected " << props.props.size() << " eligible cuda devices.";
return props.props[device_id];
}
inline bool TENSOR_CORE_AVAILABLE() {
#if CUDA_VERSION < 9000
return false;
#else
int device = CUDA_CURRENT_DEVICE();
auto& prop = GetDeviceProperty(device);
return prop.major >= 7;
#endif
}
class DeviceGuard { class DeviceGuard {
public: public:
DeviceGuard(int newDevice) : previous_(CURRENT_DEVICE()) { DeviceGuard(int newDevice) : previous_(CUDA_CURRENT_DEVICE()) {
if (previous_ != newDevice) if (previous_ != newDevice)
CUDA_CHECK(cudaSetDevice(newDevice)); CUDA_CHECK(cudaSetDevice(newDevice));
} }
......
...@@ -150,6 +150,28 @@ void Tanh(const int count, const T* x, T* y); ...@@ -150,6 +150,28 @@ void Tanh(const int count, const T* x, T* y);
template <typename T, class Context> template <typename T, class Context>
void TanhGrad(const int count, const T* dy, const T* y, T* dx); void TanhGrad(const int count, const T* dy, const T* y, T* dx);
/******************** arithmetic.affine ********************/
template <typename T, class Context>
void Affine(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const T* x,
const T* alpha,
const T* beta,
const T* beta_multiplier,
T* y);
template <typename T, class Context>
void AffineGrad(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const T* dy,
const T* alpha,
T* dx);
/******************** arithmetic.bias_add ********************/ /******************** arithmetic.bias_add ********************/
template <typename T, class Context> template <typename T, class Context>
...@@ -172,24 +194,6 @@ void Clip(const int count, ...@@ -172,24 +194,6 @@ void Clip(const int count,
T* mask, T* mask,
T* y); T* y);
/******************** arithmetic.scale ********************/
template <typename T, class Context>
void Scale(const int axis,
Tensor* x,
Tensor* gamma,
Tensor* beta,
Tensor* BMul,
Tensor* y);
template <typename T, class Context>
void ScaleGrad(const int axis, Tensor* dy, Tensor* gamma, Tensor* dx);
/******************** cast.float2half ********************/
template <typename T, class Context>
void Float2Half(const int count, const float* x, float16* y);
/******************** control_flow.compare ********************/ /******************** control_flow.compare ********************/
template <typename T, class Context> template <typename T, class Context>
...@@ -286,7 +290,7 @@ void SparseSoftmaxFocalLossGrad(const int count, ...@@ -286,7 +290,7 @@ void SparseSoftmaxFocalLossGrad(const int count,
Tensor* ignore, Tensor* ignore,
T* dx); T* dx);
/******************** misc.dtype ********************/ /******************** misc.astype ********************/
template <typename Ta, typename Tb, class Context> template <typename Ta, typename Tb, class Context>
void TypeA2B(const int count, const Ta* a, Tb* b); void TypeA2B(const int count, const Ta* a, Tb* b);
......
...@@ -88,7 +88,7 @@ std::string CreateGraph(const std::string& graph_file, const Device& device, Wor ...@@ -88,7 +88,7 @@ std::string CreateGraph(const std::string& graph_file, const Device& device, Wor
// overwritten device options // overwritten device options
DeviceOption* device_option = meta_graph.mutable_device_option(); DeviceOption* device_option = meta_graph.mutable_device_option();
device_option->set_device_type((DeviceType)device.device_type()); device_option->set_device_type((DeviceType)device.device_type());
device_option->set_gpu_id(device.device_id()); device_option->set_device_id(device.device_id());
device_option->set_engine("CUDNN"); device_option->set_engine("CUDNN");
dragon::GraphBase* graph = ws->CreateGraph(meta_graph); dragon::GraphBase* graph = ws->CreateGraph(meta_graph);
if (!graph) LOG(FATAL) << "Can not create the graph."; if (!graph) LOG(FATAL) << "Can not create the graph.";
......
...@@ -80,7 +80,8 @@ def FromShape(shape, dtype='float32', ctx=None, name=None): ...@@ -80,7 +80,8 @@ def FromShape(shape, dtype='float32', ctx=None, name=None):
The wrapper of ``TensorFromShapeCC``. The wrapper of ``TensorFromShapeCC``.
""" """
tensor = Tensor(name) if name is None: tensor = Tensor(name=name)
else: tensor = Tensor(_name=name)
if not isinstance(shape, (tuple, list)): if not isinstance(shape, (tuple, list)):
raise TypeError('The shape should be a tuple or list.') raise TypeError('The shape should be a tuple or list.')
if ctx is None: ctx = MakeDeviceOption(0, 0) # CPUContext if ctx is None: ctx = MakeDeviceOption(0, 0) # CPUContext
...@@ -134,7 +135,8 @@ def FromPyArray(array, name=None): ...@@ -134,7 +135,8 @@ def FromPyArray(array, name=None):
The wrapper of ``TensorFromPyArrayCC``. The wrapper of ``TensorFromPyArrayCC``.
""" """
tensor = Tensor(name) if name is None: tensor = Tensor(name=name)
else: tensor = Tensor(_name=name)
if not isinstance(array, np.ndarray): if not isinstance(array, np.ndarray):
raise TypeError('The given nd-array should be numpy.ndarray.') raise TypeError('The given nd-array should be numpy.ndarray.')
TensorFromPyArrayCC(_stringify_tensor(tensor), array) TensorFromPyArrayCC(_stringify_tensor(tensor), array)
...@@ -258,15 +260,18 @@ def GetTensorInfo(tensor, stream=1): ...@@ -258,15 +260,18 @@ def GetTensorInfo(tensor, stream=1):
The string info contains following fields: The string info contains following fields:
stream #1: ``dtype``, ``from_numpy``, ``init`` stream #1: ``dtype``, ``from_numpy``, ``init``, ``mem``, ``mem_at``, ``device_id``
``mem``, ``mem_at``, ``device_id``
stream #2: ``shape`` stream #2: ``shape``
stream #3: #1 + #2 stream #3: #1 + #2
Parameters Parameters
---------- ----------
tensor : Tensor or str tensor : Tensor or str
The input tensor. The input tensor.
stream : int
The stream id.
Returns Returns
------- -------
......
...@@ -14,7 +14,6 @@ Common ...@@ -14,7 +14,6 @@ Common
operators/ndarray operators/ndarray
operators/control_flow operators/control_flow
operators/misc operators/misc
operators/cast
operators/mpi operators/mpi
=================================== ===================================================================== =================================== =====================================================================
...@@ -26,7 +25,6 @@ List Brief ...@@ -26,7 +25,6 @@ List Brief
`dragon.operators.ndarray`_ The ndarray operators. `dragon.operators.ndarray`_ The ndarray operators.
`dragon.operators.control_flow`_ The control flow operators. `dragon.operators.control_flow`_ The control flow operators.
`dragon.operators.misc`_ The misc operators. `dragon.operators.misc`_ The misc operators.
`dragon.operators.cast`_ The cast operators.
`dragon.operators.mpi`_ The MPI operators. `dragon.operators.mpi`_ The MPI operators.
=================================== ===================================================================== =================================== =====================================================================
...@@ -95,7 +93,6 @@ List Brief ...@@ -95,7 +93,6 @@ List Brief
.. _dragon.operators.ndarray: operators/ndarray.html .. _dragon.operators.ndarray: operators/ndarray.html
.. _dragon.operators.control_flow: operators/control_flow.html .. _dragon.operators.control_flow: operators/control_flow.html
.. _dragon.operators.misc: operators/misc.html .. _dragon.operators.misc: operators/misc.html
.. _dragon.operators.cast: operators/cast.html
.. _dragon.operators.mpi: operators/mpi.html .. _dragon.operators.mpi: operators/mpi.html
.. _dragon.operators.activation: operators/activation.html .. _dragon.operators.activation: operators/activation.html
.. _dragon.operators.vision: operators/vision.html .. _dragon.operators.vision: operators/vision.html
......
...@@ -101,7 +101,7 @@ List Brief ...@@ -101,7 +101,7 @@ List Brief
`Matmul`_ Matrix Multiplication. `Matmul`_ Matrix Multiplication.
`InnerProduct`_ InnerProduct Function. `InnerProduct`_ InnerProduct Function.
`Eltwise`_ Eltwise Sum/Product Function. `Eltwise`_ Eltwise Sum/Product Function.
`Scale`_ Scale Function. `Affine`_ Calculate ``y = Ax + b`` along the given range of axes.
`GramMatrix`_ Calculate the gram matrix, introduced by `[Gatys et.al, 2016] <https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf>`_. `GramMatrix`_ Calculate the gram matrix, introduced by `[Gatys et.al, 2016] <https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf>`_.
=============== ====================================================================== =============== ======================================================================
...@@ -244,7 +244,7 @@ List Brief ...@@ -244,7 +244,7 @@ List Brief
.. _Dot: operators/arithmetic.html#dragon.operators.arithmetic.Dot .. _Dot: operators/arithmetic.html#dragon.operators.arithmetic.Dot
.. _InnerProduct: operators/arithmetic.html#dragon.operators.arithmetic.InnerProduct .. _InnerProduct: operators/arithmetic.html#dragon.operators.arithmetic.InnerProduct
.. _Eltwise: operators/arithmetic.html#dragon.operators.arithmetic.Eltwise .. _Eltwise: operators/arithmetic.html#dragon.operators.arithmetic.Eltwise
.. _Scale: operators/arithmetic.html#dragon.operators.arithmetic.Scale .. _Affine: operators/arithmetic.html#dragon.operators.arithmetic.Affine
.. _GramMatrix: operators/arithmetic.html#dragon.operators.arithmetic.GramMatrix .. _GramMatrix: operators/arithmetic.html#dragon.operators.arithmetic.GramMatrix
.. _BatchNorm: operators/norm.html#dragon.operators.norm.BatchNorm .. _BatchNorm: operators/norm.html#dragon.operators.norm.BatchNorm
......
...@@ -39,9 +39,9 @@ class DataBatch(object): ...@@ -39,9 +39,9 @@ class DataBatch(object):
source : str source : str
The path of database. The path of database.
multiple_nodes: boolean multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean shuffle : boolean
Whether to shuffle the data. Whether to shuffle the data. Default is ``False``.
num_chunks : int num_chunks : int
The number of chunks to split. Default is ``2048``. The number of chunks to split. Default is ``2048``.
chunk_size : int chunk_size : int
......
...@@ -36,11 +36,9 @@ class DataReader(Process): ...@@ -36,11 +36,9 @@ class DataReader(Process):
source : str source : str
The path of database. The path of database.
multiple_nodes: boolean multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean shuffle : boolean
Whether to shuffle the data. Whether to shuffle the data. Default is ``False``.
instance_chunk : boolean
Whether to limit each chunk with at most 1 instance.
num_chunks : int num_chunks : int
The number of chunks to split. Default is ``2048``. The number of chunks to split. Default is ``2048``.
chunk_size : int chunk_size : int
...@@ -51,7 +49,6 @@ class DataReader(Process): ...@@ -51,7 +49,6 @@ class DataReader(Process):
self._source = kwargs.get('source', '') self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False) self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False) self._use_shuffle = kwargs.get('shuffle', False)
self._use_instance_chunk = kwargs.get('instance_chunk', False)
self._num_chunks = kwargs.get('num_chunks', 2048) self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1) self._chunk_size = kwargs.get('chunk_size', -1)
...@@ -172,12 +169,13 @@ class DataReader(Process): ...@@ -172,12 +169,13 @@ class DataReader(Process):
self._db_zfill = int(self._db.get('zfill')) self._db_zfill = int(self._db.get('zfill'))
self._epoch_size = int(self._db_size / self._num_parts + 1) self._epoch_size = int(self._db_size / self._num_parts + 1)
if self._use_instance_chunk: if self._use_shuffle:
self._chunk_size = 1 if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1 self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1
else: else:
# search a optimal chunk size by chunks if self._use_shuffle and self._chunk_size == -1:
if self._chunk_size == -1: # search a optimal chunk size by chunks [For Chunk Shuffle]
max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20))) max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
min_chunk_size = 1 min_chunk_size = 1
while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
...@@ -185,6 +183,13 @@ class DataReader(Process): ...@@ -185,6 +183,13 @@ class DataReader(Process):
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 / self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20))) (self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1) self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
# ``chunk_size`` and ``num_shuffle_parts`` are meaningless
self._chunk_size = int(self._db_size / self._num_parts) + 1
self._num_shuffle_parts = 1
self._perm = np.arange(self._num_shuffle_parts) self._perm = np.arange(self._num_shuffle_parts)
# init env # init env
......
...@@ -428,8 +428,8 @@ def Sqrt(inputs, **kwargs): ...@@ -428,8 +428,8 @@ def Sqrt(inputs, **kwargs):
return output return output
def Scale(inputs, axis=1, num_axes=1, **kwargs): def Affine(inputs, axis=1, num_axes=1, **kwargs):
"""Scale Function. """Calculate ``y = Ax + b`` along the given range of axes.
The number of inputs vary from ``2`` to ``3`` (Without or With ``bias``). The number of inputs vary from ``2`` to ``3`` (Without or With ``bias``).
...@@ -457,7 +457,7 @@ def Scale(inputs, axis=1, num_axes=1, **kwargs): ...@@ -457,7 +457,7 @@ def Scale(inputs, axis=1, num_axes=1, **kwargs):
CheckInputs(inputs, 2, 3) CheckInputs(inputs, 2, 3)
arguments = ParseArguments(locals()) arguments = ParseArguments(locals())
output = Tensor.CreateOperator(nout=1, op_type='Scale', **arguments) output = Tensor.CreateOperator(nout=1, op_type='Affine', **arguments)
if inputs[0].shape is not None: if inputs[0].shape is not None:
output.shape = inputs[0].shape[:] output.shape = inputs[0].shape[:]
......
...@@ -90,7 +90,7 @@ Square = math.Square ...@@ -90,7 +90,7 @@ Square = math.Square
Sqrt = math.Sqrt Sqrt = math.Sqrt
InnerProduct = math.InnerProduct InnerProduct = math.InnerProduct
Eltwise = math.Eltwise Eltwise = math.Eltwise
Scale = math.Scale Affine = math.Affine
GramMatrix = math.GramMatrix GramMatrix = math.GramMatrix
# normalization # normalization
......
...@@ -494,7 +494,7 @@ class ScaleLayer(Layer): ...@@ -494,7 +494,7 @@ class ScaleLayer(Layer):
def Setup(self, bottom): def Setup(self, bottom):
super(ScaleLayer, self).Setup(bottom) super(ScaleLayer, self).Setup(bottom)
return ops.Scale(bottom + [blob['data'] for blob in self._blobs], **self._param) return ops.Affine(bottom + [blob['data'] for blob in self._blobs], **self._param)
class BNLayer(Layer): class BNLayer(Layer):
...@@ -606,10 +606,12 @@ class NormalizeLayer(Layer): ...@@ -606,10 +606,12 @@ class NormalizeLayer(Layer):
def __init__(self, LayerParameter): def __init__(self, LayerParameter):
super(NormalizeLayer, self).__init__(LayerParameter) super(NormalizeLayer, self).__init__(LayerParameter)
param = LayerParameter.normalize_param param = LayerParameter.normalize_param
self._l2norm_param = {'axis': 1, self._l2norm_param = {
'axis': 1,
'num_axes': -1 if param.across_spatial else 1, 'num_axes': -1 if param.across_spatial else 1,
'eps': param.eps} 'eps': param.eps}
self._scale_param = {'axis': 1, self._scale_param = {
'axis': 1,
'num_axes': 0 if param.channel_shared else 1} 'num_axes': 0 if param.channel_shared else 1}
scope = LayerParameter.name scope = LayerParameter.name
scale = Tensor(scope + '/param:0') scale = Tensor(scope + '/param:0')
...@@ -622,7 +624,7 @@ class NormalizeLayer(Layer): ...@@ -622,7 +624,7 @@ class NormalizeLayer(Layer):
def Setup(self, bottom): def Setup(self, bottom):
super(NormalizeLayer, self).Setup(bottom) super(NormalizeLayer, self).Setup(bottom)
norm_out = [ops.L2Norm(bottom[0], **self._l2norm_param)] norm_out = [ops.L2Norm(bottom[0], **self._l2norm_param)]
scale_out = ops.Scale(norm_out + [blob['data'] for blob in self.scale_blobs], scale_out = ops.Affine(norm_out + [blob['data'] for blob in self.scale_blobs],
**self._scale_param) **self._scale_param)
return scale_out return scale_out
......
...@@ -71,7 +71,7 @@ class DataLayer(Layer): ...@@ -71,7 +71,7 @@ class DataLayer(Layer):
'min_random_scale': transform_param.min_random_scale, 'min_random_scale': transform_param.min_random_scale,
'max_random_scale': transform_param.max_random_scale, 'max_random_scale': transform_param.max_random_scale,
'shuffle': parallel_param.shuffle, 'shuffle': parallel_param.shuffle,
'node_step': parallel_param.node_step, 'multiple_nodes': parallel_param.multiple_nodes,
'partition': parallel_param.partition} 'partition': parallel_param.partition}
def Setup(self, bottom): def Setup(self, bottom):
......
...@@ -112,6 +112,9 @@ class Module(object): ...@@ -112,6 +112,9 @@ class Module(object):
module.state_dict(destination, prefix + name + '.', to_numpy=to_numpy) module.state_dict(destination, prefix + name + '.', to_numpy=to_numpy)
return destination return destination
def _load_state_dict_key_mismatch(self, full_name, name, is_missing):
pass
def load_state_dict(self, state_dict, strict=True): def load_state_dict(self, state_dict, strict=True):
logger.info('Load the state dict from numpy arrays.') logger.info('Load the state dict from numpy arrays.')
def submodule_key_mismatch(full_name, is_missing): def submodule_key_mismatch(full_name, is_missing):
...@@ -159,7 +162,7 @@ class Module(object): ...@@ -159,7 +162,7 @@ class Module(object):
', '.join('"{}"'.format(k) for k in unexpected)) ', '.join('"{}"'.format(k) for k in unexpected))
if len(missing) > 0: if len(missing) > 0:
error_msg += 'Missing key(s) in state_dict: {}. '.format( error_msg += 'Missing key(s) in state_dict: {}. '.format(
', '.join('"{}"'.format(k) for k in unexpected)) ', '.join('"{}"'.format(k) for k in missing))
if len(error_msg) > 0: if len(error_msg) > 0:
raise KeyError(error_msg) raise KeyError(error_msg)
...@@ -210,9 +213,9 @@ class Module(object): ...@@ -210,9 +213,9 @@ class Module(object):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
with dg.name_scope(get_module_name(self)): with dg.name_scope(get_module_name(self)):
return self.forward(*args) return self.forward(*args, **kwargs)
def forward(self, *inputs): def forward(self, *inputs, **kwargs):
raise NotImplementedError('The base module can not be called.') raise NotImplementedError('The base module can not be called.')
def name_scope(self, remove_separator=True): def name_scope(self, remove_separator=True):
......
...@@ -16,12 +16,13 @@ as it will be reused by the ``torch.ops``. ...@@ -16,12 +16,13 @@ as it will be reused by the ``torch.ops``.
from dragon.vm.torch.module import Module from dragon.vm.torch.module import Module
from dragon.vm.torch.tensor import Parameter from dragon.vm.torch.tensor import Parameter
from .modules.conv import Conv2d from .modules.conv import Conv2d, ConvTranspose2d
from .modules.pooling import MaxPool2d, AvgPool2d from .modules.pooling import MaxPool2d, AvgPool2d
from .modules.activation import ReLU, Softmax from .modules.activation import ReLU, Sigmoid, Softmax
from .modules.linear import Linear from .modules.linear import Linear
from .modules.loss import CrossEntropyLoss from .modules.loss import CrossEntropyLoss
from .modules.container import Container, Sequential, ModuleList from .modules.container import Container, Sequential, ModuleList
from .modules.batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d from .modules.batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
from .modules.affine import Affine
from .modules.dropout import Dropout, Dropout2d, Dropout3d from .modules.dropout import Dropout, Dropout2d, Dropout3d
from . import init from . import init
\ No newline at end of file
...@@ -35,6 +35,25 @@ class ReLU(Module): ...@@ -35,6 +35,25 @@ class ReLU(Module):
return self.run(inputs, outputs) return self.run(inputs, outputs)
class Sigmoid(Module):
def __init__(self, inplace=False):
super(Sigmoid, self).__init__()
self._inplace = inplace
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'Sigmoid',
'n_inputs': 1, 'n_outputs': 1,
'arguments': {}
}
def forward(self, x):
inputs = [x]; self.unify_devices(inputs)
outputs = [x if self._inplace else self.register_output(x.dtype)]
return self.run(inputs, outputs)
class Softmax(Module): class Softmax(Module):
def __init__(self, dim=None): def __init__(self, dim=None):
super(Softmax, self).__init__() super(Softmax, self).__init__()
......
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.nn import Module, Parameter
from dragon.vm.torch.ops.creation import zeros, ones
class Affine(Module):
def __init__(self, num_features, bias=True, fix_weight=False, fix_bias=False):
super(Affine, self).__init__()
self.num_features = num_features
self.weight = Parameter(ones(num_features), requires_grad=not fix_weight)
if bias:
self.bias = Parameter(zeros(num_features), requires_grad=not fix_bias)
else:
self.bias = None
self.inputs = [self.weight, self.bias] if bias else [self.weight]
self.register_op()
def register_op(self):
self.op_meta = {
'op_type': 'Affine',
'n_inputs': 3 if self.bias else 2, 'n_outputs': 1,
'arguments': {
'axis': 1, # Data format: NCHW
'num_axes': 1,
}
}
def forward(self, input):
inputs = [input] + self.inputs
self.unify_devices(inputs)
outputs = [self.register_output(input.dtype)]
return self.run(inputs, outputs)
\ No newline at end of file
...@@ -20,8 +20,8 @@ from dragon.vm.torch.module import RunOperator ...@@ -20,8 +20,8 @@ from dragon.vm.torch.module import RunOperator
class _BatchNorm(Module): class _BatchNorm(Module):
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, def __init__(self, num_features, eps=1e-5, momentum=0.1,
track_running_stats=True): affine=True, track_running_stats=True):
super(_BatchNorm, self).__init__() super(_BatchNorm, self).__init__()
self.num_features = num_features self.num_features = num_features
self.eps = eps self.eps = eps
...@@ -103,6 +103,10 @@ class _BatchNorm(Module): ...@@ -103,6 +103,10 @@ class _BatchNorm(Module):
self.unify_devices(inputs) self.unify_devices(inputs)
outputs = [self.register_output(input.dtype)] outputs = [self.register_output(input.dtype)]
phase = 'TRAIN' if input.requires_grad else 'TEST' phase = 'TRAIN' if input.requires_grad else 'TEST'
# Normalize the input by using batch stats ALWAYS
# Note that the update of moving average is meaningless(
# Because we can not remove it. Why? Ask nvidia and cuDNN -:)
if not self.track_running_stats: phase = 'TRAIN'
meta = ['PERSISTENT',] + self.make_meta_from_phase(phase) meta = ['PERSISTENT',] + self.make_meta_from_phase(phase)
return RunOperator(inputs, outputs, meta) return RunOperator(inputs, outputs, meta)
......
...@@ -52,7 +52,8 @@ class _ConvNd(Module): ...@@ -52,7 +52,8 @@ class _ConvNd(Module):
def register_op(self): def register_op(self):
self.op_meta = { self.op_meta = {
'op_type': 'Conv2d', 'op_type': 'Conv{}d{}'.format(len(self.kernel_size),
'Transpose' if self.transposed else ''),
'n_inputs': 3 if self.bias else 2, 'n_outputs': 1, 'n_inputs': 3 if self.bias else 2, 'n_outputs': 1,
'arguments': { 'arguments': {
'num_output': self.weight.shape[0], 'num_output': self.weight.shape[0],
...@@ -106,3 +107,21 @@ class Conv2d(_ConvNd): ...@@ -106,3 +107,21 @@ class Conv2d(_ConvNd):
self.unify_devices(inputs) self.unify_devices(inputs)
outputs = [self.register_output(input.dtype)] outputs = [self.register_output(input.dtype)]
return self.run(inputs, outputs) return self.run(inputs, outputs)
class ConvTranspose2d(_ConvNd):
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, output_padding=0, groups=1, bias=True, dilation=1):
kernel_size = _pair(kernel_size)
stride = _pair(stride)
padding = _pair(padding)
dilation = _pair(dilation)
super(ConvTranspose2d, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
True, _pair(0), groups, bias)
def forward(self, input):
inputs = [input, self.weight] + ([self.bias] if self.bias else [])
self.unify_devices(inputs)
outputs = [self.register_output(input.dtype)]
return self.run(inputs, outputs)
\ No newline at end of file
...@@ -21,3 +21,7 @@ from .arithmetic import ( ...@@ -21,3 +21,7 @@ from .arithmetic import (
from .ndarray import ( from .ndarray import (
sum, mean, argmin, argmax, max, topk, cat sum, mean, argmin, argmax, max, topk, cat
) )
from .vision import (
nn_resize, bilinear_resize, roi_pool, roi_align
)
\ No newline at end of file
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.ops.modules.base import BaseModule
class Resize2d(BaseModule):
def __init__(self, key, ctx, **kwargs):
super(Resize2d, self).__init__(key, ctx, **kwargs)
self.op_type = kwargs.get('op_type', 'NNResize')
self.dsize = kwargs.get('dsize', None)
self.fx = kwargs.get('fx', None)
self.fy = kwargs.get('fy', None)
self.register_arguments()
self.register_op()
def register_arguments(self):
if self.dsize:
self.dsize = [self.register_argument('dsize[{}]'.format(i))
for i in range(2)]
def register_op(self):
self.op_meta = {
'op_type': self.op_type,
'n_inputs': 1, 'n_outputs': 1,
'arguments': {
'dsize_desc': [d for d in self.dsize] if self.dsize else None,
'fx': self.fx, 'fy': self.fy,
'data_format': 'NCHW',
}
}
def forward(self, input, dsize=None):
inputs = [input]; self.unify_devices(inputs)
outputs = [self.register_output(input.dtype)]
if dsize is not None:
for ix, d in enumerate(dsize):
self.set_argument_i(self.dsize[ix], d)
return self.run(inputs, outputs)
class RoIPool(BaseModule):
def __init__(self, key, ctx, **kwargs):
super(RoIPool, self).__init__(key, ctx, **kwargs)
self.pool_h = kwargs.get('pooled_h', 0)
self.pool_w = kwargs.get('pooled_w', 0)
self.spatial_scale = kwargs.get('spatial_scale', 1.0)
self.register_arguments()
self.register_op()
def register_arguments(self):
"""No arguments for roi-pool op."""
pass
def register_op(self):
self.op_meta = {
'op_type': 'ROIPooling',
'n_inputs': 2, 'n_outputs': 1,
'arguments': {
'pool_h': self.pool_h, 'pool_w': self.pool_w,
'spatial_scale': self.spatial_scale,
}
}
def forward(self, feature, rois, dsize=None):
inputs = [feature, rois]; self.unify_devices(inputs)
outputs = [self.register_output(feature.dtype)]
return self.run(inputs, outputs)
class RoIAlign(BaseModule):
def __init__(self, key, ctx, **kwargs):
super(RoIAlign, self).__init__(key, ctx, **kwargs)
self.pool_h = kwargs.get('pooled_h', 0)
self.pool_w = kwargs.get('pooled_w', 0)
self.spatial_scale = kwargs.get('spatial_scale', 1.0)
self.sampling_ratio = kwargs.get('sampling_ratio', 2)
self.register_arguments()
self.register_op()
def register_arguments(self):
"""No arguments for roi-pool op."""
pass
def register_op(self):
self.op_meta = {
'op_type': 'ROIAlign',
'n_inputs': 2, 'n_outputs': 1,
'arguments': {
'pool_h': self.pool_h, 'pool_w': self.pool_w,
'spatial_scale': self.spatial_scale,
'sampling_ratio': self.sampling_ratio,
}
}
def forward(self, feature, rois, dsize=None):
inputs = [feature, rois]; self.unify_devices(inputs)
outputs = [self.register_output(feature.dtype)]
return self.run(inputs, outputs)
\ No newline at end of file
# ------------------------------------------------------------
# Copyright (c) 2017-present, SeetaTech, Co.,Ltd.
#
# Licensed under the BSD 2-Clause License.
# You should have received a copy of the BSD 2-Clause License
# along with the software. If not, See,
#
# <https://opensource.org/licenses/BSD-2-Clause>
#
# ------------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from dragon.vm.torch.ops.primitive import MakeContext
from dragon.vm.torch.ops.factory import get_module
from dragon.vm.torch.ops.modules.vision import Resize2d
from dragon.vm.torch.ops.modules.vision import RoIPool, RoIAlign
def _resize_2d(input, op_type, dsize, fx, fy):
if dsize is None:
if fx < 0 or fy < 0:
raise ValueError('Set fx and fy if dsize is None.')
else:
if len(dsize) != 2:
raise ValueError('The dsize should be a list with 2 elements.')
if dsize is None and (fy == -1.0 or fx == -1.0):
raise RuntimeError('The dsize, fx/fy should be specified either.')
ctx = MakeContext(inputs=[input])
key = 'torch/ops/{}/{}:{}/dsize:{}/fx:{}/fy:{}'.format(
op_type.lower(), ctx[0].lower(), ctx[1], '2' if dsize else 'none', fx, fy)
module = get_module(Resize2d, key, ctx,
op_type=op_type, dsize=dsize, fx=fx, fy=fy)
return module.forward(input, dsize)
def nn_resize(input, dsize, fx=-1.0, fy=-1.0):
return _resize_2d(input, 'NNResize', dsize, fx, fy)
def bilinear_resize(input, dsize, fx=-1.0, fy=-1.0):
return _resize_2d(input, 'BilinearResize', dsize, fx, fy)
def roi_pool(feature, rois, pooled_h, pooled_w, spatial_scale):
ctx = MakeContext(inputs=[feature])
key = 'torch/ops/roi_pool/{}:{}/pool_h:{}/pool_w:{}/spatial_scale:{}'.format(
ctx[0].lower(), ctx[1], pooled_h, pooled_w, spatial_scale)
module = get_module(RoIPool, key, ctx, pooled_h=pooled_h,
pooled_w=pooled_w, spatial_scale=spatial_scale)
return module.forward(feature, rois)
def roi_align(feature, rois, pooled_h, pooled_w,
spatial_scale, sampling_ratio=2):
ctx = MakeContext(inputs=[feature])
key = 'torch/ops/roi_align/{}:{}/pool_h:{}/pool_w:{}/' \
'spatial_scale:{}/sampling_ratio:{}'.format(
ctx[0].lower(), ctx[1], pooled_h, pooled_w, spatial_scale, sampling_ratio)
module = get_module(RoIAlign, key, ctx, pooled_h=pooled_h,
pooled_w=pooled_w, spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)
return module.forward(feature, rois)
\ No newline at end of file
...@@ -110,7 +110,7 @@ class Optimizer(object): ...@@ -110,7 +110,7 @@ class Optimizer(object):
_update(p, g, op_type=self._update_type, _update(p, g, op_type=self._update_type,
slot=group['slot'], slot=group['slot'],
lr_mult=group.get('lr_mult', 1.0), lr_mult=group.get('lr_mult', 1.0),
decay_mult=group.get('lr_mult', 1.0)) decay_mult=group.get('decay_mult', 1.0))
def zero_grad(self): def zero_grad(self):
"""Set all gradients to zeros. """Set all gradients to zeros.
......
...@@ -144,6 +144,17 @@ class Tensor(object): ...@@ -144,6 +144,17 @@ class Tensor(object):
"Use .cpu() to move the tensor to host memory first.") "Use .cpu() to move the tensor to host memory first.")
return tensor_utils.ToPyArray(self._dg_tensor) return tensor_utils.ToPyArray(self._dg_tensor)
def numpy_ex(self):
"""Create a numpy const nd-array sharing this tensor.
Returns
-------
numpy.ndarray
The numpy nd-array.
"""
return tensor_utils.ToPyArrayEx(self._dg_tensor)
def dragon(self): def dragon(self):
"""Create a dragon tensor sharing this tensor. """Create a dragon tensor sharing this tensor.
...@@ -533,7 +544,7 @@ class Tensor(object): ...@@ -533,7 +544,7 @@ class Tensor(object):
Parameters Parameters
---------- ----------
args : tuple args : tuple or int
The new size. The new size.
Returns Returns
...@@ -606,6 +617,21 @@ class Tensor(object): ...@@ -606,6 +617,21 @@ class Tensor(object):
""" """
self.fill_(0.) self.fill_(0.)
def one_(self):
"""Fills self tensor with ones.
Parameters
----------
value : numerical type
Returns
-------
vm.torch.Tensor
The self.
"""
self.fill_(1.)
def uniform_(self, low=0, high=1): def uniform_(self, low=0, high=1):
"""Fill self tensor with the specified uniform distribution. """Fill self tensor with the specified uniform distribution.
......
...@@ -34,7 +34,7 @@ class _DataLoaderIter(object): ...@@ -34,7 +34,7 @@ class _DataLoaderIter(object):
class DataLoader(object): class DataLoader(object):
def __init__(self, dataset, batch_size=1, shuffle=False, def __init__(self, dataset, batch_size=1, shuffle=False,
partition=False, multiple_nodes=False, instance_chunk=False): partition=False, multiple_nodes=False, num_chunks=2048, chunk_size=-1):
"""A MPI-Aware DataLoader. Forked from ``dragon.io``. """A MPI-Aware DataLoader. Forked from ``dragon.io``.
Parameters Parameters
...@@ -43,16 +43,16 @@ class DataLoader(object): ...@@ -43,16 +43,16 @@ class DataLoader(object):
The dataset. The dataset.
batch_size : int batch_size : int
The batch size. Divided by n mpi-nodes if ``partition`` is True. The batch size. Divided by n mpi-nodes if ``partition`` is True.
instance_chunk : boolean
Whether to limit each chunk with at most 1 instance.
shuffle : boolean shuffle : boolean
Whether to shuffle the data. Whether to shuffle the data.
partition : boolean partition : boolean
Whether to partition batch. Default is ``False``. Whether to partition batch. Default is ``False``.
multiple_nodes: boolean multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Whether to split data for multiple parallel nodes.
instance_chunk : boolean num_chunks : int
Whether to limit each chunk with at most 1 instance. The number of chunks to split. Default is ``2048``.
chunk_size : int
The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).
""" """
self.dataset = dataset self.dataset = dataset
...@@ -65,7 +65,8 @@ class DataLoader(object): ...@@ -65,7 +65,8 @@ class DataLoader(object):
'source': dataset.database, 'source': dataset.database,
'multiple_nodes': multiple_nodes, 'multiple_nodes': multiple_nodes,
'shuffle': shuffle, 'shuffle': shuffle,
'instance_chunk': instance_chunk, 'num_chunks': num_chunks,
'chunk_size': chunk_size,
'batch_size': batch_size, 'batch_size': batch_size,
'partition': partition, 'partition': partition,
'transform': dataset.transform, 'transform': dataset.transform,
......
...@@ -39,11 +39,9 @@ class DataBatch(object): ...@@ -39,11 +39,9 @@ class DataBatch(object):
source : str source : str
The path of database. The path of database.
multiple_nodes: boolean multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean shuffle : boolean
Whether to shuffle the data. Whether to shuffle the data. Default is ``False``.
instance_chunk : boolean
Whether to limit each chunk with at most 1 instance.
num_chunks : int num_chunks : int
The number of chunks to split. Default is ``2048``. The number of chunks to split. Default is ``2048``.
chunk_size : int chunk_size : int
......
...@@ -36,11 +36,9 @@ class DataReader(Process): ...@@ -36,11 +36,9 @@ class DataReader(Process):
source : str source : str
The path of database. The path of database.
multiple_nodes: boolean multiple_nodes: boolean
Whether to split data for multiple parallel nodes. Whether to split data for multiple parallel nodes. Default is ``False``.
shuffle : boolean shuffle : boolean
Whether to shuffle the data. Whether to shuffle the data. Default is ``False``.
instance_chunk : boolean
Whether to limit each chunk with at most 1 instance.
num_chunks : int num_chunks : int
The number of chunks to split. Default is ``2048``. The number of chunks to split. Default is ``2048``.
chunk_size : int chunk_size : int
...@@ -51,7 +49,6 @@ class DataReader(Process): ...@@ -51,7 +49,6 @@ class DataReader(Process):
self._source = kwargs.get('source', '') self._source = kwargs.get('source', '')
self._multiple_nodes = kwargs.get('multiple_nodes', False) self._multiple_nodes = kwargs.get('multiple_nodes', False)
self._use_shuffle = kwargs.get('shuffle', False) self._use_shuffle = kwargs.get('shuffle', False)
self._use_instance_chunk = kwargs.get('instance_chunk', False)
self._num_chunks = kwargs.get('num_chunks', 2048) self._num_chunks = kwargs.get('num_chunks', 2048)
self._chunk_size = kwargs.get('chunk_size', -1) self._chunk_size = kwargs.get('chunk_size', -1)
...@@ -172,12 +169,13 @@ class DataReader(Process): ...@@ -172,12 +169,13 @@ class DataReader(Process):
self._db_zfill = int(self._db.get('zfill')) self._db_zfill = int(self._db.get('zfill'))
self._epoch_size = int(self._db_size / self._num_parts + 1) self._epoch_size = int(self._db_size / self._num_parts + 1)
if self._use_instance_chunk: if self._use_shuffle:
self._chunk_size = 1 if self._chunk_size == 1:
# each chunk has at most 1 record [For Fully Shuffle]
self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1 self._num_shuffle_parts = int(self._db_size / self._chunk_size / self._num_parts) + 1
else: else:
# search a optimal chunk size by chunks if self._use_shuffle and self._chunk_size == -1:
if self._chunk_size == -1: # search a optimal chunk size by chunks [For Chunk Shuffle]
max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20))) max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
min_chunk_size = 1 min_chunk_size = 1
while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
...@@ -185,6 +183,13 @@ class DataReader(Process): ...@@ -185,6 +183,13 @@ class DataReader(Process):
self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 / self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
(self._num_parts * self._chunk_size << 20))) (self._num_parts * self._chunk_size << 20)))
self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1) self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
else:
# each chunk has at most K records [For Multiple Nodes]
# note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
# ``chunk_size`` and ``num_shuffle_parts`` are meaningless
self._chunk_size = int(self._db_size / self._num_parts) + 1
self._num_shuffle_parts = 1
self._perm = np.arange(self._num_shuffle_parts) self._perm = np.arange(self._num_shuffle_parts)
# init env # init env
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# #
# ------------------------------------------------------------ # ------------------------------------------------------------
import torch import dragon.vm.torch as torch
import hashlib import hashlib
import os import os
...@@ -44,28 +44,6 @@ HASH_REGEX = re.compile(r'-([a-f0-9]*)\.') ...@@ -44,28 +44,6 @@ HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')
def load_url(url, model_dir=None, map_location=None, progress=True): def load_url(url, model_dir=None, map_location=None, progress=True):
r"""Loads the Torch serialized object at the given URL.
If the object is already present in `model_dir`, it's deserialized and
returned. The filename part of the URL should follow the naming convention
``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
digits of the SHA256 hash of the contents of the file. The hash is used to
ensure unique names and to verify the contents of the file.
The default value of `model_dir` is ``$TORCH_HOME/models`` where
``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
overriden with the ``$TORCH_MODEL_ZOO`` environment variable.
Args:
url (string): URL of the object to download
model_dir (string, optional): directory in which to save the object
map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load)
progress (bool, optional): whether or not to display a progress bar to stderr
Example:
>>> state_dict = torch.utils.model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
"""
if model_dir is None: if model_dir is None:
torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models'))
......
...@@ -36,7 +36,7 @@ find_packages('dragon') ...@@ -36,7 +36,7 @@ find_packages('dragon')
find_modules() find_modules()
setup(name = 'dragon', setup(name = 'dragon',
version='0.2.2.1', version='0.2.2.2',
description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework', description = 'Dragon: A Computation Graph Virtual Machine Based Deep Learning Framework',
url='https://github.com/seetaresearch/Dragon', url='https://github.com/seetaresearch/Dragon',
author='Ting Pan', author='Ting Pan',
......
...@@ -12,7 +12,7 @@ CUDAObject CUDAContext::cuda_object_; ...@@ -12,7 +12,7 @@ CUDAObject CUDAContext::cuda_object_;
template<> void CPUContext::Memcpy<CPUContext, CUDAContext>( template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
size_t nbytes, void* dst, const void* src) { size_t nbytes, void* dst, const void* src) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CUDAContext ctx(POINTER_DEVICE(src)); CUDAContext ctx(CUDA_POINTER_DEVICE(src));
ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src); ctx.Memcpy<CPUContext, CUDAContext>(nbytes, dst, src);
#else #else
LOG(FATAL) << "CUDA was not compiled."; LOG(FATAL) << "CUDA was not compiled.";
...@@ -23,7 +23,7 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>( ...@@ -23,7 +23,7 @@ template<> void CPUContext::Memcpy<CPUContext, CUDAContext>(
template<> void CPUContext::Memcpy<CUDAContext, CPUContext>( template<> void CPUContext::Memcpy<CUDAContext, CPUContext>(
size_t nbytes, void* dst, const void* src) { size_t nbytes, void* dst, const void* src) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
CUDAContext ctx(POINTER_DEVICE(dst)); CUDAContext ctx(CUDA_POINTER_DEVICE(dst));
ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src); ctx.Memcpy<CUDAContext, CPUContext>(nbytes, dst, src);
#else #else
LOG(FATAL) << "CUDA was not compiled."; LOG(FATAL) << "CUDA was not compiled.";
......
...@@ -128,8 +128,8 @@ MixedMemory::~MixedMemory() { ...@@ -128,8 +128,8 @@ MixedMemory::~MixedMemory() {
void MixedMemory::SwitchToDevice() { void MixedMemory::SwitchToDevice() {
if (cuda_ptr_) { if (cuda_ptr_) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
int ptr_device = POINTER_DEVICE(cuda_ptr_); int ptr_device = CUDA_POINTER_DEVICE(cuda_ptr_);
int cur_device = CURRENT_DEVICE(); int cur_device = CUDA_CURRENT_DEVICE();
if (ptr_device != cur_device) state_ = SWITCHED; if (ptr_device != cur_device) state_ = SWITCHED;
#endif #endif
} }
...@@ -139,7 +139,7 @@ void MixedMemory::SwitchToCUDADevice(int device_id) { ...@@ -139,7 +139,7 @@ void MixedMemory::SwitchToCUDADevice(int device_id) {
#ifdef WITH_CUDA #ifdef WITH_CUDA
DeviceGuard gurad(device_id); DeviceGuard gurad(device_id);
if (cuda_ptr_) { if (cuda_ptr_) {
int ptr_device = POINTER_DEVICE(cuda_ptr_); int ptr_device = CUDA_POINTER_DEVICE(cuda_ptr_);
if (ptr_device != device_id) state_ = SWITCHED; if (ptr_device != device_id) state_ = SWITCHED;
} }
ToCUDA(); ToCUDA();
...@@ -164,7 +164,7 @@ const Map<string, string> MixedMemory::info() const { ...@@ -164,7 +164,7 @@ const Map<string, string> MixedMemory::info() const {
} }
s2s["mem_at"] = _state_; s2s["mem_at"] = _state_;
if (cpu_ptr_) s2s["CPU"] = "0"; if (cpu_ptr_) s2s["CPU"] = "0";
if (cuda_ptr_) s2s["CUDA"] = dragon_cast<string, int>(POINTER_DEVICE(cuda_ptr_)); if (cuda_ptr_) s2s["CUDA"] = dragon_cast<string, int>(CUDA_POINTER_DEVICE(cuda_ptr_));
return s2s; return s2s;
} }
......
#include "operators/arithmetic/scale_op.h" #include "operators/arithmetic/affine_op.h"
#include "core/workspace.h" #include "core/workspace.h"
#include "utils/filler.h" #include "utils/filler.h"
#include "utils/op_kernel.h" #include "utils/op_kernel.h"
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
namespace dragon { namespace dragon {
template <class Context> template <typename T> template <class Context> template <typename T>
void ScaleOp<Context>::RunWithType() { void AffineOp<Context>::RunWithType() {
start_axis = axis; start_axis = axis;
if (start_axis < 0) start_axis += (int)Input(0).ndim(); if (start_axis < 0) start_axis += (int)Input(0).ndim();
if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis; if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis;
...@@ -18,27 +18,29 @@ void ScaleOp<Context>::RunWithType() { ...@@ -18,27 +18,29 @@ void ScaleOp<Context>::RunWithType() {
const vector<TIndex>::const_iterator& dim_start = Input(0).dims().begin() + start_axis; const vector<TIndex>::const_iterator& dim_start = Input(0).dims().begin() + start_axis;
const vector<TIndex>::const_iterator& dim_end = dim_start + num_axes; const vector<TIndex>::const_iterator& dim_end = dim_start + num_axes;
vector<TIndex> param_dims(dim_start, dim_end); vector<TIndex> param_dims(dim_start, dim_end);
TENSOR_FILL(Input(1), param_dims);;
TENSOR_FILL(Input(1), param_dims); outer_dim = Input(0).count(0, start_axis);
inner_dim = Input(0).count(start_axis + num_axes);
scale_dim = Input(1).count();
if (InputSize() > 2) { if (InputSize() > 2) {
TENSOR_FILL(Input(2), param_dims); TENSOR_FILL(Input(2), param_dims);
inner_dim = Input(0).count(start_axis + num_axes);
INIT_MULTIPLIER(bias_multiplier, inner_dim); INIT_MULTIPLIER(bias_multiplier, inner_dim);
} }
if (InputSize() > 2) { auto* Xdata = Input(0).template data<T, Context>();
kernel::Scale<T, Context>(start_axis, &Input(0), &Input(1), auto* Adata = Input(1).template data<T, Context>();
&Input(2), bias_multiplier, auto* Bdata = InputSize() > 2 ? Input(2).template data<T, Context>() : nullptr;
Output(0)); auto* BMdata = InputSize() > 2 ? bias_multiplier->template data<T, Context>() : nullptr;
} else { auto* Ydata = Output(0)->template mutable_data<T, Context>();
kernel::Scale<T, Context>(start_axis, &Input(0), &Input(1),
nullptr, nullptr, kernel::Affine<T, Context>(Output(0)->count(),
Output(0)); outer_dim, scale_dim, inner_dim,
} Xdata, Adata, Bdata, BMdata,
Ydata);
} }
template <class Context> template <class Context>
void ScaleOp<Context>::RunOnDevice() { void AffineOp<Context>::RunOnDevice() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) RunWithType<float>();
...@@ -46,14 +48,14 @@ void ScaleOp<Context>::RunOnDevice() { ...@@ -46,14 +48,14 @@ void ScaleOp<Context>::RunOnDevice() {
else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" }); else LOG(FATAL) << DTypeHelper(Input(0), { "float32", "float16" });
} }
DEPLOY_CPU(Scale); DEPLOY_CPU(Affine);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(Scale); DEPLOY_CUDA(Affine);
#endif #endif
OPERATOR_SCHEMA(Scale).NumInputs(2, 3).NumOutputs(1); OPERATOR_SCHEMA(Affine).NumInputs(2, 3).NumOutputs(1);
template <class Context> template <typename T> template <class Context> template <typename T>
void ScaleGradientOp<Context>::BiasRunWithType() { void AffineGradientOp<Context>::BiasRunWithType() {
Output(2)->ReshapeLike(Input(1)); Output(2)->ReshapeLike(Input(1));
INIT_MULTIPLIER(bias_multiplier, inner_dim); INIT_MULTIPLIER(bias_multiplier, inner_dim);
auto* BMul_data = this->bias_multiplier->template data<T, Context>(); auto* BMul_data = this->bias_multiplier->template data<T, Context>();
...@@ -71,7 +73,7 @@ void ScaleGradientOp<Context>::BiasRunWithType() { ...@@ -71,7 +73,7 @@ void ScaleGradientOp<Context>::BiasRunWithType() {
} }
template <class Context> template <typename T> template <class Context> template <typename T>
void ScaleGradientOp<Context>::ScaleRunWithType() { void AffineGradientOp<Context>::ScaleRunWithType() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
Output(1)->ReshapeLike(Input(1)); Output(1)->ReshapeLike(Input(1));
INIT_MULTIPLIER(sum_multiplier, sum_dim); INIT_MULTIPLIER(sum_multiplier, sum_dim);
...@@ -119,15 +121,20 @@ void ScaleGradientOp<Context>::ScaleRunWithType() { ...@@ -119,15 +121,20 @@ void ScaleGradientOp<Context>::ScaleRunWithType() {
} }
template <class Context> template <typename T> template <class Context> template <typename T>
void ScaleGradientOp<Context>::RunWithType() { void AffineGradientOp<Context>::RunWithType() {
Output(0)->ReshapeLike(Input(0)); Output(0)->ReshapeLike(Input(0));
kernel::ScaleGrad<T, Context>(start_axis, auto* dYdata = Input(-1).template data<T, Context>();
&Input(-1), &Input(1), Output(0)); auto* Adata = Input(1).template data<T, Context>();
auto* dXdata = Output(0)->template mutable_data<T, Context>();
kernel::AffineGrad<T, Context>(Output(0)->count(),
outer_dim, scale_dim, inner_dim,
dYdata, Adata, dXdata);
} }
template <class Context> template <class Context>
void ScaleGradientOp<Context>::RunOnDevice() { void AffineGradientOp<Context>::RunOnDevice() {
start_axis = axis; start_axis = axis;
if (start_axis < 0) start_axis += (int)Input(0).ndim(); if (start_axis < 0) start_axis += (int)Input(0).ndim();
if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis; if (num_axes == -1) num_axes = (int)Input(0).ndim() - start_axis;
...@@ -151,21 +158,21 @@ void ScaleGradientOp<Context>::RunOnDevice() { ...@@ -151,21 +158,21 @@ void ScaleGradientOp<Context>::RunOnDevice() {
} }
} }
DEPLOY_CPU(ScaleGradient); DEPLOY_CPU(AffineGradient);
#ifdef WITH_CUDA #ifdef WITH_CUDA
DEPLOY_CUDA(ScaleGradient); DEPLOY_CUDA(AffineGradient);
#endif #endif
OPERATOR_SCHEMA(ScaleGradient).NumInputs(3).NumOutputs(3); OPERATOR_SCHEMA(AffineGradient).NumInputs(3).NumOutputs(3);
class GetScaleGradient final : public GradientMakerBase { class GetAffineGradient final : public GradientMakerBase {
public: public:
GRADIENT_MAKER_CTOR(GetScaleGradient); GRADIENT_MAKER_CTOR(GetAffineGradient);
vector<OperatorDef> MakeDefs() override { vector<OperatorDef> MakeDefs() override {
return SingleDef(def.type() + "Gradient", "", return SingleDef(def.type() + "Gradient", "",
vector<string> {I(0), I(1), GO(0)}, vector<string> {I(0), I(1), GO(0)},
vector<string> {GI(0), GI(1), GI(2)}); vector<string> {GI(0), GI(1), GI(2)});
} }
}; };
REGISTER_GRADIENT(Scale, GetScaleGradient); REGISTER_GRADIENT(Affine, GetAffineGradient);
} // namespace dragon } // namespace dragon
\ No newline at end of file
...@@ -3,17 +3,18 @@ ...@@ -3,17 +3,18 @@
#include "utils/math_functions.h" #include "utils/math_functions.h"
namespace dragon { namespace dragon {
template <class Context> template <typename T>
template <class Context> template <typename Tx, typename Ty>
void AccuracyOp<Context>::RunWithType() { void AccuracyOp<Context>::RunWithType() {
if (OutputSize() > 1) { if (OutputSize() > 1) {
math::Set<T, CPUContext>(num_classes, 0, math::Set<float, CPUContext>(num_classes, 0,
Output(1)->template mutable_data<T, CPUContext>()); Output(1)->template mutable_data<float, CPUContext>());
} }
Map<int, int> num_per_class; Map<int, TIndex> num_per_class;
T acc = 0, count = 0; TIndex acc = 0, count = 0;
auto* Xdata = Input(0).template data<T, CPUContext>(); auto* Xdata = Input(0).template data<Tx, CPUContext>();
auto* labels = Input(1).template data<T, CPUContext>(); auto* labels = Input(1).template data<Ty, CPUContext>();
auto* ignores = ignore_labels.count() > 0 ? auto* ignores = ignore_labels.count() > 0 ?
ignore_labels.data<int, CPUContext>() : nullptr; ignore_labels.data<int, CPUContext>() : nullptr;
const TIndex dim = Input(0).count() / outer_dim; const TIndex dim = Input(0).count() / outer_dim;
...@@ -23,14 +24,14 @@ void AccuracyOp<Context>::RunWithType() { ...@@ -23,14 +24,14 @@ void AccuracyOp<Context>::RunWithType() {
for (int k = 0; k < ignore_labels.count(); k++) for (int k = 0; k < ignore_labels.count(); k++)
if (label == ignores[k]) continue; if (label == ignores[k]) continue;
if (OutputSize() > 1) num_per_class[label]++; if (OutputSize() > 1) num_per_class[label]++;
vector<pair<T, int> > vec; vector<pair<Tx, int> > vec;
for (int k = 0; k < num_classes; k++) for (int k = 0; k < num_classes; k++)
vec.push_back(std::make_pair(Xdata[i * dim + k * inner_dim + j], k)); vec.push_back(std::make_pair(Xdata[i * dim + k * inner_dim + j], k));
std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end(), std::greater<pair<T, int> >()); std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end(), std::greater<pair<Tx, int> >());
for (int k = 0; k < top_k; k++) { for (int k = 0; k < top_k; k++) {
if (vec[k].second == label) { if (vec[k].second == label) {
if (OutputSize() > 1) if (OutputSize() > 1)
Output(1)->template mutable_data<T, CPUContext>()[label]++; Output(1)->template mutable_data<float, CPUContext>()[label]++;
acc++; acc++;
break; break;
} }
...@@ -39,11 +40,12 @@ void AccuracyOp<Context>::RunWithType() { ...@@ -39,11 +40,12 @@ void AccuracyOp<Context>::RunWithType() {
} // end inner_dim } // end inner_dim
} // end outer_dim } // end outer_dim
Output(0)->template mutable_data<T, CPUContext>()[0] = acc / count; Output(0)->template mutable_data<float, CPUContext>()[0] = (float)acc / count;
if (OutputSize() > 1) { if (OutputSize() > 1) {
auto* acc_per_class = Output(1)->template mutable_data<T, CPUContext>(); auto* acc_per_class = Output(1)->template mutable_data<float, CPUContext>();
for (int i = 0; i < num_classes; i++) for (int i = 0; i < num_classes; i++)
acc_per_class[i] = num_per_class[i] == 0 ? 0 : acc_per_class[i] / acc_per_class[i]; acc_per_class[i] = num_per_class[i] == 0 ?
0 : acc_per_class[i] / num_per_class[i];
} }
} }
...@@ -58,8 +60,11 @@ void AccuracyOp<Context>::RunOnDevice() { ...@@ -58,8 +60,11 @@ void AccuracyOp<Context>::RunOnDevice() {
Output(0)->Reshape(vector<TIndex>(1, 1)); Output(0)->Reshape(vector<TIndex>(1, 1));
if (OutputSize() > 1) Output(1)->Reshape(vector<TIndex>(1, num_classes)); if (OutputSize() > 1) Output(1)->Reshape(vector<TIndex>(1, num_classes));
if (XIsType(Input(0), float)) RunWithType<float>(); if (XIsType(Input(0), float)) {
else LOG(FATAL) << DTypeHelper(Input(0), { "float32" }); if (XIsType(Input(1), float)) RunWithType<float, float>();
else if(XIsType(Input(1), int64_t)) RunWithType<float, int64_t>();
else LOG(FATAL) << DTypeHelper(Input(1), { "float32", "int64" });
} else LOG(FATAL) << DTypeHelper(Input(0), { "float32" });
} }
DEPLOY_CPU(Accuracy); DEPLOY_CPU(Accuracy);
......
...@@ -128,6 +128,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() { ...@@ -128,6 +128,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float>(); RunWithType<float>();
} else if (XIsType(Input(0), float16)) { } else if (XIsType(Input(0), float16)) {
...@@ -148,6 +150,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() { ...@@ -148,6 +150,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float16>(); RunWithType<float16>();
#endif // WITH_CUDA_FP16 #endif // WITH_CUDA_FP16
...@@ -306,6 +310,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() { ...@@ -306,6 +310,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float>(); RunWithType<float>();
} else if (XIsType(Input(0), float16)) { } else if (XIsType(Input(0), float16)) {
...@@ -325,6 +331,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() { ...@@ -325,6 +331,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float16>(); RunWithType<float16>();
#endif // WITH_CUDA_FP16 #endif // WITH_CUDA_FP16
......
...@@ -130,6 +130,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() { ...@@ -130,6 +130,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float>(); RunWithType<float>();
} else if (XIsType(Input(0), float16)) { } else if (XIsType(Input(0), float16)) {
...@@ -150,6 +152,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() { ...@@ -150,6 +152,8 @@ void CuDNNConv2dTransposeOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float16>(); RunWithType<float16>();
#endif // WITH_CUDA_FP16 #endif // WITH_CUDA_FP16
...@@ -310,6 +314,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() { ...@@ -310,6 +314,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float>(); RunWithType<float>();
} else if (XIsType(Input(0), float16)) { } else if (XIsType(Input(0), float16)) {
...@@ -330,6 +336,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() { ...@@ -330,6 +336,8 @@ void CuDNNConv2dTransposeGradientOp<Context>::RunOnDevice() {
#endif #endif
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 0)
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group)); CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, this->group));
if (enable_tensor_core)
CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
#endif #endif
RunWithType<float16>(); RunWithType<float16>();
#endif // WITH_CUDA_FP16 #endif // WITH_CUDA_FP16
......
...@@ -42,22 +42,6 @@ template <> void Set<int, CUDAContext>(const int n, ...@@ -42,22 +42,6 @@ template <> void Set<int, CUDAContext>(const int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template <> void Set<float16, CUDAContext>(const int n,
const float16 alpha,
float16* x) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_Set<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float16>(alpha),
reinterpret_cast<half2*>(x));
else
_Set<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void RandomUniform<uint32_t, CUDAContext>(const int n, template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
const float low, const float low,
const float high, const float high,
...@@ -67,13 +51,6 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n, ...@@ -67,13 +51,6 @@ template <> void RandomUniform<uint32_t, CUDAContext>(const int n,
CURAND_CHECK(curandGenerate(curand_generator(), x, n)); CURAND_CHECK(curandGenerate(curand_generator(), x, n));
} }
template <> void RandomUniform<float16, CUDAContext>(const int n,
const float low,
const float high,
float16* x) {
NOT_IMPLEMENTED;
}
template <> void RandomNormal<float, CUDAContext>(const int n, template <> void RandomNormal<float, CUDAContext>(const int n,
const float mu, const float mu,
const float sigma, const float sigma,
...@@ -81,13 +58,6 @@ template <> void RandomNormal<float, CUDAContext>(const int n, ...@@ -81,13 +58,6 @@ template <> void RandomNormal<float, CUDAContext>(const int n,
CURAND_CHECK(curandGenerateNormal(curand_generator(), x, n, mu, sigma)); CURAND_CHECK(curandGenerateNormal(curand_generator(), x, n, mu, sigma));
} }
template <> void RandomNormal<float16, CUDAContext>(const int n,
const float mu,
const float sigma,
float16* x) {
NOT_IMPLEMENTED;
}
template <> void RandomBernoulli<float, CUDAContext>(const int n, template <> void RandomBernoulli<float, CUDAContext>(const int n,
const float p, const float p,
unsigned int* x) { unsigned int* x) {
...@@ -113,46 +83,6 @@ template <> void Add<float, CUDAContext>(int n, ...@@ -113,46 +83,6 @@ template <> void Add<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AddHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd(a[idx], b[idx]);
#endif
}
}
template <typename T>
__global__ void _AddHalf2(const int n, const half2* a, const half2* b, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd2(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Add<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_AddHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
else _AddHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Sub(const int n, const T* a, const T *b, T* y) { __global__ void _Sub(const int n, const T* a, const T *b, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -168,46 +98,6 @@ template <> void Sub<float, CUDAContext>(int n, ...@@ -168,46 +98,6 @@ template <> void Sub<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SubHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hsub(a[idx], b[idx]);
#endif
}
}
template <typename T>
__global__ void _SubHalf2(const int n, const half2* a, const half2* b, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hsub2(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Sub<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SubHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
else _SubHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Mul(const int n, const T* a, const T* b, T* y) { __global__ void _Mul(const int n, const T* a, const T* b, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -223,46 +113,6 @@ template <> void Mul<float, CUDAContext>(int n, ...@@ -223,46 +113,6 @@ template <> void Mul<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(a[idx], b[idx]);
#endif
}
}
template <typename T>
__global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Mul<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_MulHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
else _MulHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Div(const int n, const T* a, const T* b, T* y) { __global__ void _Div(const int n, const T* a, const T* b, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -278,32 +128,6 @@ template <> void Div<float, CUDAContext>(int n, ...@@ -278,32 +128,6 @@ template <> void Div<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hdiv(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Div<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
_DivHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Clip(const int n, const T low, const T high, T* x) { __global__ void _Clip(const int n, const T low, const T high, T* x) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -358,44 +182,6 @@ template <> void Square<float, CUDAContext>(int n, ...@@ -358,44 +182,6 @@ template <> void Square<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SquareHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(x[idx], x[idx]);
#endif
}
}
template <typename T>
__global__ void _SquareHalf2(const int n, const half2* x, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(x[idx], x[idx]);
#endif
}
}
#endif
template <> void Square<float16, CUDAContext>(int n,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SquareHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else _SquareHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Sqrt(const int n, const T* x, T* y) { __global__ void _Sqrt(const int n, const T* x, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -410,43 +196,6 @@ template <> void Sqrt<float, CUDAContext>(int n, ...@@ -410,43 +196,6 @@ template <> void Sqrt<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SqrtHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = hsqrt(x[idx]);
#endif
}
}
template <typename T>
__global__ void _SqrtHalf2(const int n, const half2* x, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = h2sqrt(x[idx]);
#endif
}
}
#endif
template <> void Sqrt<float16, CUDAContext>(int n,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SqrtHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else _SqrtHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Pow(const int n, const T alpha, const T* a, T* y) { __global__ void _Pow(const int n, const T alpha, const T* a, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -462,48 +211,6 @@ template <> void Pow<float, CUDAContext>(int n, ...@@ -462,48 +211,6 @@ template <> void Pow<float, CUDAContext>(int n,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(a[idx], a[idx]);
#endif
}
}
template <typename T>
__global__ void _PowHalf2(const int n, const float alpha, const half2* a, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(a[idx], a[idx]);
#endif
}
}
#endif
template <> void Pow<float16, CUDAContext>(int n,
const float alpha,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
CHECK(alpha == float(2)) << "fp16 only support the power of 2";
if (n % 2 == 0)
_PowHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
alpha,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else _PowHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
alpha,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _Inv(const int n, const float numerator, const T* x, T* y) { __global__ void _Inv(const int n, const float numerator, const T* x, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -518,47 +225,6 @@ template <> void Inv<float, CUDAContext>(const int n, ...@@ -518,47 +225,6 @@ template <> void Inv<float, CUDAContext>(const int n,
_Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y); _Inv<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, numerator, x, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(hrcp(x[idx]), numerator);
#endif
}
}
template <typename T>
__global__ void _InvHalf2(const int n, const half2 numerator, const half2* x, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(h2rcp(x[idx]), numerator);
#endif
}
}
#endif
template <> void Inv<float16, CUDAContext>(const int n,
const float numerator,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_InvHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float>(numerator),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else
_InvHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
dragon_cast<half, float>(numerator),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** Level-2 ********************/ /******************** Level-2 ********************/
template <> void Scal<float, CUDAContext>(const int n, const float alpha, float* y) { template <> void Scal<float, CUDAContext>(const int n, const float alpha, float* y) {
...@@ -568,18 +234,6 @@ template <> void Scal<float, CUDAContext>(const int n, const float alpha, float* ...@@ -568,18 +234,6 @@ template <> void Scal<float, CUDAContext>(const int n, const float alpha, float*
y, 1)); y, 1));
} }
template <> void Scal<float16, CUDAContext>(const int n, const float alpha, float16* y) {
#ifdef WITH_CUDA_FP16
CUBLAS_CHECK(cublasScalEx(cublas_handle(),
n,
&alpha, CUDA_R_32F,
y, CUDA_R_16F, 1,
CUDA_R_32F));
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Scale<float, CUDAContext>(const int n, template <> void Scale<float, CUDAContext>(const int n,
const float alpha, const float alpha,
const float* x, const float* x,
...@@ -588,23 +242,13 @@ template <> void Scale<float, CUDAContext>(const int n, ...@@ -588,23 +242,13 @@ template <> void Scale<float, CUDAContext>(const int n,
CUBLAS_CHECK(cublasSscal_v2(cublas_handle(), n, &alpha, y, 1)); CUBLAS_CHECK(cublasSscal_v2(cublas_handle(), n, &alpha, y, 1));
} }
template <> void Scale<float16, CUDAContext>(const int n,
const float alpha,
const float16* x,
float16* y) {
CUDAContext ctx;
ctx.Copy<float16, CUDAContext, CUDAContext>(n, y, x);
Scal<float16, CUDAContext>(n, alpha, y);
}
template <> float StridedDot<float, CUDAContext>(const int n, template <> float StridedDot<float, CUDAContext>(const int n,
const float* a, const float* a,
const int incx, const int incx,
const float* b, const float* b,
const int incy) { const int incy) {
float result; float result;
CUBLAS_CHECK(cublasSdot_v2(cublas_handle(), CUBLAS_CHECK(cublasSdot_v2(cublas_handle(), n,
n,
a, incx, a, incx,
b, incy, b, incy,
&result)); &result));
...@@ -615,22 +259,6 @@ template <> float Dot<float, CUDAContext>(int n, const float* a, const float* b) ...@@ -615,22 +259,6 @@ template <> float Dot<float, CUDAContext>(int n, const float* a, const float* b)
return StridedDot<float, CUDAContext>(n, a, 1, b, 1); return StridedDot<float, CUDAContext>(n, a, 1, b, 1);
} }
template <> float Dot<float16, CUDAContext>(int n, const float16* a, const float16* b) {
#ifdef WITH_CUDA_FP16
float16 result;
CUBLAS_CHECK(cublasDotEx(cublas_handle(),
n,
&a, CUDA_R_16F, 1,
&b, CUDA_R_16F, 1,
&result, CUDA_R_16F,
CUDA_R_32F));
return dragon_cast<float, float16>(result);
#else
CUDA_FP16_NOT_COMPILED;
return 0.;
#endif
}
template <> float ASum<float, CUDAContext>(const int n, const float* x) { template <> float ASum<float, CUDAContext>(const int n, const float* x) {
return cublasSasum(n, x, 1); return cublasSasum(n, x, 1);
} }
...@@ -646,43 +274,6 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f ...@@ -646,43 +274,6 @@ template <> void AddScalar<float, CUDAContext>(const int n, const float alpha, f
_AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y); _AddScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd(y[idx], alpha);
#endif
}
}
template <typename T>
__global__ void _AddScalarHalf2(const int n, half2 alpha, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd2(y[idx], alpha);
#endif
}
}
#endif
template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
_AddScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
} else {
_AddScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
}
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _MulScalar(const int n, T alpha, T* y) { __global__ void _MulScalar(const int n, T alpha, T* y) {
CUDA_KERNEL_LOOP(idx, n) { CUDA_KERNEL_LOOP(idx, n) {
...@@ -694,69 +285,15 @@ template <> void MulScalar<float, CUDAContext>(const int n, const float alpha, f ...@@ -694,69 +285,15 @@ template <> void MulScalar<float, CUDAContext>(const int n, const float alpha, f
_MulScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y); _MulScalar<float> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, y);
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _MulScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(y[idx], alpha);
#endif
}
}
template <typename T>
__global__ void _MulScalarHalf2(const int n, half2 alpha, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(y[idx], alpha);
#endif
}
}
#endif
template <> void MulScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
_MulScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
} else {
_MulScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
}
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Axpy<float, CUDAContext>(const int n, template <> void Axpy<float, CUDAContext>(const int n,
float alpha, float alpha,
const float* x, const float* x,
float* y) { float* y) {
CUBLAS_CHECK(cublasSaxpy_v2(cublas_handle(), CUBLAS_CHECK(cublasSaxpy_v2(cublas_handle(), n,
n,
&alpha, x, 1, &alpha, x, 1,
y, 1)); y, 1));
} }
template <> void Axpy<float16, CUDAContext>(const int n,
float alpha,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
CUBLAS_CHECK(cublasAxpyEx(cublas_handle(),
n,
&alpha, CUDA_R_32F,
x, CUDA_R_16F, 1,
y, CUDA_R_16F, 1,
CUDA_R_32F));
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Axpby<float, CUDAContext>(const int n, template <> void Axpby<float, CUDAContext>(const int n,
float alpha, float alpha,
const float* x, const float* x,
...@@ -766,17 +303,6 @@ template <> void Axpby<float, CUDAContext>(const int n, ...@@ -766,17 +303,6 @@ template <> void Axpby<float, CUDAContext>(const int n,
Axpy<float, CUDAContext>(n, alpha, x, y); Axpy<float, CUDAContext>(n, alpha, x, y);
} }
template <> void Axpby<float16, CUDAContext>(const int n,
float alpha,
const float16* x,
float beta,
float16* y) {
Scal<float16, CUDAContext>(n, beta, y);
Axpy<float16, CUDAContext>(n, alpha, x, y);
}
/******************** Level-3 ********************/
template <> void RandomUniform<float, CUDAContext>(const int n, template <> void RandomUniform<float, CUDAContext>(const int n,
const float low, const float low,
const float high, const float high,
...@@ -787,6 +313,8 @@ template <> void RandomUniform<float, CUDAContext>(const int n, ...@@ -787,6 +313,8 @@ template <> void RandomUniform<float, CUDAContext>(const int n,
if (low != float(0)) AddScalar<float, CUDAContext>(n, low, x); if (low != float(0)) AddScalar<float, CUDAContext>(n, low, x);
} }
/******************** Level-3 ********************/
template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA, template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const CBLAS_TRANSPOSE transB,
const int M, const int M,
...@@ -813,51 +341,6 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA, ...@@ -813,51 +341,6 @@ template <> void Gemm<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
C, N)); C, N));
} }
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
const int M,
const int N,
const int K,
const float alpha,
const float16* A,
const float16* B,
const float beta,
float16 *C,
TensorProto_DataType math_type) {
#ifdef WITH_CUDA_FP16
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
if (math_type == TensorProto_DataType_FLOAT) {
const float _alpha_ = alpha, _beta_ = beta;
CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
B, CUDA_R_16F, ldb,
A, CUDA_R_16F, lda,
&_beta_,
C, CUDA_R_16F, N));
} else if (math_type == TensorProto_DataType_FLOAT16) {
const half _alpha_ = dragon_cast<half, float>(alpha);
const half _beta_ = dragon_cast<half, float>(beta);
CUBLAS_CHECK(cublasHgemm(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
reinterpret_cast<const half*>(B), ldb,
reinterpret_cast<const half*>(A), lda,
&_beta_,
reinterpret_cast<half*>(C), N));
} else {
LOG(FATAL) << "Unsupported math type";
}
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA, template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M, const int N, const int M, const int N,
const float alpha, const float alpha,
...@@ -878,50 +361,6 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA, ...@@ -878,50 +361,6 @@ template <> void Gemv<float, CUDAContext>(const CBLAS_TRANSPOSE transA,
y, 1)); y, 1));
} }
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M,
const int N,
const float alpha,
const float16* A,
const float16* x,
const float beta,
float16* y,
TensorProto_DataType math_type) {
#ifdef WITH_CUDA_FP16
cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = (cuTransA == CUBLAS_OP_N) ? N : M;
int k = (cuTransA == CUBLAS_OP_N) ? M : N;
int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
int LDC = m;
const float _alpha_ = alpha, _beta_ = beta;
if (math_type == TensorProto_DataType_FLOAT) {
CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
A, CUDA_R_16F, LDA,
x, CUDA_R_16F, k,
&_beta_,
y, CUDA_R_16F, LDC));
} else if (math_type == TensorProto_DataType_FLOAT16) {
const half _alpha_ = dragon_cast<half, float>(alpha);
const half _beta_ = dragon_cast<half, float>(beta);
CUBLAS_CHECK(cublasHgemm(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
reinterpret_cast<const half*>(A), LDA,
reinterpret_cast<const half*>(x), k,
&_beta_,
reinterpret_cast<half*>(y), LDC));
} else {
LOG(FATAL) << "Unsupported math type";
}
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
} // namespace math } // namespace math
} // namespace dragon } // namespace dragon
......
#ifdef WITH_CUDA
#include <cmath>
#include "core/context_cuda.h"
#include "utils/cuda_device.h"
#include "utils/math_functions.h"
#include "utils/cast.h"
namespace dragon {
namespace math {
/******************** Level-0 ********************/
template <typename T>
__global__ void _SetHalf(const int n, const T alpha, T* x) {
CUDA_KERNEL_LOOP(idx, n) {
x[idx] = alpha;
}
}
template <> void Set<float16, CUDAContext>(const int n,
const float16 alpha,
float16* x) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SetHalf<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float16>(alpha),
reinterpret_cast<half2*>(x));
else
_SetHalf<float16> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n, alpha, x);
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void RandomUniform<float16, CUDAContext>(const int n,
const float low,
const float high,
float16* x) {
NOT_IMPLEMENTED;
}
template <> void RandomNormal<float16, CUDAContext>(const int n,
const float mu,
const float sigma,
float16* x) {
NOT_IMPLEMENTED;
}
/******************** Level-1 ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AddHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd(a[idx], b[idx]);
#endif
}
}
template <typename T>
__global__ void _AddHalf2(const int n, const half2* a, const half2* b, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd2(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Add<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_AddHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
else _AddHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SubHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hsub(a[idx], b[idx]);
#endif
}
}
template <typename T>
__global__ void _SubHalf2(const int n, const half2* a, const half2* b, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hsub2(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Sub<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SubHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
else _SubHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _MulHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(a[idx], b[idx]);
#endif
}
}
template <typename T>
__global__ void _MulHalf2(const int n, const half2* a, const half2* b, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Mul<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_MulHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(a),
reinterpret_cast<const half2*>(b),
reinterpret_cast<half2*>(y));
else _MulHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _DivHalf(const int n, const half* a, const half* b, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hdiv(a[idx], b[idx]);
#endif
}
}
#endif
template <> void Div<float16, CUDAContext>(int n,
const float16* a,
const float16* b,
float16* y) {
#ifdef WITH_CUDA_FP16
_DivHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SquareHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(x[idx], x[idx]);
#endif
}
}
template <typename T>
__global__ void _SquareHalf2(const int n, const half2* x, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(x[idx], x[idx]);
#endif
}
}
#endif
template <> void Square<float16, CUDAContext>(int n,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SquareHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else _SquareHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _SqrtHalf(const int n, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = hsqrt(x[idx]);
#endif
}
}
template <typename T>
__global__ void _SqrtHalf2(const int n, const half2* x, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = h2sqrt(x[idx]);
#endif
}
}
#endif
template <> void Sqrt<float16, CUDAContext>(int n,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_SqrtHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else _SqrtHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _PowHalf(const int n, const float alpha, const half* a, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(a[idx], a[idx]);
#endif
}
}
template <typename T>
__global__ void _PowHalf2(const int n, const float alpha, const half2* a, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(a[idx], a[idx]);
#endif
}
}
#endif
template <> void Pow<float16, CUDAContext>(int n,
const float alpha,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
CHECK(alpha == float(2)) << "fp16 only support the power of 2";
if (n % 2 == 0)
_PowHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
alpha,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else _PowHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
alpha,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _InvHalf(const int n, const half numerator, const half* x, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(hrcp(x[idx]), numerator);
#endif
}
}
template <typename T>
__global__ void _InvHalf2(const int n, const half2 numerator, const half2* x, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(h2rcp(x[idx]), numerator);
#endif
}
}
#endif
template <> void Inv<float16, CUDAContext>(const int n,
const float numerator,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0)
_InvHalf2<half2> << < GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float>(numerator),
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y));
else
_InvHalf<half> << < GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
dragon_cast<half, float>(numerator),
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** Level-2 ********************/
template <> void Scal<float16, CUDAContext>(const int n, const float alpha, float16* y) {
#ifdef WITH_CUDA_FP16
CUBLAS_CHECK(cublasScalEx(cublas_handle(), n,
&alpha, CUDA_R_32F,
y, CUDA_R_16F, 1,
CUDA_R_32F));
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Scale<float16, CUDAContext>(const int n,
const float alpha,
const float16* x,
float16* y) {
CUDAContext ctx;
ctx.Copy<float16, CUDAContext, CUDAContext>(n, y, x);
Scal<float16, CUDAContext>(n, alpha, y);
}
template <> float Dot<float16, CUDAContext>(int n, const float16* a, const float16* b) {
#ifdef WITH_CUDA_FP16
float16 result;
CUBLAS_CHECK(cublasDotEx(cublas_handle(),
n,
&a, CUDA_R_16F, 1,
&b, CUDA_R_16F, 1,
&result, CUDA_R_16F,
CUDA_R_32F));
return dragon_cast<float, float16>(result);
#else
CUDA_FP16_NOT_COMPILED;
return 0.;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AddScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd(y[idx], alpha);
#endif
}
}
template <typename T>
__global__ void _AddScalarHalf2(const int n, half2 alpha, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hadd2(y[idx], alpha);
#endif
}
}
#endif
template <> void AddScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
_AddScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
} else {
_AddScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
}
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _MulScalarHalf(const int n, half alpha, half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul(y[idx], alpha);
#endif
}
}
template <typename T>
__global__ void _MulScalarHalf2(const int n, half2 alpha, half2* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hmul2(y[idx], alpha);
#endif
}
}
#endif
template <> void MulScalar<float16, CUDAContext>(const int n, const float alpha, float16* y) {
#ifdef WITH_CUDA_FP16
if (n % 2 == 0) {
_MulScalarHalf2<half2> << <GET_BLOCKS(n / 2), CUDA_NUM_THREADS >> >(n / 2,
dragon_cast<half2, float>(alpha),
reinterpret_cast<half2*>(y));
} else {
_MulScalarHalf<half> << <GET_BLOCKS(n), CUDA_NUM_THREADS >> >(n,
dragon_cast<half, float>(alpha),
reinterpret_cast<half*>(y));
}
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Axpy<float16, CUDAContext>(const int n,
float alpha,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
CUBLAS_CHECK(cublasAxpyEx(cublas_handle(), n,
&alpha, CUDA_R_32F,
x, CUDA_R_16F, 1,
y, CUDA_R_16F, 1,
CUDA_R_32F));
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Axpby<float16, CUDAContext>(const int n,
float alpha,
const float16* x,
float beta,
float16* y) {
Scal<float16, CUDAContext>(n, beta, y);
Axpy<float16, CUDAContext>(n, alpha, x, y);
}
/******************** Level-3 ********************/
template <> void Gemm<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB,
const int M,
const int N,
const int K,
const float alpha,
const float16* A,
const float16* B,
const float beta,
float16 *C,
TensorProto_DataType math_type) {
#ifdef WITH_CUDA_FP16
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
if (math_type == TensorProto_DataType_FLOAT) {
const float _alpha_ = alpha, _beta_ = beta;
#if CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
// GEMM + MATH32 + TENSOR-CORE
CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
B, CUDA_R_16F, ldb,
A, CUDA_R_16F, lda,
&_beta_,
C, CUDA_R_16F, N,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// GEMM + MATH32 + DEFAULT
CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
B, CUDA_R_16F, ldb,
A, CUDA_R_16F, lda,
&_beta_,
C, CUDA_R_16F, N));
}
#else
CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
B, CUDA_R_16F, ldb,
A, CUDA_R_16F, lda,
&_beta_,
C, CUDA_R_16F, N));
#endif
} else if (math_type == TensorProto_DataType_FLOAT16) {
const half _alpha_ = dragon_cast<half, float>(alpha);
const half _beta_ = dragon_cast<half, float>(beta);
#if CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
// GEMM + MATH16 + TENSOR-CORE
CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
B, CUDA_R_16F, ldb,
A, CUDA_R_16F, lda,
&_beta_,
C, CUDA_R_16F, N,
CUDA_R_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// GEMM + MATH16 + DEFAULT
CUBLAS_CHECK(cublasHgemm(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
reinterpret_cast<const half*>(B), ldb,
reinterpret_cast<const half*>(A), lda,
&_beta_,
reinterpret_cast<half*>(C), N));
}
#else
CUBLAS_CHECK(cublasHgemm(cublas_handle(),
cuTransB, cuTransA,
N, M, K,
&_alpha_,
reinterpret_cast<const half*>(B), ldb,
reinterpret_cast<const half*>(A), lda,
&_beta_,
reinterpret_cast<half*>(C), N));
#endif
} else {
LOG(FATAL) << "Unsupported math type";
}
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void Gemv<float16, CUDAContext>(const CBLAS_TRANSPOSE transA,
const int M,
const int N,
const float alpha,
const float16* A,
const float16* x,
const float beta,
float16* y,
TensorProto_DataType math_type) {
#ifdef WITH_CUDA_FP16
cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = (cuTransA == CUBLAS_OP_N) ? N : M;
int k = (cuTransA == CUBLAS_OP_N) ? M : N;
int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
int LDC = m;
const float _alpha_ = alpha, _beta_ = beta;
if (math_type == TensorProto_DataType_FLOAT) {
#if CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
// GEMV + MATH32 + TENSOR-CORE
CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
A, CUDA_R_16F, LDA,
x, CUDA_R_16F, k,
&_beta_,
y, CUDA_R_16F, LDC,
CUDA_R_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// GEMV + MATH32 + DEFAULT
CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
A, CUDA_R_16F, LDA,
x, CUDA_R_16F, k,
&_beta_,
y, CUDA_R_16F, LDC));
}
#else
CUBLAS_CHECK(cublasSgemmEx(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
A, CUDA_R_16F, LDA,
x, CUDA_R_16F, k,
&_beta_,
y, CUDA_R_16F, LDC));
#endif
} else if (math_type == TensorProto_DataType_FLOAT16) {
const half _alpha_ = dragon_cast<half, float>(alpha);
const half _beta_ = dragon_cast<half, float>(beta);
#if CUDA_VERSION >= 9000
if (TENSOR_CORE_AVAILABLE()) {
// GEMV + MATH16 + TENSOR-CORE
CUBLAS_CHECK(cublasGemmEx(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
A, CUDA_R_16F, LDA,
x, CUDA_R_16F, k,
&_beta_,
y, CUDA_R_16F, LDC,
CUDA_R_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// GEMV + MATH16 + DEFAULT
CUBLAS_CHECK(cublasHgemm(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
reinterpret_cast<const half*>(A), LDA,
reinterpret_cast<const half*>(x), k,
&_beta_,
reinterpret_cast<half*>(y), LDC));
}
#else
CUBLAS_CHECK(cublasHgemm(cublas_handle(),
cuTransA, CUBLAS_OP_N,
m, 1, k,
&_alpha_,
reinterpret_cast<const half*>(A), LDA,
reinterpret_cast<const half*>(x), k,
&_beta_,
reinterpret_cast<half*>(y), LDC));
#endif
} else {
LOG(FATAL) << "Unsupported math type";
}
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
} // namespace math
} // namespace dragon
#endif // WITH_CUDA
\ No newline at end of file
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "utils/omp_alternative.h" #include "utils/omp_alternative.h"
#include "utils/sse_alternative.h" #include "utils/sse_alternative.h"
#include "utils/math_functions.h" #include "utils/math_functions.h"
#include "utils/cast.h"
bool judge(int a, int b) { return unsigned(a) < unsigned(b); } bool judge(int a, int b) { return unsigned(a) < unsigned(b); }
...@@ -359,6 +360,70 @@ template<> void TanhGrad<float, CPUContext>(const int count, ...@@ -359,6 +360,70 @@ template<> void TanhGrad<float, CPUContext>(const int count,
} }
} }
/******************** arithmetic.affine ********************/
template<> void Affine<float, CPUContext>(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const float* x,
const float* alpha,
const float* beta,
const float* beta_multiplier,
float* y) {
// Ax
auto* Xdata = x; auto* Ydata = y;
for (int n = 0; n < outer_dim; ++n) {
for (int d = 0; d < scale_dim; ++d) {
math::Scale<float, CPUContext>(inner_dim, alpha[d], Xdata, Ydata);
Xdata += inner_dim;
Ydata += inner_dim;
}
}
// Pb
if (beta != nullptr && beta_multiplier != nullptr) {
int dim = scale_dim * inner_dim;
Ydata = y;
for (int n = 0; n < outer_dim; ++n) {
math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans,
scale_dim, inner_dim, 1,
1.0,
beta, beta_multiplier,
1.0,
Ydata);
Ydata += dim;
}
}
}
template<> void Affine<float16, CPUContext>(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const float16* x,
const float16* alpha,
const float16* beta,
const float16* beta_multiplier,
float16* y) {
LOG(FATAL) << "float16 is unsupported for CPUContext.";
}
template <> void AffineGrad<float, CPUContext>(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const float* dy,
const float* alpha,
float* dx) {
auto* dYdata = dy; auto* dXdata = dx;
for (int n = 0; n < outer_dim; ++n) {
for (int d = 0; d < scale_dim; ++d) {
math::Scale<float, CPUContext>(inner_dim, alpha[d], dYdata, dXdata);
dYdata += inner_dim; dXdata += inner_dim;
}
}
}
/******************** arithmetic.bias_add ********************/ /******************** arithmetic.bias_add ********************/
template<> void BiasAdd<float, CPUContext>(const int count, template<> void BiasAdd<float, CPUContext>(const int count,
...@@ -408,77 +473,6 @@ template <> void Clip<float, CPUContext>(const int count, ...@@ -408,77 +473,6 @@ template <> void Clip<float, CPUContext>(const int count,
} }
} }
/******************** arithmetic.scale ********************/
template<> void Scale<float, CPUContext>(const int axis,
Tensor* x,
Tensor* gamma,
Tensor* beta,
Tensor* BMul,
Tensor* y) {
int outer_dim = x->count(0, axis);
int inner_dim = x->count(axis + gamma->ndim());
int scale_dim = gamma->count();
auto* Xdata = x->data<float, CPUContext>();
auto* Ydata = y->mutable_data<float, CPUContext>();
auto* Sdata = gamma->data<float, CPUContext>();
auto* Bdata = beta != nullptr ?
beta->data<float, CPUContext>() :
nullptr;
auto* BMul_data = BMul != nullptr ?
BMul->data<float, CPUContext>() :
nullptr;
for (int n = 0; n < outer_dim; ++n) {
for (int d = 0; d < scale_dim; ++d) {
const float factor = Sdata[d];
math::Scale<float, CPUContext>(inner_dim, factor, Xdata, Ydata);
Xdata += inner_dim;
Ydata += inner_dim;
}
}
if (Bdata != nullptr) {
int dim = scale_dim * inner_dim;
Ydata = y->mutable_data<float, CPUContext>();
for (int n = 0; n < outer_dim; ++n) {
math::Gemm<float, CPUContext>(CblasNoTrans, CblasNoTrans,
scale_dim, inner_dim, 1,
1.0,
Bdata, BMul_data,
1.0,
Ydata);
Ydata += dim;
}
}
}
template<> void Scale<float16, CPUContext>(const int axis,
Tensor* x,
Tensor* gamma,
Tensor* beta,
Tensor* BMul,
Tensor* y) {
LOG(FATAL) << "float16 is unsupported for CPUContext.";
}
template <> void ScaleGrad<float, CPUContext>(const int axis,
Tensor* dy,
Tensor* gamma,
Tensor* dx) {
int outer_dim = dx->count(0, axis);
int inner_dim = dx->count(axis + gamma->ndim());
int scale_dim = gamma->count();
auto* dYdata = dy->data<float, CPUContext>();
auto* dXdata = dx->mutable_data<float, CPUContext>();
auto* Sdata = gamma->data<float, CPUContext>();
for (int n = 0; n < outer_dim; ++n) {
for (int d = 0; d < scale_dim; ++d) {
const float factor = Sdata[d];
math::Scale<float, CPUContext>(inner_dim, factor, dYdata, dXdata);
dYdata += inner_dim; dXdata += inner_dim;
}
}
}
/******************** control_flow.compare ********************/ /******************** control_flow.compare ********************/
template <> void Equal<float, CPUContext>(const int count, template <> void Equal<float, CPUContext>(const int count,
...@@ -809,7 +803,7 @@ template<> void SparseSoftmaxFocalLossGrad<float, CPUContext>(const int count, ...@@ -809,7 +803,7 @@ template<> void SparseSoftmaxFocalLossGrad<float, CPUContext>(const int count,
} }
} }
/******************** misc.dtype ********************/ /******************** misc.astype ********************/
template <typename Ta, typename Tb> template <typename Ta, typename Tb>
void _TypeA2B(const int count, const Ta* a, Tb* b) { void _TypeA2B(const int count, const Ta* a, Tb* b) {
...@@ -819,6 +813,14 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) { ...@@ -819,6 +813,14 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) {
for (int i = 0; i < count; ++i) b[i] = a[i]; for (int i = 0; i < count; ++i) b[i] = a[i];
} }
template <typename Ta, typename Tb>
void _TypeA2B_v2(const int count, const Ta* a, Tb* b) {
#ifdef WITH_OMP
#pragma omp parallel for num_threads(GET_OMP_THREADS(count))
#endif
for (int i = 0; i < count; ++i) b[i] = dragon_cast<Tb, Ta>(a[i]);
}
#define DEFINE_TYPE_A2B(type_a, type_b) \ #define DEFINE_TYPE_A2B(type_a, type_b) \
template <> void TypeA2B<type_a, type_b, CPUContext>(const int count, \ template <> void TypeA2B<type_a, type_b, CPUContext>(const int count, \
const type_a* a, \ const type_a* a, \
...@@ -826,6 +828,13 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) { ...@@ -826,6 +828,13 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) {
_TypeA2B<type_a, type_b>(count, a, b); \ _TypeA2B<type_a, type_b>(count, a, b); \
} }
#define DEFINE_TYPE_A2B_V2(type_a, type_b) \
template <> void TypeA2B<type_a, type_b, CPUContext>(const int count, \
const type_a* a, \
type_b* b) { \
_TypeA2B_v2<type_a, type_b>(count, a, b); \
}
#define DEFINE_TYPE_DISABLE_FP16(type) \ #define DEFINE_TYPE_DISABLE_FP16(type) \
template <> void TypeA2B<float16, type, CPUContext>(const int count, \ template <> void TypeA2B<float16, type, CPUContext>(const int count, \
const float16* a, \ const float16* a, \
...@@ -845,13 +854,10 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) { ...@@ -845,13 +854,10 @@ void _TypeA2B(const int count, const Ta* a, Tb* b) {
DEFINE_TYPE_A2B(type_a, int64_t); \ DEFINE_TYPE_A2B(type_a, int64_t); \
DEFINE_TYPE_A2B(type_a, uint8_t); DEFINE_TYPE_A2B(type_a, uint8_t);
template <> void TypeA2B<float16, float16, CPUContext>(const int count, DEFINE_TYPE_A2B_V2(float16, float);
const float16* a, DEFINE_TYPE_A2B_V2(float, float16);
float16* b) { DEFINE_TYPE_A2B_V2(float16, float16);
LOG(FATAL) << "float16 is unsupported for CPUContext."; DEFINE_TYPE_A2ALL(float);
}
DEFINE_TYPE_A2ALL(float); DEFINE_TYPE_DISABLE_FP16(float);
DEFINE_TYPE_A2ALL(double); DEFINE_TYPE_DISABLE_FP16(double); DEFINE_TYPE_A2ALL(double); DEFINE_TYPE_DISABLE_FP16(double);
DEFINE_TYPE_A2ALL(int); DEFINE_TYPE_DISABLE_FP16(int); DEFINE_TYPE_A2ALL(int); DEFINE_TYPE_DISABLE_FP16(int);
DEFINE_TYPE_A2ALL(int64_t); DEFINE_TYPE_DISABLE_FP16(int64_t); DEFINE_TYPE_A2ALL(int64_t); DEFINE_TYPE_DISABLE_FP16(int64_t);
......
...@@ -14,18 +14,13 @@ namespace dragon { ...@@ -14,18 +14,13 @@ namespace dragon {
namespace kernel { namespace kernel {
template <typename T> template <typename T>
__global__ void _Empty() { } __global__ void _Empty() {}
template<> void Empty<float, CUDAContext>() { template<> void Empty<float, CUDAContext>() {
_Empty<float> << <1, 1 >> >(); _Empty<float> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template<> void Empty<float16, CUDAContext>() {
_Empty<float16> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK;
}
/******************** activation.dropout ********************/ /******************** activation.dropout ********************/
template<typename T> template<typename T>
...@@ -357,49 +352,6 @@ template<> void Relu<float, CUDAContext>(const int count, ...@@ -357,49 +352,6 @@ template<> void Relu<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ReluHalf(const int count, const half* x, const half slope, half* y) {
const half kZero = __float2half(0.f);
CUDA_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hgt(x[idx], kZero) ? x[idx] : __hmul(x[idx], slope);
#endif
}
}
template <typename T>
__global__ void _ReluHalf2(const int count, const half2* x, const half2 slope, half2* y) {
const half2 kZero = __float2half2_rn(0.f);
CUDA_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hbgt2(x[idx], kZero) ? x[idx] : __hmul2(x[idx], slope);
#endif
}
}
#endif
template<> void Relu<float16, CUDAContext>(const int count,
const float16* x,
const float slope,
float16* y) {
#ifdef WITH_CUDA_FP16
if (count % 2 == 0)
_ReluHalf2<half2> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> > (count / 2,
reinterpret_cast<const half2*>(x),
dragon_cast<half2, float>(slope),
reinterpret_cast<half2*>(y));
else
_ReluHalf<half> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
reinterpret_cast<const half*>(x),
dragon_cast<half, float>(slope),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _ReluGrad(const int count, __global__ void _ReluGrad(const int count,
const T* dy, const T* dy,
...@@ -668,6 +620,69 @@ template<> void TanhGrad<float, CUDAContext>(const int count, ...@@ -668,6 +620,69 @@ template<> void TanhGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
/******************** arithmetic.scale ********************/
template <typename T>
__global__ void _AffineWithOBias(const int count,
const int scale_dim,
const int inner_dim,
const T* x,
const T* alpha,
T* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = alpha[scale_idx] * x[idx];
}
}
template <typename T>
__global__ void _AffineWithBias(const int count,
const int scale_dim,
const int inner_dim,
const T* x,
const T* alpha,
const T* beta,
T* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = alpha[scale_idx] * x[idx] + beta[scale_idx];
}
}
template<> void Affine<float, CUDAContext>(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const float* x,
const float* alpha,
const float* beta,
const float* beta_multiplier,
float* y) {
if (beta != nullptr) {
_AffineWithBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
scale_dim, inner_dim,
x, alpha, beta, y);
} else {
_AffineWithOBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
scale_dim, inner_dim,
x, alpha, y);
}
CUDA_POST_KERNEL_CHECK;
}
template <> void AffineGrad<float, CUDAContext>(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const float* dy,
const float* alpha,
float* dx) {
_AffineWithOBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
scale_dim, inner_dim,
dy, alpha, dx);
CUDA_POST_KERNEL_CHECK;
}
/******************** arithmetic.bias_add ********************/ /******************** arithmetic.bias_add ********************/
template <typename T> template <typename T>
...@@ -747,172 +762,6 @@ template <> void Clip<float, CUDAContext>(const int count, ...@@ -747,172 +762,6 @@ template <> void Clip<float, CUDAContext>(const int count,
y); y);
} }
/******************** arithmetic.scale ********************/
template <typename T>
__global__ void _ScaleWithoutBias(const int n,
const T* x,
const T* scale,
const int scale_dim,
const int inner_dim,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = x[idx] * scale[scale_idx];
}
}
template <typename T>
__global__ void _ScaleWithBias(const int n,
const T* x,
const T* scale,
const T* bias,
const int scale_dim,
const int inner_dim,
T* y) {
CUDA_KERNEL_LOOP(idx, n) {
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = x[idx] * scale[scale_idx] + bias[scale_idx];
}
}
template<> void Scale<float, CUDAContext>(const int axis,
Tensor* x,
Tensor* gamma,
Tensor* beta,
Tensor* BMul,
Tensor* y) {
const int count = x->count();
const int inner_dim = x->count(axis + gamma->ndim());
const int scale_dim = gamma->count();
auto* Xdata = x->data<float, CUDAContext>();
auto* Ydata = y->mutable_data<float, CUDAContext>();
auto* Sdata = gamma->data<float, CUDAContext>();
auto* Bdata = beta != nullptr ?
beta->data<float, CUDAContext>() :
nullptr;
if (Bdata != nullptr)
_ScaleWithBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
Xdata,
Sdata,
Bdata,
scale_dim,
inner_dim,
Ydata);
else _ScaleWithoutBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
Xdata,
Sdata,
scale_dim,
inner_dim,
Ydata);
}
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ScaleWithoutBiasHalf(const int n,
const half* x,
const half* scale,
const int scale_dim,
const int inner_dim,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = __hmul(x[idx], scale[scale_idx]);
#endif
}
}
template <typename T>
__global__ void _ScaleWithBiasHalf(const int n,
const half* x,
const half* scale,
const half* bias,
const int scale_dim,
const int inner_dim,
half* y) {
CUDA_KERNEL_LOOP(idx, n) {
#if __CUDA_ARCH__ >= 530
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = __hadd(__hmul(x[idx], scale[scale_idx]), bias[scale_idx]);
#endif
}
}
#endif
template<> void Scale<float16, CUDAContext>(const int axis,
Tensor* x,
Tensor* gamma,
Tensor* beta,
Tensor* BMul,
Tensor* y) {
#ifdef WITH_CUDA_FP16
const int count = x->count();
const int inner_dim = x->count(axis + gamma->ndim());
const int scale_dim = gamma->count();
auto* Xdata = x->data<float16, CUDAContext>();
auto* Ydata = y->mutable_data<float16, CUDAContext>();
auto* Sdata = gamma->data<float16, CUDAContext>();
auto* Bdata = beta != nullptr ?
beta->data<float16, CUDAContext>() :
nullptr;
if (Bdata != nullptr)
_ScaleWithBiasHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
reinterpret_cast<const half*>(Xdata),
reinterpret_cast<const half*>(Sdata),
reinterpret_cast<const half*>(Bdata),
scale_dim,
inner_dim,
reinterpret_cast<half*>(Ydata));
else _ScaleWithoutBiasHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
reinterpret_cast<const half*>(Xdata),
reinterpret_cast<const half*>(Sdata),
scale_dim,
inner_dim,
reinterpret_cast<half*>(Ydata));
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void ScaleGrad<float, CUDAContext>(const int axis,
Tensor* dy,
Tensor* gamma,
Tensor* dx) {
const int count = dx->count();
const int inner_dim = dx->count(axis + gamma->ndim());
const int scale_dim = gamma->count();
auto* dYdata = dy->data<float, CUDAContext>();
auto* dXdata = dx->mutable_data<float, CUDAContext>();
auto* Sdata = gamma->data<float, CUDAContext>();
_ScaleWithoutBias<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
dYdata,
Sdata,
scale_dim,
inner_dim,
dXdata);
}
/******************** cast.float2half ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _FloatToHalfKernel(const int count, const float* x, half* y) {
CUDA_KERNEL_LOOP(idx, count) {
y[idx] = __float2half(x[idx]);
}
}
template <> void Float2Half<float, CUDAContext>(const int count,
const float* x,
float16* y) {
_FloatToHalfKernel<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
x,
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
}
#endif
/******************** control_flow.compare ********************/ /******************** control_flow.compare ********************/
template <typename T> template <typename T>
...@@ -1387,7 +1236,7 @@ template<> void SparseSoftmaxFocalLossGrad<float, CUDAContext>(const int count, ...@@ -1387,7 +1236,7 @@ template<> void SparseSoftmaxFocalLossGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
/******************** misc.dtype ********************/ /******************** misc.astype ********************/
template <typename Ta, typename Tb> template <typename Ta, typename Tb>
__global__ void _TypeA2B(const int count, const Ta* a, Tb* b) { __global__ void _TypeA2B(const int count, const Ta* a, Tb* b) {
...@@ -1396,19 +1245,6 @@ __global__ void _TypeA2B(const int count, const Ta* a, Tb* b) { ...@@ -1396,19 +1245,6 @@ __global__ void _TypeA2B(const int count, const Ta* a, Tb* b) {
} }
} }
#ifdef WITH_CUDA_FP16
__global__ void _TypeHalf2Float(const int count, const half* a, float* b) {
CUDA_KERNEL_LOOP(idx, count) {
b[idx] = __half2float(a[idx]);
}
}
__global__ void _TypeFloat2Half(const int count, const float* a, half* b) {
CUDA_KERNEL_LOOP(idx, count) {
b[idx] = __float2half(a[idx]);
}
}
#endif
#define DEFINE_TYPE_A2B(type_a, type_b) \ #define DEFINE_TYPE_A2B(type_a, type_b) \
template <> void TypeA2B<type_a, type_b, CUDAContext>(const int count, \ template <> void TypeA2B<type_a, type_b, CUDAContext>(const int count, \
const type_a* a, \ const type_a* a, \
...@@ -1416,34 +1252,6 @@ __global__ void _TypeFloat2Half(const int count, const float* a, half* b) { ...@@ -1416,34 +1252,6 @@ __global__ void _TypeFloat2Half(const int count, const float* a, half* b) {
_TypeA2B<type_a, type_b> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, a, b); \ _TypeA2B<type_a, type_b> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, a, b); \
} }
#define DEFINE_TYPE_DISABLE_FP16(type) \
template <> void TypeA2B<float16, type, CUDAContext>(const int count, \
const float16* a, \
type* b) { \
LOG(FATAL) << "CUDAContext has not implemented: float16 -> " \
<< TypeMetaToString(TypeMeta::Make<type>()); \
} \
template <> void TypeA2B<type, float16, CUDAContext>(const int count, \
const type* a, \
float16* b) { \
LOG(FATAL) << "CUDAContext has not implemented: " \
<< TypeMetaToString(TypeMeta::Make<type>()) << " -> float16"; \
}
#define DEFINE_TYPE_ENABLE_FP16_FP32 \
template <> void TypeA2B<float16, float, CUDAContext>(const int count, \
const float16* a, \
float* b) { \
_TypeHalf2Float << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
reinterpret_cast<const half*>(a), b); \
} \
template <> void TypeA2B<float, float16, CUDAContext>(const int count, \
const float* a, \
float16* b) { \
_TypeFloat2Half << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
a, reinterpret_cast<half*>(b)); \
}
#define DEFINE_TYPE_A2ALL(type_a) \ #define DEFINE_TYPE_A2ALL(type_a) \
DEFINE_TYPE_A2B(type_a, float); \ DEFINE_TYPE_A2B(type_a, float); \
DEFINE_TYPE_A2B(type_a, double); \ DEFINE_TYPE_A2B(type_a, double); \
...@@ -1457,32 +1265,6 @@ DEFINE_TYPE_A2ALL(int); ...@@ -1457,32 +1265,6 @@ DEFINE_TYPE_A2ALL(int);
DEFINE_TYPE_A2ALL(int64_t); DEFINE_TYPE_A2ALL(int64_t);
DEFINE_TYPE_A2ALL(uint8_t); DEFINE_TYPE_A2ALL(uint8_t);
#ifdef WITH_CUDA_FP16
template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
const float16* a,
float16* b) {
_TypeA2B<half, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
reinterpret_cast<const half*>(a),
reinterpret_cast<half*>(b));
}
DEFINE_TYPE_ENABLE_FP16_FP32;
DEFINE_TYPE_DISABLE_FP16(double);
DEFINE_TYPE_DISABLE_FP16(int);
DEFINE_TYPE_DISABLE_FP16(int64_t);
DEFINE_TYPE_DISABLE_FP16(uint8_t);
#else
template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
const float16* a,
float16* b) {
LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
}
DEFINE_TYPE_DISABLE_FP16(float);
DEFINE_TYPE_DISABLE_FP16(double);
DEFINE_TYPE_DISABLE_FP16(int);
DEFINE_TYPE_DISABLE_FP16(int64_t);
DEFINE_TYPE_DISABLE_FP16(uint8_t);
#endif
/******************** misc.image_data ********************/ /******************** misc.image_data ********************/
template <typename Tx, typename Ty> template <typename Tx, typename Ty>
...@@ -1522,43 +1304,6 @@ __global__ void _ImageData_NHWC(const int count, ...@@ -1522,43 +1304,6 @@ __global__ void _ImageData_NHWC(const int count,
} }
} }
template <typename Tx, typename Ty>
__global__ void _ImageDataHalf_NCHW(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const Tx* x,
Ty* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int w = idx % W;
const int h = (idx / W) % H;
const int c = (idx / W / H) % C;
const int n = idx / W / H / C;
float raw_value = x[((n * H + h) * W + w) * C + c];
if (mean_values != nullptr) raw_value -= mean_values[c];
if (std_values != nullptr) raw_value /= std_values[c];
y[idx] = __float2half(raw_value);
}
}
template <typename Tx, typename Ty>
__global__ void _ImageDataHalf_NHWC(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const Tx* x,
Ty* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int c = idx % C;
float raw_value = x[idx];
if (mean_values != nullptr) raw_value -= mean_values[c];
if (std_values != nullptr) raw_value /= std_values[c];
y[idx] = __float2half(raw_value);
}
}
template <> void ImageData<float, float, CUDAContext>(const int count, template <> void ImageData<float, float, CUDAContext>(const int count,
const int N, const int C, const int N, const int C,
const int H, const int W, const int H, const int W,
...@@ -1613,67 +1358,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>(const int count, ...@@ -1613,67 +1358,7 @@ template <> void ImageData<uint8_t, float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template <> void ImageData<float, float16, CUDAContext>(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const string& data_format,
const float* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (data_format == "NCHW") {
_ImageDataHalf_NCHW<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ImageDataHalf_NHWC<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else LOG(FATAL) << "Unknown data format: " << data_format;
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void ImageData<uint8_t, float16, CUDAContext>(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const string& data_format,
const uint8_t* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (data_format == "NCHW") {
_ImageDataHalf_NCHW<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ImageDataHalf_NHWC<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else LOG(FATAL) << "Unknown data format: " << data_format;
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** ndarray.argmax ********************/ /******************** ndarray.argmax ********************/
...@@ -1998,30 +1683,6 @@ template <> void Concat<float, CUDAContext>(const int count, ...@@ -1998,30 +1683,6 @@ template <> void Concat<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template <> void Concat<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
const int x_concat_dim,
const int y_concat_dim,
const int concat_offset,
const float16* x,
float16* y,
CUDAContext* context) {
#ifdef WITH_CUDA_FP16
_Concat<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
outer_dim,
inner_dim,
x_concat_dim,
y_concat_dim,
concat_offset,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _ConcatGrad(const int count, __global__ void _ConcatGrad(const int count,
const int outer_dim, const int outer_dim,
...@@ -2061,29 +1722,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count, ...@@ -2061,29 +1722,7 @@ template <> void ConcatGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template <> void ConcatGrad<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
const int x_concat_dim,
const int y_concat_dim,
const int concat_offset,
const float16* dy,
float16* dx,
CUDAContext* context) {
#ifdef WITH_CUDA_FP16
_ConcatGrad<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
outer_dim,
inner_dim,
x_concat_dim,
y_concat_dim,
concat_offset,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** ndarray.crop ********************/ /******************** ndarray.crop ********************/
...@@ -2746,28 +2385,6 @@ template <> void Transpose<float, CUDAContext>(const int count, ...@@ -2746,28 +2385,6 @@ template <> void Transpose<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template <> void Transpose<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
const int* old_steps,
const int* new_steps,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
_Transpose<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
ndim,
order,
old_steps,
new_steps,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T> template <typename T>
__global__ void _TransposeGrad(const int count, __global__ void _TransposeGrad(const int count,
const int ndim, const int ndim,
...@@ -2804,27 +2421,6 @@ template <> void TransposeGrad<float, CUDAContext>(const int count, ...@@ -2804,27 +2421,6 @@ template <> void TransposeGrad<float, CUDAContext>(const int count,
CUDA_POST_KERNEL_CHECK; CUDA_POST_KERNEL_CHECK;
} }
template <> void TransposeGrad<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
const int* old_steps,
const int* new_steps,
const float16* dy,
float16* dx) {
#ifdef WITH_CUDA_FP16
_TransposeGrad<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
ndim,
order,
old_steps,
new_steps,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** recurrent.lstm_uint ********************/ /******************** recurrent.lstm_uint ********************/
template <typename T> template <typename T>
......
#ifdef WITH_CUDA
#include <cmath>
#include "core/context_cuda.h"
#include "core/tensor.h"
#include "utils/cuda_device.h"
#include "utils/op_kernel.h"
#include "utils/math_functions.h"
#include "utils/cast.h"
namespace dragon {
namespace kernel {
template <typename T>
__global__ void _EmptyHalf() {}
template<> void Empty<float16, CUDAContext>() {
_EmptyHalf<float16> << <1, 1 >> >();
CUDA_POST_KERNEL_CHECK;
}
/******************** activation.relu ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _ReluHalf(const int count, const half* x, const half slope, half* y) {
const half kZero = __float2half(0.f);
CUDA_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hgt(x[idx], kZero) ? x[idx] : __hmul(x[idx], slope);
#endif
}
}
template <typename T>
__global__ void _ReluHalf2(const int count, const half2* x, const half2 slope, half2* y) {
const half2 kZero = __float2half2_rn(0.f);
CUDA_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
y[idx] = __hbgt2(x[idx], kZero) ? x[idx] : __hmul2(x[idx], slope);
#endif
}
}
#endif
template<> void Relu<float16, CUDAContext>(const int count,
const float16* x,
const float slope,
float16* y) {
#ifdef WITH_CUDA_FP16
if (count % 2 == 0)
_ReluHalf2<half2> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> > (count / 2,
reinterpret_cast<const half2*>(x),
dragon_cast<half2, float>(slope),
reinterpret_cast<half2*>(y));
else
_ReluHalf<half> << < GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
reinterpret_cast<const half*>(x),
dragon_cast<half, float>(slope),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** arithmetic.affine ********************/
#ifdef WITH_CUDA_FP16
template <typename T>
__global__ void _AffineWithOBiasHalf(const int count,
const int scale_dim,
const int inner_dim,
const half* x,
const half* alpha,
half* y) {
CUDA_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = __hmul(alpha[scale_idx], x[idx]);
#endif
}
}
template <typename T>
__global__ void _AffineWithBiasHalf(const int count,
const int scale_dim,
const int inner_dim,
const half* x,
const half* alpha,
const half* beta,
half* y) {
CUDA_KERNEL_LOOP(idx, count) {
#if __CUDA_ARCH__ >= 530
const int scale_idx = (idx / inner_dim) % scale_dim;
y[idx] = __hadd(__hmul(alpha[scale_idx], x[idx]), beta[scale_idx]);
#endif
}
}
#endif
template<> void Affine<float16, CUDAContext>(const int count,
const int outer_dim,
const int scale_dim,
const int inner_dim,
const float16* x,
const float16* alpha,
const float16* beta,
const float16* beta_multiplier,
float16* y) {
#ifdef WITH_CUDA_FP16
if (beta != nullptr) {
_AffineWithBiasHalf<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
scale_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
reinterpret_cast<const half*>(beta),
reinterpret_cast<half*>(y));
} else {
_AffineWithOBiasHalf<float> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
scale_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
reinterpret_cast<half*>(y));
}
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** misc.astype ********************/
#ifdef WITH_CUDA_FP16
__global__ void _TypeHalf2Float(const int count, const half* a, float* b) {
CUDA_KERNEL_LOOP(idx, count) {
b[idx] = __half2float(a[idx]);
}
}
__global__ void _TypeFloat2Half(const int count, const float* a, half* b) {
CUDA_KERNEL_LOOP(idx, count) {
b[idx] = __float2half(a[idx]);
}
}
__global__ void _TypeHalf2Half(const int count, const half* a, half* b) {
CUDA_KERNEL_LOOP(idx, count) {
b[idx] = a[idx];
}
}
#endif
#define DEFINE_TYPE_DISABLE_FP16(type) \
template <> void TypeA2B<float16, type, CUDAContext>(const int count, \
const float16* a, \
type* b) { \
LOG(FATAL) << "CUDAContext has not implemented: float16 -> " \
<< TypeMetaToString(TypeMeta::Make<type>()); \
} \
template <> void TypeA2B<type, float16, CUDAContext>(const int count, \
const type* a, \
float16* b) { \
LOG(FATAL) << "CUDAContext has not implemented: " \
<< TypeMetaToString(TypeMeta::Make<type>()) << " -> float16"; \
}
#define DEFINE_TYPE_ENABLE_FP16_FP32 \
template <> void TypeA2B<float16, float, CUDAContext>(const int count, \
const float16* a, \
float* b) { \
_TypeHalf2Float << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
reinterpret_cast<const half*>(a), b); \
CUDA_POST_KERNEL_CHECK; \
} \
template <> void TypeA2B<float, float16, CUDAContext>(const int count, \
const float* a, \
float16* b) { \
_TypeFloat2Half << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count, \
a, reinterpret_cast<half*>(b)); \
CUDA_POST_KERNEL_CHECK; \
}
#ifdef WITH_CUDA_FP16
template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
const float16* a,
float16* b) {
_TypeHalf2Half << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
reinterpret_cast<const half*>(a),
reinterpret_cast<half*>(b));
CUDA_POST_KERNEL_CHECK;
}
DEFINE_TYPE_ENABLE_FP16_FP32;
DEFINE_TYPE_DISABLE_FP16(double);
DEFINE_TYPE_DISABLE_FP16(int);
DEFINE_TYPE_DISABLE_FP16(int64_t);
DEFINE_TYPE_DISABLE_FP16(uint8_t);
#else
template <> void TypeA2B<float16, float16, CUDAContext>(const int count,
const float16* a,
float16* b) {
LOG(FATAL) << "CUDAContext has not implemented: float16 -> float16";
}
DEFINE_TYPE_DISABLE_FP16(float);
DEFINE_TYPE_DISABLE_FP16(double);
DEFINE_TYPE_DISABLE_FP16(int);
DEFINE_TYPE_DISABLE_FP16(int64_t);
DEFINE_TYPE_DISABLE_FP16(uint8_t);
#endif
/******************** misc.image_data ********************/
#ifdef WITH_CUDA_FP16
template <typename Tx, typename Ty>
__global__ void _ImageDataHalf_NCHW(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const Tx* x,
Ty* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int w = idx % W;
const int h = (idx / W) % H;
const int c = (idx / W / H) % C;
const int n = idx / W / H / C;
float raw_value = x[((n * H + h) * W + w) * C + c];
if (mean_values != nullptr) raw_value -= mean_values[c];
if (std_values != nullptr) raw_value /= std_values[c];
y[idx] = __float2half(raw_value);
}
}
template <typename Tx, typename Ty>
__global__ void _ImageDataHalf_NHWC(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const Tx* x,
Ty* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int c = idx % C;
float raw_value = x[idx];
if (mean_values != nullptr) raw_value -= mean_values[c];
if (std_values != nullptr) raw_value /= std_values[c];
y[idx] = __float2half(raw_value);
}
}
#endif
template <> void ImageData<float, float16, CUDAContext>(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const string& data_format,
const float* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (data_format == "NCHW") {
_ImageDataHalf_NCHW<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ImageDataHalf_NHWC<float, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else LOG(FATAL) << "Unknown data format: " << data_format;
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <> void ImageData<uint8_t, float16, CUDAContext>(const int count,
const int N, const int C,
const int H, const int W,
const float* mean_values,
const float* std_values,
const string& data_format,
const uint8_t* x,
float16* y) {
#ifdef WITH_CUDA_FP16
if (data_format == "NCHW") {
_ImageDataHalf_NCHW<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else if (data_format == "NHWC") {
_ImageDataHalf_NHWC<uint8_t, half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
N, C, H, W,
mean_values,
std_values,
x,
reinterpret_cast<half*>(y));
} else LOG(FATAL) << "Unknown data format: " << data_format;
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** ndarray.concat ********************/
template <typename T>
__global__ void _ConcatHalf(const int count,
const int outer_dim,
const int inner_dim,
const int x_concat_dim,
const int y_concat_dim,
const int concat_offset,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, count) {
const int tmp = x_concat_dim * inner_dim;
const int outer_idx = idx / tmp;
const int concat_idx = idx % tmp;
const int y_idx = (outer_idx * y_concat_dim + concat_offset)
* inner_dim + concat_idx;
y[y_idx] = x[idx];
}
}
template <> void Concat<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
const int x_concat_dim,
const int y_concat_dim,
const int concat_offset,
const float16* x,
float16* y,
CUDAContext* context) {
#ifdef WITH_CUDA_FP16
_ConcatHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
outer_dim,
inner_dim,
x_concat_dim,
y_concat_dim,
concat_offset,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T>
__global__ void _ConcatGradHalf(const int count,
const int outer_dim,
const int inner_dim,
const int x_concat_dim,
const int y_concat_dim,
const int concat_offset,
const T* dy,
T* dx) {
CUDA_KERNEL_LOOP(idx, count) {
const int tmp = x_concat_dim * inner_dim;
const int outer_idx = idx / tmp;
const int concat_idx = idx % tmp;
const int y_idx = (outer_idx * y_concat_dim + concat_offset)
* inner_dim + concat_idx;
dx[idx] = dy[y_idx];
}
}
template <> void ConcatGrad<float16, CUDAContext>(const int count,
const int outer_dim,
const int inner_dim,
const int x_concat_dim,
const int y_concat_dim,
const int concat_offset,
const float16* dy,
float16* dx,
CUDAContext* context) {
#ifdef WITH_CUDA_FP16
_ConcatGradHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
outer_dim,
inner_dim,
x_concat_dim,
y_concat_dim,
concat_offset,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
/******************** ndarray.transpose ********************/
template <typename T>
__global__ void _TransposeHalf(const int count,
const int ndim,
const int* order,
const int* old_steps,
const int* new_steps,
const T* x,
T* y) {
CUDA_KERNEL_LOOP(idx, count) {
int x_idx = 0, y_idx = idx;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
x_idx += (y_idx / new_steps[j]) * old_steps[k];
y_idx %= new_steps[j];
}
y[idx] = x[x_idx];
}
}
template <> void Transpose<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
const int* old_steps,
const int* new_steps,
const float16* x,
float16* y) {
#ifdef WITH_CUDA_FP16
_TransposeHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
ndim,
order,
old_steps,
new_steps,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
template <typename T>
__global__ void _TransposeGradHalf(const int count,
const int ndim,
const int* order,
const int* old_steps,
const int* new_steps,
const T* dy,
T* dx) {
CUDA_KERNEL_LOOP(idx, count) {
int x_idx = 0, y_idx = idx;
for (int j = 0; j < ndim; ++j) {
int k = order[j];
x_idx += (y_idx / new_steps[j]) * old_steps[k];
y_idx %= new_steps[j];
}
dx[x_idx] = dy[idx];
}
}
template <> void TransposeGrad<float16, CUDAContext>(const int count,
const int ndim,
const int* order,
const int* old_steps,
const int* new_steps,
const float16* dy,
float16* dx) {
#ifdef WITH_CUDA_FP16
_TransposeGradHalf<half> << <GET_BLOCKS(count), CUDA_NUM_THREADS >> >(count,
ndim,
order,
old_steps,
new_steps,
reinterpret_cast<const half*>(dy),
reinterpret_cast<half*>(dx));
CUDA_POST_KERNEL_CHECK;
#else
CUDA_FP16_NOT_COMPILED;
#endif
}
} // namespace kernel
} // namespace dragon
#endif // WITH_CUDA
\ No newline at end of file
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!