Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
SeetaResearch
/
Dragon
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit d1f714ea
authored
May 15, 2019
by
Ting PAN
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Apply the dispatcher to RunImpl
1 parent
bd84b7fd
Hide whitespace changes
Inline
Side-by-side
Showing
159 changed files
with
1208 additions
and
2363 deletions
Dragon/include/core/common.h
Dragon/include/core/graph.h
Dragon/include/core/graph_gradient.h
Dragon/include/core/operator.h
Dragon/include/core/operator_schema.h
Dragon/include/core/types.h
Dragon/include/core/workspace.h
Dragon/include/operators/arithmetic/fully_connected_op.h
Dragon/include/operators/array/multinomial_op.h
Dragon/include/utils/caffemodel.h
Dragon/python/dragon/operators/array.py
Dragon/python/dragon/vm/torch/ops/builtin.py
Dragon/python/dragon/vm/torch/ops/modules/array.py
Dragon/python/dragon/vm/torch/tensor.py
Dragon/src/contrib/rcnn/bbox_utils.cu
Dragon/src/contrib/rcnn/bbox_utils.h
Dragon/src/core/graph.cc
Dragon/src/core/graph_gradient.cc
Dragon/src/core/graph_optimizer.cc
Dragon/src/core/operator_schema.cc
Dragon/src/kernels/activation/dropout_op_kernel.cu
Dragon/src/kernels/activation/droppath_op_kernel.cu
Dragon/src/kernels/activation/elu_op_kernel.cu
Dragon/src/kernels/activation/prelu_op_kernel.cu
Dragon/src/kernels/activation/relu_op_kernel.cu
Dragon/src/kernels/activation/selu_op_kernel.cu
Dragon/src/kernels/activation/sigmoid_op_kernel.cu
Dragon/src/kernels/activation/softmax_op_kernel.cu
Dragon/src/kernels/activation/tanh_op_kernel.cu
Dragon/src/kernels/arithmetic/affine_op_kernel.cu
Dragon/src/kernels/arithmetic/clip_op_kernel.cu
Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
Dragon/src/kernels/arithmetic/moments_op_kernel.cu
Dragon/src/kernels/array/arange_op_kernel.cu
Dragon/src/kernels/array/argreduce_op_kernel.cc
Dragon/src/kernels/array/argreduce_op_kernel.cu
Dragon/src/kernels/array/concat_op_kernel.cu
Dragon/src/kernels/array/crop_op_kernel.cu
Dragon/src/kernels/array/index_select_op_kernel.cu
Dragon/src/kernels/array/one_hot_op_kernel.cu
Dragon/src/kernels/array/pad_op_kernel.cu
Dragon/src/kernels/array/reduce_sum_op_kernel.cu
Dragon/src/kernels/array/repeat_op_kernel.cu
Dragon/src/kernels/array/slice_op_kernel.cu
Dragon/src/kernels/array/tile_op_kernel.cu
Dragon/src/kernels/array/transpose_op_kernel.cu
Dragon/src/kernels/control_flow/assign_op_kernel.cu
Dragon/src/kernels/control_flow/compare_op_kernel.cu
Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
Dragon/src/kernels/loss/l1_loss_op_kernel.cu
Dragon/src/kernels/loss/nll_loss_op_kernel.cu
Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cu
Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
Dragon/src/kernels/misc/astype_op_kernel.cu
Dragon/src/kernels/misc/gradient_op_kernel.cu
Dragon/src/kernels/misc/image_data_op_kernel.cu
Dragon/src/kernels/norm/batch_norm_op_kernel.cu
Dragon/src/kernels/norm/group_norm_op_kernel.cu
Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
Dragon/src/kernels/update/adam_update_op_kernel.cu
Dragon/src/kernels/update/mprec_update_op_kerne.cu
Dragon/src/kernels/update/nesterov_update_op_kernel.cu
Dragon/src/kernels/update/rmsprop_update_op_kernel.cu
Dragon/src/kernels/update/sgd_update_op_kernel.cu
Dragon/src/kernels/vision/bias_add_op_kernel.cu
Dragon/src/kernels/vision/bilinear_resize_op_kernel.cu
Dragon/src/kernels/vision/conv_op_kernel.cu
Dragon/src/kernels/vision/depthwise_conv_op_kernel.cu
Dragon/src/kernels/vision/drop_block_op_kernel.cu
Dragon/src/kernels/vision/nn_resize_op_kernel.cu
Dragon/src/kernels/vision/pool_op_kernel.cu
Dragon/src/kernels/vision/roi_align_op_kernel.cu
Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
Dragon/src/kernels/vision/roi_pool_op_kernel.cu
Dragon/src/onnx/onnx_backend.cc
Dragon/src/onnx/onnx_backend.h
Dragon/src/operators/activation/cudnn_dropout_op.cc
Dragon/src/operators/activation/cudnn_elu_op.cc
Dragon/src/operators/activation/cudnn_relu_op.cc
Dragon/src/operators/activation/cudnn_sigmoid_op.cc
Dragon/src/operators/activation/cudnn_softmax_op.cc
Dragon/src/operators/activation/cudnn_tanh_op.cc
Dragon/src/operators/activation/dropout_op.cc
Dragon/src/operators/activation/droppath_op.cc
Dragon/src/operators/activation/elu_op.cc
Dragon/src/operators/activation/prelu_op.cc
Dragon/src/operators/activation/relu_op.cc
Dragon/src/operators/activation/selu_op.cc
Dragon/src/operators/activation/sigmoid_op.cc
Dragon/src/operators/activation/softmax_op.cc
Dragon/src/operators/activation/tanh_op.cc
Dragon/src/operators/arithmetic/affine_op.cc
Dragon/src/operators/arithmetic/cudnn_affine_op.cc
Dragon/src/operators/arithmetic/eltwise_op.cc
Dragon/src/operators/arithmetic/exp_op.cc
Dragon/src/operators/arithmetic/fully_connected_op.cc
Dragon/src/operators/arithmetic/gram_matrix_op.cc
Dragon/src/operators/arithmetic/log_op.cc
Dragon/src/operators/arithmetic/matmul_op.cc
Dragon/src/operators/arithmetic/maximum_op.cc
Dragon/src/operators/arithmetic/minimum_op.cc
Dragon/src/operators/arithmetic/moments_op.cc
Dragon/src/operators/arithmetic/pow_op.cc
Dragon/src/operators/arithmetic/sqrt_op.cc
Dragon/src/operators/arithmetic/square_op.cc
Dragon/src/operators/array/arange_op.cc
Dragon/src/operators/array/argreduce_op.cc
Dragon/src/operators/array/concat_op.cc
Dragon/src/operators/array/crop_op.cc
Dragon/src/operators/array/index_select_op.cc
Dragon/src/operators/array/multinomial_op.cc
Dragon/src/operators/array/one_hot_op.cc
Dragon/src/operators/array/pad_op.cc
Dragon/src/operators/array/reduce_op.cc
Dragon/src/operators/array/repeat_op.cc
Dragon/src/operators/array/slice_op.cc
Dragon/src/operators/array/stack_op.cc
Dragon/src/operators/array/tile_op.cc
Dragon/src/operators/array/transpose_op.cc
Dragon/src/operators/control_flow/assign_op.cc
Dragon/src/operators/control_flow/copy_op.cc
Dragon/src/operators/control_flow/masked_assign_op.cc
Dragon/src/operators/loss/ctc_loss_op.cc
Dragon/src/operators/loss/l1_loss_op.cc
Dragon/src/operators/loss/l2_loss_op.cc
Dragon/src/operators/loss/sigmoid_ce_loss_op.cc
Dragon/src/operators/loss/smooth_l1_loss_op.cc
Dragon/src/operators/loss/softmax_ce_loss_op.cc
Dragon/src/operators/misc/accuracy_op.cc
Dragon/src/operators/misc/gradient_op.cc
Dragon/src/operators/misc/initialize_op.cc
Dragon/src/operators/misc/python_op.cc
Dragon/src/operators/mpi/mpi_broadcast_op.cc
Dragon/src/operators/mpi/mpi_gather_op.cc
Dragon/src/operators/norm/l2_norm_op.cc
Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
Dragon/src/operators/recurrent/rnn_param_op.cc
Dragon/src/operators/vision/bias_add_op.cc
Dragon/src/operators/vision/bilinear_resize_op.cc
Dragon/src/operators/vision/conv2d_op.cc
Dragon/src/operators/vision/conv2d_transpose_op.cc
Dragon/src/operators/vision/cudnn_bias_add_op.cc
Dragon/src/operators/vision/cudnn_conv2d_op.cc
Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
Dragon/src/operators/vision/cudnn_lrn_op.cc
Dragon/src/operators/vision/cudnn_pool2d_op.cc
Dragon/src/operators/vision/depthwise_conv2d_op.cc
Dragon/src/operators/vision/drop_block2d_op.cc
Dragon/src/operators/vision/nn_resize_op.cc
Dragon/src/operators/vision/roi_align_op.cc
Dragon/src/operators/vision/roi_pool_op.cc
Dragon/src/utils/math_functions.cu
Dragon/src/utils/math_functions.fp16.cu
Dragon/include/core/common.h
View file @
d1f714e
...
...
@@ -35,6 +35,7 @@
#include "core/types.h"
#include "proto/dragon.pb.h"
#include "utils/string.h"
#include "utils/logging.h"
namespace
dragon
{
...
...
Dragon/include/core/graph.h
View file @
d1f714e
...
...
@@ -85,6 +85,8 @@ GraphBase* NewGraph(
const
GraphDef
&
def
,
Workspace
*
ws
);
/* Macros */
DECLARE_REGISTRY
(
GraphRegistry
,
GraphBase
,
...
...
Dragon/include/core/graph_gradient.h
View file @
d1f714e
...
...
@@ -43,7 +43,7 @@ class GraphGradientMaker {
bool
CheckGrad
(
const
OperatorDef
&
forward_op
,
const
Set
<
string
>&
targets
,
vector
<
pair
<
string
,
int
>
>&
gen_grads
);
vector
<
pair
<
string
,
int
>>&
gen_grads
);
string
GetOperatorName
();
...
...
Dragon/include/core/operator.h
View file @
d1f714e
...
...
@@ -100,7 +100,7 @@ class OperatorBase {
/*! \brief Return the specified argument */
const
Argument
&
arg
(
const
string
&
name
)
{
return
*
(
args_
[
name
]);
}
typedef
Map
<
string
,
vector
<
OperatorBase
*>
>
SubGraph
;
typedef
Map
<
string
,
vector
<
OperatorBase
*>>
SubGraph
;
/*! \brief Return the recomputing subgraph of this operator */
SubGraph
&
subgraph
()
{
return
subgraph_
;
}
...
...
@@ -221,7 +221,7 @@ OperatorBase* NewOperator(
const
OperatorDef
&
def
,
Workspace
*
ws
);
/*
!
Macros */
/* Macros */
#define OpArg OperatorBase::Arg
#define OpArgs OperatorBase::Args
...
...
@@ -266,7 +266,7 @@ DECLARE_REGISTRY(
const
OperatorDef
&
,
Workspace
*
);
/*
!
NVIDIA's Accelerated Library - CUDNN */
/* NVIDIA's Accelerated Library - CUDNN */
DECLARE_REGISTRY
(
CUDNNOperatorRegistry
,
...
...
@@ -274,7 +274,7 @@ DECLARE_REGISTRY(
const
OperatorDef
&
,
Workspace
*
);
/*
!
CAMBRICON's Accelerated Library - CNML */
/* CAMBRICON's Accelerated Library - CNML */
DECLARE_REGISTRY
(
CNMLOperatorRegistry
,
...
...
@@ -282,13 +282,60 @@ DECLARE_REGISTRY(
const
OperatorDef
&
,
Workspace
*
);
/* Dispatcher for Runtime Typed-Implementation */
#define XIsType(x, dtype) \
x.template IsType<dtype>()
template
<
typename
...
Types
>
struct
TensorTypes
{};
template
<
typename
Sizes
,
typename
...
Args
>
struct
DispatchHelper
;
#define DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, Impl) \
template <typename T, typename... Types, typename... Args> \
struct DispatchHelper<TensorTypes<T, Types...>, Args...> { \
template <typename Op> \
static void Call(Op* op, const TypeMeta& meta, string& types) { \
if (meta.Match<T>()) return op->template Impl<T, Args...>(); \
types += " * " + TypeToString<T>() + ",\n"; \
return DispatchHelper<TensorTypes<Types...>, Args...> \
::Call(op, meta, types); \
} \
template <typename Op> \
static void Call(Op* op, const Tensor& tensor) { \
string types; return Call(op, tensor.meta(), types); \
} \
}; \
template <typename... Args> \
struct DispatchHelper<TensorTypes<>, Args...> { \
template <typename Op> \
static void Call(Op* op, const TypeMeta& meta, string& types) { \
LOG(FATAL) << "Unsupported DType: " \
<< TypeMetaToString(meta) << "\n" \
<< "<" << op->type() << "Op>" \
<< " supports the following dtypes: {\n" \
<< types << "}"; \
} \
template <typename Op> \
static void Call(Op* op, const Tensor& tensor) { \
return Call(op, tensor.meta(), ""); \
} \
};
DEFINE_TENSOR_TYPES_DISPATCHER
(
TensorTypes
,
RunImpl
);
#undef DEFINE_TENSOR_TYPES_DISPATCHER
/* TensorFiller */
#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr<
Filler<type, Context>
> filler( \
unique_ptr<
Filler<type, Context>
> filler( \
CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
} else { \
...
...
@@ -308,7 +355,7 @@ DECLARE_REGISTRY(
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr<
Filler<T, Context>
> filler( \
unique_ptr<
Filler<T, Context>
> filler( \
CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
} else { \
...
...
@@ -322,6 +369,8 @@ DECLARE_REGISTRY(
tensor.Reshape(shape); \
}
/* Shared Multiplier */
#define DECLARE_MULTIPLIER(name, size) \
const T* name; \
{ \
...
...
@@ -335,6 +384,8 @@ DECLARE_REGISTRY(
name = mp->template data<T, Context>(); \
}
/* Dynamic Arguments */
#define DECLARE_ARG_WITH_DESC(type, arg) \
type arg##_; \
string arg##_desc_; \
...
...
@@ -393,8 +444,7 @@ DECLARE_REGISTRY(
#define GET_ARGS_SIZE(arg) \
(int)std::max(arg##_.size(), arg##_desc_.size())
#define XIsType(x, dtype) \
x.template IsType<dtype>()
/* Registers */
#define INSTANTIATE_OPERATOR(name, context) \
template class name##Op<context>;
...
...
Dragon/include/core/operator_schema.h
View file @
d1f714e
...
...
@@ -42,7 +42,7 @@ class OpSchema {
return
*
this
;
}
OpSchema
&
Inplace
(
set
<
pair
<
int
,
int
>
>
inplace
);
OpSchema
&
Inplace
(
set
<
pair
<
int
,
int
>>
inplace
);
std
::
function
<
bool
(
int
,
int
)
>
CheckInplace
;
bool
AllowInplace
()
const
{
return
allow_inplace_
;
}
...
...
Dragon/include/core/types.h
View file @
d1f714e
...
...
@@ -73,6 +73,11 @@ inline const std::string TypeMetaToString(
m2s_type_map
[
meta
.
id
()]
:
"unknown"
;
}
template
<
typename
T
>
inline
const
std
::
string
TypeToString
()
{
return
TypeMetaToString
(
TypeMeta
::
Make
<
T
>
());
}
}
// namespace dragon
#endif // DRAGON_CORE_TYPES_H_
\ No newline at end of file
Dragon/include/core/workspace.h
View file @
d1f714e
...
...
@@ -13,22 +13,18 @@
#ifndef DRAGON_CORE_WORKSPACE_H_
#define DRAGON_CORE_WORKSPACE_H_
#include "core/common.h"
#include "core/graph.h"
#include "utils/string.h"
namespace
dragon
{
class
Workspace
{
public
:
typedef
Map
<
string
,
Map
<
string
,
int64_t
>
>
DummyNameMap
;
typedef
Map
<
string
,
unique_ptr
<
Tensor
>
>
TensorMap
;
typedef
Map
<
string
,
Map
<
string
,
int64_t
>>
DummyNameMap
;
typedef
Map
<
string
,
unique_ptr
<
Tensor
>>
TensorMap
;
typedef
Map
<
string
,
string
>
TensorAliasMap
;
typedef
Map
<
string
,
TensorFillerProto
>
TensorFillerMap
;
typedef
Map
<
string
,
unique_ptr
<
OperatorBase
>
>
OperatorMap
;
typedef
Map
<
string
,
unique_ptr
<
GraphBase
>
>
GraphMap
;
typedef
Map
<
string
,
unique_ptr
<
OperatorBase
>>
OperatorMap
;
typedef
Map
<
string
,
unique_ptr
<
GraphBase
>>
GraphMap
;
/*! \brief Constructor */
Workspace
(
const
string
&
name
)
:
name_
(
name
)
{
Initialize
();
}
...
...
Dragon/include/operators/arithmetic/fully_connected_op.h
View file @
d1f714e
...
...
@@ -28,6 +28,7 @@ class FullyConnectedOp final : public Operator<Context> {
USE_OPERATOR_FUNCTIONS
;
void
RunOnDevice
();
template
<
typename
T
>
void
RunImpl
();
template
<
typename
T
>
void
TransRunImpl
();
template
<
typename
T
>
void
NoTransRunImpl
();
...
...
Dragon/include/operators/array/multinomial_op.h
View file @
d1f714e
...
...
@@ -22,6 +22,7 @@ class MultinomialOp final : public Operator<Context> {
public
:
MultinomialOp
(
const
OperatorDef
&
def
,
Workspace
*
ws
)
:
Operator
<
Context
>
(
def
,
ws
),
eps_
(
OpArg
<
float
>
(
"eps"
,
0
.
f
)),
normalize_
(
OpArg
<
int64_t
>
(
"normalize"
,
0
)),
num_samples_
(
OpArg
<
int64_t
>
(
"num_samples"
,
1
))
{}
USE_OPERATOR_FUNCTIONS
;
...
...
@@ -32,6 +33,7 @@ class MultinomialOp final : public Operator<Context> {
template
<
typename
T
>
void
RunImpl
();
protected
:
float
eps_
;
int64_t
outer_dim_
,
axis_
;
int64_t
normalize_
,
num_samples_
;
unique_ptr
<
OperatorBase
>
softmax_op_
;
...
...
Dragon/include/utils/caffemodel.h
View file @
d1f714e
...
...
@@ -26,22 +26,24 @@ inline void LoadCaffeModel(
LOG
(
INFO
)
<<
"Restore From Model @: "
<<
file
<<
"......"
;
LOG
(
INFO
)
<<
"Model Format: CaffeModel"
;
for
(
int
i
=
0
;
i
<
net_param
.
layer_size
();
i
++
)
{
const
LayerParameter
&
layer
=
net_param
.
layer
(
i
);
const
string
&
layer_name
=
layer
.
name
();
string
prefix
=
layer_name
+
"/param:"
;
const
auto
&
layer
=
net_param
.
layer
(
i
);
const
auto
&
layer_name
=
layer
.
name
();
auto
prefix
=
layer_name
+
"/param:"
;
for
(
int
j
=
0
;
j
<
layer
.
blobs_size
();
j
++
)
{
string
tensor_name
=
prefix
+
std
::
to_string
(
j
);
if
(
!
ws
->
HasTensor
(
tensor_name
))
LOG
(
WARNING
)
<<
"Tensor("
<<
tensor_name
<<
") "
<<
"does not exist in any Graphs, skip."
;
else
{
BlobProto
blob
=
layer
.
blobs
(
j
);
vector
<
int64_t
>
dims
;
for
(
auto
dim
:
blob
.
shape
().
dim
())
dims
.
push_back
(
dim
);
Tensor
*
tensor
=
ws
->
GetTensor
(
tensor_name
);
auto
tensor_name
=
prefix
+
std
::
to_string
(
j
);
if
(
!
ws
->
HasTensor
(
tensor_name
))
{
LOG
(
WARNING
)
<<
"Tensor("
<<
tensor_name
<<
") "
<<
"does not exist in any Graphs, skip."
;
}
else
{
auto
blob
=
layer
.
blobs
(
j
);
vec64_t
tensor_shape
;
for
(
auto
dim
:
blob
.
shape
().
dim
())
tensor_shape
.
push_back
(
dim
);
auto
*
tensor
=
ws
->
GetTensor
(
tensor_name
);
std
::
stringstream
DimString
;
if
(
dims
.
size
()
>
0
)
{
tensor
->
Reshape
(
dims
);
if
(
tensor_shape
.
size
()
>
0
)
{
tensor
->
Reshape
(
tensor_shape
);
CHECK_EQ
(
tensor
->
count
(),
blob
.
data_size
())
<<
"
\n
Tensor("
<<
tensor_name
<<
") "
<<
"failed to load, except size: "
...
...
@@ -52,9 +54,9 @@ inline void LoadCaffeModel(
tensor
->
Reshape
({
blob
.
data_size
()
});
DimString
<<
"(missing)"
;
}
float
*
Xdata
=
tensor
->
mutable_data
<
float
,
CPUContext
>
();
for
(
int
idx
=
0
;
idx
<
blob
.
data_size
();
idx
++
)
Xdata
[
idx
]
=
blob
.
data
(
idx
);
auto
*
x
=
tensor
->
mutable_data
<
float
,
CPUContext
>
();
for
(
int
xi
=
0
;
xi
<
blob
.
data_size
();
++
xi
)
x
[
xi
]
=
blob
.
data
(
xi
);
LOG
(
INFO
)
<<
"Tensor("
<<
tensor_name
<<
") "
<<
"loaded, shape: "
<<
DimString
.
str
()
<<
", size: "
<<
blob
.
data_size
();
...
...
@@ -66,32 +68,33 @@ inline void LoadCaffeModel(
inline
void
SavaCaffeModel
(
string
file
,
const
vector
<
Tensor
*>&
tensors
)
{
NetParameter
net_param
;
int
j
=
-
1
;
NetParameter
net
;
Map
<
string
,
int
>
layer_hash
;
int
layer_idx
=
-
1
;
for
(
int
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
if
(
tensors
[
i
]
->
count
()
<=
0
)
continue
;
vector
<
string
>
splits
=
str
::
split
(
auto
splits
=
str
::
split
(
tensors
[
i
]
->
name
(),
"/param:"
);
if
(
layer_hash
.
count
(
splits
[
0
])
==
0
)
{
layer_hash
[
splits
[
0
]]
=
++
layer_idx
;
LayerParameter
*
layer
=
net_param
.
add_layer
();
layer_hash
[
splits
[
0
]]
=
++
j
;
auto
*
layer
=
net
.
add_layer
();
layer
->
set_name
(
splits
[
0
]);
}
BlobProto
*
blob
=
net_param
.
mutable_layer
(
layer_idx
)
->
add_blobs
();
for
(
auto
dim
:
tensors
[
i
]
->
dims
())
blob
->
mutable_shape
()
->
add_dim
(
dim
);
auto
*
blob
=
net
.
mutable_layer
(
j
)
->
add_blobs
();
for
(
auto
dim
:
tensors
[
i
]
->
dims
())
blob
->
mutable_shape
()
->
add_dim
(
dim
);
if
(
XIsType
((
*
tensors
[
i
]),
float
))
{
auto
*
Xdata
=
tensors
[
i
]
->
data
<
float
,
CPUContext
>
();
for
(
int
id
=
0
;
id
<
tensors
[
i
]
->
count
();
id
++
)
blob
->
mutable_data
()
->
Add
(
Xdata
[
id
]);
auto
*
x
=
tensors
[
i
]
->
data
<
float
,
CPUContext
>
();
for
(
int
xi
=
0
;
xi
<
tensors
[
i
]
->
count
();
++
xi
)
blob
->
mutable_data
()
->
Add
(
x
[
xi
]);
}
else
if
(
XIsType
((
*
tensors
[
i
]),
float16
))
{
auto
*
Xdata
=
tensors
[
i
]
->
data
<
float16
,
CPUContext
>
();
for
(
int
id
=
0
;
id
<
tensors
[
i
]
->
count
();
id
++
)
auto
*
x
=
tensors
[
i
]
->
data
<
float16
,
CPUContext
>
();
for
(
int
xi
=
0
;
xi
<
tensors
[
i
]
->
count
();
++
xi
)
blob
->
mutable_data
()
->
Add
(
cast
::
to
<
float
>
(
Xdata
[
id
]));
cast
::
to
<
float
>
(
x
[
xi
]));
}
}
WriteProtoToBinaryFile
(
net
_param
,
file
.
c_str
());
WriteProtoToBinaryFile
(
net
,
file
.
c_str
());
LOG
(
INFO
)
<<
"Save the model @: "
<<
file
<<
"......"
;
LOG
(
INFO
)
<<
"Model format: Caffe"
;
}
...
...
Dragon/python/dragon/operators/array.py
View file @
d1f714e
...
...
@@ -748,7 +748,7 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
@OpSchema.Inputs
(
1
)
def
Multinomial
(
inputs
,
num_samples
=
1
,
normalize
=
False
,
**
kwargs
):
def
Multinomial
(
inputs
,
num_samples
=
1
,
eps
=
0.
,
normalize
=
False
,
**
kwargs
):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
...
...
@@ -765,6 +765,8 @@ def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
The input tensor.
num_samples : int, optional, default=1
The number of samples.
eps : float, optional, default=0.
The prob to a uniform sampling.
normalize : boolean, optional, default=False
Whether to normalize the inputs.
...
...
Dragon/python/dragon/vm/torch/ops/builtin.py
View file @
d1f714e
...
...
@@ -987,7 +987,7 @@ def one_hot(input, depth):
return
module
.
forward
(
input
)
def
multinomial
(
input
,
num_samples
,
out
=
None
):
def
multinomial
(
input
,
num_samples
,
eps
=
0.
,
out
=
None
):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
...
...
@@ -997,8 +997,8 @@ def multinomial(input, num_samples, out=None):
The input tensor.
num_samples : int
The number of samples.
normalize : boolean, optional, default=False
Whether to normalize the inputs
.
eps : float, optional, default=0.
The prob to a uniform sampling
.
Returns
-------
...
...
@@ -1008,9 +1008,11 @@ def multinomial(input, num_samples, out=None):
"""
dev
=
MakeDevice
(
inputs
=
[
input
])
key
=
'Multinomial/{}'
\
'/num_samples:{}'
.
format
(
dev
,
num_samples
)
'/num_samples:{}'
\
'/eps:{}'
.
format
(
dev
,
num_samples
,
eps
)
module
=
get_module
(
Multinomial
,
key
,
dev
,
eps
=
eps
,
num_samples
=
num_samples
,
)
return
module
.
forward
(
input
,
out
)
...
...
Dragon/python/dragon/vm/torch/ops/modules/array.py
View file @
d1f714e
...
...
@@ -377,6 +377,7 @@ class Cast(BaseModule):
class
Multinomial
(
BaseModule
):
def
__init__
(
self
,
key
,
dev
,
**
kwargs
):
super
(
Multinomial
,
self
)
.
__init__
(
key
,
dev
,
**
kwargs
)
self
.
eps
=
kwargs
.
get
(
'eps'
,
0
)
self
.
num_samples
=
kwargs
.
get
(
'num_samples'
,
1
)
self
.
register_op
()
...
...
@@ -384,6 +385,7 @@ class Multinomial(BaseModule):
self
.
op_meta
=
{
'op_type'
:
'Multinomial'
,
'arguments'
:
{
'eps'
:
float
(
self
.
eps
),
'num_samples'
:
self
.
num_samples
,
'normalize'
:
False
,
},
...
...
Dragon/python/dragon/vm/torch/tensor.py
View file @
d1f714e
...
...
@@ -980,7 +980,7 @@ class Tensor(object):
"""
raise
NotImplementedError
(
'Refer torch.ops.tensor.normal_'
)
def
multinomial
(
self
,
num_samples
,
normalize
=
False
):
def
multinomial
(
self
,
num_samples
,
eps
=
0.
):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
...
...
@@ -988,8 +988,8 @@ class Tensor(object):
----------
num_samples : int
The number of samples.
normalize : boolean, optional, default=False
Whether to normalize the inputs
.
eps : float, optional, default=0.
The prob to a uniform sampling
.
Returns
-------
...
...
Dragon/src/contrib/rcnn/bbox_utils.cu
View file @
d1f714e
...
...
@@ -81,8 +81,8 @@ void _ApplyNMS(
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
boxes_nbytes, cudaMemcpyHostToDevice));
nms_mask<T>
<<
< blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >>
> (num_boxes,
<<< blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >>
> (num_boxes,
thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
ctx->FinishDeviceCompution();
...
...
Dragon/src/contrib/rcnn/bbox_utils.h
View file @
d1f714e
...
...
@@ -347,7 +347,7 @@ inline void CollectRoIs(
const
int
canonical_level
,
const
int
canonical_scale
,
const
T
*
rois
,
vector
<
vector
<
int64_t
>
>&
roi_bins
)
{
vector
<
vec64_t
>&
roi_bins
)
{
const
T
*
roi
=
rois
;
for
(
int
i
=
0
;
i
<
num_rois
;
++
i
)
{
int
bin_idx
=
roi_level
(
min_level
,
max_level
,
...
...
@@ -360,7 +360,7 @@ inline void CollectRoIs(
template
<
typename
T
>
inline
void
DistributeRoIs
(
const
vector
<
vector
<
int64_t
>
>&
roi_bins
,
const
vector
<
vec64_t
>&
roi_bins
,
const
T
*
rois
,
vector
<
T
*>
outputs
)
{
for
(
int
i
=
0
;
i
<
roi_bins
.
size
();
i
++
)
{
...
...
Dragon/src/core/graph.cc
View file @
d1f714e
...
...
@@ -123,7 +123,7 @@ Graph::Graph(const GraphDef& def, Workspace* ws)
// Recomputing-aware
if
(
subgraph_indices
.
size
()
>
0
)
{
Map
<
string
,
vector
<
OperatorBase
*>
>
subgraph
;
Map
<
string
,
vector
<
OperatorBase
*>
>
subgraph
;
for
(
const
auto
&
it
:
subgraph_indices
)
{
subgraph
[
it
.
first
]
=
vector
<
OperatorBase
*>
();
for
(
const
auto
&
idx
:
subgraph_indices
[
it
.
first
])
...
...
Dragon/src/core/graph_gradient.cc
View file @
d1f714e
...
...
@@ -7,7 +7,7 @@ namespace dragon {
bool
GraphGradientMaker
::
CheckGrad
(
const
OperatorDef
&
forward_op
,
const
Set
<
string
>&
targets
,
vector
<
pair
<
string
,
int
>
>&
gen_grads
)
{
vector
<
pair
<
string
,
int
>>&
gen_grads
)
{
if
(
NoGradientRegistry
()
->
Has
(
forward_op
.
type
()))
{
for
(
auto
&
input
:
forward_op
.
input
())
blacklist_set_
.
insert
(
input
);
...
...
@@ -81,7 +81,7 @@ void GraphGradientMaker::Make(
for
(
int
i
=
(
int
)
forward_def
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
// Collect inputs & outputs, generate RAW grad ops
const
OperatorDef
&
op
=
*
forward_def
[
i
];
vector
<
pair
<
string
,
int
>
>
gen_grads
;
vector
<
pair
<
string
,
int
>
>
gen_grads
;
bool
is_skip
=
CheckGrad
(
op
,
targets_set
,
gen_grads
);
vector
<
string
>
g_outputs
;
for
(
auto
&
output
:
op
.
output
())
{
...
...
@@ -214,7 +214,7 @@ void GraphGradientMaker::Make(
GraphDef
GraphGradientMaker
::
Share
(
const
GraphDef
&
input_def
)
{
Set
<
int
>
invalid_ops
;
Map
<
string
,
int
>
ref_count
;
Map
<
string
,
pair
<
int
,
string
>
>
ssa_map
;
Map
<
string
,
pair
<
int
,
string
>
>
ssa_map
;
// Count the refs for detecting leaf nodes
for
(
int
i
=
0
;
i
<
input_def
.
op_size
();
++
i
)
{
const
OperatorDef
&
op
=
input_def
.
op
(
i
);
...
...
Dragon/src/core/graph_optimizer.cc
View file @
d1f714e
...
...
@@ -174,7 +174,7 @@ GraphDef GraphOptimizer::MirrorStage(
const
GraphDef
&
input_def
,
Map
<
string
,
vec32_t
>&
op_indices
)
{
GraphDef
output_def
(
input_def
);
Map
<
string
,
set
<
int
>
>
fake_op_indices
;
Map
<
string
,
set
<
int
>>
fake_op_indices
;
Map
<
string
,
string
>
rename_map
;
Map
<
string
,
int
>
versions
;
...
...
Dragon/src/core/operator_schema.cc
View file @
d1f714e
...
...
@@ -54,7 +54,7 @@ OpSchema& OpSchema::NumOutputs(int n) {
return
NumOutputs
(
n
,
n
);
}
OpSchema
&
OpSchema
::
Inplace
(
set
<
pair
<
int
,
int
>
>
inplace
)
{
OpSchema
&
OpSchema
::
Inplace
(
set
<
pair
<
int
,
int
>
>
inplace
)
{
CheckInplace
=
[
inplace
](
int
in
,
int
out
)
->
bool
{
return
(
inplace
.
count
(
std
::
make_pair
(
in
,
out
))
>
0
);
};
...
...
Dragon/src/kernels/activation/dropout_op_kernel.cu
View file @
d1f714e
...
...
@@ -37,14 +37,10 @@ template<> void Dropout<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
auto thresh = (uint32_t)(UINT_MAX * prob);
math::RandomUniform(
count,
0.f, (float)UINT_MAX,
mask32, ctx
);
math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
_Dropout
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
thresh,
scale,
...
...
@@ -85,14 +81,10 @@ template<> void Dropout<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
auto thresh = (uint32_t)(UINT_MAX * prob);
math::RandomUniform(
count,
0.f, (float)UINT_MAX,
mask32, ctx
);
math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
_Dropout
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
thresh,
cast::to<half>(scale),
...
...
@@ -124,8 +116,8 @@ template <> void ApplyMask<float, uint8_t, CUDAContext>(
float* y,
CUDAContext* ctx) {
_ApplyMask
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, scale, x, mask, y
);
}
...
...
@@ -157,8 +149,8 @@ template <> void ApplyMask<float16, uint8_t, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_ApplyMaskHalf
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
cast::to<half>(scale),
reinterpret_cast<const half*>(x),
...
...
Dragon/src/kernels/activation/droppath_op_kernel.cu
View file @
d1f714e
...
...
@@ -44,8 +44,8 @@ template<> void DropPath<float, CUDAContext>(
auto nthreads = rows * cols;
auto thresh = 1.f - (1.f / scale);
_DropPath
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, cols, thresh, scale, x, mask, y
);
}
...
...
@@ -85,8 +85,8 @@ template<> void DropPath<float16, CUDAContext>(
auto nthreads = rows * cols;
auto thresh = 1.f - (1.f / scale);
_DropPath
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, cols,
thresh,
cast::to<half>(scale),
...
...
Dragon/src/kernels/activation/elu_op_kernel.cu
View file @
d1f714e
...
...
@@ -28,8 +28,8 @@ template<> void Elu<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Elu
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, x, alpha, y
);
}
...
...
@@ -58,8 +58,8 @@ template<> void EluGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_EluGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, alpha, dy, y, dx
);
}
...
...
Dragon/src/kernels/activation/prelu_op_kernel.cu
View file @
d1f714e
...
...
@@ -66,21 +66,21 @@ template<> void PRelu<float, CUDAContext>(
CUDAContext* ctx) {
if (channel_shared) {
_PRelu
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, channels, dim, x, w, y
);
} else {
if (data_format == "NCHW") {
_PReluNCHW
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, channels, dim, x, w, y
);
} else if (data_format == "NHWC") {
_PReluNHWC
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, channels, dim, x, w, y
);
} else {
...
...
@@ -152,21 +152,21 @@ template<> void PReluGrad<float, CUDAContext>(
CUDAContext* ctx) {
if (channel_shared) {
_PReluGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, channels, dim, dy, x, w, dx
);
} else {
if (data_format == "NCHW") {
_PReluGradNCHW
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, channels, dim, dy, x, w, dx
);
} else if (data_format == "NHWC") {
_PReluGradNHWC
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, channels, dim, dy, x, w, dx
);
} else {
...
...
@@ -210,8 +210,8 @@ template<> void PReluWGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto cdim = channels * dim;
_PReluWGradBcast
<<
< CUDA_BLOCKS(cdim), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(cdim), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
cdim, rows, row_offset, dy, x, bcast_dw
);
if (channel_shared) {
...
...
Dragon/src/kernels/activation/relu_op_kernel.cu
View file @
d1f714e
...
...
@@ -35,8 +35,8 @@ template<> void Relu<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Relu
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, slope, x, y
);
}
...
...
@@ -83,8 +83,8 @@ template<> void Relu<float16, CUDAContext>(
CUDAContext* ctx) {
if ((count & 1) == 0) {
_Relu
<<
< CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count >> 1,
cast::to<half2>(slope),
reinterpret_cast<const half2*>(x),
...
...
@@ -92,8 +92,8 @@ template<> void Relu<float16, CUDAContext>(
);
} else {
_Relu
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
cast::to<half>(slope),
reinterpret_cast<const half*>(x),
...
...
@@ -134,8 +134,8 @@ template<> void ReluGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_ReluGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, slope, dy, y, dx
);
}
...
...
@@ -170,8 +170,8 @@ template<> void ReluGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_ReluGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, slope,
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(y),
...
...
Dragon/src/kernels/activation/selu_op_kernel.cu
View file @
d1f714e
...
...
@@ -34,8 +34,8 @@ template<> void SElu<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_SElu
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, x, y
);
}
...
...
@@ -63,8 +63,8 @@ template<> void SElu<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_SElu
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -99,8 +99,8 @@ template<> void SEluGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_SEluGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, dy, y, dx
);
}
...
...
@@ -131,8 +131,8 @@ template<> void SEluGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_SEluGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(y),
...
...
Dragon/src/kernels/activation/sigmoid_op_kernel.cu
View file @
d1f714e
...
...
@@ -25,8 +25,8 @@ template<> void Sigmoid<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Sigmoid
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, x, y
);
}
...
...
@@ -51,8 +51,8 @@ template<> void SigmoidGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_SigmoidGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, dy, y, dx
);
}
...
...
Dragon/src/kernels/activation/softmax_op_kernel.cu
View file @
d1f714e
...
...
@@ -96,26 +96,26 @@ template<> void Softmax<float, CUDAContext>(
auto num_preds = outer_dim * inner_dim;
auto nelements = num_preds * axis_dim;
_SoftmaxReduceMax
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim, x, scale
);
_SoftmaxSub
<<
< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nelements, axis_dim, inner_dim, scale, y
);
math::Exp(nelements, y, y, ctx);
_SoftmaxReduceSum
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim, y, scale
);
_SoftmaxDiv
<<
< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nelements, axis_dim, inner_dim, scale, y
);
}
...
...
@@ -159,13 +159,13 @@ template<> void SoftmaxGrad<float, CUDAContext>(
auto num_preds = outer_dim * inner_dim;
auto nelements = num_preds * axis_dim;
_SoftmaxDot
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim, dy, y, scale
);
_SoftmaxSub
<<
< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nelements, axis_dim, inner_dim, scale, dx
);
math::Mul(nelements, dx, y, dx, ctx);
...
...
Dragon/src/kernels/activation/tanh_op_kernel.cu
View file @
d1f714e
...
...
@@ -25,8 +25,8 @@ template<> void Tanh<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Tanh
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, x, y
);
}
...
...
@@ -51,8 +51,8 @@ template<> void TanhGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_TanhGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, dy, y, dx
);
}
...
...
Dragon/src/kernels/arithmetic/affine_op_kernel.cu
View file @
d1f714e
...
...
@@ -60,15 +60,15 @@ template<> void Affine<float, CUDAContext>(
auto nthreads = outer_dim * axis_dim * inner_dim;
if (beta != nullptr) {
_Affine
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim,
x, alpha, beta, y
);
} else {
_AffineNoBias
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, x, alpha, y
);
}
...
...
@@ -124,8 +124,8 @@ template<> void Affine<float16, CUDAContext>(
auto nthreads = outer_dim * axis_dim * inner_dim;
if (beta != nullptr) {
_Affine
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
...
...
@@ -134,8 +134,8 @@ template<> void Affine<float16, CUDAContext>(
);
} else {
_AffineNoBias
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
...
...
@@ -156,8 +156,8 @@ template <> void AffineGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_AffineNoBias
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, dy, alpha, dx
);
}
...
...
@@ -174,8 +174,8 @@ template <> void AffineGrad<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_AffineNoBias
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim,
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(alpha),
...
...
Dragon/src/kernels/arithmetic/clip_op_kernel.cu
View file @
d1f714e
...
...
@@ -83,8 +83,8 @@ template<> __global__ void _ClipGrad<half>(
T* y, \
CUDAContext* ctx) { \
_Clip<T> \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, \
cast::to<T>(low), \
cast::to<T>(high), \
...
...
@@ -102,8 +102,8 @@ template<> __global__ void _ClipGrad<half>(
T* dx, \
CUDAContext* ctx) { \
_ClipGrad<T> \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, \
cast::to<T>(low), \
cast::to<T>(high), \
...
...
@@ -133,8 +133,8 @@ template <> void Clip<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Clip
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
cast::to<half>(low),
cast::to<half>(high),
...
...
@@ -152,8 +152,8 @@ template <> void ClipGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_ClipGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
cast::to<half>(low),
cast::to<half>(high),
...
...
Dragon/src/kernels/arithmetic/maximum_op_kernel.cu
View file @
d1f714e
...
...
@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, x1, x2, y \
); \
}
...
...
@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
T* dx2, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, x1, x2, dy, dx1, dx2 \
); \
}
...
...
@@ -196,8 +196,8 @@ template <> void Maximum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Maximum \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
...
...
@@ -212,8 +212,8 @@ template <> void BroadcastMaximum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_BroadcastMaximum \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
...
...
@@ -230,8 +230,8 @@ template <> void MaximumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_MaximumGrad \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
...
...
@@ -250,8 +250,8 @@ template <> void BroadcastMaximumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_BroadcastMaximumGrad \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
...
...
Dragon/src/kernels/arithmetic/minimum_op_kernel.cu
View file @
d1f714e
...
...
@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, x1, x2, y \
); \
}
...
...
@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
T* dx2, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, x1, x2, dy, dx1, dx2 \
); \
}
...
...
@@ -196,8 +196,8 @@ template <> void Minimum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Minimum \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
...
...
@@ -212,8 +212,8 @@ template <> void BroadcastMinimum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_BroadcastMinimum \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
...
...
@@ -230,8 +230,8 @@ template <> void MinimumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_MinimumGrad \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
...
...
@@ -250,8 +250,8 @@ template <> void BroadcastMinimumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_BroadcastMinimumGrad \
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
...
...
Dragon/src/kernels/arithmetic/moments_op_kernel.cu
View file @
d1f714e
...
...
@@ -251,8 +251,8 @@ void _Moments(
ndims, x_dims, y_dims,
&rows, &cols)) {
_ColwiseMoments
<<
< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
rows, cols, x, mean, var
); return;
}
...
...
@@ -262,8 +262,8 @@ void _Moments(
ndims, x_dims, y_dims,
&rows, &cols)) {
_RowwiseMoments
<<
< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
rows, cols, x, mean, var
); return;
}
...
...
@@ -294,8 +294,8 @@ void _Moments(
ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());
_GenericMoments
<<
< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
ndims, outer_dim, inner_dim,
XSS, YDS, x, mean, var
);
...
...
Dragon/src/kernels/array/arange_op_kernel.cu
View file @
d1f714e
...
...
@@ -30,8 +30,8 @@ __global__ void _Arange(
T* y, \
CUDAContext* ctx) { \
_Arange \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, start, step, y \
); \
}
...
...
@@ -64,8 +64,8 @@ template <> void Arange<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Arange
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, start, step,
reinterpret_cast<half*>(y)
);
...
...
Dragon/src/kernels/array/argreduce_op_kernel.cc
View file @
d1f714e
...
...
@@ -20,12 +20,12 @@ void _ArgMax(
for
(
int
iix
=
0
;
iix
<
inner_dim
;
++
iix
)
{
const
T
*
X
=
x
+
(
oix
*
axis_dim
*
inner_dim
+
iix
);
const
int
y_offset
=
oix
*
top_k
*
inner_dim
+
iix
;
vector
<
pair
<
T
,
int64_t
>
>
vec
(
axis_dim
);
vector
<
pair
<
T
,
int64_t
>
>
vec
(
axis_dim
);
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
vec
[
j
]
=
std
::
make_pair
(
X
[
j
*
inner_dim
],
j
);
std
::
partial_sort
(
vec
.
begin
(),
vec
.
begin
()
+
top_k
,
vec
.
end
(),
std
::
greater
<
pair
<
T
,
int64_t
>
>
());
std
::
greater
<
pair
<
T
,
int64_t
>
>
());
for
(
int
j
=
0
;
j
<
top_k
;
++
j
)
{
indices
[
y_offset
+
j
*
inner_dim
]
=
vec
[
j
].
second
;
if
(
values
)
values
[
y_offset
+
j
*
inner_dim
]
=
vec
[
j
].
first
;
...
...
@@ -49,7 +49,7 @@ void _ArgMin(
for
(
int
iix
=
0
;
iix
<
inner_dim
;
++
iix
)
{
const
T
*
X
=
x
+
(
oix
*
axis_dim
*
inner_dim
+
iix
);
const
int
y_offset
=
oix
*
top_k
*
inner_dim
+
iix
;
vector
<
pair
<
T
,
int64_t
>
>
vec
(
axis_dim
);
vector
<
pair
<
T
,
int64_t
>
>
vec
(
axis_dim
);
for
(
int
j
=
0
;
j
<
axis_dim
;
++
j
)
vec
[
j
]
=
std
::
make_pair
(
X
[
j
*
inner_dim
],
j
);
std
::
partial_sort
(
vec
.
begin
(),
vec
.
begin
()
+
top_k
,
vec
.
end
());
...
...
Dragon/src/kernels/array/argreduce_op_kernel.cu
View file @
d1f714e
...
...
@@ -133,8 +133,8 @@ template<> __global__ void _ArgMin<half>(
CHECK_EQ(top_k, 1) << "\nRequired top_k == 1."; \
auto nthreads = outer_dim * inner_dim; \
_##name \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, inner_dim, axis_dim, \
x, indices, values \
); \
...
...
@@ -168,8 +168,8 @@ template<> void ArgMax<float16, CUDAContext>(
CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
auto nthreads = outer_dim * inner_dim;
_ArgMax
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, inner_dim, axis_dim,
reinterpret_cast<const half*>(x),
indices,
...
...
@@ -189,8 +189,8 @@ template<> void ArgMin<float16, CUDAContext>(
CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
auto nthreads = outer_dim * inner_dim;
_ArgMin
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, inner_dim, axis_dim,
reinterpret_cast<const half*>(x),
indices,
...
...
Dragon/src/kernels/array/concat_op_kernel.cu
View file @
d1f714e
...
...
@@ -43,8 +43,8 @@ __global__ void _Concat(
auto cols = axis_dim * inner_dim; \
auto nthreads = outer_dim * axis_dim * inner_dim; \
_##name \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, \
inner_dim, \
cols, \
...
...
Dragon/src/kernels/array/crop_op_kernel.cu
View file @
d1f714e
...
...
@@ -83,8 +83,8 @@ __global__ void _CropGrad(
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, ndims, \
x_strides, y_dims, \
starts, x, y \
...
...
Dragon/src/kernels/array/index_select_op_kernel.cu
View file @
d1f714e
...
...
@@ -115,8 +115,8 @@ template <> __global__ void _IndexSelectGrad<half>(
CUDAContext* ctx) { \
auto nthreads = outer_dim * num_indices * inner_dim; \
_IndexSelect \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, inner_dim, \
axis_dim, num_indices, \
indices, x, y \
...
...
@@ -135,8 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \
_IndexSelectGrad \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, inner_dim, \
axis_dim, num_indices, \
indices, dy, dx \
...
...
@@ -170,8 +170,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_IndexSelectGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, inner_dim,
axis_dim, num_indices,
indices,
...
...
Dragon/src/kernels/array/one_hot_op_kernel.cu
View file @
d1f714e
...
...
@@ -32,8 +32,8 @@ template <> void OneHot<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_OneHot
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, depth, on_value, x, y
);
}
...
...
@@ -48,8 +48,8 @@ template <> void OneHot<int, CUDAContext>(
int* y,
CUDAContext* ctx) {
_OneHot
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, depth, on_value, x, y
);
}
...
...
@@ -64,8 +64,8 @@ template <> void OneHot<int64_t, CUDAContext>(
int64_t* y,
CUDAContext* ctx) {
_OneHot
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, depth, on_value, x, y
);
}
...
...
Dragon/src/kernels/array/pad_op_kernel.cu
View file @
d1f714e
...
...
@@ -130,8 +130,8 @@ __global__ void _EdgePad(
T* y, \
CUDAContext* ctx) { \
_ConstPad \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, ndims, \
x_dims, x_strides, \
y_dims, l_pads, \
...
...
@@ -152,8 +152,8 @@ __global__ void _EdgePad(
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, ndims, \
x_dims, x_strides, \
y_dims, l_pads, \
...
...
Dragon/src/kernels/array/reduce_sum_op_kernel.cu
View file @
d1f714e
...
...
@@ -202,8 +202,8 @@ void _ReduceSum(
ndims, x_dims, y_dims,
&rows, &cols)) {
_ColwiseReduceSum
<<
< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
rows, cols, scale, x, y
); return;
}
...
...
@@ -213,8 +213,8 @@ void _ReduceSum(
ndims, x_dims, y_dims,
&rows, &cols)) {
_RowwiseReduceSum
<<
< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
rows, cols, scale, x, y
); return;
}
...
...
@@ -245,8 +245,8 @@ void _ReduceSum(
ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());
_GenericReduceSum
<<
< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
ndims, outer_dim, inner_dim,
XSS, YDS, scale, x, y
);
...
...
@@ -372,8 +372,8 @@ template <> __global__ void _ReduceSumGrad<half>(
T* dx, \
CUDAContext* ctx) { \
_ReduceSumGrad \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, ndim, x_dims, \
y_dims, y_strides, \
scale, dy, dx \
...
...
@@ -398,8 +398,8 @@ template<> void ReduceSumGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_ReduceSumGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, ndim, x_dims,
y_dims, y_strides,
scale,
...
...
Dragon/src/kernels/array/repeat_op_kernel.cu
View file @
d1f714e
...
...
@@ -93,8 +93,8 @@ template<> __global__ void _RepeatGrad<half>(
auto y_inner_dim = inner_dim * repeats; \
auto nthreads = outer_dim * axis_dim * y_inner_dim; \
_Repeat \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, axis_dim, \
inner_dim, y_inner_dim, \
x, y \
...
...
@@ -113,8 +113,8 @@ template<> __global__ void _RepeatGrad<half>(
auto y_inner_dim = inner_dim * repeats; \
auto nthreads = outer_dim * axis_dim * inner_dim; \
_RepeatGrad \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, \
axis_dim, \
inner_dim, \
...
...
@@ -151,8 +151,8 @@ template<> void RepeatGrad<float16, CUDAContext>(
auto y_inner_dim = inner_dim * repeats;
auto nthreads = outer_dim * axis_dim * inner_dim;
_RepeatGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>(
nthreads,
axis_dim,
inner_dim,
...
...
Dragon/src/kernels/array/slice_op_kernel.cu
View file @
d1f714e
...
...
@@ -64,8 +64,8 @@ __global__ void _SliceGrad(
auto cols = slice_dim * inner_dim; \
auto nthreads = outer_dim * cols; \
_##name \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, \
inner_dim, \
axis_dim, \
...
...
@@ -126,8 +126,8 @@ template <> void SliceGrad<float16, CUDAContext>(
auto cols = slice_dim * inner_dim;
auto nthreads = outer_dim * cols;
_SliceGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
inner_dim,
axis_dim,
...
...
Dragon/src/kernels/array/tile_op_kernel.cu
View file @
d1f714e
...
...
@@ -98,8 +98,8 @@ template<> __global__ void _TileGrad<half>(
T* y, \
CUDAContext* ctx) { \
_Tile \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, \
ndims, \
x_dims, \
...
...
@@ -120,8 +120,8 @@ template<> __global__ void _TileGrad<half>(
auto nthreads = rows * cols; \
auto tiled_cols = multiple * cols; \
_TileGrad \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, \
cols, \
tiled_cols, \
...
...
@@ -156,8 +156,8 @@ template<> void TileGrad<float16, CUDAContext>(
auto nthreads = rows * cols;
auto tiled_cols = multiple * cols;
_TileGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
cols,
tiled_cols,
...
...
Dragon/src/kernels/array/transpose_op_kernel.cu
View file @
d1f714e
...
...
@@ -80,8 +80,8 @@ __global__ void _TransposeGrad(
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, ndims, x_strides, y_dims, x, y \
); \
}
...
...
Dragon/src/kernels/control_flow/assign_op_kernel.cu
View file @
d1f714e
...
...
@@ -55,8 +55,8 @@ __global__ void _Assign(
T* y, \
CUDAContext* ctx) { \
_Assign \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, \
ndims, \
x_dims, \
...
...
Dragon/src/kernels/control_flow/compare_op_kernel.cu
View file @
d1f714e
...
...
@@ -153,8 +153,8 @@ __global__ void _GreaterEqualHalf(
bool* y, \
CUDAContext* ctx) { \
IMPL \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, a, b, y \
); \
}
...
...
@@ -167,8 +167,8 @@ __global__ void _GreaterEqualHalf(
bool* y, \
CUDAContext* ctx) { \
_##OP##Half \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
...
...
Dragon/src/kernels/control_flow/masked_assign_op_kernel.cu
View file @
d1f714e
...
...
@@ -30,8 +30,8 @@ __global__ void _MaskedAssign(
T* y, \
CUDAContext* ctx) { \
_MaskedAssign \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, mask, x, y \
); \
}
...
...
Dragon/src/kernels/loss/l1_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -27,8 +27,8 @@ template<> void AbsGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_AbsGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, dy, dx
);
}
...
...
Dragon/src/kernels/loss/nll_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -55,8 +55,8 @@ template <> void NLLLoss<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLoss
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, loss, flag
);
...
...
@@ -77,8 +77,8 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLoss
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, loss, flag
);
...
...
@@ -129,8 +129,8 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLossGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, dx, flag
);
...
...
@@ -151,8 +151,8 @@ template<> void NLLLossGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLossGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, dx, flag
);
...
...
Dragon/src/kernels/loss/sigmoid_ce_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -42,8 +42,8 @@ template <> void SigmoidCrossEntropy<float, CUDAContext>(
int* flag,
CUDAContext* ctx) {
_SigmoidCrossEntropy
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, logit, target, loss, flag
);
}
...
...
@@ -77,8 +77,8 @@ template <> void SigmoidCrossEntropyGrad<float, CUDAContext>(
int* flag,
CUDAContext* ctx) {
_SigmoidCrossEntropyGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, logit, target, dlogit, flag
);
}
...
...
Dragon/src/kernels/loss/sigmoid_focal_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -71,8 +71,8 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLoss
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, losses, flags
...
...
@@ -96,8 +96,8 @@ template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLoss
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, losses, flags
...
...
@@ -171,8 +171,8 @@ template <> void SigmoidFocalLossGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto count = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLossGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, dlogits, flags
...
...
@@ -196,8 +196,8 @@ template <> void SigmoidFocalLossGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto count = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLossGrad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, dlogits, flags
...
...
Dragon/src/kernels/loss/smooth_l1_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -33,8 +33,8 @@ template<> void SmoothL1<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_SmoothL1
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, beta, x, y
);
}
...
...
@@ -63,8 +63,8 @@ template<> void SmoothL1Grad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_SmoothL1Grad
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, beta, dy, dx
);
}
...
...
Dragon/src/kernels/loss/softmax_ce_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -29,8 +29,8 @@ template <> void SoftmaxCrossEntropy<float, CUDAContext>(
float* losses,
CUDAContext* ctx) {
_SoftmaxCrossEntropy
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, prob, targets, losses
);
}
...
...
Dragon/src/kernels/loss/softmax_focal_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -67,8 +67,8 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLoss
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
...
...
@@ -95,8 +95,8 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLoss
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
...
...
@@ -179,8 +179,8 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLossGrad
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
...
...
@@ -207,8 +207,8 @@ template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLossGrad
<<
< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
...
...
Dragon/src/kernels/loss/sparse_softmax_ce_loss_op_kernel.cu
View file @
d1f714e
...
...
@@ -59,8 +59,8 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropy
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, loss, flag
);
...
...
@@ -81,8 +81,8 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropy
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, loss, flag
);
...
...
@@ -136,8 +136,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, dx, flag
);
...
...
@@ -158,8 +158,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, dx, flag
);
...
...
Dragon/src/kernels/misc/astype_op_kernel.cu
View file @
d1f714e
...
...
@@ -26,8 +26,8 @@ __global__ void _TypeA2B(
Tb* b, \
CUDAContext* ctx) { \
_TypeA2B \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, a, b \
); \
}
...
...
@@ -66,8 +66,8 @@ template <> void TypeA2B<float16, float, CUDAContext>(
float* b,
CUDAContext* ctx) {
_TypeA2B
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, reinterpret_cast<const half*>(a), b
);
}
...
...
@@ -89,8 +89,8 @@ template <> void TypeA2B<float, float16, CUDAContext>(
float16* b,
CUDAContext* ctx) {
_TypeA2B
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, a, reinterpret_cast<half*>(b)
);
}
...
...
@@ -112,8 +112,8 @@ template <> void TypeA2B<float16, float16, CUDAContext>(
float16* b,
CUDAContext* ctx) {
_TypeA2B
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(a),
reinterpret_cast<half*>(b)
...
...
Dragon/src/kernels/misc/gradient_op_kernel.cu
View file @
d1f714e
...
...
@@ -62,8 +62,8 @@ template <> __global__ void _GradientTwoSum<half2>(
T* dx, \
CUDAContext* ctx) { \
_GradientTwoSum \
<<
< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
count, dy1, dy2, dx \
); \
}
...
...
@@ -83,8 +83,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
CUDAContext* ctx) {
if ((count & 1) == 0) {
_GradientTwoSum
<<
< CUDA_BLOCKS(count >> 2), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count >> 2), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count >> 2,
reinterpret_cast<const half2*>(dy1),
reinterpret_cast<const half2*>(dy2),
...
...
@@ -92,8 +92,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
);
} else {
_GradientTwoSum
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
reinterpret_cast<const half*>(dy1),
reinterpret_cast<const half*>(dy2),
...
...
Dragon/src/kernels/misc/image_data_op_kernel.cu
View file @
d1f714e
...
...
@@ -76,14 +76,14 @@ template <> void ImageData<float, float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std, x, y
);
} else if (data_format == "NHWC") {
_ImageDataNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std, x, y
);
} else {
...
...
@@ -107,14 +107,14 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std, x, y
);
} else if (data_format == "NHWC") {
_ImageDataNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std, x, y
);
} else {
...
...
@@ -191,15 +191,15 @@ template <> void ImageData<float, float16, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataHalfNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
} else if (data_format == "NHWC") {
_ImageDataHalfNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
...
...
@@ -222,15 +222,15 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataHalfNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
} else if (data_format == "NHWC") {
_ImageDataHalfNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
...
...
Dragon/src/kernels/norm/batch_norm_op_kernel.cu
View file @
d1f714e
...
...
@@ -190,27 +190,27 @@ __global__ void _BatchNormInferenceGrad(
auto nthreads = N * C * S; \
if (data_format == "NCHW") { \
_BatchNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, C, S, x, mu, rsig, gamma, \
dy, ds, db, dgamma, dbeta \
); \
_BatchNormTrainingGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, N, C, S, x, mu, \
rsig, gamma, ds, db, dy, dx \
); \
} else if (data_format == "NHWC") { \
_BatchNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, C, S, x, mu, rsig, gamma, \
dy, ds, db, dgamma, dbeta \
); \
_BatchNormTrainingGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, N, C, S, x, mu, \
rsig, gamma, ds, db, dy, dx \
); \
...
...
@@ -234,24 +234,24 @@ __global__ void _BatchNormInferenceGrad(
if (data_format == "NCHW") { \
if (dgamma != nullptr) { \
_BatchNormWGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
(N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
} \
_BatchNormInferenceGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
(nthreads, C, S, rsig, gamma, dy, dx); \
} else if (data_format == "NHWC") { \
if (dgamma != nullptr) { \
_BatchNormWGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
(N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
} \
_BatchNormInferenceGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> \
(nthreads, C, S, rsig, gamma, dy, dx); \
} \
}
...
...
Dragon/src/kernels/norm/group_norm_op_kernel.cu
View file @
d1f714e
...
...
@@ -408,20 +408,20 @@ __global__ void _GroupNormGradHalf(
CUDAContext* ctx) { \
const int C = G * D; \
_GroupNormFusedParams<Tp> \
<<
< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, G, D, mu, rsig, gamma, beta, scale, bias \
); \
if (data_format == "NCHW") { \
_GroupNormForwardNCHW<Tx, Tp> \
<<
< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, C, S, x, scale, bias, y \
); \
} else if (data_format == "NHWC") { \
_GroupNormForwardNHWC<Tx, Tp> \
<<
< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, C, S, x, scale, bias, y \
); \
} \
...
...
@@ -448,35 +448,35 @@ __global__ void _GroupNormGradHalf(
auto nthreads = N * G * D * S; \
if (data_format == "NCHW") { \
_GroupNormWGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
); \
_GroupNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, G, D, S, x, gamma, dy, ds, db \
); \
_GroupNormGrad<Tx, Tp, StorageOrder::NCHW> \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
nthreads, G, D, S, x, mu, rsig, \
gamma, ds, db, dy, dx \
); \
} else if (data_format == "NHWC") { \
_GroupNormWGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
); \
_GroupNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
N, G, D, S, x, gamma, dy, ds, db \
); \
_GroupNormGrad<Tx, Tp, StorageOrder::NHWC> \
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> ( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>
> ( \
nthreads, G, D, S, x, mu, rsig, \
gamma, ds, db, dy, dx \
); \
...
...
@@ -503,14 +503,14 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
CUDAContext* ctx) {
const int C = G * D;
_GroupNormFusedParams<float>
<<
< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, G, D, mu, rsig, gamma, beta, scale, bias
);
if (data_format == "NCHW") {
_GroupNormForwardNCHW<half, float>
<<
< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, C, S,
reinterpret_cast<const half*>(x),
scale, bias,
...
...
@@ -518,8 +518,8 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_GroupNormForwardNHWC<half, float>
<<
< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, C, S,
reinterpret_cast<const half*>(x),
scale, bias,
...
...
@@ -548,8 +548,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
auto nthreads = N * G * D * S;
if (data_format == "NCHW") {
_GroupNormWGradHalf<StorageOrder::NCHW>
<<
< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig,
...
...
@@ -557,8 +557,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
dgamma, dbeta
);
_GroupNormInternalGradHalf<StorageOrder::NCHW>
<<
< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, G, D, S,
reinterpret_cast<const half*>(x),
gamma,
...
...
@@ -566,8 +566,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
ds, db
);
_GroupNormGradHalf<StorageOrder::NCHW>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig, gamma, ds, db,
...
...
@@ -576,8 +576,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
);
} else if (data_format == "NHWC") { \
_GroupNormWGradHalf<StorageOrder::NHWC>
<<
< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig,
...
...
@@ -585,8 +585,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
dgamma, dbeta
);
_GroupNormInternalGradHalf<StorageOrder::NHWC>
<<
< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
N, G, D, S,
reinterpret_cast<const half*>(x),
gamma,
...
...
@@ -594,8 +594,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
ds, db
);
_GroupNormGradHalf<StorageOrder::NHWC>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig, gamma, ds, db,
...
...
Dragon/src/kernels/recurrent/lstm_cell_op_kernel.cu
View file @
d1f714e
...
...
@@ -58,13 +58,13 @@ template <> void LSTMCell<float, CUDAContext>(
auto o_offset = 2 * C, c_offset = 3 * C,
x_offset = 4 * C, NC = N * C;
_LSTMCellAct
<<
< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
NC * 4, c_offset, x_offset, actx
);
_LSTMCellGate
<<
< CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
NC, C, o_offset, c_offset,
x_offset, cx, actx, c, h
);
...
...
@@ -138,14 +138,14 @@ template <> void LSTMCellGrad<float, CUDAContext>(
auto o_offset = 2 * C, c_offset = 3 * C,
x_offset = 4 * C, NC = N * C;
_LSTMCellGateGrad
<<
< CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
NC, C, o_offset, c_offset, x_offset,
cx, actx, c, dc, dh, dcx, dx
);
_LSTMCellActGrad
<<
< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
NC * 4, c_offset, x_offset, actx, dx
);
}
...
...
Dragon/src/kernels/update/adam_update_op_kernel.cu
View file @
d1f714e
...
...
@@ -39,8 +39,8 @@ template <> void AdamUpdate<float, CUDAContext>(
float* v,
CUDAContext* ctx) {
_AdamUpdate
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, lr, beta1, beta2, eps, g, m, v
);
}
...
...
Dragon/src/kernels/update/mprec_update_op_kerne.cu
View file @
d1f714e
...
...
@@ -29,8 +29,8 @@ template <> void MixedPrecL2Decay<float16, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_MixedPrecL2DecayHalf
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
alpha,
reinterpret_cast<const half*>(w),
...
...
@@ -58,8 +58,8 @@ template <> void MixedPrecUpdate<float16, CUDAContext>(
float16* w,
CUDAContext* ctx) {
_MixedPrecUpdateHalf
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count,
updates,
reinterpret_cast<half*>(w)
...
...
Dragon/src/kernels/update/nesterov_update_op_kernel.cu
View file @
d1f714e
...
...
@@ -32,8 +32,8 @@ template <> void NesterovUpdate<float, CUDAContext>(
float* h,
CUDAContext* ctx) {
_NesterovUpdate
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, lr, momentum, g, h
);
}
...
...
Dragon/src/kernels/update/rmsprop_update_op_kernel.cu
View file @
d1f714e
...
...
@@ -34,8 +34,8 @@ template <> void RMSPropUpdate<float, CUDAContext>(
float* h,
CUDAContext* ctx) {
_RMSPropUpdate
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, lr, decay, eps, g, h
);
}
...
...
Dragon/src/kernels/update/sgd_update_op_kernel.cu
View file @
d1f714e
...
...
@@ -31,8 +31,8 @@ template <> void SGDUpdate<float, CUDAContext>(
float* h,
CUDAContext* ctx) {
_SGDUpdate
<<
< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
count, lr, momentum, g, h
);
}
...
...
Dragon/src/kernels/vision/bias_add_op_kernel.cu
View file @
d1f714e
...
...
@@ -52,14 +52,14 @@ template<> void BiasAdd<float, CUDAContext>(
auto nthreads = outer_dim * axis_dim * inner_dim;
if (data_format == "NCHW") {
_BiasAddNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, inner_dim, bias, y
);
} else if (data_format == "NHWC") {
_BiasAddNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, axis_dim, bias, y
);
} else {
...
...
Dragon/src/kernels/vision/bilinear_resize_op_kernel.cu
View file @
d1f714e
...
...
@@ -109,15 +109,15 @@ template <> void BilinearResize<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_BilinearResizeNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
} else if(data_format == "NHWC") {
_BilinearResizeNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
...
...
@@ -224,15 +224,15 @@ template <> void BilinearResizeGrad<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_BilinearResizeGradNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
} else if(data_format == "NHWC") {
_BilinearResizeGradNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
...
...
Dragon/src/kernels/vision/conv_op_kernel.cu
View file @
d1f714e
...
...
@@ -123,8 +123,8 @@ template <> void Im2Col2d<float, CUDAContext>(
auto nthreads = C * out_h * out_w;
if (data_format == "NCHW") {
_Im2Col2dNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
H, W,
out_h, out_w,
...
...
@@ -136,8 +136,8 @@ template <> void Im2Col2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_Im2Col2dNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -286,8 +286,8 @@ template <> void Col2Im2d<float, CUDAContext>(
const int nthreads = C * H * W;
if (data_format == "NCHW") {
_Col2Im2dNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
H, W,
out_h, out_w,
...
...
@@ -299,8 +299,8 @@ template <> void Col2Im2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_Col2Im2dNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
Dragon/src/kernels/vision/depthwise_conv_op_kernel.cu
View file @
d1f714e
...
...
@@ -144,8 +144,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
if (data_format == "NCHW") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dNCHW<float, 3, 3>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -157,8 +157,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dNCHW<float, 5, 5>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -170,8 +170,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dNCHW<float, 7, 7>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -183,8 +183,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else {
_DepthwiseConv2dNCHW<float, -1, -1>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -198,8 +198,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
} else if (data_format == "NHWC") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dNHWC<float, 3, 3>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -211,8 +211,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dNHWC<float, 5, 5>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -224,8 +224,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dNHWC<float, 7, 7>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -237,8 +237,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else {
_DepthwiseConv2dNHWC<float, -1, -1>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -394,8 +394,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
if (data_format == "NCHW") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dGradNCHW<float, 3, 3>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -407,8 +407,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dGradNCHW<float, 5, 5>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -420,8 +420,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dGradNCHW<float, 7, 7>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -433,8 +433,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else {
_DepthwiseConv2dGradNCHW<float, -1, -1>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -448,8 +448,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
} else if (data_format == "NHWC") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dGradNHWC<float, 3, 3>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -461,8 +461,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dGradNHWC<float, 5, 5>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -474,8 +474,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dGradNHWC<float, 7, 7>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -487,8 +487,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else {
_DepthwiseConv2dGradNHWC<float, -1, -1>
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
out_h, out_w,
...
...
@@ -634,8 +634,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
auto nblocks = C * kernel_h * kernel_w;
if (data_format == "NCHW") {
_DepthwiseConv2dWGradNCHW
<<
< nblocks, nthreads,
0, ctx->cuda_stream() >>
>(
<<< nblocks, nthreads,
0, ctx->cuda_stream() >>
>(
N, C, H, W,
out_h, out_w,
kernel_h, kernel_w,
...
...
@@ -646,8 +646,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_DepthwiseConv2dWGradNHWC
<<
< nblocks, nthreads,
0, ctx->cuda_stream() >>
>(
<<< nblocks, nthreads,
0, ctx->cuda_stream() >>
>(
N, C, H, W,
out_h, out_w,
kernel_h, kernel_w,
...
...
Dragon/src/kernels/vision/drop_block_op_kernel.cu
View file @
d1f714e
...
...
@@ -77,16 +77,12 @@ template <> void DropBlock2d<CUDAContext>(
int* mask,
CUDAContext* ctx) {
auto nthreads = N * C * seed_h * seed_w;
math::RandomUniform(
nthreads,
0.f, float(UINT_MAX),
seed, ctx
);
math::RandomUniform(nthreads, 0.f, 1.f, seed, ctx);
auto mask_thresh = (uint32_t)(UINT_MAX * gamma);
if (data_format == "NCHW") {
_DropBlock2dNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
seed_h, seed_w,
...
...
@@ -96,8 +92,8 @@ template <> void DropBlock2d<CUDAContext>(
);
} else if(data_format == "NHWC") {
_DropBlock2dNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
seed_h, seed_w,
...
...
Dragon/src/kernels/vision/nn_resize_op_kernel.cu
View file @
d1f714e
...
...
@@ -81,15 +81,15 @@ template <> void NNResize<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_NNResizeNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
} else if(data_format == "NHWC") {
_NNResizeNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
...
...
@@ -116,8 +116,8 @@ template <> void NNResize<float16, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_NNResizeNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W,
out_h, out_w, scale_h, scale_w,
reinterpret_cast<const half*>(x),
...
...
@@ -125,8 +125,8 @@ template <> void NNResize<float16, CUDAContext>(
);
} else if(data_format == "NHWC") {
_NNResizeNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W,
out_h, out_w, scale_h, scale_w,
reinterpret_cast<const half*>(x),
...
...
@@ -209,15 +209,15 @@ template <> void NNResizeGrad<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_NNResizeGradNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
} else if(data_format == "NHWC") {
_NNResizeGradNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
...
...
Dragon/src/kernels/vision/pool_op_kernel.cu
View file @
d1f714e
...
...
@@ -120,8 +120,8 @@ template<> void MaxPool2d<float, CUDAContext>(
auto nthreads = N * C * pool_h * pool_w;
if (data_format == "NCHW") {
_MaxPool2dNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -132,8 +132,8 @@ template<> void MaxPool2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_MaxPool2dNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -256,8 +256,8 @@ template<> void AvgPool2d<float, CUDAContext>(
auto nthreads = N * C * pool_h * pool_w;
if (data_format == "NCHW") {
_AvgPool2dNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -268,8 +268,8 @@ template<> void AvgPool2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_AvgPool2dNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -392,8 +392,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_MaxPool2dGrad_NCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -404,8 +404,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_MaxPool2dGradNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -531,8 +531,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_AvgPool2dGradNCHW
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -543,8 +543,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_AvgPool2dGradNHWC
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
Dragon/src/kernels/vision/roi_align_op_kernel.cu
View file @
d1f714e
...
...
@@ -132,8 +132,8 @@ template<> void ROIAlign<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlign
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -283,8 +283,8 @@ template<> void ROIAlignGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlignGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
Dragon/src/kernels/vision/roi_align_op_kernel.fp16.cu
View file @
d1f714e
...
...
@@ -134,8 +134,8 @@ template<> void ROIAlign<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlignHalf
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>
(nthreads, C, H, W, pool_h, pool_w,
sampling_ratio, spatial_scale,
reinterpret_cast<const half*>(x), rois,
...
...
Dragon/src/kernels/vision/roi_pool_op_kernel.cu
View file @
d1f714e
...
...
@@ -92,8 +92,8 @@ template<> void ROIPool<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIPool
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -185,8 +185,8 @@ template<> void ROIPool<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIPoolHalf
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
C, H, W,
pool_h, pool_w,
...
...
@@ -286,8 +286,8 @@ template<> void ROIPoolGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = N * C * H * W;
_ROIPoolGrad
<<
< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
nthreads,
num_rois,
C, H, W,
...
...
Dragon/src/onnx/onnx_backend.cc
View file @
d1f714e
...
...
@@ -180,9 +180,9 @@ ONNXBackend::get_special_nodes() const {
};
return
kSpecialNodes
;
}
const
Map
<
string
,
Map
<
string
,
string
>
>&
const
Map
<
string
,
Map
<
string
,
string
>
>&
ONNXBackend
::
get_node_renamed_attrs
()
const
{
const
static
Map
<
string
,
Map
<
string
,
string
>
>
const
static
Map
<
string
,
Map
<
string
,
string
>
>
kPerNodeRenamedAttrs
=
{
{
"Gemm"
,
{
{
"transB"
,
"transW"
}
}
},
{
"BatchNormalization"
,
{
{
"epsilon"
,
"eps"
}
}
},
...
...
Dragon/src/onnx/onnx_backend.h
View file @
d1f714e
...
...
@@ -221,7 +221,7 @@ class ONNXBackend {
const
Map
<
string
,
SpecialNodeConverter
>&
get_special_nodes
()
const
;
const
Map
<
string
,
string
>&
get_renamed_attrs
()
const
;
const
Map
<
string
,
Map
<
string
,
string
>
>&
get_node_renamed_attrs
()
const
;
const
Map
<
string
,
Map
<
string
,
string
>
>&
get_node_renamed_attrs
()
const
;
};
}
// namespace onnx
...
...
Dragon/src/operators/activation/cudnn_dropout_op.cc
View file @
d1f714e
...
...
@@ -77,15 +77,8 @@ template <class Context>
void
CuDNNDropoutOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -147,15 +140,8 @@ template <class Context>
void
CuDNNDropoutGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Dropout
);
...
...
Dragon/src/operators/activation/cudnn_elu_op.cc
View file @
d1f714e
...
...
@@ -26,15 +26,8 @@ template <class Context>
void
CuDNNEluOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -60,15 +53,8 @@ template <class Context>
void
CuDNNEluGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Elu
);
...
...
Dragon/src/operators/activation/cudnn_relu_op.cc
View file @
d1f714e
...
...
@@ -40,15 +40,8 @@ void CuDNNReluOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -92,15 +85,8 @@ void CuDNNReluGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Relu
);
...
...
Dragon/src/operators/activation/cudnn_sigmoid_op.cc
View file @
d1f714e
...
...
@@ -35,15 +35,8 @@ template <class Context>
void
CuDNNSigmoidOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -82,15 +75,8 @@ template <class Context>
void
CuDNNSigmoidGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Sigmoid
);
...
...
Dragon/src/operators/activation/cudnn_softmax_op.cc
View file @
d1f714e
...
...
@@ -45,15 +45,8 @@ void CuDNNSoftmaxOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -91,15 +84,8 @@ void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Softmax
);
...
...
Dragon/src/operators/activation/cudnn_tanh_op.cc
View file @
d1f714e
...
...
@@ -35,15 +35,8 @@ template <class Context>
void
CuDNNTanhOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -82,15 +75,8 @@ template <class Context>
void
CuDNNTanhGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Tanh
);
...
...
Dragon/src/operators/activation/dropout_op.cc
View file @
d1f714e
...
...
@@ -44,15 +44,8 @@ template <class Context>
void
DropoutOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -83,15 +76,8 @@ template <class Context>
void
DropoutGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Dropout
);
...
...
Dragon/src/operators/activation/droppath_op.cc
View file @
d1f714e
...
...
@@ -52,15 +52,8 @@ void DropPathOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -97,15 +90,8 @@ void DropPathGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
DropPath
);
...
...
Dragon/src/operators/activation/elu_op.cc
View file @
d1f714e
...
...
@@ -20,13 +20,8 @@ template <class Context>
void
EluOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -46,13 +41,8 @@ template <class Context>
void
EluGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Elu
);
...
...
Dragon/src/operators/activation/prelu_op.cc
View file @
d1f714e
...
...
@@ -40,13 +40,8 @@ void PReluOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -98,13 +93,8 @@ void PReluGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
Y
(
1
)
->
ReshapeLike
(
X
(
1
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
PRelu
);
...
...
Dragon/src/operators/activation/relu_op.cc
View file @
d1f714e
...
...
@@ -20,15 +20,8 @@ template <class Context>
void
ReluOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -48,15 +41,8 @@ template <class Context>
void
ReluGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Relu
);
...
...
Dragon/src/operators/activation/selu_op.cc
View file @
d1f714e
...
...
@@ -19,15 +19,8 @@ template <class Context>
void
SEluOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -47,15 +40,8 @@ template <class Context>
void
SEluGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
SElu
);
...
...
Dragon/src/operators/activation/sigmoid_op.cc
View file @
d1f714e
...
...
@@ -15,13 +15,8 @@ template <class Context>
void
SigmoidOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -41,13 +36,8 @@ template <class Context>
void
SigmoidGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Sigmoid
);
...
...
Dragon/src/operators/activation/softmax_op.cc
View file @
d1f714e
...
...
@@ -43,13 +43,8 @@ void SoftmaxOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -86,13 +81,8 @@ void SoftmaxGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Softmax
);
...
...
Dragon/src/operators/activation/tanh_op.cc
View file @
d1f714e
...
...
@@ -15,13 +15,8 @@ template <class Context>
void
TanhOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -41,13 +36,8 @@ template <class Context>
void
TanhGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Tanh
);
...
...
Dragon/src/operators/arithmetic/affine_op.cc
View file @
d1f714e
...
...
@@ -46,15 +46,8 @@ void AffineOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -111,9 +104,7 @@ void AffineGradientOp<Context>::RunImpl() {
}
template
<
class
Context
>
template
<
typename
T
>
void
AffineGradientOp
<
Context
>::
Reduce
(
T
*
x
,
T
*
y
)
{
void
AffineGradientOp
<
Context
>::
Reduce
(
T
*
x
,
T
*
y
)
{
vec32_t
dims
=
{
(
int
)
outer_dim_
,
(
int
)
scale_dim_
,
...
...
@@ -138,15 +129,8 @@ void AffineGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
-
1
));
if
(
XIsType
(
X
(
-
1
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
-
1
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
-
1
));
}
DEPLOY_CPU
(
Affine
);
...
...
Dragon/src/operators/arithmetic/cudnn_affine_op.cc
View file @
d1f714e
...
...
@@ -108,13 +108,6 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
template
<
class
Context
>
template
<
typename
DT
,
typename
CT
>
void
CuDNNAffineGradientOp
<
Context
>::
RunImpl
()
{
this
->
template
ResetDesc
<
DT
>
(
X
(
-
1
));
scale_dim_
=
X
(
1
).
count
();
outer_dim_
=
X
(
-
1
).
count
(
0
,
axis_
);
inner_dim_
=
X
(
-
1
).
count
(
axis_
+
num_axes_
);
dim_
=
scale_dim_
*
inner_dim_
;
reduce_dim_
=
std
::
max
(
outer_dim_
,
inner_dim_
);
Y
(
0
)
->
ReshapeLike
(
X
(
-
1
));
auto
*
alpha
=
X
(
1
).
template
data
<
DT
,
Context
>
();
auto
*
dy
=
X
(
-
1
).
template
mutable_data
<
DT
,
Context
>
();
...
...
@@ -230,9 +223,7 @@ void CuDNNAffineGradientOp<Context>::CuDNNReduce(
}
template
<
class
Context
>
template
<
typename
T
>
void
CuDNNAffineGradientOp
<
Context
>::
Reduce
(
T
*
x
,
T
*
y
)
{
void
CuDNNAffineGradientOp
<
Context
>::
Reduce
(
T
*
x
,
T
*
y
)
{
vec32_t
dims
=
{
(
int
)
outer_dim_
,
(
int
)
scale_dim_
,
...
...
@@ -248,6 +239,14 @@ void CuDNNAffineGradientOp<Context>::Reduce(
template
<
class
Context
>
void
CuDNNAffineGradientOp
<
Context
>::
RunOnDevice
()
{
scale_dim_
=
X
(
1
).
count
();
outer_dim_
=
X
(
-
1
).
count
(
0
,
axis_
);
inner_dim_
=
X
(
-
1
).
count
(
axis_
+
num_axes_
);
dim_
=
scale_dim_
*
inner_dim_
;
reduce_dim_
=
std
::
max
(
outer_dim_
,
inner_dim_
);
Y
(
0
)
->
ReshapeLike
(
X
(
-
1
));
if
(
XIsType
(
X
(
-
1
),
float
))
{
RunImpl
<
float
,
float
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
float16
))
{
...
...
Dragon/src/operators/arithmetic/eltwise_op.cc
View file @
d1f714e
...
...
@@ -36,6 +36,13 @@ void EltwiseOp<Context>::ProdRunImpl() {
template
<
class
Context
>
template
<
typename
T
>
void
EltwiseOp
<
Context
>::
RunImpl
()
{
if
(
operation_
==
"SUM"
)
SumRunImpl
<
T
>
();
else
if
(
operation_
==
"PROD"
)
ProdRunImpl
<
T
>
();
else
LOG
(
FATAL
)
<<
"Unknwon Operation: "
<<
operation_
;
}
template
<
class
Context
>
void
EltwiseOp
<
Context
>::
RunOnDevice
()
{
for
(
int
i
=
1
;
i
<
XSize
();
i
++
)
{
CHECK
(
X
(
i
).
dims
()
==
X
(
0
).
dims
())
<<
"
\n
Excepted Input("
<<
i
<<
")'s dims as "
...
...
@@ -45,33 +52,10 @@ void EltwiseOp<Context>::RunImpl() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
operation_
==
"SUM"
)
SumRunImpl
<
T
>
();
else
if
(
operation_
==
"PROD"
)
ProdRunImpl
<
T
>
();
else
LOG
(
FATAL
)
<<
"Unknwon Operation: "
<<
operation_
;
}
template
<
class
Context
>
void
EltwiseOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -133,26 +117,10 @@ void EltwiseGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
EltwiseGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Eltwise
);
...
...
Dragon/src/operators/arithmetic/exp_op.cc
View file @
d1f714e
...
...
@@ -15,17 +15,9 @@ template <class Context>
void
ExpOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -40,17 +32,9 @@ template <class Context>
void
ExpGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Exp
);
...
...
Dragon/src/operators/arithmetic/fully_connected_op.cc
View file @
d1f714e
...
...
@@ -84,6 +84,12 @@ void FullyConnectedOp<Context>::NoTransRunImpl() {
}
}
template
<
class
Context
>
template
<
typename
T
>
void
FullyConnectedOp
<
Context
>::
RunImpl
()
{
if
(
transW_
)
TransRunImpl
<
T
>
();
else
NoTransRunImpl
<
T
>
();
}
template
<
class
Context
>
void
FullyConnectedOp
<
Context
>::
RunOnDevice
()
{
DETERMINE_RUNTIME_ARGS
(
X
(
0
));
...
...
@@ -101,31 +107,12 @@ void FullyConnectedOp<Context>::RunOnDevice() {
for
(
int
i
=
0
;
i
<
axis_
+
1
;
i
++
)
{
out_shape
[
i
]
=
i
<
axis_
?
X
(
0
).
dim
(
i
)
:
N_
;
}
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
float16
))
{
if
(
transW_
)
{
TransRunImpl
<
float16
>
();
}
else
{
NoTransRunImpl
<
float16
>
();
}
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
if
(
transW_
)
{
TransRunImpl
<
float
>
();
}
else
{
NoTransRunImpl
<
float
>
();
}
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
if
(
transW_
)
{
TransRunImpl
<
double
>
();
}
else
{
NoTransRunImpl
<
double
>
();
}
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -209,17 +196,9 @@ void FullyConnectedGradientOp<Context>::RunOnDevice() {
<<
X
(
1
).
DimString
();
}
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
FullyConnected
);
...
...
Dragon/src/operators/arithmetic/gram_matrix_op.cc
View file @
d1f714e
...
...
@@ -35,17 +35,9 @@ void GramMatrixOp<Context>::RunOnDevice() {
{
outer_dim_
,
axis_dim_
,
axis_dim_
}
);
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -79,17 +71,9 @@ void GramMatrixGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
GramMatrix
);
...
...
Dragon/src/operators/arithmetic/log_op.cc
View file @
d1f714e
...
...
@@ -14,17 +14,9 @@ template <class Context>
void
LogOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -39,17 +31,9 @@ template <class Context>
void
LogGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Log
);
...
...
Dragon/src/operators/arithmetic/matmul_op.cc
View file @
d1f714e
...
...
@@ -65,17 +65,9 @@ void MatmulOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -182,17 +174,9 @@ void MatmulGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
Y
(
1
)
->
ReshapeLike
(
X
(
1
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Matmul
);
...
...
Dragon/src/operators/arithmetic/maximum_op.cc
View file @
d1f714e
...
...
@@ -58,26 +58,10 @@ void MaximumOp<Context>::RunImpl() {
template
<
class
Context
>
void
MaximumOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -135,9 +119,6 @@ void MaximumGradientOp<Context>::BroadcastRunImpl() {
template
<
class
Context
>
template
<
typename
T
>
void
MaximumGradientOp
<
Context
>::
RunImpl
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
Y
(
1
)
->
ReshapeLike
(
X
(
1
));
if
(
X
(
0
).
dims
()
==
X
(
1
).
dims
())
{
EltwiseRunImpl
<
T
>
();
}
else
{
...
...
@@ -147,26 +128,13 @@ void MaximumGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
MaximumGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
Y
(
1
)
->
ReshapeLike
(
X
(
1
));
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Maximum
);
...
...
Dragon/src/operators/arithmetic/minimum_op.cc
View file @
d1f714e
...
...
@@ -58,26 +58,10 @@ void MinimumOp<Context>::RunImpl() {
template
<
class
Context
>
void
MinimumOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -135,9 +119,6 @@ void MinimumGradientOp<Context>::BroadcastRunImpl() {
template
<
class
Context
>
template
<
typename
T
>
void
MinimumGradientOp
<
Context
>::
RunImpl
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
Y
(
1
)
->
ReshapeLike
(
X
(
1
));
if
(
X
(
0
).
dims
()
==
X
(
1
).
dims
())
{
EltwiseRunImpl
<
T
>
();
}
else
{
...
...
@@ -147,26 +128,13 @@ void MinimumGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
MinimumGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
Y
(
1
)
->
ReshapeLike
(
X
(
1
));
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Minimum
);
...
...
Dragon/src/operators/arithmetic/moments_op.cc
View file @
d1f714e
...
...
@@ -7,6 +7,31 @@ namespace dragon {
template
<
class
Context
>
template
<
typename
Tx
,
typename
Ty
>
void
MomentsOp
<
Context
>::
RunImpl
()
{
auto
*
x
=
X
(
0
).
template
data
<
Tx
,
Context
>
();
auto
*
mean
=
Y
(
0
)
->
template
mutable_data
<
Ty
,
Context
>
();
auto
*
var
=
Y
(
1
)
->
template
mutable_data
<
Ty
,
Context
>
();
if
(
X
(
0
).
count
()
==
1
)
{
kernel
::
TypeA2B
(
Y
(
0
)
->
count
(),
x
,
mean
,
ctx
()
);
math
::
Set
(
Y
(
0
)
->
count
(),
cast
::
to
<
Ty
>
(
0.
f
),
var
,
ctx
()
);
}
else
{
kernel
::
Moments
(
(
int
)
dims32_
.
size
(),
dims32_
.
data
(),
(
int
)
axes32_
.
size
(),
axes32_
.
data
(),
x
,
mean
,
var
,
ctx
()
);
}
}
template
<
class
Context
>
void
MomentsOp
<
Context
>::
RunOnDevice
()
{
dims_
=
X
(
0
).
dims
();
axes32_
.
clear
();
dims32_
.
assign
(
dims_
.
begin
(),
dims_
.
end
());
axes32_
.
assign
(
axes_
.
begin
(),
axes_
.
end
());
...
...
@@ -35,31 +60,6 @@ void MomentsOp<Context>::RunImpl() {
Y
(
0
)
->
Reshape
(
out_shape
);
Y
(
1
)
->
Reshape
(
out_shape
);
auto
*
x
=
X
(
0
).
template
data
<
Tx
,
Context
>
();
auto
*
mean
=
Y
(
0
)
->
template
mutable_data
<
Ty
,
Context
>
();
auto
*
var
=
Y
(
1
)
->
template
mutable_data
<
Ty
,
Context
>
();
if
(
X
(
0
).
count
()
==
1
)
{
kernel
::
TypeA2B
(
Y
(
0
)
->
count
(),
x
,
mean
,
ctx
()
);
math
::
Set
(
Y
(
0
)
->
count
(),
cast
::
to
<
Ty
>
(
0.
f
),
var
,
ctx
()
);
}
else
{
kernel
::
Moments
(
(
int
)
dims32_
.
size
(),
dims32_
.
data
(),
(
int
)
axes32_
.
size
(),
axes32_
.
data
(),
x
,
mean
,
var
,
ctx
()
);
}
}
template
<
class
Context
>
void
MomentsOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
,
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
...
...
Dragon/src/operators/arithmetic/pow_op.cc
View file @
d1f714e
...
...
@@ -32,17 +32,9 @@ template <class Context>
void
PowOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -88,17 +80,9 @@ template <class Context>
void
PowGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Pow
);
...
...
Dragon/src/operators/arithmetic/sqrt_op.cc
View file @
d1f714e
...
...
@@ -14,17 +14,9 @@ template <class Context>
void
SqrtOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -40,17 +32,9 @@ template <class Context>
void
SqrtGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Sqrt
);
...
...
Dragon/src/operators/arithmetic/square_op.cc
View file @
d1f714e
...
...
@@ -14,26 +14,10 @@ template <class Context>
void
SquareOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -49,26 +33,10 @@ template <class Context>
void
SquareGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Square
);
...
...
Dragon/src/operators/array/arange_op.cc
View file @
d1f714e
...
...
@@ -6,20 +6,24 @@ namespace dragon {
template
<
class
Context
>
template
<
typename
T
>
void
ArangeOp
<
Context
>::
RunImpl
()
{
astart_
=
start
(),
astop_
=
stop
(),
astep_
=
step
();
if
(
astop_
==
0
)
{
astop_
=
astart_
;
astart_
=
0
;
}
dim_
=
(
astop_
-
astart_
-
1
)
/
astep_
+
1
;
CHECK_GT
(
dim_
,
0
)
<<
"
\n
Invalid arguments:
\n
"
<<
"start = "
<<
start
()
<<
", "
<<
"stop = "
<<
stop
()
<<
", "
<<
"step = "
<<
step
()
<<
"."
;
Y
(
0
)
->
Reshape
({
dim_
});
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
kernel
::
Arange
(
dim_
,
astart_
,
astep_
,
y
,
ctx
());
}
template
<
class
Context
>
void
ArangeOp
<
Context
>::
RunOnDevice
()
{
astart_
=
start
(),
astop_
=
stop
(),
astep_
=
step
();
if
(
astop_
==
0
)
{
astop_
=
astart_
;
astart_
=
0
;
}
dim_
=
(
astop_
-
astart_
-
1
)
/
astep_
+
1
;
CHECK_GT
(
dim_
,
0
)
<<
"
\n
Invalid arguments:
\n
"
<<
"start = "
<<
start
()
<<
", "
<<
"stop = "
<<
stop
()
<<
", "
<<
"step = "
<<
step
()
<<
"."
;
Y
(
0
)
->
Reshape
({
dim_
});
if
(
dtype
()
==
"int8"
)
{
RunImpl
<
int8_t
>
();
}
else
if
(
dtype
()
==
"uint8"
)
{
...
...
Dragon/src/operators/array/argreduce_op.cc
View file @
d1f714e
...
...
@@ -101,28 +101,10 @@ void ArgReduceOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
Y
(
1
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
ArgReduce
);
...
...
Dragon/src/operators/array/concat_op.cc
View file @
d1f714e
...
...
@@ -56,28 +56,10 @@ void ConcatOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -116,28 +98,10 @@ void ConcatGradientOp<Context>::RunOnDevice() {
for
(
int
i
=
0
;
i
<
YSize
();
i
++
)
Y
(
i
)
->
ReshapeLike
(
X
(
i
));
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Concat
);
...
...
Dragon/src/operators/array/crop_op.cc
View file @
d1f714e
...
...
@@ -145,28 +145,10 @@ void CropOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC
(
X_strides_
,
X
(
0
).
strides
(),
int
);
TENSOR_FROM_VEC
(
Y_dims_
,
Y_dims
,
int
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -209,28 +191,10 @@ void CropGradientOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC
(
X_strides_
,
X
(
0
).
strides
(),
int
);
TENSOR_FROM_VEC
(
Y_dims_
,
Y_dims
,
int
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Crop
);
...
...
Dragon/src/operators/array/index_select_op.cc
View file @
d1f714e
...
...
@@ -55,28 +55,10 @@ void IndexSelectOp<Context>::RunOnDevice() {
CHECK
(
X
(
1
).
template
IsType
<
int64_t
>
())
<<
"
\n
The type of indices should be int64."
;
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -115,26 +97,10 @@ void IndexSelectGradientOp<Context>::RunOnDevice() {
CHECK
(
X
(
1
).
template
IsType
<
int64_t
>
())
<<
"
\n
The type of indices should be int64."
;
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
IndexSelect
);
...
...
Dragon/src/operators/array/multinomial_op.cc
View file @
d1f714e
...
...
@@ -35,17 +35,26 @@ void MultinomialOp<Context>::RunImpl() {
double
running_total
,
r
;
int
yi
=
0
,
num_classes
=
X
(
0
).
dim
(
axis_
);
double
uniform_p
=
1.
/
(
double
)
num_classes
;
auto
*
rng
=
ctx
()
->
rand_generator
();
std
::
uniform_real_distribution
<
float
>
eps_dist
;
for
(
int
i
=
0
;
i
<
outer_dim_
;
++
i
)
{
running_total
=
0.
;
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
running_total
+=
(
double
)
x
[
j
];
cdf
[
j
]
=
running_total
;
if
(
eps_
>
0.
f
&&
eps_dist
(
*
rng
)
<
eps_
)
{
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
running_total
+=
uniform_p
;
cdf
[
j
]
=
running_total
;
}
}
else
{
for
(
int
j
=
0
;
j
<
num_classes
;
++
j
)
{
running_total
+=
(
double
)
x
[
j
];
cdf
[
j
]
=
running_total
;
}
}
std
::
uniform_real_distribution
<
double
>
dist
(
0.
f
,
running_total
);
dist
(
0.
,
running_total
);
for
(
int
j
=
0
;
j
<
(
int
)
num_samples_
;
++
j
)
{
r
=
dist
(
*
rng
);
auto
found_iter
=
std
::
upper_bound
(
...
...
@@ -75,24 +84,10 @@ void MultinomialOp<Context>::RunOnDevice() {
// Normalize the logits if necessary
if
(
normalize_
)
SoftmaxRun
();
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Multinomial
);
...
...
Dragon/src/operators/array/one_hot_op.cc
View file @
d1f714e
...
...
@@ -29,17 +29,9 @@ void OneHotOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"int32"
,
"int64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
int
,
int64_t
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
OneHot
);
...
...
Dragon/src/operators/array/pad_op.cc
View file @
d1f714e
...
...
@@ -112,28 +112,10 @@ void PadOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC
(
X_strides_
,
X
(
0
).
strides
(),
int
);
TENSOR_FROM_VEC
(
Y_dims_
,
Y_dims
,
int
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -213,28 +195,10 @@ void PadGradientOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC
(
Y_strides_
,
X
(
0
).
strides
(),
int
);
TENSOR_FROM_VEC
(
X_dims_
,
X_dims
,
int
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Pad
);
...
...
Dragon/src/operators/array/reduce_op.cc
View file @
d1f714e
...
...
@@ -14,33 +14,6 @@ namespace dragon {
template
<
class
Context
>
template
<
typename
T
>
void
ReduceOp
<
Context
>::
RunImpl
()
{
dims_
=
X
(
0
).
dims
();
dims32_
.
assign
(
dims_
.
begin
(),
dims_
.
end
());
axes32_
.
assign
(
axes_
.
begin
(),
axes_
.
end
());
if
(
axes32_
.
empty
())
{
// Reduce to a Scalar if missing axes
for
(
int
i
=
0
;
i
<
X
(
0
).
ndim
();
++
i
)
axes32_
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
axes32_
.
size
();
i
++
)
{
int
axis
=
axes32_
[
i
];
axes32_
[
i
]
=
axis
<
0
?
axis
+
X
(
0
).
ndim
()
:
axis
;
CHECK
(
axes32_
[
i
]
>=
0
&&
axes32_
[
i
]
<
X
(
0
).
ndim
())
\
<<
"
\n
Excepted the axis in [-"
<<
X
(
0
).
ndim
()
<<
", "
<<
X
(
0
).
ndim
()
<<
"), got "
<<
axis
<<
"."
;
dims_
[
axes32_
[
i
]]
=
1
;
}
vec64_t
out_shape
;
for
(
const
auto
&
dim
:
dims_
)
{
if
(
dim
!=
1
||
keep_dims_
)
out_shape
.
emplace_back
(
dim
);
}
Y
(
0
)
->
Reshape
(
out_shape
);
auto
*
x
=
X
(
0
).
template
data
<
T
,
Context
>
();
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
...
...
@@ -64,31 +37,8 @@ void ReduceOp<Context>::RunImpl() {
template
<
class
Context
>
void
ReduceOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
}
template
<
class
Context
>
template
<
typename
T
>
void
ReduceGradientOp
<
Context
>::
RunImpl
()
{
y_dims_
=
X
(
0
).
dims
();
dims_
=
X
(
0
).
dims
();
dims32_
.
assign
(
dims_
.
begin
(),
dims_
.
end
());
axes32_
.
assign
(
axes_
.
begin
(),
axes_
.
end
());
if
(
axes32_
.
empty
())
{
...
...
@@ -103,11 +53,25 @@ void ReduceGradientOp<Context>::RunImpl() {
CHECK
(
axes32_
[
i
]
>=
0
&&
axes32_
[
i
]
<
X
(
0
).
ndim
())
\
<<
"
\n
Excepted the axis in [-"
<<
X
(
0
).
ndim
()
<<
", "
<<
X
(
0
).
ndim
()
<<
"), got "
<<
axis
<<
"."
;
y_
dims_
[
axes32_
[
i
]]
=
1
;
dims_
[
axes32_
[
i
]]
=
1
;
}
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
vec64_t
out_shape
;
for
(
const
auto
&
dim
:
dims_
)
{
if
(
dim
!=
1
||
keep_dims_
)
out_shape
.
emplace_back
(
dim
);
}
Y
(
0
)
->
Reshape
(
out_shape
);
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
void
ReduceGradientOp
<
Context
>::
RunImpl
()
{
auto
*
dy
=
X
(
1
).
template
data
<
T
,
Context
>
();
auto
*
dx
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
...
...
@@ -153,26 +117,30 @@ void ReduceGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
ReduceGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
y_dims_
=
X
(
0
).
dims
();
axes32_
.
assign
(
axes_
.
begin
(),
axes_
.
end
());
if
(
axes32_
.
empty
())
{
// Reduce to a Scalar if missing axes
for
(
int
i
=
0
;
i
<
X
(
0
).
ndim
();
++
i
)
axes32_
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
axes32_
.
size
();
i
++
)
{
int
axis
=
axes32_
[
i
];
axes32_
[
i
]
=
axis
<
0
?
axis
+
X
(
0
).
ndim
()
:
axis
;
CHECK
(
axes32_
[
i
]
>=
0
&&
axes32_
[
i
]
<
X
(
0
).
ndim
())
\
<<
"
\n
Excepted the axis in [-"
<<
X
(
0
).
ndim
()
<<
", "
<<
X
(
0
).
ndim
()
<<
"), got "
<<
axis
<<
"."
;
y_dims_
[
axes32_
[
i
]]
=
1
;
}
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Reduce
);
...
...
Dragon/src/operators/array/repeat_op.cc
View file @
d1f714e
...
...
@@ -45,28 +45,10 @@ void RepeatOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
}
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -98,26 +80,10 @@ void RepeatGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Repeat
);
...
...
Dragon/src/operators/array/slice_op.cc
View file @
d1f714e
...
...
@@ -65,28 +65,10 @@ void SliceOp<Context>::RunOnDevice() {
outer_dim_
=
X
(
0
).
count
(
0
,
axis_
);
inner_dim_
=
X
(
0
).
count
(
axis_
+
1
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -136,28 +118,10 @@ void SliceGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Slice
);
...
...
Dragon/src/operators/array/stack_op.cc
View file @
d1f714e
...
...
@@ -51,28 +51,10 @@ void StackOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -104,28 +86,10 @@ void StackGradientOp<Context>::RunOnDevice() {
for
(
int
i
=
0
;
i
<
YSize
();
i
++
)
Y
(
i
)
->
ReshapeLike
(
X
(
i
));
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Stack
);
...
...
Dragon/src/operators/array/tile_op.cc
View file @
d1f714e
...
...
@@ -60,28 +60,10 @@ void TileOp<Context>::RunOnDevice() {
TENSOR_FROM_VEC
(
X_dims_
,
X
(
0
).
dims
(),
int
);
TENSOR_FROM_VEC
(
Y_dims_
,
Y_dims
,
int
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -99,7 +81,7 @@ void TileGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
TileGradientOp
<
Context
>::
RunOnDevice
()
{
// Add the axes
vector
<
pair
<
int
,
int
>
>
dispatch_axes
;
vector
<
pair
<
int
,
int
>
>
dispatch_axes
;
for
(
int
i
=
0
;
i
<
X
(
0
).
ndim
();
i
++
)
{
auto
m
=
multiples
(
i
);
if
(
m
>
1
)
{
dispatch_axes
.
push_back
({
m
,
i
});
}
...
...
@@ -128,26 +110,11 @@ void TileGradientOp<Context>::RunOnDevice() {
rows_
=
dst_
->
count
(
0
,
axis_
);
cols_
=
dst_
->
count
(
axis_
);
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
ctx
()
->
FinishDeviceCompution
();
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
ctx
()
->
FinishDeviceCompution
();
// Protect X if num_axes >= 2
std
::
swap
(
src_
,
dst_
);
...
...
Dragon/src/operators/array/transpose_op.cc
View file @
d1f714e
...
...
@@ -54,28 +54,10 @@ void TransposeOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -125,28 +107,10 @@ void TransposeGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Transpose
);
...
...
Dragon/src/operators/control_flow/assign_op.cc
View file @
d1f714e
...
...
@@ -116,28 +116,10 @@ void AssignOp<Context>::RunOnDevice() {
TENSOR_FROM_VECTOR
(
X_dims_
,
X_dims
,
int
);
TENSOR_FROM_VECTOR
(
Y_strides_
,
Y
(
0
)
->
strides
(),
int
);
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Assign
);
...
...
Dragon/src/operators/control_flow/copy_op.cc
View file @
d1f714e
...
...
@@ -14,28 +14,10 @@ template <class Context>
void
CopyOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Copy
);
...
...
Dragon/src/operators/control_flow/masked_assign_op.cc
View file @
d1f714e
...
...
@@ -65,28 +65,10 @@ void MaskedAssignOp<Context>::RunOnDevice() {
CHECK
(
XIsType
(
X
(
1
),
bool
)
||
XIsType
(
X
(
1
),
uint8_t
))
<<
"
\n
Excepted bool or uint8 mask."
;
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
MaskedAssign
);
...
...
Dragon/src/operators/loss/ctc_loss_op.cc
View file @
d1f714e
...
...
@@ -23,13 +23,8 @@ void CTCLossGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
CTCLossGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
CTCLoss
);
...
...
Dragon/src/operators/loss/l1_loss_op.cc
View file @
d1f714e
...
...
@@ -15,9 +15,7 @@ void L1LossOp<Context>::RunImpl() {
->
ReshapeLike
(
X
(
0
))
->
template
mutable_data
<
T
,
Context
>
();
auto
*
y
=
Y
(
0
)
->
Reshape
({})
->
template
mutable_data
<
T
,
Context
>
();
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
if
(
XSize
()
>
1
)
{
auto
*
target
=
X
(
1
).
template
data
<
T
,
Context
>
();
...
...
@@ -53,13 +51,10 @@ void L1LossOp<Context>::RunOnDevice() {
<<
"while "
<<
X
(
0
).
DimString
()
<<
" is required."
;
}
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
Y
(
0
)
->
Reshape
({});
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -104,13 +99,8 @@ void L1LossGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
L1LossGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
L1Loss
);
...
...
Dragon/src/operators/loss/l2_loss_op.cc
View file @
d1f714e
...
...
@@ -14,9 +14,7 @@ void L2LossOp<Context>::RunImpl() {
->
ReshapeLike
(
X
(
0
))
->
template
mutable_data
<
T
,
Context
>
();
auto
*
y
=
Y
(
0
)
->
Reshape
({})
->
template
mutable_data
<
float
,
Context
>
();
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
float
,
Context
>
();
if
(
XSize
()
>
1
)
{
auto
*
target
=
X
(
1
).
template
data
<
T
,
Context
>
();
...
...
@@ -56,15 +54,10 @@ void L2LossOp<Context>::RunOnDevice() {
<<
"while "
<<
X
(
0
).
DimString
()
<<
" is required."
;
}
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
Y
(
0
)
->
Reshape
({});
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -107,15 +100,8 @@ void L2LossGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
L2LossGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
L2Loss
);
...
...
Dragon/src/operators/loss/sigmoid_ce_loss_op.cc
View file @
d1f714e
...
...
@@ -50,13 +50,8 @@ void SigmoidCrossEntropyOp<Context>::RunOnDevice() {
loss_
.
ReshapeLike
(
X
(
0
));
flag_
.
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -113,13 +108,8 @@ void SigmoidCrossEntropyGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
flag_
.
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
SigmoidCrossEntropy
);
...
...
Dragon/src/operators/loss/smooth_l1_loss_op.cc
View file @
d1f714e
...
...
@@ -44,7 +44,6 @@ void SmoothL1LossOp<Context>::RunImpl() {
normalizer
=
X
(
0
).
count
();
}
Y
(
0
)
->
Reshape
({});
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
math
::
Sum
(
nelements
,
1.
/
normalizer
,
err
,
y
,
ctx
());
}
...
...
@@ -53,13 +52,10 @@ template <class Context>
void
SmoothL1LossOp
<
Context
>::
RunOnDevice
()
{
CHECK
(
X
(
0
).
count
()
==
X
(
1
).
count
());
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
Y
(
0
)
->
Reshape
({});
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -110,13 +106,8 @@ void SmoothL1LossGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
SmoothL1LossGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
SmoothL1Loss
);
...
...
Dragon/src/operators/loss/softmax_ce_loss_op.cc
View file @
d1f714e
...
...
@@ -89,13 +89,8 @@ void SoftmaxCrossEntropyOp<Context>::RunOnDevice() {
SoftmaxRun
();
loss_
.
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -161,13 +156,8 @@ void SoftmaxCrossEntropyGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
SoftmaxCrossEntropy
);
...
...
Dragon/src/operators/misc/accuracy_op.cc
View file @
d1f714e
...
...
@@ -24,7 +24,7 @@ void AccuracyOp<Context>::RunImpl() {
const
int
label
=
target
[
i
*
inner_dim_
+
j
];
for
(
int
k
=
0
;
k
<
ignore_
.
count
();
k
++
)
if
(
label
==
ignore
[
k
])
continue
;
vector
<
pair
<
Tx
,
int
>
>
vec
;
vector
<
pair
<
Tx
,
int
>
>
vec
;
for
(
int
k
=
0
;
k
<
axis_dim_
;
k
++
)
vec
.
push_back
(
std
::
make_pair
(
...
...
@@ -35,7 +35,7 @@ void AccuracyOp<Context>::RunImpl() {
vec
.
begin
(),
vec
.
begin
()
+
top_k_
,
vec
.
end
(),
std
::
greater
<
pair
<
Tx
,
int
>
>
()
std
::
greater
<
pair
<
Tx
,
int
>
>
()
);
for
(
int
k
=
0
;
k
<
top_k_
;
k
++
)
{
if
(
vec
[
k
].
second
==
label
)
{
acc
++
;
break
;
}
...
...
Dragon/src/operators/misc/gradient_op.cc
View file @
d1f714e
...
...
@@ -18,43 +18,19 @@ void GradientGenerateOp<Context>::RunImpl() {
template
<
class
Context
>
void
GradientGenerateOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
bool
))
{
RunImpl
<
bool
>
();
}
else
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"bool"
,
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
GradientGenerate
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
GradientGenerate
);
#endif
OPERATOR_SCHEMA
(
GradientGenerate
);
template
<
class
Context
>
template
<
typename
T
>
void
GradientGatherOp
<
Context
>::
RunImpl
()
{
int64_t
count
=
Y
(
0
)
->
count
();
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
if
(
indices
.
size
()
==
1
)
{
auto
*
x
=
X
(
indices
[
0
]).
template
data
<
T
,
Context
>
();
ctx
()
->
template
Copy
<
T
,
Context
,
Context
>
(
count
,
y
,
x
);
math
::
Copy
(
count
,
x
,
y
,
ctx
()
);
}
else
if
(
indices
.
size
()
==
2
)
{
CHECK_EQ
(
count
,
X
(
indices
[
1
]).
count
());
auto
*
a
=
X
(
indices
[
0
]).
template
data
<
T
,
Context
>
();
...
...
@@ -63,7 +39,7 @@ void GradientGatherOp<Context>::RunImpl() {
}
else
{
size_t
i
=
1
;
auto
*
x
=
X
(
indices
[
0
]).
template
data
<
T
,
Context
>
();
ctx
()
->
template
Copy
<
T
,
Context
,
Context
>
(
count
,
y
,
x
);
math
::
Copy
(
count
,
x
,
y
,
ctx
()
);
while
(
i
<
indices
.
size
())
{
if
(
indices
.
size
()
-
i
>=
2
)
{
auto
*
a
=
X
(
indices
[
i
]).
template
data
<
T
,
Context
>
();
...
...
@@ -84,34 +60,12 @@ void GradientGatherOp<Context>::RunOnDevice() {
auto
&
Xi
=
X
(
indices
[
0
]);
Y
(
0
)
->
ReshapeLike
(
Xi
);
if
(
XIsType
(
Xi
,
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
Xi
,
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
Xi
,
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
Xi
,
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
Xi
,
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
Xi
,
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
Xi
,
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
Xi
,
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
Xi
);
}
DEPLOY_CPU
(
GradientGather
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
GradientGather
);
#endif
OPERATOR_SCHEMA
(
GradientGather
).
NumOutputs
(
1
);
template
<
class
Context
>
template
<
typename
T
>
void
GradientAddOp
<
Context
>::
RunImpl
()
{
auto
*
x
=
X
(
1
).
template
data
<
T
,
Context
>
();
...
...
@@ -124,37 +78,12 @@ void GradientAddOp<Context>::RunOnDevice() {
CHECK_EQ
(
X
(
0
).
name
(),
Y
(
0
)
->
name
())
<<
"
\n
Requires X(0) == Y(0)."
;
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
uint8_t
))
{
RunImpl
<
uint8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"uint8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
"float64"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
GradientAdd
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
GradientAdd
);
#endif
OPERATOR_SCHEMA
(
GradientAdd
)
.
NumInputs
(
2
).
NumOutputs
(
1
)
.
Inplace
({
{
0
,
0
}
});
template
<
class
Context
>
void
StopGradientOp
<
Context
>::
RunOnDevice
()
{
if
(
Y
(
0
)
->
name
()
!=
X
(
0
).
name
())
{
...
...
@@ -163,14 +92,53 @@ void StopGradientOp<Context>::RunOnDevice() {
}
}
DEPLOY_CPU
(
GradientGenerate
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
GradientGenerate
);
#endif
DEPLOY_CPU
(
GradientGather
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
GradientGather
);
#endif
DEPLOY_CPU
(
GradientAdd
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
GradientAdd
);
#endif
DEPLOY_CPU
(
StopGradient
);
#ifdef WITH_CUDA
DEPLOY_CUDA
(
StopGradient
);
#endif
OPERATOR_SCHEMA
(
GradientGenerate
)
/* X(0), ... */
.
NumInputs
(
1
,
INT_MAX
)
/* Y(0), ... */
.
NumOutputs
(
1
,
INT_MAX
);
OPERATOR_SCHEMA
(
GradientGather
)
/* X(0), ... */
.
NumInputs
(
1
,
INT_MAX
)
/* Y */
.
NumOutputs
(
1
);
OPERATOR_SCHEMA
(
GradientAdd
)
/* X(0), X(1) */
.
NumInputs
(
2
)
/* Y */
.
NumOutputs
(
1
)
/* X(0) => Y */
.
Inplace
({
{
0
,
0
}
});
OPERATOR_SCHEMA
(
StopGradient
)
.
NumInputs
(
1
).
NumOutputs
(
1
)
.
Inplace
({
{
0
,
0
}
});;
/* X */
.
NumInputs
(
1
)
/* Y */
.
NumOutputs
(
1
)
/* X => Y */
.
Inplace
({
{
0
,
0
}
});
NO_GRADIENT
(
StopGradient
);
...
...
Dragon/src/operators/misc/initialize_op.cc
View file @
d1f714e
...
...
@@ -5,7 +5,7 @@ namespace dragon {
template
<
class
Context
>
template
<
typename
T
>
void
InitializeOp
<
Context
>::
RunImpl
()
{
unique_ptr
<
Filler
<
T
,
Context
>
>
f
;
unique_ptr
<
Filler
<
T
,
Context
>
>
f
;
f
.
reset
(
CreateFiller
<
T
,
Context
>
(
proto_
));
f
->
Fill
(
Y
(
0
),
ctx
());
}
...
...
Dragon/src/operators/misc/python_op.cc
View file @
d1f714e
...
...
@@ -152,9 +152,11 @@ DEPLOY_CUDA(TemplateGradient);
#endif
OPERATOR_SCHEMA
(
TemplateGradient
);
class
GetTemplateGradient
final
:
public
GradientMakerBase
{
namespace
{
class
GradientMaker
final
:
public
GradientMakerBase
{
public
:
GRADIENT_MAKER_CTOR
(
G
etTemplateGradient
);
GRADIENT_MAKER_CTOR
(
G
radientMaker
);
vector
<
OperatorDef
>
MakeDef
()
override
{
vector
<
string
>
inputs
,
outputs
;
for
(
auto
input
:
def
.
input
())
inputs
.
push_back
(
input
);
...
...
@@ -164,7 +166,9 @@ class GetTemplateGradient final : public GradientMakerBase {
}
};
REGISTER_GRADIENT
(
Template
,
GetTemplateGradient
);
}
// namespace
REGISTER_GRADIENT
(
Template
,
GradientMaker
);
}
// namespace dragon
...
...
Dragon/src/operators/mpi/mpi_broadcast_op.cc
View file @
d1f714e
...
...
@@ -35,22 +35,10 @@ void MPIBroadcastOp<Context>::RunOnDevice() {
BCast
(
dims
.
data
(),
ndim
);
Y
(
0
)
->
Reshape
(
dims
);
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -74,22 +62,10 @@ template <class Context>
void
MPIBroadcastGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
-
1
));
if
(
XIsType
(
X
(
-
1
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
-
1
),
{
"int8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
});
}
DispatchHelper
<
TensorTypes
<
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
-
1
));
}
DEPLOY_CPU
(
MPIBroadcast
);
...
...
Dragon/src/operators/mpi/mpi_gather_op.cc
View file @
d1f714e
...
...
@@ -50,22 +50,10 @@ void MPIGatherOp<Context>::RunOnDevice() {
}
}
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -88,22 +76,10 @@ template <class Context>
void
MPIGatherGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
int8_t
))
{
RunImpl
<
int8_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
int
))
{
RunImpl
<
int
>
();
}
else
if
(
XIsType
(
X
(
0
),
int64_t
))
{
RunImpl
<
int64_t
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"int8"
,
"int32"
,
"int64"
,
"float16"
,
"float32"
,
});
}
DispatchHelper
<
TensorTypes
<
bool
,
int8_t
,
uint8_t
,
int
,
int64_t
,
float16
,
float
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
MPIGather
);
...
...
Dragon/src/operators/norm/l2_norm_op.cc
View file @
d1f714e
...
...
@@ -73,17 +73,9 @@ void L2NormOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -177,17 +169,9 @@ void L2NormGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
double
))
{
RunImpl
<
double
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float16"
,
"float32"
,
"float64"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
,
double
>
>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
L2Norm
);
...
...
Dragon/src/operators/recurrent/cudnn_recurrent_op.cc
View file @
d1f714e
...
...
@@ -211,15 +211,8 @@ void CuDNNRecurrentOp<Context>::RunImpl() {
template
<
class
Context
>
void
CuDNNRecurrentOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -313,15 +306,8 @@ void CuDNNRecurrentGradientOp<Context>::RunOnDevice() {
Y
(
2
)
->
ReshapeLike
(
X
(
2
));
// dHx
Y
(
3
)
->
ReshapeLike
(
X
(
3
));
// dCx
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Recurrent
);
...
...
Dragon/src/operators/recurrent/rnn_param_op.cc
View file @
d1f714e
...
...
@@ -52,15 +52,8 @@ void RNNParamSetOp<Context>::RunImpl() {
template
<
class
Context
>
void
RNNParamSetOp
<
Context
>::
RunOnDevice
()
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
RNNParamSet
);
...
...
Dragon/src/operators/vision/bias_add_op.cc
View file @
d1f714e
...
...
@@ -43,13 +43,8 @@ void BiasAddOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -102,13 +97,8 @@ void BiasAddGradientOp<Context>::RunOnDevice() {
Y
(
1
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
-
1
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
-
1
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
-
1
));
}
DEPLOY_CPU
(
BiasAdd
);
...
...
Dragon/src/operators/vision/bilinear_resize_op.cc
View file @
d1f714e
...
...
@@ -48,14 +48,9 @@ void BilinearResizeOp<Context>::RunOnDevice() {
}
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -90,14 +85,9 @@ void BilinearResizeGradientOp<Context>::RunImpl() {
template
<
class
Context
>
void
BilinearResizeGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
BilinearResize
);
...
...
Dragon/src/operators/vision/conv2d_op.cc
View file @
d1f714e
...
...
@@ -27,13 +27,8 @@ void Conv2dOp<Context>::RunOnDevice() {
if
(
data_format
()
==
"NHWC"
&&
group_
!=
1
)
LOG
(
FATAL
)
<<
"GroupConv(NHWC) is not supported."
;
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -66,13 +61,8 @@ void Conv2dGradientOp<Context>::RunOnDevice() {
if
(
data_format
()
==
"NHWC"
&&
group_
!=
1
)
LOG
(
FATAL
)
<<
"GroupConv(NHWC) is not supported."
;
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
Conv2d
);
...
...
Dragon/src/operators/vision/conv2d_transpose_op.cc
View file @
d1f714e
...
...
@@ -32,13 +32,8 @@ void ConvTranspose2dOp<Context>::RunOnDevice() {
for
(
int
i
=
0
;
i
<
num_axes_
;
i
++
)
out_shape_
[
i
]
=
X
(
0
).
dim
(
axis_
+
i
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -76,13 +71,8 @@ void ConvTranspose2dGradientOp<Context>::RunOnDevice() {
for
(
int
i
=
0
;
i
<
num_axes_
;
i
++
)
out_shape_
[
i
]
=
X
(
0
).
dim
(
axis_
+
i
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
ConvTranspose2d
);
...
...
Dragon/src/operators/vision/cudnn_bias_add_op.cc
View file @
d1f714e
...
...
@@ -62,15 +62,8 @@ void CuDNNBiasAddOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -129,15 +122,8 @@ void CuDNNBiasAddGradientOp<Context>::RunOnDevice() {
Y
(
1
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
-
1
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
-
1
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
-
1
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
-
1
));
}
DEPLOY_CUDNN
(
BiasAdd
);
...
...
Dragon/src/operators/vision/cudnn_conv2d_op.cc
View file @
d1f714e
...
...
@@ -216,15 +216,8 @@ void CuDNNConv2dOp<Context>::RunOnDevice() {
#endif
ConvOpBase
<
Context
>::
Reshape
();
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
...
...
@@ -474,15 +467,8 @@ void CuDNNConv2dGradientOp<Context>::RunOnDevice() {
#endif
ConvOpBase
<
Context
>::
Reshape
(
true
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Conv2d
);
...
...
Dragon/src/operators/vision/cudnn_conv2d_transpose_op.cc
View file @
d1f714e
...
...
@@ -214,15 +214,8 @@ void CuDNNConvTranspose2dOp<Context>::RunOnDevice() {
#endif
ConvOpBase
<
Context
>::
Reshape
();
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
ConvTranspose2d
);
...
...
@@ -471,15 +464,8 @@ void CuDNNConvTranspose2dGradientOp<Context>::RunOnDevice() {
#endif
ConvOpBase
<
Context
>::
Reshape
(
true
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
ConvTranspose2dGradient
);
...
...
Dragon/src/operators/vision/cudnn_depthwise_conv2d_op.cc
View file @
d1f714e
...
...
@@ -68,13 +68,8 @@ void CuDNNDepthwiseConv2dOp<Context>::RunOnDevice() {
<<
"
\n
Excepted in/out channels unchanged."
;
ConvOpBase
<
Context
>::
Reshape
();
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -152,13 +147,8 @@ void CuDNNDepthwiseConv2dGradientOp<Context>::RunOnDevice() {
==
"NCHW"
?
X
(
0
).
dim
(
1
)
:
X
(
0
).
dim
(
-
1
);
ConvOpBase
<
Context
>::
Reshape
(
true
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
DepthwiseConv2d
);
...
...
Dragon/src/operators/vision/cudnn_lrn_op.cc
View file @
d1f714e
...
...
@@ -14,10 +14,13 @@ void CuDNNLRNOp<Context>::RunImpl() {
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
CUDNN_CHECK
(
cudnnLRNCrossChannelForward
(
ctx
()
->
cudnn_handle
(),
lrn_desc_
,
ctx
()
->
cudnn_handle
(),
lrn_desc_
,
CUDNN_LRN_CROSS_CHANNEL_DIM1
,
CuDNNType
<
T
>::
one
,
input_desc_
,
x
,
CuDNNType
<
T
>::
zero
,
output_desc_
,
y
CuDNNType
<
T
>::
one
,
input_desc_
,
x
,
CuDNNType
<
T
>::
zero
,
output_desc_
,
y
));
}
else
{
LOG
(
FATAL
)
<<
"Unknown DataFormat: "
<<
data_format
();
...
...
@@ -29,15 +32,8 @@ void CuDNNLRNOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
this
->
mode_
==
"ACROSS_CHANNELS"
)
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
else
if
(
this
->
mode_
==
"WITHIN_CHANNEL"
)
{
LRNOp
<
Context
>::
RunOnDevice
();
}
else
{
...
...
@@ -57,7 +53,8 @@ void CuDNNLRNGradientOp<Context>::RunImpl() {
auto
*
dx
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
CUDNN_CHECK
(
cudnnLRNCrossChannelBackward
(
ctx
()
->
cudnn_handle
(),
lrn_desc_
,
ctx
()
->
cudnn_handle
(),
lrn_desc_
,
CUDNN_LRN_CROSS_CHANNEL_DIM1
,
CuDNNType
<
T
>::
one
,
input_desc_
,
y
,
...
...
@@ -76,15 +73,8 @@ void CuDNNLRNGradientOp<Context>::RunOnDevice() {
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
this
->
mode_
==
"ACROSS_CHANNELS"
)
{
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
else
if
(
this
->
mode_
==
"WITHIN_CHANNEL"
)
{
LRNGradientOp
<
Context
>::
RunOnDevice
();
}
else
{
...
...
Dragon/src/operators/vision/cudnn_pool2d_op.cc
View file @
d1f714e
...
...
@@ -39,9 +39,12 @@ void CuDNNPool2dOp<Context>::RunImpl() {
auto
*
y
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
CUDNN_CHECK
(
cudnnPoolingForward
(
ctx
()
->
cudnn_handle
(),
pool_desc_
,
CuDNNType
<
T
>::
one
,
input_desc_
,
x
,
CuDNNType
<
T
>::
zero
,
output_desc_
,
y
ctx
()
->
cudnn_handle
(),
pool_desc_
,
CuDNNType
<
T
>::
one
,
input_desc_
,
x
,
CuDNNType
<
T
>::
zero
,
output_desc_
,
y
));
}
...
...
@@ -49,15 +52,8 @@ template <class Context>
void
CuDNNPool2dOp
<
Context
>::
RunOnDevice
()
{
Pool2dOp
<
Context
>::
Reshape
();
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -97,7 +93,8 @@ void CuDNNPool2dGradientOp<Context>::RunImpl() {
auto
*
dx
=
Y
(
0
)
->
template
mutable_data
<
T
,
Context
>
();
CUDNN_CHECK
(
cudnnPoolingBackward
(
ctx
()
->
cudnn_handle
(),
pool_desc_
,
ctx
()
->
cudnn_handle
(),
pool_desc_
,
CuDNNType
<
T
>::
one
,
input_desc_
,
y
,
input_desc_
,
dy
,
...
...
@@ -111,15 +108,8 @@ template <class Context>
void
CuDNNPool2dGradientOp
<
Context
>::
RunOnDevice
()
{
Pool2dGradientOp
<
Context
>::
Reshape
();
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CUDNN
(
Pool2d
);
...
...
Dragon/src/operators/vision/depthwise_conv2d_op.cc
View file @
d1f714e
...
...
@@ -40,13 +40,8 @@ void DepthwiseConv2dOp<Context>::RunOnDevice() {
<<
"
\n
Excepted in/out channels unchanged."
;
ConvOpBase
<
Context
>::
Reshape
();
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -98,13 +93,8 @@ void DepthwiseConv2dGradientOp<Context>::RunOnDevice() {
X
(
0
).
dim
(
1
)
:
X
(
0
).
dim
(
-
1
);
ConvOpBase
<
Context
>::
Reshape
(
true
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
}
);
}
DispatchHelper
<
TensorTypes
<
float
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
DepthwiseConv2d
);
...
...
Dragon/src/operators/vision/drop_block2d_op.cc
View file @
d1f714e
...
...
@@ -102,15 +102,8 @@ template <class Context>
void
DropBlock2dOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
@@ -144,15 +137,8 @@ template <class Context>
void
DropBlock2dGradientOp
<
Context
>::
RunOnDevice
()
{
Y
(
0
)
->
ReshapeLike
(
X
(
0
));
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
DEPLOY_CPU
(
DropBlock2d
);
...
...
Dragon/src/operators/vision/nn_resize_op.cc
View file @
d1f714e
...
...
@@ -49,15 +49,8 @@ void NNResizeOp<Context>::RunOnDevice() {
Y
(
0
)
->
Reshape
(
out_shape
);
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
}
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
Dragon/src/operators/vision/roi_align_op.cc
View file @
d1f714e
...
...
@@ -33,15 +33,8 @@ void ROIAlignOp<Context>::RunOnDevice() {
pool_w_
/* feature_w */
});
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
};
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
Dragon/src/operators/vision/roi_pool_op.cc
View file @
d1f714e
...
...
@@ -37,15 +37,8 @@ void ROIPoolOp<Context>::RunOnDevice() {
pool_w_
/* feature_w */
});
if
(
XIsType
(
X
(
0
),
float
))
{
RunImpl
<
float
>
();
}
else
if
(
XIsType
(
X
(
0
),
float16
))
{
RunImpl
<
float16
>
();
}
else
{
LOG
(
FATAL
)
<<
DTypeString
(
X
(
0
),
{
"float32"
,
"float16"
}
);
};
DispatchHelper
<
TensorTypes
<
float
,
float16
>>::
Call
(
this
,
X
(
0
));
}
template
<
class
Context
>
template
<
typename
T
>
...
...
Dragon/src/utils/math_functions.cu
View file @
d1f714e
...
...
@@ -111,8 +111,8 @@ DEFINE_COPY_FUNC(double);
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, x, y \
); \
}
...
...
@@ -245,8 +245,8 @@ __global__ void _AddScalar(
sizeof(T) * n, ctx->cuda_stream())); \
} else { \
_Set \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, alpha, y \
); \
} \
...
...
@@ -273,15 +273,15 @@ DEFINE_SET_FUNC(double);
if (type == 0) { \
/*! Row - BroadcastX */ \
_RowBroadcastSet \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, x, y \
); \
} else if (type == 1) { \
/*! Col - BroadcastX */ \
_ColBroadcastSet \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, x, y \
); \
} \
...
...
@@ -307,8 +307,8 @@ DEFINE_BROADCAST_SET_FUNC(double);
T* y, \
CUDAContext* ctx) { \
_Pow \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cast::to<T>(exp), x, y \
); \
}
...
...
@@ -337,8 +337,8 @@ DEFINE_POWX_FUNC(double);
} return; \
} \
_Scale \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, _alpha_, x, y \
); \
}
...
...
@@ -386,8 +386,8 @@ DEFINE_CUBLAS_SCALE_FUNC(double, cublasDscal_v2);
T* y, \
CUDAContext* ctx) { \
_Axpy \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cast::to<T>(alpha), x, y \
); \
}
...
...
@@ -434,8 +434,8 @@ DEFINE_AXPY_FUNC(int64_t);
T* y, \
CUDAContext* ctx) { \
_Axpby \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, \
cast::to<T>(alpha), x, \
cast::to<T>(beta), y \
...
...
@@ -461,8 +461,8 @@ DEFINE_AXPBY_FUNC(double);
T _alpha_ = (T)alpha; \
if (_alpha_ == T(0)) return; \
_AddScalar \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, _alpha_, y \
); \
}
...
...
@@ -506,8 +506,8 @@ __global__ void _InvStd(
T* y, \
CUDAContext* ctx) { \
_InvStd \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cast::to<T>(eps), x, y \
); \
}
...
...
@@ -598,8 +598,8 @@ template <> float ASum<float, CUDAContext>(
T* y, \
CUDAContext* ctx) { \
_##name \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, a, b, y \
); \
}
...
...
@@ -829,29 +829,29 @@ __global__ void _ColBroadcastDiv(
if (type == 0) { \
/*! Row - BroadcastB */ \
_RowBroadcast##name<T, false> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, a, b, y \
); \
} else if (type == 1) { \
/*! Col - BroadcastB */ \
_ColBroadcast##name<T, false> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, a, b, y \
); \
} else if (type == 2) { \
/*! Row - BroadcastA */ \
_RowBroadcast##name<T, true> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, a, b, y \
); \
} else if (type == 3) { \
/*! Col - BroadcastA */ \
_ColBroadcast##name<T, true> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, a, b, y \
); \
} else { \
...
...
Dragon/src/utils/math_functions.fp16.cu
View file @
d1f714e
...
...
@@ -48,16 +48,16 @@ template <> void Exp<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_ExpHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)
);
} else {
_ExpHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -94,16 +94,16 @@ template <> void Log<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_LogHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)
);
} else {
_LogHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -140,16 +140,16 @@ template <> void Inv<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_InvHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)
);
} else {
_InvHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -186,16 +186,16 @@ template <> void Sqrt<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_SqrtHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)
);
} else {
_SqrtHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -232,16 +232,16 @@ template <> void RSqrt<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_RSqrtHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)
);
} else {
_RSqrtHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -278,16 +278,16 @@ template <> void Square<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_SquareHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
reinterpret_cast<const half2*>(x),
reinterpret_cast<half2*>(y)
);
} else {
_SquareHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
...
...
@@ -330,16 +330,16 @@ template <> void Set<float16, CUDAContext>(
}
if ((n & 1) == 0) {
_SetHalf<half2>
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
cast::to<half2>(alpha),
reinterpret_cast<half2*>(y)
);
} else {
_SetHalf<float16>
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n, alpha, y
);
}
...
...
@@ -380,8 +380,8 @@ template <> void Pow<float16, CUDAContext>(
CHECK(alpha == 2.f) << "\nRequired power = 2";
if ((n & 1) == 0) {
_PowHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
alpha,
reinterpret_cast<const half2*>(x),
...
...
@@ -389,8 +389,8 @@ template <> void Pow<float16, CUDAContext>(
);
} else {
_PowHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
alpha,
reinterpret_cast<const half*>(x),
...
...
@@ -487,16 +487,16 @@ template <> void AddScalar<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_AddScalarHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
cast::to<half2>(alpha),
reinterpret_cast<half2*>(y)
);
} else {
_AddScalarHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
cast::to<half>(alpha),
reinterpret_cast<half*>(y)
...
...
@@ -546,8 +546,8 @@ template <> void InvStd<float16, CUDAContext>(
CUDAContext* ctx) {
if ((n & 1) == 0) {
_InvStdHalf2
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n >> 1,
cast::to<half2>(eps),
reinterpret_cast<const half2*>(x),
...
...
@@ -555,8 +555,8 @@ template <> void InvStd<float16, CUDAContext>(
);
} else {
_InvStdHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
cast::to<half>(eps),
reinterpret_cast<const half*>(x),
...
...
@@ -668,8 +668,8 @@ __global__ void _DivHalf(
CUDAContext* ctx) { \
if ((n & 1) == 0) { \
_##name##Half2 \
<<
< CUDA_BLOCKS(n >> 1), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n >> 1), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n >> 1, \
reinterpret_cast<const half2*>(a), \
reinterpret_cast<const half2*>(b), \
...
...
@@ -677,8 +677,8 @@ __global__ void _DivHalf(
); \
} else { \
_##name##Half \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
...
...
@@ -699,8 +699,8 @@ template <> void Div<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_DivHalf
<<
< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
<<< CUDA_BLOCKS(n), CUDA_THREADS,
0, ctx->cuda_stream() >>
>(
n,
reinterpret_cast<const half*>(a),
reinterpret_cast<const half*>(b),
...
...
@@ -884,8 +884,8 @@ __global__ void _ColBroadcastDivHalf(
if (type == 0) { \
/*! Row - BroadcastB */ \
_RowBroadcast##name##Half<false> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
...
...
@@ -894,8 +894,8 @@ __global__ void _ColBroadcastDivHalf(
} else if (type == 1) { \
/*! Col - BroadcastB */ \
_ColBroadcast##name##Half<false> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
...
...
@@ -904,8 +904,8 @@ __global__ void _ColBroadcastDivHalf(
} else if (type == 2) { \
/*! Row - BroadcastA */ \
_RowBroadcast##name##Half<true> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
...
...
@@ -914,8 +914,8 @@ __global__ void _ColBroadcastDivHalf(
} else if (type == 3) { \
/*! Col - BroadcastA */ \
_ColBroadcast##name##Half<true> \
<<
< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
<<< CUDA_BLOCKS(n), CUDA_THREADS, \
0, ctx->cuda_stream() >>
>( \
n, cols, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
...
...
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment