Commit d1f714ea by Ting PAN

Apply the dispatcher to RunImpl

1 parent bd84b7fd
Showing with 668 additions and 868 deletions
......@@ -35,6 +35,7 @@
#include "core/types.h"
#include "proto/dragon.pb.h"
#include "utils/string.h"
#include "utils/logging.h"
namespace dragon {
......
......@@ -85,6 +85,8 @@ GraphBase* NewGraph(
const GraphDef& def,
Workspace* ws);
/* Macros */
DECLARE_REGISTRY(
GraphRegistry,
GraphBase,
......
......@@ -43,7 +43,7 @@ class GraphGradientMaker {
bool CheckGrad(
const OperatorDef& forward_op,
const Set<string>& targets,
vector< pair<string, int> >& gen_grads);
vector<pair<string, int>>& gen_grads);
string GetOperatorName();
......
......@@ -100,7 +100,7 @@ class OperatorBase {
/*! \brief Return the specified argument */
const Argument& arg(const string& name) { return *(args_[name]); }
typedef Map<string, vector<OperatorBase*> > SubGraph;
typedef Map<string, vector<OperatorBase*>> SubGraph;
/*! \brief Return the recomputing subgraph of this operator */
SubGraph& subgraph() { return subgraph_; }
......@@ -221,7 +221,7 @@ OperatorBase* NewOperator(
const OperatorDef& def,
Workspace* ws);
/*! Macros */
/* Macros */
#define OpArg OperatorBase::Arg
#define OpArgs OperatorBase::Args
......@@ -266,7 +266,7 @@ DECLARE_REGISTRY(
const OperatorDef&,
Workspace*);
/*! NVIDIA's Accelerated Library - CUDNN */
/* NVIDIA's Accelerated Library - CUDNN */
DECLARE_REGISTRY(
CUDNNOperatorRegistry,
......@@ -274,7 +274,7 @@ DECLARE_REGISTRY(
const OperatorDef&,
Workspace*);
/*! CAMBRICON's Accelerated Library - CNML */
/* CAMBRICON's Accelerated Library - CNML */
DECLARE_REGISTRY(
CNMLOperatorRegistry,
......@@ -282,13 +282,60 @@ DECLARE_REGISTRY(
const OperatorDef&,
Workspace*);
/* Dispatcher for Runtime Typed-Implementation */
#define XIsType(x, dtype) \
x.template IsType<dtype>()
template <typename... Types>
struct TensorTypes {};
template <typename Sizes, typename... Args>
struct DispatchHelper;
#define DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, Impl) \
template <typename T, typename... Types, typename... Args> \
struct DispatchHelper<TensorTypes<T, Types...>, Args...> { \
template <typename Op> \
static void Call(Op* op, const TypeMeta& meta, string& types) { \
if (meta.Match<T>()) return op->template Impl<T, Args...>(); \
types += " * " + TypeToString<T>() + ",\n"; \
return DispatchHelper<TensorTypes<Types...>, Args...> \
::Call(op, meta, types); \
} \
template <typename Op> \
static void Call(Op* op, const Tensor& tensor) { \
string types; return Call(op, tensor.meta(), types); \
} \
}; \
template <typename... Args> \
struct DispatchHelper<TensorTypes<>, Args...> { \
template <typename Op> \
static void Call(Op* op, const TypeMeta& meta, string& types) { \
LOG(FATAL) << "Unsupported DType: " \
<< TypeMetaToString(meta) << "\n" \
<< "<" << op->type() << "Op>" \
<< " supports the following dtypes: {\n" \
<< types << "}"; \
} \
template <typename Op> \
static void Call(Op* op, const Tensor& tensor) { \
return Call(op, tensor.meta(), ""); \
} \
};
DEFINE_TENSOR_TYPES_DISPATCHER(TensorTypes, RunImpl);
#undef DEFINE_TENSOR_TYPES_DISPATCHER
/* TensorFiller */
#define TENSOR_FILL_WITH_TYPE(tensor, shape, type) \
if (tensor.count() == 0) { \
CHECK(ws()->GetFiller(tensor.name())) \
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr< Filler<type, Context> > filler( \
unique_ptr<Filler<type, Context>> filler( \
CreateFiller<type, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
} else { \
......@@ -308,7 +355,7 @@ DECLARE_REGISTRY(
<< "\nTensor(" << tensor.name() << ") is empty. \n" \
<< "may be specify a filler for it ?"; \
tensor.Reshape(shape); \
unique_ptr< Filler<T, Context> > filler( \
unique_ptr<Filler<T, Context>> filler( \
CreateFiller<T, Context>(*ws()->GetFiller(tensor.name()))); \
filler->Fill(&tensor, ctx()); \
} else { \
......@@ -322,6 +369,8 @@ DECLARE_REGISTRY(
tensor.Reshape(shape); \
}
/* Shared Multiplier */
#define DECLARE_MULTIPLIER(name, size) \
const T* name; \
{ \
......@@ -335,6 +384,8 @@ DECLARE_REGISTRY(
name = mp->template data<T, Context>(); \
}
/* Dynamic Arguments */
#define DECLARE_ARG_WITH_DESC(type, arg) \
type arg##_; \
string arg##_desc_; \
......@@ -393,8 +444,7 @@ DECLARE_REGISTRY(
#define GET_ARGS_SIZE(arg) \
(int)std::max(arg##_.size(), arg##_desc_.size())
#define XIsType(x, dtype) \
x.template IsType<dtype>()
/* Registers */
#define INSTANTIATE_OPERATOR(name, context) \
template class name##Op<context>;
......
......@@ -42,7 +42,7 @@ class OpSchema {
return *this;
}
OpSchema& Inplace(set<pair<int, int> > inplace);
OpSchema& Inplace(set<pair<int, int>> inplace);
std::function<bool(int, int)> CheckInplace;
bool AllowInplace() const { return allow_inplace_; }
......
......@@ -73,6 +73,11 @@ inline const std::string TypeMetaToString(
m2s_type_map[meta.id()] : "unknown";
}
template<typename T>
inline const std::string TypeToString() {
return TypeMetaToString(TypeMeta::Make<T>());
}
} // namespace dragon
#endif // DRAGON_CORE_TYPES_H_
\ No newline at end of file
......@@ -13,22 +13,18 @@
#ifndef DRAGON_CORE_WORKSPACE_H_
#define DRAGON_CORE_WORKSPACE_H_
#include "core/common.h"
#include "core/graph.h"
#include "utils/string.h"
namespace dragon {
class Workspace {
public:
typedef Map<string, Map<string, int64_t> > DummyNameMap;
typedef Map<string, unique_ptr<Tensor> > TensorMap;
typedef Map<string, Map<string, int64_t>> DummyNameMap;
typedef Map<string, unique_ptr<Tensor>> TensorMap;
typedef Map<string, string> TensorAliasMap;
typedef Map<string, TensorFillerProto> TensorFillerMap;
typedef Map<string, unique_ptr<OperatorBase> > OperatorMap;
typedef Map<string, unique_ptr<GraphBase> > GraphMap;
typedef Map<string, unique_ptr<OperatorBase>> OperatorMap;
typedef Map<string, unique_ptr<GraphBase>> GraphMap;
/*! \brief Constructor */
Workspace(const string& name) : name_(name) { Initialize(); }
......
......@@ -28,6 +28,7 @@ class FullyConnectedOp final : public Operator<Context> {
USE_OPERATOR_FUNCTIONS;
void RunOnDevice();
template <typename T> void RunImpl();
template <typename T> void TransRunImpl();
template <typename T> void NoTransRunImpl();
......
......@@ -22,6 +22,7 @@ class MultinomialOp final : public Operator<Context> {
public:
MultinomialOp(const OperatorDef& def, Workspace* ws)
: Operator<Context>(def, ws),
eps_(OpArg<float>("eps", 0.f)),
normalize_(OpArg<int64_t>("normalize", 0)),
num_samples_(OpArg<int64_t>("num_samples", 1)) {}
USE_OPERATOR_FUNCTIONS;
......@@ -32,6 +33,7 @@ class MultinomialOp final : public Operator<Context> {
template <typename T> void RunImpl();
protected:
float eps_;
int64_t outer_dim_, axis_;
int64_t normalize_, num_samples_;
unique_ptr<OperatorBase> softmax_op_;
......
......@@ -26,22 +26,24 @@ inline void LoadCaffeModel(
LOG(INFO) << "Restore From Model @: " << file << "......";
LOG(INFO) << "Model Format: CaffeModel";
for (int i = 0; i < net_param.layer_size(); i++) {
const LayerParameter& layer = net_param.layer(i);
const string& layer_name = layer.name();
string prefix = layer_name + "/param:";
const auto& layer = net_param.layer(i);
const auto& layer_name = layer.name();
auto prefix = layer_name + "/param:";
for (int j = 0; j < layer.blobs_size(); j++) {
string tensor_name = prefix + std::to_string(j);
if (!ws->HasTensor(tensor_name))
LOG(WARNING) << "Tensor(" << tensor_name << ") "
<< "does not exist in any Graphs, skip.";
else{
BlobProto blob = layer.blobs(j);
vector<int64_t> dims;
for (auto dim : blob.shape().dim()) dims.push_back(dim);
Tensor* tensor = ws->GetTensor(tensor_name);
auto tensor_name = prefix + std::to_string(j);
if (!ws->HasTensor(tensor_name)) {
LOG(WARNING)
<< "Tensor(" << tensor_name << ") "
<< "does not exist in any Graphs, skip.";
} else {
auto blob = layer.blobs(j);
vec64_t tensor_shape;
for (auto dim : blob.shape().dim())
tensor_shape.push_back(dim);
auto* tensor = ws->GetTensor(tensor_name);
std::stringstream DimString;
if (dims.size() > 0) {
tensor->Reshape(dims);
if (tensor_shape.size() > 0) {
tensor->Reshape(tensor_shape);
CHECK_EQ(tensor->count(), blob.data_size())
<< "\nTensor(" << tensor_name << ") "
<< "failed to load, except size: "
......@@ -52,9 +54,9 @@ inline void LoadCaffeModel(
tensor->Reshape({ blob.data_size() });
DimString << "(missing)";
}
float* Xdata = tensor->mutable_data<float, CPUContext>();
for (int idx = 0; idx < blob.data_size(); idx++)
Xdata[idx] = blob.data(idx);
auto* x = tensor->mutable_data<float, CPUContext>();
for (int xi = 0; xi < blob.data_size(); ++xi)
x[xi] = blob.data(xi);
LOG(INFO) << "Tensor(" << tensor_name << ") "
<< "loaded, shape: " << DimString.str()
<< ", size: " << blob.data_size();
......@@ -66,32 +68,33 @@ inline void LoadCaffeModel(
inline void SavaCaffeModel(
string file,
const vector<Tensor*>& tensors) {
NetParameter net_param;
int j = -1;
NetParameter net;
Map<string, int> layer_hash;
int layer_idx = -1;
for (int i = 0; i < tensors.size(); i++) {
if (tensors[i]->count() <= 0) continue;
vector<string> splits = str::split(
auto splits = str::split(
tensors[i]->name(), "/param:");
if (layer_hash.count(splits[0]) == 0) {
layer_hash[splits[0]] = ++layer_idx;
LayerParameter* layer = net_param.add_layer();
layer_hash[splits[0]] = ++j;
auto* layer = net.add_layer();
layer->set_name(splits[0]);
}
BlobProto* blob = net_param.mutable_layer(layer_idx)->add_blobs();
for (auto dim : tensors[i]->dims()) blob->mutable_shape()->add_dim(dim);
auto* blob = net.mutable_layer(j)->add_blobs();
for (auto dim : tensors[i]->dims())
blob->mutable_shape()->add_dim(dim);
if (XIsType((*tensors[i]), float)) {
auto* Xdata = tensors[i]->data<float, CPUContext>();
for (int id = 0; id < tensors[i]->count(); id++)
blob->mutable_data()->Add(Xdata[id]);
auto* x = tensors[i]->data<float, CPUContext>();
for (int xi = 0; xi < tensors[i]->count(); ++xi)
blob->mutable_data()->Add(x[xi]);
} else if (XIsType((*tensors[i]), float16)) {
auto* Xdata = tensors[i]->data<float16, CPUContext>();
for (int id = 0; id < tensors[i]->count(); id++)
auto* x = tensors[i]->data<float16, CPUContext>();
for (int xi = 0; xi < tensors[i]->count(); ++xi)
blob->mutable_data()->Add(
cast::to<float>(Xdata[id]));
cast::to<float>(x[xi]));
}
}
WriteProtoToBinaryFile(net_param, file.c_str());
WriteProtoToBinaryFile(net, file.c_str());
LOG(INFO) << "Save the model @: " << file << "......";
LOG(INFO) << "Model format: Caffe";
}
......
......@@ -748,7 +748,7 @@ def Arange(start, stop=None, step=1, dtype='float32', **kwargs):
@OpSchema.Inputs(1)
def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
def Multinomial(inputs, num_samples=1, eps=0., normalize=False, **kwargs):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
......@@ -765,6 +765,8 @@ def Multinomial(inputs, num_samples=1, normalize=False, **kwargs):
The input tensor.
num_samples : int, optional, default=1
The number of samples.
eps : float, optional, default=0.
The prob to a uniform sampling.
normalize : boolean, optional, default=False
Whether to normalize the inputs.
......
......@@ -987,7 +987,7 @@ def one_hot(input, depth):
return module.forward(input)
def multinomial(input, num_samples, out=None):
def multinomial(input, num_samples, eps=0., out=None):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
......@@ -997,8 +997,8 @@ def multinomial(input, num_samples, out=None):
The input tensor.
num_samples : int
The number of samples.
normalize : boolean, optional, default=False
Whether to normalize the inputs.
eps : float, optional, default=0.
The prob to a uniform sampling.
Returns
-------
......@@ -1008,9 +1008,11 @@ def multinomial(input, num_samples, out=None):
"""
dev = MakeDevice(inputs=[input])
key = 'Multinomial/{}' \
'/num_samples:{}'.format(dev, num_samples)
'/num_samples:{}' \
'/eps:{}'.format(dev, num_samples, eps)
module = get_module(
Multinomial, key, dev,
eps=eps,
num_samples=num_samples,
)
return module.forward(input, out)
......
......@@ -377,6 +377,7 @@ class Cast(BaseModule):
class Multinomial(BaseModule):
def __init__(self, key, dev, **kwargs):
super(Multinomial, self).__init__(key, dev, **kwargs)
self.eps = kwargs.get('eps', 0)
self.num_samples = kwargs.get('num_samples', 1)
self.register_op()
......@@ -384,6 +385,7 @@ class Multinomial(BaseModule):
self.op_meta = {
'op_type': 'Multinomial',
'arguments': {
'eps': float(self.eps),
'num_samples': self.num_samples,
'normalize': False,
},
......
......@@ -980,7 +980,7 @@ class Tensor(object):
"""
raise NotImplementedError('Refer torch.ops.tensor.normal_')
def multinomial(self, num_samples, normalize=False):
def multinomial(self, num_samples, eps=0.):
"""Return a tensor where each row contains ``num_samples``,
sampled from the multinomial distribution.
......@@ -988,8 +988,8 @@ class Tensor(object):
----------
num_samples : int
The number of samples.
normalize : boolean, optional, default=False
Whether to normalize the inputs.
eps : float, optional, default=0.
The prob to a uniform sampling.
Returns
-------
......
......@@ -81,8 +81,8 @@ void _ApplyNMS(
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes,
boxes_nbytes, cudaMemcpyHostToDevice));
nms_mask<T>
<< < blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >> > (num_boxes,
<<< blocks, NMS_BLOCK_SIZE,
0, ctx->cuda_stream() >>> (num_boxes,
thresh, (T*)boxes_dev, (uint64_t*)mask_dev);
ctx->FinishDeviceCompution();
......
......@@ -347,7 +347,7 @@ inline void CollectRoIs(
const int canonical_level,
const int canonical_scale,
const T* rois,
vector< vector<int64_t> >& roi_bins) {
vector<vec64_t>& roi_bins) {
const T* roi = rois;
for (int i = 0; i < num_rois; ++i) {
int bin_idx = roi_level(min_level, max_level,
......@@ -360,7 +360,7 @@ inline void CollectRoIs(
template <typename T>
inline void DistributeRoIs(
const vector< vector<int64_t> >& roi_bins,
const vector<vec64_t>& roi_bins,
const T* rois,
vector<T*> outputs) {
for (int i = 0; i < roi_bins.size(); i++) {
......
......@@ -123,7 +123,7 @@ Graph::Graph(const GraphDef& def, Workspace* ws)
// Recomputing-aware
if (subgraph_indices.size() > 0) {
Map< string, vector<OperatorBase*> > subgraph;
Map<string, vector<OperatorBase*>> subgraph;
for (const auto& it : subgraph_indices) {
subgraph[it.first] = vector<OperatorBase*>();
for (const auto& idx : subgraph_indices[it.first])
......
......@@ -7,7 +7,7 @@ namespace dragon {
bool GraphGradientMaker::CheckGrad(
const OperatorDef& forward_op,
const Set<string>& targets,
vector< pair<string, int> >& gen_grads) {
vector<pair<string, int>>& gen_grads) {
if (NoGradientRegistry()->Has(forward_op.type())) {
for (auto& input : forward_op.input())
blacklist_set_.insert(input);
......@@ -81,7 +81,7 @@ void GraphGradientMaker::Make(
for (int i = (int)forward_def.size() - 1; i >= 0; --i) {
// Collect inputs & outputs, generate RAW grad ops
const OperatorDef& op = *forward_def[i];
vector< pair<string, int> > gen_grads;
vector<pair<string, int>> gen_grads;
bool is_skip = CheckGrad(op, targets_set, gen_grads);
vector<string> g_outputs;
for (auto& output : op.output()) {
......@@ -214,7 +214,7 @@ void GraphGradientMaker::Make(
GraphDef GraphGradientMaker::Share(const GraphDef& input_def) {
Set<int> invalid_ops;
Map<string, int> ref_count;
Map< string, pair<int, string> > ssa_map;
Map<string, pair<int, string>> ssa_map;
// Count the refs for detecting leaf nodes
for (int i = 0; i < input_def.op_size(); ++i) {
const OperatorDef& op = input_def.op(i);
......
......@@ -174,7 +174,7 @@ GraphDef GraphOptimizer::MirrorStage(
const GraphDef& input_def,
Map<string, vec32_t >& op_indices) {
GraphDef output_def(input_def);
Map<string, set<int> > fake_op_indices;
Map<string, set<int>> fake_op_indices;
Map<string, string> rename_map;
Map<string, int> versions;
......
......@@ -54,7 +54,7 @@ OpSchema& OpSchema::NumOutputs(int n) {
return NumOutputs(n, n);
}
OpSchema& OpSchema::Inplace(set< pair<int, int> > inplace) {
OpSchema& OpSchema::Inplace(set<pair<int, int>> inplace) {
CheckInplace = [inplace](int in, int out)->bool {
return (inplace.count(std::make_pair(in, out)) > 0);
};
......
......@@ -37,14 +37,10 @@ template<> void Dropout<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
auto thresh = (uint32_t)(UINT_MAX * prob);
math::RandomUniform(
count,
0.f, (float)UINT_MAX,
mask32, ctx
);
math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
_Dropout
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
thresh,
scale,
......@@ -85,14 +81,10 @@ template<> void Dropout<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
auto thresh = (uint32_t)(UINT_MAX * prob);
math::RandomUniform(
count,
0.f, (float)UINT_MAX,
mask32, ctx
);
math::RandomUniform(count, 0.f, 1.f, mask32, ctx);
_Dropout
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
thresh,
cast::to<half>(scale),
......@@ -124,8 +116,8 @@ template <> void ApplyMask<float, uint8_t, CUDAContext>(
float* y,
CUDAContext* ctx) {
_ApplyMask
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, scale, x, mask, y
);
}
......@@ -157,8 +149,8 @@ template <> void ApplyMask<float16, uint8_t, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_ApplyMaskHalf
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
cast::to<half>(scale),
reinterpret_cast<const half*>(x),
......
......@@ -44,8 +44,8 @@ template<> void DropPath<float, CUDAContext>(
auto nthreads = rows * cols;
auto thresh = 1.f - (1.f / scale);
_DropPath
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, cols, thresh, scale, x, mask, y
);
}
......@@ -85,8 +85,8 @@ template<> void DropPath<float16, CUDAContext>(
auto nthreads = rows * cols;
auto thresh = 1.f - (1.f / scale);
_DropPath
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, cols,
thresh,
cast::to<half>(scale),
......
......@@ -28,8 +28,8 @@ template<> void Elu<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Elu
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, x, alpha, y
);
}
......@@ -58,8 +58,8 @@ template<> void EluGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_EluGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, alpha, dy, y, dx
);
}
......
......@@ -66,21 +66,21 @@ template<> void PRelu<float, CUDAContext>(
CUDAContext* ctx) {
if (channel_shared) {
_PRelu
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, channels, dim, x, w, y
);
} else {
if (data_format == "NCHW") {
_PReluNCHW
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, channels, dim, x, w, y
);
} else if (data_format == "NHWC") {
_PReluNHWC
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, channels, dim, x, w, y
);
} else {
......@@ -152,21 +152,21 @@ template<> void PReluGrad<float, CUDAContext>(
CUDAContext* ctx) {
if (channel_shared) {
_PReluGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, channels, dim, dy, x, w, dx
);
} else {
if (data_format == "NCHW") {
_PReluGradNCHW
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, channels, dim, dy, x, w, dx
);
} else if (data_format == "NHWC") {
_PReluGradNHWC
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, channels, dim, dy, x, w, dx
);
} else {
......@@ -210,8 +210,8 @@ template<> void PReluWGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto cdim = channels * dim;
_PReluWGradBcast
<< < CUDA_BLOCKS(cdim), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(cdim), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
cdim, rows, row_offset, dy, x, bcast_dw
);
if (channel_shared) {
......
......@@ -35,8 +35,8 @@ template<> void Relu<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Relu
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, slope, x, y
);
}
......@@ -83,8 +83,8 @@ template<> void Relu<float16, CUDAContext>(
CUDAContext* ctx) {
if ((count & 1) == 0) {
_Relu
<< < CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count >> 1), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count >> 1,
cast::to<half2>(slope),
reinterpret_cast<const half2*>(x),
......@@ -92,8 +92,8 @@ template<> void Relu<float16, CUDAContext>(
);
} else {
_Relu
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
cast::to<half>(slope),
reinterpret_cast<const half*>(x),
......@@ -134,8 +134,8 @@ template<> void ReluGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_ReluGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, slope, dy, y, dx
);
}
......@@ -170,8 +170,8 @@ template<> void ReluGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_ReluGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, slope,
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(y),
......
......@@ -34,8 +34,8 @@ template<> void SElu<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_SElu
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, x, y
);
}
......@@ -63,8 +63,8 @@ template<> void SElu<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_SElu
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x),
reinterpret_cast<half*>(y)
......@@ -99,8 +99,8 @@ template<> void SEluGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_SEluGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, dy, y, dx
);
}
......@@ -131,8 +131,8 @@ template<> void SEluGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_SEluGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(y),
......
......@@ -25,8 +25,8 @@ template<> void Sigmoid<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Sigmoid
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, x, y
);
}
......@@ -51,8 +51,8 @@ template<> void SigmoidGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_SigmoidGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, dy, y, dx
);
}
......
......@@ -96,26 +96,26 @@ template<> void Softmax<float, CUDAContext>(
auto num_preds = outer_dim * inner_dim;
auto nelements = num_preds * axis_dim;
_SoftmaxReduceMax
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim, x, scale
);
_SoftmaxSub
<< < CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nelements, axis_dim, inner_dim, scale, y
);
math::Exp(nelements, y, y, ctx);
_SoftmaxReduceSum
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim, y, scale
);
_SoftmaxDiv
<< < CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nelements, axis_dim, inner_dim, scale, y
);
}
......@@ -159,13 +159,13 @@ template<> void SoftmaxGrad<float, CUDAContext>(
auto num_preds = outer_dim * inner_dim;
auto nelements = num_preds * axis_dim;
_SoftmaxDot
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim, dy, y, scale
);
_SoftmaxSub
<< < CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nelements), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nelements, axis_dim, inner_dim, scale, dx
);
math::Mul(nelements, dx, y, dx, ctx);
......
......@@ -25,8 +25,8 @@ template<> void Tanh<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_Tanh
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, x, y
);
}
......@@ -51,8 +51,8 @@ template<> void TanhGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_TanhGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, dy, y, dx
);
}
......
......@@ -60,15 +60,15 @@ template<> void Affine<float, CUDAContext>(
auto nthreads = outer_dim * axis_dim * inner_dim;
if (beta != nullptr) {
_Affine
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim,
x, alpha, beta, y
);
} else {
_AffineNoBias
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, x, alpha, y
);
}
......@@ -124,8 +124,8 @@ template<> void Affine<float16, CUDAContext>(
auto nthreads = outer_dim * axis_dim * inner_dim;
if (beta != nullptr) {
_Affine
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
......@@ -134,8 +134,8 @@ template<> void Affine<float16, CUDAContext>(
);
} else {
_AffineNoBias
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim,
reinterpret_cast<const half*>(x),
reinterpret_cast<const half*>(alpha),
......@@ -156,8 +156,8 @@ template <> void AffineGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_AffineNoBias
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, dy, alpha, dx
);
}
......@@ -174,8 +174,8 @@ template <> void AffineGrad<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_AffineNoBias
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim,
reinterpret_cast<const half*>(dy),
reinterpret_cast<const half*>(alpha),
......
......@@ -83,8 +83,8 @@ template<> __global__ void _ClipGrad<half>(
T* y, \
CUDAContext* ctx) { \
_Clip<T> \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, \
cast::to<T>(low), \
cast::to<T>(high), \
......@@ -102,8 +102,8 @@ template<> __global__ void _ClipGrad<half>(
T* dx, \
CUDAContext* ctx) { \
_ClipGrad<T> \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, \
cast::to<T>(low), \
cast::to<T>(high), \
......@@ -133,8 +133,8 @@ template <> void Clip<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Clip
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
cast::to<half>(low),
cast::to<half>(high),
......@@ -152,8 +152,8 @@ template <> void ClipGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_ClipGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
cast::to<half>(low),
cast::to<half>(high),
......
......@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
T* y, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, y \
); \
}
......@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMaximumGrad<half>(
T* dx2, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, dy, dx1, dx2 \
); \
}
......@@ -196,8 +196,8 @@ template <> void Maximum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Maximum \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
......@@ -212,8 +212,8 @@ template <> void BroadcastMaximum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_BroadcastMaximum \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
......@@ -230,8 +230,8 @@ template <> void MaximumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_MaximumGrad \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
......@@ -250,8 +250,8 @@ template <> void BroadcastMaximumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_BroadcastMaximumGrad \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
......
......@@ -139,8 +139,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
T* y, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, y \
); \
}
......@@ -155,8 +155,8 @@ template<> __global__ void _BroadcastMinimumGrad<half>(
T* dx2, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, x1, x2, dy, dx1, dx2 \
); \
}
......@@ -196,8 +196,8 @@ template <> void Minimum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Minimum \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
......@@ -212,8 +212,8 @@ template <> void BroadcastMinimum<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_BroadcastMinimum \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
......@@ -230,8 +230,8 @@ template <> void MinimumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_MinimumGrad \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
reinterpret_cast<const half*>(x2),
......@@ -250,8 +250,8 @@ template <> void BroadcastMinimumGrad<float16, CUDAContext>(
float16* dx2,
CUDAContext* ctx) {
_BroadcastMinimumGrad \
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(x1),
cast::to<half>(x2),
......
......@@ -251,8 +251,8 @@ void _Moments(
ndims, x_dims, y_dims,
&rows, &cols)) {
_ColwiseMoments
<< < CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
rows, cols, x, mean, var
); return;
}
......@@ -262,8 +262,8 @@ void _Moments(
ndims, x_dims, y_dims,
&rows, &cols)) {
_RowwiseMoments
<< < CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
rows, cols, x, mean, var
); return;
}
......@@ -294,8 +294,8 @@ void _Moments(
ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());
_GenericMoments
<< < CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
ndims, outer_dim, inner_dim,
XSS, YDS, x, mean, var
);
......
......@@ -30,8 +30,8 @@ __global__ void _Arange(
T* y, \
CUDAContext* ctx) { \
_Arange \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, start, step, y \
); \
}
......@@ -64,8 +64,8 @@ template <> void Arange<float16, CUDAContext>(
float16* y,
CUDAContext* ctx) {
_Arange
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, start, step,
reinterpret_cast<half*>(y)
);
......
......@@ -20,12 +20,12 @@ void _ArgMax(
for (int iix = 0; iix < inner_dim; ++iix) {
const T* X = x + (oix * axis_dim * inner_dim + iix);
const int y_offset = oix * top_k * inner_dim + iix;
vector< pair<T, int64_t> > vec(axis_dim);
vector<pair<T, int64_t>> vec(axis_dim);
for (int j = 0; j < axis_dim; ++j)
vec[j] = std::make_pair(X[j * inner_dim], j);
std::partial_sort(
vec.begin(), vec.begin() + top_k, vec.end(),
std::greater< pair<T, int64_t> >());
std::greater<pair<T, int64_t>>());
for (int j = 0; j < top_k; ++j) {
indices[y_offset + j * inner_dim] = vec[j].second;
if (values) values[y_offset + j * inner_dim] = vec[j].first;
......@@ -49,7 +49,7 @@ void _ArgMin(
for (int iix = 0; iix < inner_dim; ++iix) {
const T* X = x + (oix * axis_dim * inner_dim + iix);
const int y_offset = oix * top_k * inner_dim + iix;
vector< pair<T, int64_t> > vec(axis_dim);
vector<pair<T, int64_t>> vec(axis_dim);
for (int j = 0; j < axis_dim; ++j)
vec[j] = std::make_pair(X[j * inner_dim], j);
std::partial_sort(vec.begin(), vec.begin() + top_k, vec.end());
......
......@@ -133,8 +133,8 @@ template<> __global__ void _ArgMin<half>(
CHECK_EQ(top_k, 1) << "\nRequired top_k == 1."; \
auto nthreads = outer_dim * inner_dim; \
_##name \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, inner_dim, axis_dim, \
x, indices, values \
); \
......@@ -168,8 +168,8 @@ template<> void ArgMax<float16, CUDAContext>(
CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
auto nthreads = outer_dim * inner_dim;
_ArgMax
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, inner_dim, axis_dim,
reinterpret_cast<const half*>(x),
indices,
......@@ -189,8 +189,8 @@ template<> void ArgMin<float16, CUDAContext>(
CHECK_EQ(top_k, 1) << "\nRequired top_k == 1.";
auto nthreads = outer_dim * inner_dim;
_ArgMin
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, inner_dim, axis_dim,
reinterpret_cast<const half*>(x),
indices,
......
......@@ -43,8 +43,8 @@ __global__ void _Concat(
auto cols = axis_dim * inner_dim; \
auto nthreads = outer_dim * axis_dim * inner_dim; \
_##name \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, \
inner_dim, \
cols, \
......
......@@ -83,8 +83,8 @@ __global__ void _CropGrad(
T* y, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, ndims, \
x_strides, y_dims, \
starts, x, y \
......
......@@ -115,8 +115,8 @@ template <> __global__ void _IndexSelectGrad<half>(
CUDAContext* ctx) { \
auto nthreads = outer_dim * num_indices * inner_dim; \
_IndexSelect \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, inner_dim, \
axis_dim, num_indices, \
indices, x, y \
......@@ -135,8 +135,8 @@ template <> __global__ void _IndexSelectGrad<half>(
CUDAContext* ctx) { \
auto nthreads = outer_dim * inner_dim; \
_IndexSelectGrad \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, inner_dim, \
axis_dim, num_indices, \
indices, dy, dx \
......@@ -170,8 +170,8 @@ template <> void IndexSelectGrad<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_IndexSelectGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, inner_dim,
axis_dim, num_indices,
indices,
......
......@@ -32,8 +32,8 @@ template <> void OneHot<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_OneHot
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, depth, on_value, x, y
);
}
......@@ -48,8 +48,8 @@ template <> void OneHot<int, CUDAContext>(
int* y,
CUDAContext* ctx) {
_OneHot
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, depth, on_value, x, y
);
}
......@@ -64,8 +64,8 @@ template <> void OneHot<int64_t, CUDAContext>(
int64_t* y,
CUDAContext* ctx) {
_OneHot
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, depth, on_value, x, y
);
}
......
......@@ -130,8 +130,8 @@ __global__ void _EdgePad(
T* y, \
CUDAContext* ctx) { \
_ConstPad \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, ndims, \
x_dims, x_strides, \
y_dims, l_pads, \
......@@ -152,8 +152,8 @@ __global__ void _EdgePad(
T* y, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, ndims, \
x_dims, x_strides, \
y_dims, l_pads, \
......
......@@ -202,8 +202,8 @@ void _ReduceSum(
ndims, x_dims, y_dims,
&rows, &cols)) {
_ColwiseReduceSum
<< < CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(rows), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
rows, cols, scale, x, y
); return;
}
......@@ -213,8 +213,8 @@ void _ReduceSum(
ndims, x_dims, y_dims,
&rows, &cols)) {
_RowwiseReduceSum
<< < CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(cols), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
rows, cols, scale, x, y
); return;
}
......@@ -245,8 +245,8 @@ void _ReduceSum(
ctx->Memcpy<CUDAContext, CPUContext>(dbytes, YDS, dimsT.data());
_GenericReduceSum
<< < CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(outer_dim), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
ndims, outer_dim, inner_dim,
XSS, YDS, scale, x, y
);
......@@ -372,8 +372,8 @@ template <> __global__ void _ReduceSumGrad<half>(
T* dx, \
CUDAContext* ctx) { \
_ReduceSumGrad \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, ndim, x_dims, \
y_dims, y_strides, \
scale, dy, dx \
......@@ -398,8 +398,8 @@ template<> void ReduceSumGrad<float16, CUDAContext>(
float16* dx,
CUDAContext* ctx) {
_ReduceSumGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, ndim, x_dims,
y_dims, y_strides,
scale,
......
......@@ -93,8 +93,8 @@ template<> __global__ void _RepeatGrad<half>(
auto y_inner_dim = inner_dim * repeats; \
auto nthreads = outer_dim * axis_dim * y_inner_dim; \
_Repeat \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, axis_dim, \
inner_dim, y_inner_dim, \
x, y \
......@@ -113,8 +113,8 @@ template<> __global__ void _RepeatGrad<half>(
auto y_inner_dim = inner_dim * repeats; \
auto nthreads = outer_dim * axis_dim * inner_dim; \
_RepeatGrad \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, \
axis_dim, \
inner_dim, \
......@@ -151,8 +151,8 @@ template<> void RepeatGrad<float16, CUDAContext>(
auto y_inner_dim = inner_dim * repeats;
auto nthreads = outer_dim * axis_dim * inner_dim;
_RepeatGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>(
nthreads,
axis_dim,
inner_dim,
......
......@@ -64,8 +64,8 @@ __global__ void _SliceGrad(
auto cols = slice_dim * inner_dim; \
auto nthreads = outer_dim * cols; \
_##name \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, \
inner_dim, \
axis_dim, \
......@@ -126,8 +126,8 @@ template <> void SliceGrad<float16, CUDAContext>(
auto cols = slice_dim * inner_dim;
auto nthreads = outer_dim * cols;
_SliceGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
inner_dim,
axis_dim,
......
......@@ -98,8 +98,8 @@ template<> __global__ void _TileGrad<half>(
T* y, \
CUDAContext* ctx) { \
_Tile \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, \
ndims, \
x_dims, \
......@@ -120,8 +120,8 @@ template<> __global__ void _TileGrad<half>(
auto nthreads = rows * cols; \
auto tiled_cols = multiple * cols; \
_TileGrad \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, \
cols, \
tiled_cols, \
......@@ -156,8 +156,8 @@ template<> void TileGrad<float16, CUDAContext>(
auto nthreads = rows * cols;
auto tiled_cols = multiple * cols;
_TileGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
cols,
tiled_cols,
......
......@@ -80,8 +80,8 @@ __global__ void _TransposeGrad(
T* y, \
CUDAContext* ctx) { \
_##name \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, ndims, x_strides, y_dims, x, y \
); \
}
......
......@@ -55,8 +55,8 @@ __global__ void _Assign(
T* y, \
CUDAContext* ctx) { \
_Assign \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, \
ndims, \
x_dims, \
......
......@@ -153,8 +153,8 @@ __global__ void _GreaterEqualHalf(
bool* y, \
CUDAContext* ctx) { \
IMPL \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, a, b, y \
); \
}
......@@ -167,8 +167,8 @@ __global__ void _GreaterEqualHalf(
bool* y, \
CUDAContext* ctx) { \
_##OP##Half \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, \
reinterpret_cast<const half*>(a), \
reinterpret_cast<const half*>(b), \
......
......@@ -30,8 +30,8 @@ __global__ void _MaskedAssign(
T* y, \
CUDAContext* ctx) { \
_MaskedAssign \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, mask, x, y \
); \
}
......
......@@ -27,8 +27,8 @@ template<> void AbsGrad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_AbsGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, dy, dx
);
}
......
......@@ -55,8 +55,8 @@ template <> void NLLLoss<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLoss
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, loss, flag
);
......@@ -77,8 +77,8 @@ template <> void NLLLoss<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLoss
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, loss, flag
);
......@@ -129,8 +129,8 @@ template<> void NLLLossGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLossGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, dx, flag
);
......@@ -151,8 +151,8 @@ template<> void NLLLossGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_NLLLossGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, log_prob, target, dx, flag
);
......
......@@ -42,8 +42,8 @@ template <> void SigmoidCrossEntropy<float, CUDAContext>(
int* flag,
CUDAContext* ctx) {
_SigmoidCrossEntropy
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, logit, target, loss, flag
);
}
......@@ -77,8 +77,8 @@ template <> void SigmoidCrossEntropyGrad<float, CUDAContext>(
int* flag,
CUDAContext* ctx) {
_SigmoidCrossEntropyGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, logit, target, dlogit, flag
);
}
......
......@@ -71,8 +71,8 @@ template <> void SigmoidFocalLoss<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLoss
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, losses, flags
......@@ -96,8 +96,8 @@ template <> void SigmoidFocalLoss<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLoss
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, losses, flags
......@@ -171,8 +171,8 @@ template <> void SigmoidFocalLossGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto count = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLossGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, dlogits, flags
......@@ -196,8 +196,8 @@ template <> void SigmoidFocalLossGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto count = outer_dim * axis_dim * inner_dim;
_SigmoidFocalLossGrad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
logits, targets, dlogits, flags
......
......@@ -33,8 +33,8 @@ template<> void SmoothL1<float, CUDAContext>(
float* y,
CUDAContext* ctx) {
_SmoothL1
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, beta, x, y
);
}
......@@ -63,8 +63,8 @@ template<> void SmoothL1Grad<float, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_SmoothL1Grad
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, beta, dy, dx
);
}
......
......@@ -29,8 +29,8 @@ template <> void SoftmaxCrossEntropy<float, CUDAContext>(
float* losses,
CUDAContext* ctx) {
_SoftmaxCrossEntropy
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, prob, targets, losses
);
}
......
......@@ -67,8 +67,8 @@ template <> void SoftmaxFocalLoss<float, float, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLoss
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
......@@ -95,8 +95,8 @@ template <> void SoftmaxFocalLoss<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLoss
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
......@@ -179,8 +179,8 @@ template<> void SoftmaxFocalLossGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLossGrad
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
......@@ -207,8 +207,8 @@ template<> void SoftmaxFocalLossGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto num_preds = outer_dim * inner_dim;
_SoftmaxFocalLossGrad
<< < CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(num_preds), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
num_preds, axis_dim, inner_dim,
pos_alpha, neg_alpha, gamma, neg_id,
nignores, ignores,
......
......@@ -59,8 +59,8 @@ template <> void SparseSoftmaxCrossEntropy<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropy
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, loss, flag
);
......@@ -81,8 +81,8 @@ template <> void SparseSoftmaxCrossEntropy<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropy
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, loss, flag
);
......@@ -136,8 +136,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, dx, flag
);
......@@ -158,8 +158,8 @@ template<> void SparseSoftmaxCrossEntropyGrad<float, int64_t, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = outer_dim * inner_dim;
_SparseSoftmaxCrossEntropyGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, nignores,
ignore, prob, target, dx, flag
);
......
......@@ -26,8 +26,8 @@ __global__ void _TypeA2B(
Tb* b, \
CUDAContext* ctx) { \
_TypeA2B \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, a, b \
); \
}
......@@ -66,8 +66,8 @@ template <> void TypeA2B<float16, float, CUDAContext>(
float* b,
CUDAContext* ctx) {
_TypeA2B
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, reinterpret_cast<const half*>(a), b
);
}
......@@ -89,8 +89,8 @@ template <> void TypeA2B<float, float16, CUDAContext>(
float16* b,
CUDAContext* ctx) {
_TypeA2B
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, a, reinterpret_cast<half*>(b)
);
}
......@@ -112,8 +112,8 @@ template <> void TypeA2B<float16, float16, CUDAContext>(
float16* b,
CUDAContext* ctx) {
_TypeA2B
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(a),
reinterpret_cast<half*>(b)
......
......@@ -62,8 +62,8 @@ template <> __global__ void _GradientTwoSum<half2>(
T* dx, \
CUDAContext* ctx) { \
_GradientTwoSum \
<< < CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(count), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
count, dy1, dy2, dx \
); \
}
......@@ -83,8 +83,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
CUDAContext* ctx) {
if ((count & 1) == 0) {
_GradientTwoSum
<< < CUDA_BLOCKS(count >> 2), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count >> 2), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count >> 2,
reinterpret_cast<const half2*>(dy1),
reinterpret_cast<const half2*>(dy2),
......@@ -92,8 +92,8 @@ template <> void GradientTwoSum<float16, CUDAContext>(
);
} else {
_GradientTwoSum
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
reinterpret_cast<const half*>(dy1),
reinterpret_cast<const half*>(dy2),
......
......@@ -76,14 +76,14 @@ template <> void ImageData<float, float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std, x, y
);
} else if (data_format == "NHWC") {
_ImageDataNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std, x, y
);
} else {
......@@ -107,14 +107,14 @@ template <> void ImageData<uint8_t, float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std, x, y
);
} else if (data_format == "NHWC") {
_ImageDataNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std, x, y
);
} else {
......@@ -191,15 +191,15 @@ template <> void ImageData<float, float16, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataHalfNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
} else if (data_format == "NHWC") {
_ImageDataHalfNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
......@@ -222,15 +222,15 @@ template <> void ImageData<uint8_t, float16, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_ImageDataHalfNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
} else if (data_format == "NHWC") {
_ImageDataHalfNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, mean, std,
x, reinterpret_cast<half*>(y)
);
......
......@@ -190,27 +190,27 @@ __global__ void _BatchNormInferenceGrad(
auto nthreads = N * C * S; \
if (data_format == "NCHW") { \
_BatchNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, C, S, x, mu, rsig, gamma, \
dy, ds, db, dgamma, dbeta \
); \
_BatchNormTrainingGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, N, C, S, x, mu, \
rsig, gamma, ds, db, dy, dx \
); \
} else if (data_format == "NHWC") { \
_BatchNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, C, S, x, mu, rsig, gamma, \
dy, ds, db, dgamma, dbeta \
); \
_BatchNormTrainingGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, N, C, S, x, mu, \
rsig, gamma, ds, db, dy, dx \
); \
......@@ -234,24 +234,24 @@ __global__ void _BatchNormInferenceGrad(
if (data_format == "NCHW") { \
if (dgamma != nullptr) { \
_BatchNormWGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>> \
(N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
} \
_BatchNormInferenceGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>> \
(nthreads, C, S, rsig, gamma, dy, dx); \
} else if (data_format == "NHWC") { \
if (dgamma != nullptr) { \
_BatchNormWGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
<<< CUDA_2D_BLOCKS(C), CUDA_THREADS, \
0, ctx->cuda_stream() >>> \
(N, C, S, x, mu, rsig, dy, dgamma, dbeta); \
} \
_BatchNormInferenceGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>> \
(nthreads, C, S, rsig, gamma, dy, dx); \
} \
}
......
......@@ -408,20 +408,20 @@ __global__ void _GroupNormGradHalf(
CUDAContext* ctx) { \
const int C = G * D; \
_GroupNormFusedParams<Tp> \
<< < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, G, D, mu, rsig, gamma, beta, scale, bias \
); \
if (data_format == "NCHW") { \
_GroupNormForwardNCHW<Tx, Tp> \
<< < CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, C, S, x, scale, bias, y \
); \
} else if (data_format == "NHWC") { \
_GroupNormForwardNHWC<Tx, Tp> \
<< < CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, C, S, x, scale, bias, y \
); \
} \
......@@ -448,35 +448,35 @@ __global__ void _GroupNormGradHalf(
auto nthreads = N * G * D * S; \
if (data_format == "NCHW") { \
_GroupNormWGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
); \
_GroupNormInternalGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, G, D, S, x, gamma, dy, ds, db \
); \
_GroupNormGrad<Tx, Tp, StorageOrder::NCHW> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
nthreads, G, D, S, x, mu, rsig, \
gamma, ds, db, dy, dx \
); \
} else if (data_format == "NHWC") { \
_GroupNormWGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, G, D, S, x, mu, rsig, dy, dgamma, dbeta \
); \
_GroupNormInternalGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >> >( \
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS, \
0, ctx->cuda_stream() >>>( \
N, G, D, S, x, gamma, dy, ds, db \
); \
_GroupNormGrad<Tx, Tp, StorageOrder::NHWC> \
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >> > ( \
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS, \
0, ctx->cuda_stream() >>> ( \
nthreads, G, D, S, x, mu, rsig, \
gamma, ds, db, dy, dx \
); \
......@@ -503,14 +503,14 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
CUDAContext* ctx) {
const int C = G * D;
_GroupNormFusedParams<float>
<< < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, G, D, mu, rsig, gamma, beta, scale, bias
);
if (data_format == "NCHW") {
_GroupNormForwardNCHW<half, float>
<< < CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, C, S,
reinterpret_cast<const half*>(x),
scale, bias,
......@@ -518,8 +518,8 @@ template <> void GroupNormForward<float16, float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_GroupNormForwardNHWC<half, float>
<< < CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(N * C), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, C, S,
reinterpret_cast<const half*>(x),
scale, bias,
......@@ -548,8 +548,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
auto nthreads = N * G * D * S;
if (data_format == "NCHW") {
_GroupNormWGradHalf<StorageOrder::NCHW>
<< < CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig,
......@@ -557,8 +557,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
dgamma, dbeta
);
_GroupNormInternalGradHalf<StorageOrder::NCHW>
<< < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, G, D, S,
reinterpret_cast<const half*>(x),
gamma,
......@@ -566,8 +566,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
ds, db
);
_GroupNormGradHalf<StorageOrder::NCHW>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig, gamma, ds, db,
......@@ -576,8 +576,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
);
} else if (data_format == "NHWC") { \
_GroupNormWGradHalf<StorageOrder::NHWC>
<< < CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(G * D), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig,
......@@ -585,8 +585,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
dgamma, dbeta
);
_GroupNormInternalGradHalf<StorageOrder::NHWC>
<< < CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_2D_BLOCKS(N * G), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
N, G, D, S,
reinterpret_cast<const half*>(x),
gamma,
......@@ -594,8 +594,8 @@ template <> void GroupNormBackward<float16, float, CUDAContext>(
ds, db
);
_GroupNormGradHalf<StorageOrder::NHWC>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, G, D, S,
reinterpret_cast<const half*>(x),
mu, rsig, gamma, ds, db,
......
......@@ -58,13 +58,13 @@ template <> void LSTMCell<float, CUDAContext>(
auto o_offset = 2 * C, c_offset = 3 * C,
x_offset = 4 * C, NC = N * C;
_LSTMCellAct
<< < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
NC * 4, c_offset, x_offset, actx
);
_LSTMCellGate
<< < CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
NC, C, o_offset, c_offset,
x_offset, cx, actx, c, h
);
......@@ -138,14 +138,14 @@ template <> void LSTMCellGrad<float, CUDAContext>(
auto o_offset = 2 * C, c_offset = 3 * C,
x_offset = 4 * C, NC = N * C;
_LSTMCellGateGrad
<< < CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(NC), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
NC, C, o_offset, c_offset, x_offset,
cx, actx, c, dc, dh, dcx, dx
);
_LSTMCellActGrad
<< < CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(NC * 4), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
NC * 4, c_offset, x_offset, actx, dx
);
}
......
......@@ -39,8 +39,8 @@ template <> void AdamUpdate<float, CUDAContext>(
float* v,
CUDAContext* ctx) {
_AdamUpdate
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, lr, beta1, beta2, eps, g, m, v
);
}
......
......@@ -29,8 +29,8 @@ template <> void MixedPrecL2Decay<float16, CUDAContext>(
float* dx,
CUDAContext* ctx) {
_MixedPrecL2DecayHalf
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
alpha,
reinterpret_cast<const half*>(w),
......@@ -58,8 +58,8 @@ template <> void MixedPrecUpdate<float16, CUDAContext>(
float16* w,
CUDAContext* ctx) {
_MixedPrecUpdateHalf
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count,
updates,
reinterpret_cast<half*>(w)
......
......@@ -32,8 +32,8 @@ template <> void NesterovUpdate<float, CUDAContext>(
float* h,
CUDAContext* ctx) {
_NesterovUpdate
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, lr, momentum, g, h
);
}
......
......@@ -34,8 +34,8 @@ template <> void RMSPropUpdate<float, CUDAContext>(
float* h,
CUDAContext* ctx) {
_RMSPropUpdate
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, lr, decay, eps, g, h
);
}
......
......@@ -31,8 +31,8 @@ template <> void SGDUpdate<float, CUDAContext>(
float* h,
CUDAContext* ctx) {
_SGDUpdate
<< < CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(count), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
count, lr, momentum, g, h
);
}
......
......@@ -52,14 +52,14 @@ template<> void BiasAdd<float, CUDAContext>(
auto nthreads = outer_dim * axis_dim * inner_dim;
if (data_format == "NCHW") {
_BiasAddNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, inner_dim, bias, y
);
} else if (data_format == "NHWC") {
_BiasAddNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, axis_dim, bias, y
);
} else {
......
......@@ -109,15 +109,15 @@ template <> void BilinearResize<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_BilinearResizeNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
} else if(data_format == "NHWC") {
_BilinearResizeNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
......@@ -224,15 +224,15 @@ template <> void BilinearResizeGrad<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_BilinearResizeGradNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
} else if(data_format == "NHWC") {
_BilinearResizeGradNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
......
......@@ -123,8 +123,8 @@ template <> void Im2Col2d<float, CUDAContext>(
auto nthreads = C * out_h * out_w;
if (data_format == "NCHW") {
_Im2Col2dNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
H, W,
out_h, out_w,
......@@ -136,8 +136,8 @@ template <> void Im2Col2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_Im2Col2dNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -286,8 +286,8 @@ template <> void Col2Im2d<float, CUDAContext>(
const int nthreads = C * H * W;
if (data_format == "NCHW") {
_Col2Im2dNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
H, W,
out_h, out_w,
......@@ -299,8 +299,8 @@ template <> void Col2Im2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_Col2Im2dNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......
......@@ -144,8 +144,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
if (data_format == "NCHW") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dNCHW<float, 3, 3>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -157,8 +157,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dNCHW<float, 5, 5>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -170,8 +170,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dNCHW<float, 7, 7>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -183,8 +183,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else {
_DepthwiseConv2dNCHW<float, -1, -1>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -198,8 +198,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
} else if (data_format == "NHWC") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dNHWC<float, 3, 3>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -211,8 +211,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dNHWC<float, 5, 5>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -224,8 +224,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dNHWC<float, 7, 7>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -237,8 +237,8 @@ template <> void DepthwiseConv2d<float, CUDAContext>(
);
} else {
_DepthwiseConv2dNHWC<float, -1, -1>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -394,8 +394,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
if (data_format == "NCHW") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dGradNCHW<float, 3, 3>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -407,8 +407,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dGradNCHW<float, 5, 5>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -420,8 +420,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dGradNCHW<float, 7, 7>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -433,8 +433,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else {
_DepthwiseConv2dGradNCHW<float, -1, -1>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -448,8 +448,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
} else if (data_format == "NHWC") {
if (kernel_h == 3 && kernel_w == 3) {
_DepthwiseConv2dGradNHWC<float, 3, 3>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -461,8 +461,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 5 && kernel_w == 5) {
_DepthwiseConv2dGradNHWC<float, 5, 5>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -474,8 +474,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else if (kernel_h == 7 && kernel_w == 7) {
_DepthwiseConv2dGradNHWC<float, 7, 7>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -487,8 +487,8 @@ template <> void DepthwiseConv2dGrad<float, CUDAContext>(
);
} else {
_DepthwiseConv2dGradNHWC<float, -1, -1>
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
out_h, out_w,
......@@ -634,8 +634,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
auto nblocks = C * kernel_h * kernel_w;
if (data_format == "NCHW") {
_DepthwiseConv2dWGradNCHW
<< < nblocks, nthreads,
0, ctx->cuda_stream() >> >(
<<< nblocks, nthreads,
0, ctx->cuda_stream() >>>(
N, C, H, W,
out_h, out_w,
kernel_h, kernel_w,
......@@ -646,8 +646,8 @@ template <> void DepthwiseConv2dWGrad<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_DepthwiseConv2dWGradNHWC
<< < nblocks, nthreads,
0, ctx->cuda_stream() >> >(
<<< nblocks, nthreads,
0, ctx->cuda_stream() >>>(
N, C, H, W,
out_h, out_w,
kernel_h, kernel_w,
......
......@@ -77,16 +77,12 @@ template <> void DropBlock2d<CUDAContext>(
int* mask,
CUDAContext* ctx) {
auto nthreads = N * C * seed_h * seed_w;
math::RandomUniform(
nthreads,
0.f, float(UINT_MAX),
seed, ctx
);
math::RandomUniform(nthreads, 0.f, 1.f, seed, ctx);
auto mask_thresh = (uint32_t)(UINT_MAX * gamma);
if (data_format == "NCHW") {
_DropBlock2dNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
seed_h, seed_w,
......@@ -96,8 +92,8 @@ template <> void DropBlock2d<CUDAContext>(
);
} else if(data_format == "NHWC") {
_DropBlock2dNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
seed_h, seed_w,
......
......@@ -81,15 +81,15 @@ template <> void NNResize<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_NNResizeNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
} else if(data_format == "NHWC") {
_NNResizeNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, x, y
);
......@@ -116,8 +116,8 @@ template <> void NNResize<float16, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_NNResizeNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W,
out_h, out_w, scale_h, scale_w,
reinterpret_cast<const half*>(x),
......@@ -125,8 +125,8 @@ template <> void NNResize<float16, CUDAContext>(
);
} else if(data_format == "NHWC") {
_NNResizeNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W,
out_h, out_w, scale_h, scale_w,
reinterpret_cast<const half*>(x),
......@@ -209,15 +209,15 @@ template <> void NNResizeGrad<float, CUDAContext>(
auto scale_w = (float)W / (float)out_w;
if (data_format == "NCHW") {
_NNResizeGradNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
} else if(data_format == "NHWC") {
_NNResizeGradNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads, C, H, W, out_h, out_w,
scale_h, scale_w, dy, dx
);
......
......@@ -120,8 +120,8 @@ template<> void MaxPool2d<float, CUDAContext>(
auto nthreads = N * C * pool_h * pool_w;
if (data_format == "NCHW") {
_MaxPool2dNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -132,8 +132,8 @@ template<> void MaxPool2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_MaxPool2dNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -256,8 +256,8 @@ template<> void AvgPool2d<float, CUDAContext>(
auto nthreads = N * C * pool_h * pool_w;
if (data_format == "NCHW") {
_AvgPool2dNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -268,8 +268,8 @@ template<> void AvgPool2d<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_AvgPool2dNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -392,8 +392,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_MaxPool2dGrad_NCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -404,8 +404,8 @@ template<> void MaxPool2dGrad<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_MaxPool2dGradNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -531,8 +531,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
auto nthreads = N * C * H * W;
if (data_format == "NCHW") {
_AvgPool2dGradNCHW
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -543,8 +543,8 @@ template<> void AvgPool2dGrad<float, CUDAContext>(
);
} else if (data_format == "NHWC") {
_AvgPool2dGradNHWC
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......
......@@ -132,8 +132,8 @@ template<> void ROIAlign<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlign
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -283,8 +283,8 @@ template<> void ROIAlignGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlignGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......
......@@ -134,8 +134,8 @@ template<> void ROIAlign<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIAlignHalf
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>
(nthreads, C, H, W, pool_h, pool_w,
sampling_ratio, spatial_scale,
reinterpret_cast<const half*>(x), rois,
......
......@@ -92,8 +92,8 @@ template<> void ROIPool<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIPool
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -185,8 +185,8 @@ template<> void ROIPool<float16, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = num_rois * C * pool_h * pool_w;
_ROIPoolHalf
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
C, H, W,
pool_h, pool_w,
......@@ -286,8 +286,8 @@ template<> void ROIPoolGrad<float, CUDAContext>(
CUDAContext* ctx) {
auto nthreads = N * C * H * W;
_ROIPoolGrad
<< < CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >> >(
<<< CUDA_BLOCKS(nthreads), CUDA_THREADS,
0, ctx->cuda_stream() >>>(
nthreads,
num_rois,
C, H, W,
......
......@@ -180,9 +180,9 @@ ONNXBackend::get_special_nodes() const {
}; return kSpecialNodes;
}
const Map< string, Map<string, string> >&
const Map<string, Map<string, string>>&
ONNXBackend::get_node_renamed_attrs() const {
const static Map< string, Map<string, string> >
const static Map<string, Map<string, string>>
kPerNodeRenamedAttrs = {
{ "Gemm", { { "transB", "transW" } } },
{ "BatchNormalization", { { "epsilon", "eps" } } },
......
......@@ -221,7 +221,7 @@ class ONNXBackend {
const Map<string, SpecialNodeConverter>& get_special_nodes() const;
const Map<string, string>& get_renamed_attrs() const;
const Map< string, Map<string, string> >& get_node_renamed_attrs() const;
const Map<string, Map<string, string>>& get_node_renamed_attrs() const;
};
} // namespace onnx
......
......@@ -77,15 +77,8 @@ template <class Context>
void CuDNNDropoutOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -147,15 +140,8 @@ template <class Context>
void CuDNNDropoutGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CUDNN(Dropout);
......
......@@ -26,15 +26,8 @@ template <class Context>
void CuDNNEluOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -60,15 +53,8 @@ template <class Context>
void CuDNNEluGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CUDNN(Elu);
......
......@@ -40,15 +40,8 @@ void CuDNNReluOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -92,15 +85,8 @@ void CuDNNReluGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CUDNN(Relu);
......
......@@ -35,15 +35,8 @@ template <class Context>
void CuDNNSigmoidOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -82,15 +75,8 @@ template <class Context>
void CuDNNSigmoidGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CUDNN(Sigmoid);
......
......@@ -45,15 +45,8 @@ void CuDNNSoftmaxOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -91,15 +84,8 @@ void CuDNNSoftmaxGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CUDNN(Softmax);
......
......@@ -35,15 +35,8 @@ template <class Context>
void CuDNNTanhOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -82,15 +75,8 @@ template <class Context>
void CuDNNTanhGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CUDNN(Tanh);
......
......@@ -44,15 +44,8 @@ template <class Context>
void DropoutOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -83,15 +76,8 @@ template <class Context>
void DropoutGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CPU(Dropout);
......
......@@ -52,15 +52,8 @@ void DropPathOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -97,15 +90,8 @@ void DropPathGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CPU(DropPath);
......
......@@ -20,13 +20,8 @@ template <class Context>
void EluOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -46,13 +41,8 @@ template <class Context>
void EluGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
DEPLOY_CPU(Elu);
......
......@@ -40,13 +40,8 @@ void PReluOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -98,13 +93,8 @@ void PReluGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
Y(1)->ReshapeLike(X(1));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
DEPLOY_CPU(PRelu);
......
......@@ -20,15 +20,8 @@ template <class Context>
void ReluOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -48,15 +41,8 @@ template <class Context>
void ReluGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CPU(Relu);
......
......@@ -19,15 +19,8 @@ template <class Context>
void SEluOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -47,15 +40,8 @@ template <class Context>
void SEluGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
DEPLOY_CPU(SElu);
......
......@@ -15,13 +15,8 @@ template <class Context>
void SigmoidOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -41,13 +36,8 @@ template <class Context>
void SigmoidGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
DEPLOY_CPU(Sigmoid);
......
......@@ -43,13 +43,8 @@ void SoftmaxOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -86,13 +81,8 @@ void SoftmaxGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
DEPLOY_CPU(Softmax);
......
......@@ -15,13 +15,8 @@ template <class Context>
void TanhOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -41,13 +36,8 @@ template <class Context>
void TanhGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else {
LOG(FATAL) << DTypeString(
X(0), { "float32" }
);
}
DispatchHelper<TensorTypes
<float>>::Call(this, X(0));
}
DEPLOY_CPU(Tanh);
......
......@@ -46,15 +46,8 @@ void AffineOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -111,9 +104,7 @@ void AffineGradientOp<Context>::RunImpl() {
}
template <class Context> template <typename T>
void AffineGradientOp<Context>::Reduce(
T* x,
T* y) {
void AffineGradientOp<Context>::Reduce(T* x, T* y) {
vec32_t dims = {
(int)outer_dim_,
(int)scale_dim_,
......@@ -138,15 +129,8 @@ void AffineGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(-1));
if (XIsType(X(-1), float)) {
RunImpl<float>();
} else if (XIsType(X(-1), float16)) {
RunImpl<float16>();
} else {
LOG(FATAL) << DTypeString(X(-1),
{ "float32", "float16" }
);
}
DispatchHelper<TensorTypes
<float, float16>>::Call(this, X(-1));
}
DEPLOY_CPU(Affine);
......
......@@ -108,13 +108,6 @@ void CuDNNAffineOp<Context>::RunOnDevice() {
template <class Context> template <typename DT, typename CT>
void CuDNNAffineGradientOp<Context>::RunImpl() {
this->template ResetDesc<DT>(X(-1));
scale_dim_ = X(1).count();
outer_dim_ = X(-1).count(0, axis_);
inner_dim_ = X(-1).count(axis_ + num_axes_);
dim_ = scale_dim_ * inner_dim_;
reduce_dim_ = std::max(outer_dim_, inner_dim_);
Y(0)->ReshapeLike(X(-1));
auto* alpha = X(1).template data<DT, Context>();
auto* dy = X(-1).template mutable_data<DT, Context>();
......@@ -230,9 +223,7 @@ void CuDNNAffineGradientOp<Context>::CuDNNReduce(
}
template <class Context> template <typename T>
void CuDNNAffineGradientOp<Context>::Reduce(
T* x,
T* y) {
void CuDNNAffineGradientOp<Context>::Reduce(T* x, T* y) {
vec32_t dims = {
(int)outer_dim_,
(int)scale_dim_,
......@@ -248,6 +239,14 @@ void CuDNNAffineGradientOp<Context>::Reduce(
template <class Context>
void CuDNNAffineGradientOp<Context>::RunOnDevice() {
scale_dim_ = X(1).count();
outer_dim_ = X(-1).count(0, axis_);
inner_dim_ = X(-1).count(axis_ + num_axes_);
dim_ = scale_dim_ * inner_dim_;
reduce_dim_ = std::max(outer_dim_, inner_dim_);
Y(0)->ReshapeLike(X(-1));
if (XIsType(X(-1), float)) {
RunImpl<float, float>();
} else if (XIsType(X(-1), float16)) {
......
......@@ -36,6 +36,13 @@ void EltwiseOp<Context>::ProdRunImpl() {
template <class Context> template <typename T>
void EltwiseOp<Context>::RunImpl() {
if (operation_ == "SUM") SumRunImpl<T>();
else if (operation_ == "PROD") ProdRunImpl<T>();
else LOG(FATAL) << "Unknwon Operation: " << operation_;
}
template <class Context>
void EltwiseOp<Context>::RunOnDevice() {
for (int i = 1; i < XSize(); i++) {
CHECK(X(i).dims() == X(0).dims())
<< "\nExcepted Input(" << i << ")'s dims as "
......@@ -45,33 +52,10 @@ void EltwiseOp<Context>::RunImpl() {
Y(0)->ReshapeLike(X(0));
if (operation_ == "SUM") SumRunImpl<T>();
else if (operation_ == "PROD") ProdRunImpl<T>();
else LOG(FATAL) << "Unknwon Operation: " << operation_;
}
template <class Context>
void EltwiseOp<Context>::RunOnDevice() {
if (XIsType(X(0), int8_t)) {
RunImpl<int8_t>();
} else if (XIsType(X(0), uint8_t)) {
RunImpl<uint8_t>();
} else if (XIsType(X(0), int)) {
RunImpl<int>();
} else if (XIsType(X(0), int64_t)) {
RunImpl<int64_t>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), double)) {
RunImpl<double>();
} else {
LOG(FATAL) << DTypeString(X(0), {
"int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
});
}
DispatchHelper<TensorTypes
<int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -133,26 +117,10 @@ void EltwiseGradientOp<Context>::RunImpl() {
template <class Context>
void EltwiseGradientOp<Context>::RunOnDevice() {
if (XIsType(X(0), int8_t)) {
RunImpl<int8_t>();
} else if (XIsType(X(0), uint8_t)) {
RunImpl<uint8_t>();
} else if (XIsType(X(0), int)) {
RunImpl<int>();
} else if (XIsType(X(0), int64_t)) {
RunImpl<int64_t>();
} else if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), double)) {
RunImpl<double>();
} else {
LOG(FATAL) << DTypeString(X(0), {
"int8", "uint8", "int32", "int64",
"float16", "float32", "float64",
});
}
DispatchHelper<TensorTypes
<int8_t, uint8_t, int, int64_t,
float16, float, double>
>::Call(this, X(0));
}
DEPLOY_CPU(Eltwise);
......
......@@ -15,17 +15,9 @@ template <class Context>
void ExpOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), double)) {
RunImpl<double>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float16", "float32", "float64" }
);
}
DispatchHelper<TensorTypes
<float, float16, double>
>::Call(this, X(0));
}
template <class Context> template <typename T>
......@@ -40,17 +32,9 @@ template <class Context>
void ExpGradientOp<Context>::RunOnDevice() {
Y(0)->ReshapeLike(X(0));
if (XIsType(X(0), float16)) {
RunImpl<float16>();
} else if (XIsType(X(0), float)) {
RunImpl<float>();
} else if (XIsType(X(0), double)) {
RunImpl<double>();
} else {
LOG(FATAL) << DTypeString(X(0),
{ "float16", "float32", "float64" }
);
}
DispatchHelper<TensorTypes
<float, float16, double>
>::Call(this, X(0));
}
DEPLOY_CPU(Exp);
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!